## findnewlines 函数 输入两个csv 返回增量PCG_delat.csv

In [16]:
import pandas as pd
def findnewlines(file_path_new,file_path_old):
    df1 = pd.read_csv(file_path_new)
    df2 = pd.read_csv(file_path_old)
    if(df1.shape[0] >= df2.shape[0]):
        df1,df2 = df2,df1
    
    key_cols = df1.columns[:4]
    rows1 = set(df1[key_cols].astype(str).apply(tuple, axis=1))

    key_cols = df2.columns[:4]   # 实际上两者列名应该一致
    mask_new = df2[key_cols].astype(str).apply(tuple, axis=1).map(lambda x: x not in rows1)

    delta = df2[mask_new].copy()
    print("新增行数:", len(delta))
    display(delta)

    delta.to_csv("PCG_delta.csv", index=False)
    print("Saved to PCG_delta.csv")

In [17]:
findnewlines("PCG1.csv","PCG2.csv")

新增行数: 2


Unnamed: 0,Key,Item Type,Publication Year,Author,Title,Publication Title,ISBN,ISSN,DOI,Url,...,Programming Language,Version,System,Code,Code Number,Section,Session,Committee,History,Legislative Body
15,9Q8Z9AE3,preprint,2025,"Xu, Chenyang; Li, Siming; Wang, Hao",H-LDM: Hierarchical Latent Diffusion Models fo...,,,,10.48550/arXiv.2511.14312,http://arxiv.org/abs/2511.14312,...,,,,,,,,,,
16,J8DDHQUU,preprint,2025,"Ibrahim, Mustafa Fuad Rifet; Alkanat, Tunc; Me...",Prototyping an End-to-End Multi-Modal Tiny-CNN...,,,,10.48550/arXiv.2510.18668,http://arxiv.org/abs/2510.18668,...,,,,,,,,,,


Saved to PCG_delta.csv


## embedding函数，输入增量PCG_delta.csv，函数返回embedding好的dataFrame

In [None]:
import re
from typing import Optional, Dict, Any
from sentence_transformers import SentenceTransformer
from hsfs.embedding import EmbeddingIndex
from functions.zotero_parser import ZoteroCSVParser


def sanitize_paper_metadata(paper: Dict[str, Any]) -> Optional[Dict[str, Any]]:
    """
    Defensive metadata sanitation.
    Returns None if the paper is considered invalid.
    """

    # ---- 1. Mandatory Field Validation ----
    title = paper.get("title", "").strip()
    if not title:
        return None  # Equivalent to RDF version: discard if title is missing

    paper["title"] = title

    # ---- 2. Year Repair (Reusing regex logic from RDF) ----
    year = paper.get("year")
    if year is None:
        # Attempt to recover year from other metadata fields
        for field in ("url", "abstract"):
            text = paper.get(field, "")
            match = re.search(r"(19|20)\d{2}", text)
            if match:
                paper["year"] = int(match.group())
                break

    # ---- 3. Authors Fallback ----
    authors = paper.get("authors", "").strip()
    if not authors or authors.lower() == "nan":
        paper["authors"] = "Unknown"

    # ---- 4. Abstract Normalization ----
    abstract = paper.get("abstract", "").strip()
    if abstract.lower() in {"nan", "none"}:
        paper["abstract"] = ""

    # ---- 5. Attachments Handling ----
    # Preserve original state but ensure the value is a string
    attachments = paper.get("file_attachments")
    paper["file_attachments"] = str(attachments) if attachments is not None else ""

    return paper

# === Cell 3: Parse Zotero CSV ===
parser = ZoteroCSVParser("PCG_delta.csv")
raw_papers = parser.parse()

papers = []
for paper in raw_papers:
    fixed = sanitize_paper_metadata(paper)
    if fixed is not None:
        papers.append(fixed)

print(f"Parsed {len(papers)} papers.")
print(papers[:2])


# ====== 配置 ======
PROJECT_NAME = "airconditiondetection"   # 你日志里看到的项目名
FG_NAME = "zotero_csv_meta_fg"
FG_VERSION = 1

CSV_PATH = "PCG_delta.csv"   # 你刚保存的增量CSV（也可以用原始全量CSV）
MODEL_NAME = "all-MiniLM-L6-v2"

ID_COL = "Key"               # 你CSV里看起来是论文唯一Key
TITLE_COL = "Title"
ABSTRACT_COL = "Abstract"    # 你的Zotero CSV一般有 Abstract Note/Abstract 字段；如果不是这个名字，下面会自动兜底
AUTHOR_COL = "Author"
YEAR_COL = "Publication Year"

def pick_col(df, candidates):
    """从候选列名中选一个存在的"""
    for c in candidates:
        if c in df.columns:
            return c
    return None





def embedding_new_lines(file_path_PCG_delta):
    df = pd.read_csv(file_path_PCG_delta)
    print("Loaded:", df.shape)

    # 自动兼容 Zotero CSV 里 abstract 的不同列名
    abstract_col = pick_col(df, ["Abstract", "Abstract Note", "abstract", "AbstractNote"])
    title_col = pick_col(df, [TITLE_COL, "title"])
    author_col = pick_col(df, [AUTHOR_COL, "Authors", "author"])
    year_col = pick_col(df, [YEAR_COL, "Year", "year"])
    id_col = pick_col(df, [ID_COL, "paper_id", "id", "ID"])

    if id_col is None or title_col is None:
        raise ValueError(f"找不到主键列或标题列。当前列名: {list(df.columns)[:30]} ...")

    # 组装 combined_text（没有的字段就用空）
    def build_text(row):
        t = str(row.get(title_col, "") or "")
        a = str(row.get(abstract_col, "") or "") if abstract_col else ""
        au = str(row.get(author_col, "") or "") if author_col else ""
        y = str(row.get(year_col, "") or "") if year_col else ""
        return f"Title: {t}\nAuthors: {au}\nYear: {y}\nAbstract: {a}"

    df["combined_text"] = df.apply(build_text, axis=1)

    # 计算 embedding
    model = SentenceTransformer(MODEL_NAME)
    df["embedding"] = df["combined_text"].fillna("").apply(lambda x: model.encode(x).tolist())
    return df,id_col







Parsed 2 papers.
[{'paper_id': '9Q8Z9AE3', 'title': 'H-LDM: Hierarchical Latent Diffusion Models for Controllable and Interpretable PCG Synthesis from Clinical Metadata', 'authors': 'Xu, Chenyang; Li, Siming; Wang, Hao', 'year': 2025, 'abstract': 'Phonocardiogram (PCG) analysis is vital for cardiovascular disease diagnosis, yet the scarcity of labeled pathological data hinders the capability of AI systems. To bridge this, we introduce H-LDM, a Hierarchical Latent Diffusion Model for generating clinically accurate and controllable PCG signals from structured metadata. Our approach features: (1) a multi-scale VAE that learns a physiologically-disentangled latent space, separating rhythm, heart sounds, and murmurs; (2) a hierarchical text-to-biosignal pipeline that leverages rich clinical metadata for fine-grained control over 17 distinct conditions; and (3) an interpretable diffusion process guided by a novel Medical Attention module. Experiments on the PhysioNet CirCor dataset demonstra

In [24]:
dfnewline,idcol = embedding_new_lines(CSV_PATH)
print(dfnewline['embedding'][0])

Loaded: (2, 87)
[-0.001899355323985219, -0.10555146634578705, -0.00418861722573638, 0.004847000818699598, 0.017331000417470932, -0.007304350845515728, -0.0669705867767334, -0.021474093198776245, 0.09013232588768005, -0.07281465083360672, -0.024480782449245453, -0.06061367690563202, -0.039333831518888474, 0.016197163611650467, -0.002743748016655445, -0.02723805420100689, 0.012326937168836594, 0.04321884363889694, -0.07688336074352264, -0.017152979969978333, 0.03107169270515442, 0.12440228462219238, 0.02820478193461895, 0.03167025372385979, -0.007569717708975077, 0.10119252651929855, 0.01305475179105997, -0.06521143764257431, -0.006049715913832188, -0.0006925948546268046, 0.10838209837675095, 0.020417004823684692, 0.05237722396850586, 0.0843726396560669, -0.11379626393318176, 0.07160447537899017, -0.008913003839552402, -0.007200879976153374, -0.06425443291664124, -0.003002593293786049, 0.0987556055188179, 0.04239178076386452, -0.011345172300934792, 0.05875229835510254, 0.0529686994850635

## 把dataframe 上传到hopsworks的函数

In [None]:
def sanitize_for_hopsworks(df: pd.DataFrame) -> pd.DataFrame:
    """
    让 DataFrame 满足 Hopsworks/Avro 写入要求：
    - 把 NaN/NaT 变成 None（null）
    - 把 object 列里非 None 的值转成 str（避免出现 float NaN / 混合类型）
    """
    df = df.copy()

    # 1) NaN / NaT -> None（非常关键）
    df = df.replace({np.nan: None})

    return df


def upload_new_lines(df,id_col,):
    df = sanitize_for_hopsworks(df) #是处理空值
    # 写入 Hopsworks
    project = hopsworks.login(
        api_key_value="FWds6IvHWvbJWDyc.hKxOZK21XXgVDZ4XEIQsZdT3oPEnKAGdOMxB55BCA4J8rASk4X10A1GAHDDyjh3j"
    )   
    fs = project.get_feature_store()

    fg = fs.get_or_create_feature_group(
        name=FG_NAME,
        version=FG_VERSION,
        description="Zotero CSV metadata embeddings (no PDF)",
        primary_key=[id_col],
        embedding_index=EmbeddingIndex(),
    )

    fg.insert(df)
    print(f"Inserted {len(df)} rows into {FG_NAME}:{FG_VERSION}")

In [28]:
import numpy as np
import hopsworks
upload_new_lines(dfnewline,idcol)

2026-01-11 10:42:38,489 INFO: Initializing external client
2026-01-11 10:42:38,489 INFO: Base URL: https://c.app.hopsworks.ai:443
2026-01-11 10:42:40,162 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1282196




Uploading Dataframe: 100.00% |██████████| Rows 2/2 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: zotero_csv_meta_fg_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1282196/jobs/named/zotero_csv_meta_fg_1_offline_fg_materialization/executions
Inserted 2 rows into zotero_csv_meta_fg:1


## 下载hopswork返回已经有的内容df,转化成csv

In [47]:
def download_fgtocsv(project_name = PROJECT_NAME, fg_name = FG_NAME , fg_version =FG_VERSION ,
                       out_path: str = "PCG_old.csv"):
    project = hopsworks.login(
        project=project_name,
        api_key_value="FWds6IvHWvbJWDyc.hKxOZK21XXgVDZ4XEIQsZdT3oPEnKAGdOMxB55BCA4J8rASk4X10A1GAHDDyjh3j"
    )
    fs = project.get_feature_store()
    fg = fs.get_feature_group(fg_name, fg_version)
    df = fg.read()
    if "embedding" in df.columns:
        df = df.drop(columns=["embedding"])
    if "combined_text" in df.columns:
        df = df.drop(columns=["combined_text"])
    df.to_csv(out_path, index=False)
    print(f"Downloaded {len(df)} rows from {fg_name} v{fg_version} -> {out_path}")
    return df
    


In [48]:
print(PROJECT_NAME)

airconditiondetection


In [49]:
df_fg = download_fgtocsv()



2026-01-11 11:08:46,565 INFO: Closing external client and cleaning up certificates.
Connection closed.
2026-01-11 11:08:46,569 INFO: Initializing external client
2026-01-11 11:08:46,570 INFO: Base URL: https://c.app.hopsworks.ai:443
2026-01-11 11:08:47,922 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1282196
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.76s) 
Downloaded 4 rows from zotero_csv_meta_fg v1 -> PCG_old.csv


In [50]:
print(df_fg.shape)

(4, 87)
