In [2]:
dir = 'D:/DataforPractice/ContentNovelty/'

In [None]:
### Convert .pasrquet to .csv

import pandas as pd

df = pd.read_parquet(dir+'3_external_with_novelty.parquet')
df.to_csv(dir+'3_external_with_novelty.csv', index=False, encoding='utf-8-sig')

df = pd.read_parquet(dir+'3_internal_with_novelty.parquet')
df.to_csv(dir+'3_internal_with_novelty.csv', index=False, encoding='utf-8-sig')

In [2]:
import pandas as pd 
affinst = pd.read_csv(dir+'affinst_ed.csv')
affinst

Unnamed: 0.1,Unnamed: 0,pubid,EU_NUTS_ID,period,type,subject,ID,count
0,1,2010.0,ES415,1,Internal,Pharmacology & Pharmacy,1-ES415-Pharmacology & Pharmacy,55
1,2,2013.0,ES111,1,Internal,Pharmacology & Pharmacy,1-ES111-Pharmacology & Pharmacy,80
2,3,2015.0,UKJ36,1,Internal,Pharmacology & Pharmacy,1-UKJ36-Pharmacology & Pharmacy,22
3,4,3080.0,NL230,1,External,Immunology,1-NL230-Immunology,32
4,5,3080.0,NL230,1,External,Research & Experimental Medicine,1-NL230-Research & Experimental Medicine,13
...,...,...,...,...,...,...,...,...
23668902,23668903,46608429.0,PT11A,6,Internal,Microbiology,6-PT11A-Microbiology,81
23668903,23668904,46608438.0,NL337,6,External,Rheumatology,6-NL337-Rheumatology,52
23668904,23668905,46608438.0,FR101,6,External,Rheumatology,6-FR101-Rheumatology,82
23668905,23668906,46608438.0,PT170,6,External,Rheumatology,6-PT170-Rheumatology,21


In [2]:
publication = pd.read_csv(dir+'publication_ed.csv')
publication

Unnamed: 0.1,Unnamed: 0,pubid,abstract
0,1,2010.0,The histamine H-2 receptor antagonistic activi...
1,2,2012.0,We have investigated the ability of several co...
2,3,2013.0,For several years we have been working on the ...
3,4,2015.0,The structural and ionic requirements for pote...
4,5,2019.0,The effect of taxol on selected lysosomal enzy...
...,...,...,...
10032662,10032663,46608401.0,A hallmark of chronic bacterial infections is ...
10032663,10032664,46608410.0,Purpose We sought to describe a disorder clini...
10032664,10032665,46608423.0,Background For patients with early American Jo...
10032665,10032666,46608429.0,Aim of this study The major aim of this work w...


In [9]:
import os
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics.pairwise import cosine_distances

### path settings
SAVE_PATH = r"D:/LLM/specter"                          
DATA_DIR = r"D:/DataforPractice/ContentNovelty/"       
OUT_KNOW = os.path.join(DATA_DIR, "2_knowledge_spaces.csv")
OUT_CENT = os.path.join(DATA_DIR, "2_centroids.csv")
OUT_EXT  = os.path.join(DATA_DIR, "3_external_with_novelty.csv")

### Load LLM
tokenizer = AutoTokenizer.from_pretrained(SAVE_PATH)
model = AutoModel.from_pretrained(SAVE_PATH)
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("Device:", device)
print("Model on:", next(model.parameters()).device)

# ========= Utils =========
def encode_texts(texts, batch_size=16, max_length=512):
    """SPECTER CLS 임베딩을 배치로 반환 (np.ndarray, shape=[N, D])."""
    all_vecs = []
    for s in tqdm(range(0, len(texts), batch_size), desc="Embedding"):
        batch = texts[s:s+batch_size]
        inputs = tokenizer(
            batch, return_tensors="pt", truncation=True,
            padding=True, max_length=max_length
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
            cls = outputs.last_hidden_state[:, 0, :]        # [CLS]
        all_vecs.append(cls.detach().cpu().numpy())
    return np.vstack(all_vecs) if all_vecs else np.empty((0, model.config.hidden_size))

def vec_to_str(v: np.ndarray) -> str:
    """CSV 저장용: 공백으로 join (읽을 때는 str.split 후 float 변환)."""
    return " ".join(f"{x:.6f}" for x in v.tolist())

def str_to_vec(s: str) -> np.ndarray:
    return np.array([float(x) for x in s.split(" ")])

# ========= Data =========
affinst = pd.read_csv(os.path.join(DATA_DIR, "affinst_ed.csv"))
publication = pd.read_csv(os.path.join(DATA_DIR, "publication_ed.csv"))

ID_list = affinst['ID'].unique().tolist()

# ========= Pass 1: INTERNAL 임베딩 & 센트로이드 =========
rows_know = []   # 세부 관측치(knowledge spaces)
rows_cent = []   # (EU_NUTS_ID, period, subject)별 센트로이드

for the_id in ID_list:
    print("\n=== INTERNAL | ID:", the_id, "===")
    df_int = (
        affinst.loc[affinst["ID"] == the_id]
        .query('type == "Internal"')
        .merge(publication, on="pubid", how="inner")
        .copy()
    )
    # 제목/초록 정리
    df_int = df_int[df_int["abstract"].notna() & (df_int["abstract"].str.strip() != "")]
    if df_int.empty:
        print("  -> No Internal rows after filtering. Skipping.")
        continue

    # df_int["input_text"] = df_int["title"].fillna("") + " " + df_int["abstract"].fillna("")
    df_int["input_text"] = df_int["abstract"].fillna("")
    # 임베딩
    vecs = encode_texts(df_int["input_text"].tolist(), batch_size=16)
    df_int["__emb_vec"] = list(vecs)

    # summary of knowledge_spaces
    for _, r in df_int.iterrows():
        rows_know.append({
            "ID": the_id,
            "EU_NUTS_ID": r["EU_NUTS_ID"],
            "period": r["period"],
            "subject": r["subject"],
            "pubid": r["pubid"],
            "embedding": vec_to_str(r["__emb_vec"]),   # 직렬화
        })

    # period-region-subject level centroid
    grp = df_int.groupby(["EU_NUTS_ID", "period", "subject"], dropna=False)["__emb_vec"].apply(list)
    for (nuts, per, subj), vec_list in grp.items():
        mat = np.vstack(vec_list) if len(vec_list) else np.empty((0, model.config.hidden_size))
        # Remove NaN
        if mat.size == 0:
            continue
        mat = mat[~np.isnan(mat).any(axis=1)]
        if mat.shape[0] == 0:
            continue
        centroid = mat.mean(axis=0)
        rows_cent.append({
            "ID": the_id,
            "EU_NUTS_ID": nuts,
            "period": per,
            "subject": subj,
            "centroid": vec_to_str(centroid),
            "n_docs": mat.shape[0],
        })

df_know = pd.DataFrame(rows_know)
df_cent = pd.DataFrame(rows_cent)
if not df_know.empty:
    df_know.to_csv(OUT_KNOW, index=False)
if not df_cent.empty:
    df_cent.to_csv(OUT_CENT, index=False)
print(f"\nSaved:\n - {OUT_KNOW} ({len(df_know)} rows)\n - {OUT_CENT} ({len(df_cent)} rows)")

# ========= Pass 2: EXTERNAL 임베딩 & novelty(centroid와의 코사인 거리) =========
# centroid 딕셔너리: (EU_NUTS_ID, period, subject) → np.ndarray
centroids = {
    (row["EU_NUTS_ID"], row["period"], row["subject"]): str_to_vec(row["centroid"])
    for _, row in df_cent.iterrows()
}

rows_ext = []
for the_id in ID_list:
    print("\n=== EXTERNAL | ID:", the_id, "===")
    df_ext = (
        affinst.loc[affinst["ID"] == the_id]
        .query('type == "External"')
        .merge(publication, on="pubid", how="inner")
        .copy()
    )
    df_ext = df_ext[df_ext["abstract"].notna() & (df_ext["abstract"].str.strip() != "")]
    if df_ext.empty:
        print("  -> No External rows after filtering. Skipping.")
        continue

    # df_ext["input_text"] = df_ext["title"].fillna("") + " " + df_ext["abstract"].fillna("")
    df_ext["input_text"] = df_ext["abstract"].fillna("")
    vecs_ext = encode_texts(df_ext["input_text"].tolist(), batch_size=16)
    df_ext["__emb_vec"] = list(vecs_ext)

    def compute_novelty(row):
        key = (row['EU_NUTS_ID'], row['period'], row['subject'])
        centroid = centroids.get(key)
        if centroid is None:
            return np.nan
        v = row["__emb_vec"]
        if np.isnan(v).any():
            return np.nan
        return float(cosine_distances(v.reshape(1, -1), centroid.reshape(1, -1))[0][0])

    df_ext["content_novelty"] = df_ext.apply(compute_novelty, axis=1)

    rows_ext.extend(df_ext[[
        "ID","EU_NUTS_ID","period","subject","pubid","content_novelty"
    ]].to_dict("records"))

df_ext_out = pd.DataFrame(rows_ext)
if not df_ext_out.empty:
    df_ext_out.to_csv(OUT_EXT, index=False)
    print(f"\nSaved:\n - {OUT_EXT} ({len(df_ext_out)} rows)")
else:
    print("\nNo External novelty rows to save.")


Device: cuda
Model on: cuda:0

=== INTERNAL | ID: 1-ES415-Pharmacology & Pharmacy ===


  return forward_call(*args, **kwargs)
Embedding: 100%|██████████| 4/4 [02:21<00:00, 35.30s/it]



=== INTERNAL | ID: 1-NL327-Pharmacology & Pharmacy ===


  return forward_call(*args, **kwargs)
Embedding: 100%|██████████| 1/1 [00:11<00:00, 11.25s/it]



Saved:
 - D:/DataforPractice/ContentNovelty/2_knowledge_spaces.csv (61 rows)
 - D:/DataforPractice/ContentNovelty/2_centroids.csv (2 rows)

=== EXTERNAL | ID: 1-ES415-Pharmacology & Pharmacy ===


  return forward_call(*args, **kwargs)
Embedding: 100%|██████████| 3/3 [02:02<00:00, 40.75s/it]



=== EXTERNAL | ID: 1-NL327-Pharmacology & Pharmacy ===


  return forward_call(*args, **kwargs)
Embedding: 100%|██████████| 3/3 [01:46<00:00, 35.43s/it]


Saved:
 - D:/DataforPractice/ContentNovelty/3_external_with_novelty.csv (70 rows)





In [None]:
import pandas as pd
from transformers import AutoModel, AutoTokenizer
import torch
from tqdm import tqdm
from sklearn.preprocessing import normalize
import numpy as np

### Load model
# SAVE_PATH = r"D:/LLM/Llama4-Scout"             
SAVE_PATH = r"D:/LLM/specter"             

tokenizer = AutoTokenizer.from_pretrained(SAVE_PATH)
model = AutoModel.from_pretrained(SAVE_PATH) #, torch_dtype=torch.float16, device_map="auto"

model.eval()

# GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)  
print(next(model.parameters()).device)  

# Function to get SPECTER embedding
def get_specter_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        embedding = outputs.last_hidden_state[:, 0, :]  # [CLS] token
    return embedding.squeeze().cpu().numpy()

affinst = pd.read_csv(dir+'affinst_ed.csv')
publication = pd.read_csv(dir+'publication_ed.csv')

ID_list = ['1-ES415-Pharmacology & Pharmacy','1-NL327-Pharmacology & Pharmacy']

for i in ID_list:
    
    print(i)  

    df = affinst[affinst['ID'] == i].copy()
    df = df[df.type=="Internal"]
    df = pd.merge(df, publication, on='pubid', how='inner')
    df = df[df['abstract'].notna()]

    embeddings_test = []
    for text in tqdm(df['abstract'], desc="Embedding test group"):
        emb = get_specter_embedding(text)
        embeddings_test.append(emb)

    # Store embeddings
    df['embedding'] = embeddings_test

    # Compute centroid
    test_vectors = np.stack(df['embedding'].to_numpy())
    test_centroid = np.mean(test_vectors, axis=0)
    
    from collections import defaultdict
    import numpy as np

    knowledge_spaces = defaultdict(list)

    for _, row in df.iterrows():
        key = (row['EU_NUTS_ID'], row['period'], row['subject'])
        knowledge_spaces[key].append(row['embedding'])

    # Compute centroids for each group
    centroids = {k: np.mean(vectors, axis=0) for k, vectors in knowledge_spaces.items()}

    knowledge_spaces.to_csv('2_knowledge_spaces.csv', index=False)
    centroids.to_csv('2_centroids.csv', index=False)

# Output
print("\n✅ Embedding and centroid calculation complete.")
print("Number of documents in group:", len(df))
print("Centroid shape:", test_centroid.shape) # 768-dimensional vector
print("Sample of centroid values:", test_centroid[:10]) # the first 10 values of the 768-dimensional centroid vector


  from .autonotebook import tqdm as notebook_tqdm
NVIDIA GeForce RTX 5090 with CUDA capability sm_120 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_37 sm_50 sm_60 sm_61 sm_70 sm_75 sm_80 sm_86 sm_90 compute_37.
If you want to use the NVIDIA GeForce RTX 5090 GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/



cuda
cuda:0


TypeError: unsupported operand type(s) for +: 'builtin_function_or_method' and 'str'

In [12]:
df['abstract']

0     The histamine H-2 receptor antagonistic activi...
1     Cyclosporin A (CsA) is a potent immunosuppress...
2     Neurotrophins are molecules that regulate the ...
3     Starting with 12-acetoxy-7,9(11)-drimadiene, p...
4                                                   NaN
                            ...                        
90    1 An oxazolo(3,2-a)pyridine derivative P5, des...
91    A new GC-MS method for monitoring lignans was ...
92    Two new diterpcnic acids with an ent-halimane ...
93    Taking the natural cardenolides as a model for...
94    The cyclolignan family of natural products inc...
Name: abstract, Length: 95, dtype: object