In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import sklearn as sk

In [5]:
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"

import pandas as pd
import re
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
import spacy
import ast

# Load models
print("🔄 Loading models...")
kw_model = KeyBERT(model=SentenceTransformer("all-MiniLM-L6-v2"))
nlp = spacy.load("en_core_sci_sm")  # SciSpacy for scientific NER

# Load data
df = pd.read_csv("papers.csv")
print(f"📄 Loaded {len(df)} papers.")

# ----------------------------
# STEP 1: CLEANING TEXT
# ----------------------------
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"[^a-z0-9\s]", "", text)
    return text.strip()

df["clean_abstract"] = df["abstract"].fillna("").apply(clean_text)

# ----------------------------
# STEP 2: EXTRACT KEYWORDS
# ----------------------------
def extract_keywords(text):
    if not text or len(text) < 50:
        return []
    keywords = kw_model.extract_keywords(
        text,
        keyphrase_ngram_range=(1, 2),
        stop_words="english",
        top_n=5
    )
    return [kw[0] for kw in keywords]

print("🔍 Extracting keywords...")
df["keywords"] = df["clean_abstract"].apply(extract_keywords)

# ----------------------------
# STEP 3: ENTITY RECOGNITION (SciSpacy)
# ----------------------------
def extract_entities(text):
    doc = nlp(text)
    entities = [ent.text for ent in doc.ents if len(ent.text) > 2]
    return list(set(entities))  # remove duplicates

print("🔬 Extracting scientific entities...")
df["entities"] = df["clean_abstract"].apply(extract_entities)

# ----------------------------
# STEP 4: SAVE ENRICHED DATA
# ----------------------------
df.to_csv("papers_enriched.csv", index=False)
print("✅ Enriched metadata saved to papers_enriched.csv")

# Optional: Preview
print("\n🔹 Sample enriched record:")
print(df[["title", "keywords", "entities"]].head(1).to_string(index=False))


🔄 Loading models...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

RuntimeError: Failed to import transformers.models.chameleon.configuration_chameleon because of the following error (look up to see its traceback):
No module named 'transformers.models.chameleon.configuration_chameleon'