In [1]:
import pandas as pd

# Load and select a small batch for fine-tuning
df = pd.read_csv("clustered_papers.csv").dropna(subset=["title", "clean_abstract"])
df = df.head(20)  # fine-tune on just 20 samples

# Format input/output pairs
def build_input(row):
    return f"Title: {row['title']}\nAuthors: {row.get('authors', 'Unknown')}\nKeywords: {row.get('keywords', '')}\nAbstract: {row['clean_abstract']}"

def build_target(row):
    return f"This paper discusses {row.get('keywords', 'key topics')} and contributes to cluster {row.get('gmm_cluster', 'X')} research."

dataset = [{"input": build_input(row), "target": build_target(row)} for _, row in df.iterrows()]


In [None]:
import requests

CORE_API_KEY = "ZLQojgG1uJDYRdprWS8UhEzsIPM03cNi"  # 🔁 Replace with your actual key

query = "deep learning"

url = "https://api.core.ac.uk/v3/search/works"

params = {
    "q": query,             # ✅ Required search query
    "page": 1,
    "pageSize": 5,
    "metadata": True,
    "fulltext": True,
    "apiKey": CORE_API_KEY,
}

response = requests.get(url, params=params)

print("Status code:", response.status_code)
print("Response text:", response.text[:])  # Show only first 500 chars


Status code: 200
Response text: {"totalHits":1832062,"limit":10,"offset":0,"results":[{"acceptedDate":"","arxivId":null,"authors":[{"name":"Benito Picazo, Jes\u00fas"},{"name":"Dom\u00ednguez-Merino, Enrique"},{"name":"L\u00f3pez-Rubio, Ezequiel"},{"name":"Ortiz-de-lazcano-Lobato, Juan Miguel"},{"name":"Palomo, Esteban J."}],"citationCount":0,"contributors":["Jesus"],"outputs":["https:\/\/api.core.ac.uk\/v3\/outputs\/468593434","https:\/\/api.core.ac.uk\/v3\/outputs\/214840920","https:\/\/api.core.ac.uk\/v3\/outputs\/323333340"],"createdDate":"2019-07-09T14:23:56","dataProviders":[{"id":4786,"name":"","url":"https:\/\/api.core.ac.uk\/v3\/data-providers\/4786","logo":"https:\/\/api.core.ac.uk\/data-providers\/4786\/logo"},{"id":2072,"name":"","url":"https:\/\/api.core.ac.uk\/v3\/data-providers\/2072","logo":"https:\/\/api.core.ac.uk\/data-providers\/2072\/logo"},{"id":11082,"name":"","url":"https:\/\/api.core.ac.uk\/v3\/data-providers\/11082","logo":"https:\/\/api.core.ac.uk\/data-provi

In [7]:
# fetch_paper.py

import requests
import pandas as pd
import os

CORE_API_KEY = "ZLQojgG1uJDYRdprWS8UhEzsIPM03cNi"  # 🔁 Replace this with your actual CORE API key

def fetch_core_papers(query, max_results=10):
    url = "https://api.core.ac.uk/v3/search/works"
    params = {
        "q": query,
        "page": 1,
        "pageSize": max_results,
        "metadata": True,
        "fulltext": True,
        "apiKey": CORE_API_KEY,
    }

    try:
        res = requests.get(url, params=params)
        res.raise_for_status()
        data = res.json()
    except Exception as e:
        print(f"Error fetching data: {e}")
        return pd.DataFrame()  # Return empty DataFrame on error

    records = []
    for paper in data.get("data", []):
        records.append({
            "title": paper.get("title", ""),
            "authors": [author.get("name", "") for author in paper.get("authors", [])] if paper.get("authors") else [],
            "full_text": paper.get("fullText", "") or paper.get("description", ""),
            "year": paper.get("publishedDate", "")[:4] if paper.get("publishedDate") else "",
            "doi": paper.get("doi", ""),
            "url": paper.get("downloadUrl") or paper.get("identifier", ""),
        })

    return pd.DataFrame(records)

def save_to_csv(df, filename):
    """
    Saves the DataFrame to a CSV file.

    Args:
        df (pd.DataFrame): DataFrame to save.
        filename (str): Output CSV file path.
    """
    df.to_csv(filename, index=False)


In [9]:
print(fetch_core_papers('Natural Langage Processing'))

Empty DataFrame
Columns: []
Index: []


In [2]:
from transformers import EncoderDecoderModel, BertTokenizer, GPT2Tokenizer

# Load hybrid model
model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "mistral")
model.config.decoder_start_token_id = model.config.pad_token_id = 50256
model.config.eos_token_id = 50256

# Load tokenizers
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token


OSError: mistral is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [None]:
import torch
from torch.utils.data import Dataset

# Tokenization function
def encode(example):
    input_enc = bert_tokenizer(example["input"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    target_enc = gpt2_tokenizer(example["target"], padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    labels = target_enc.input_ids.clone()
    labels[labels == gpt2_tokenizer.pad_token_id] = -100  # ignore pad tokens in loss

    return {
        "input_ids": input_enc.input_ids[0],
        "attention_mask": input_enc.attention_mask[0],
        "decoder_input_ids": target_enc.input_ids[0],
        "labels": labels[0]
    }

class BERTGPTDataset(Dataset):
    def __init__(self, data):
        self.data = [encode(d) for d in data]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

train_dataset = BERTGPTDataset(dataset)


In [None]:
from torch.utils.data import DataLoader
from transformers import AdamW

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
optimizer = AdamW(model.parameters(), lr=5e-5)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.train()

for epoch in range(2):
    total_loss = 0
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()
    print(f"✅ Epoch {epoch+1} — Avg Loss: {total_loss / len(train_loader):.4f}")




✅ Epoch 1 — Avg Loss: 3.5536
✅ Epoch 2 — Avg Loss: 0.1720


In [None]:
model.eval()
test_input = build_input(df.iloc[0])

encoded = bert_tokenizer(test_input, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)

generated_ids = model.generate(
    input_ids=encoded.input_ids,
    attention_mask=encoded.attention_mask,
    max_length=150,
    num_beams=4,
    repetition_penalty=2.0,
    early_stopping=True
)

summary = gpt2_tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print("\n🧠 Generated Summary:\n", summary)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



🧠 Generated Summary:
 This is a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import pandas as pd

# Load the clusters CSV file (make sure to provide the correct file path)
clusters_df = pd.read_csv('clustered_papers.csv')

# Load Mistral model and tokenizer using Auto classes
mistral_tokenizer = AutoTokenizer.from_pretrained('mistral-7b')
mistral_model = AutoModelForSeq2SeqLM.from_pretrained('mistral-7b')

# Function to generate summary with Mistral
def generate_summary_with_mistral(text):
    mistral_inputs = mistral_tokenizer(text, return_tensors="pt")
    generated_output = mistral_model.generate(mistral_inputs['input_ids'], max_length=300, num_return_sequences=1)
    summary = mistral_tokenizer.decode(generated_output[0], skip_special_tokens=True)
    return summary

# Loop through each cluster and generate summaries with bibliometric insights
for cluster_id in clusters_df['cluster_id'].unique():
    # Filter articles for the current cluster
    cluster_articles = clusters_df[clusters_df['cluster_id'] == cluster_id]
    
    # Combine all abstracts from the current cluster for summarization
    combined_abstracts = " ".join(cluster_articles['abstract'].tolist())
    
    # Generate a summary for the combined abstracts
    generated_summary = generate_summary_with_mistral(combined_abstracts)
    
    # Combine bibliometric metrics
    avg_citation_count = cluster_articles['citation_count'].mean()
    avg_h_index = cluster_articles['h_index'].mean()
    
    # Format output to make it more human-friendly
    print(f"Research Trends for Cluster {cluster_id}:")
    print(f"------------------------------------------------")
    print(f"**Summary of Research**: {generated_summary}")
    print(f"\n**Bibliometric Insights**:")
    print(f"- Average Citation Count: {avg_citation_count:.2f}")
    print(f"- Average H-index: {avg_h_index:.2f}")
    print(f"- Total Number of Papers in Cluster: {len(cluster_articles)}")
    print(f"- Most Cited Paper: {cluster_articles.loc[cluster_articles['citation_count'].idxmax()]['title']}")
    print(f"- Top Author: {cluster_articles.groupby('author')['citation_count'].sum().idxmax()}")
    print(f"------------------------------------------------\n")




OSError: mistral-7b is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import pandas as pd

# Load Mistral model
mistral_id = "mistralai/Mistral-7B-Instruct-v0.1"
tokenizer = AutoTokenizer.from_pretrained(mistral_id)
model = AutoModelForCausalLM.from_pretrained(
    mistral_id,
    torch_dtype=torch.float16,
    device_map="auto"
)

# Load your clustered data
df = pd.read_csv("clustered_papers.csv")  # replace with your actual file

# Assume you have a function or mapping that returns keywords for each cluster
cluster_keywords = {
    0: ["crowdsourced data", "urban mobility", "transportation modeling"],
    1: ["machine learning", "citation analysis", "bibliometric networks"],
    # Add all your cluster -> keyword mappings
}

# Generate and print summary for each cluster
for cluster_id in sorted(df['cluster_id'].unique()):
    cluster_df = df[df['cluster_id'] == cluster_id]
    abstracts = cluster_df['abstract'].dropna().tolist()
    if not abstracts:
        continue

    combined_text = " ".join(abstracts)[:2000]  # Truncate if too long
    keywords = cluster_keywords.get(cluster_id, ["science", "research"])

    # Build instruct prompt
    prompt = f"<s>[INST] Generate a bibliometric summary using the topics: {', '.join(keywords)}. Text:\n{combined_text}\n[/INST]"
    
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=300)

    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    print(f"\n📘 Cluster {cluster_id} Summary")
    print("--------------------------------------------------")
    print(summary)
    print("--------------------------------------------------")


OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1.
401 Client Error. (Request ID: Root=1-6824cb2c-71c6583710d6c0a9237dd1a4;0157bd78-bb45-49bb-b097-0c78997e0478)

Cannot access gated repo for url https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/config.json.
Access to model mistralai/Mistral-7B-Instruct-v0.1 is restricted. You must have access to it and be authenticated to access it. Please log in.

In [None]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

# Example input text (replace with your actual article or text)
input_text = "Research trends in urban mobility using crowd-sourced data."

# Tokenize input text and convert it to input IDs
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512)

# Get the model's output (embeddings)
with torch.no_grad():
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state  # Embeddings of the input text

# Optional: If you want a single vector representation (e.g., using the [CLS] token)
sentence_embedding = embeddings[:, 0, :].squeeze().numpy()  # [CLS] token representation
print("Sentence Embedding:", sentence_embedding)


Sentence Embedding: [-3.03823858e-01 -2.94588953e-01 -3.76790494e-01  5.01689222e-03
 -3.54408994e-02  1.06439918e-01 -3.23471457e-01  2.17532516e-01
  2.92097963e-03 -7.44312882e-01 -4.35850620e-01 -3.50625992e-01
  2.00050063e-02 -2.46841222e-01 -1.59674272e-01  5.49915060e-02
 -2.40113243e-01  6.04893386e-01  4.65398401e-01  5.87469265e-02
 -3.19653422e-01 -7.89481580e-01  1.41855806e-01  2.59826928e-01
 -1.24172002e-01 -3.95769417e-01 -7.99091160e-02  4.51406538e-01
 -7.91762024e-02  1.85417414e-01 -4.29062903e-01  3.09045583e-01
 -5.20451665e-01 -8.73078465e-01  7.41924167e-01 -7.56629258e-02
 -5.66660017e-02 -4.74565268e-01 -5.19070625e-01  4.38661039e-01
 -3.53880256e-01  3.36756796e-01  4.73318577e-01 -3.30739498e-01
 -2.30438918e-01  6.87498897e-02 -2.97834635e+00 -2.10847571e-01
 -8.75047624e-01 -1.28071606e-01  3.81216854e-01  4.08959866e-01
  3.36281300e-01  1.29074126e-01  6.62514269e-01  7.79323637e-01
  2.08158702e-01 -5.16489595e-02  3.43803972e-01  1.69889957e-01
  5.5

In [13]:
import requests

# Grok API URL (replace with your actual endpoint)
grok_api_url = "https://api.groq.com/openai/v1/chat/completions"

# Prepare the payload with BERT embeddings
data = {
    "text_embedding": sentence_embedding.tolist(),  # Convert numpy array to list for JSON compatibility
    "additional_info": "Research trends in urban mobility"  # Optional additional info
}

# Send the POST request to Grok API
response = requests.post(grok_api_url, json=data)

# Check the response
if response.status_code == 200:
    result = response.json()
    print("Grok API Response:", result)
else:
    print("Error with Grok API:", response.status_code)


Error with Grok API: 401


In [14]:
pip install openai

Collecting openai
  Downloading openai-1.78.1-py3-none-any.whl.metadata (25 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Downloading distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.9.0-cp312-cp312-win_amd64.whl.metadata (5.3 kB)
Downloading openai-1.78.1-py3-none-any.whl (680 kB)
   ---------------------------------------- 0.0/680.9 kB ? eta -:--:--
   ---------------------------------------- 680.9/680.9 kB 5.4 MB/s eta 0:00:00
Downloading distro-1.9.0-py3-none-any.whl (20 kB)
Downloading jiter-0.9.0-cp312-cp312-win_amd64.whl (207 kB)
Installing collected packages: jiter, distro, openai

   -------------------------- ------------- 2/3 [openai]
   -------------------------- ------------- 2/3 [openai]
   -------------------------- ------------- 2/3 [openai]
   -------------------------- ------------- 2/3 [openai]
   -------------------------- ------------- 2/3 [openai]
   -------------------------- ------------- 2/3 [opena

In [18]:
pip install pandas torch transformers keybert openai


Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
from keybert import KeyBERT
import openai
import json

# === Config ===
CSV_FILE = "clustered_papers.csv"  # ← Update this to your clustered file
TEXT_COLUMN = "clean_abstract"  # Changed to "clean_abstract" based on your columns
CLUSTER_COLUMN = "gmm_cluster"  # Updated to "gmm_cluster" since that's the correct column
GROQ_API_KEY = "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"  # ← Replace this

# === Groq Setup ===
openai.api_key = GROQ_API_KEY
openai.api_base = "https://api.groq.com/openai/v1"

# === Load Models ===
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")
bert_model.eval()
kw_model = KeyBERT(model='bert-base-uncased')

# === Step 1: Load and Group Clusters ===
def load_clusters(file_path):
    df = pd.read_csv(file_path)
    print("Column names:", df.columns.tolist())  # Show column names explicitly
    df.columns = df.columns.str.strip()  # Clean any extra spaces in column names
    if CLUSTER_COLUMN not in df.columns:
        raise KeyError(f"Column '{CLUSTER_COLUMN}' not found in the CSV file.")
    return df.groupby(CLUSTER_COLUMN)[TEXT_COLUMN].apply(list).to_dict()

# === Step 2: Extract Keywords from BERT Input ===
def embedding_to_keywords(texts, num_keywords=10):
    combined_text = " ".join(texts)
# Adjust the number of keywords
    keywords = kw_model.extract_keywords(combined_text, top_n=20)
    return [kw[0] for kw in keywords]

# === Step 3: Build Prompt for Groq ===
def build_prompt(keywords):
    return (
        "You are a research assistant. Generate a detailed research summary "
        "focusing on the research methodology and key findings based on the following concepts:\n\n"
        + ", ".join(keywords) + "\n\nSummary:"
    )


# === Step 4: Groq API Call ===
def query_groq(prompt):
    response = openai.ChatCompletion.create(
        model="llama3-70b-8192",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.7,
        max_tokens=800
    )
    return response["choices"][0]["message"]["content"]

# === Full Pipeline for One Cluster ===
def process_cluster(texts):
    try:
        keywords = embedding_to_keywords(texts)
        prompt = build_prompt(keywords)
        return query_groq(prompt)
    except Exception as e:
        print(f"⚠️ Error processing cluster: {e}")
        return "ERROR"


# === Run for All Clusters ===
def run_pipeline():
    try:
        clusters = load_clusters(CSV_FILE)
        all_summaries = {}

        for label, texts in clusters.items():
            print(f"\n🔹 Cluster {label} 🔹")
            try:
                summary = process_cluster(texts)
                all_summaries[label] = summary
                print(summary)
            except Exception as e:
                print(f"⚠️ Error processing cluster {label}: {e}")
                all_summaries[label] = "ERROR"

        # Optional: Save output
        with open("cluster_summaries.json", "w") as f:
            json.dump(all_summaries, f, indent=4)
        print("\n✅ All summaries saved to 'cluster_summaries.json'")

    except Exception as e:
        print(f"⚠️ Error loading clusters: {e}")

if __name__ == "__main__":
    run_pipeline()


No sentence-transformers model found with name bert-base-uncased. Creating a new one with mean pooling.


Column names: ['id', 'title', 'doi', 'publication_year', 'authors', 'abstract', 'open_access', 'host_venue', 'clean_abstract', 'keywords', 'entities', 'gmm_cluster', 'gmm_probs']

🔹 Cluster 0 🔹
⚠️ Error processing cluster: 

You tried to access openai.ChatCompletion, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`

A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742

ERROR

🔹 Cluster 1 🔹
⚠️ Error processing cluster: 

You tried to access openai.ChatCompletion, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interfa

In [7]:
from rouge_score import rouge_scorer

# Example generated summary and ground truth summary
generated_summary = "AI models like GatorTron face challenges in plagiarism detection..."
ground_truth_summary = "The GatorTron model faces plagiarism risks and challenges in content detection..."

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"])

# Calculate ROUGE scores
scores = scorer.score(ground_truth_summary, generated_summary)
print(scores)


{'rouge1': Score(precision=0.5555555555555556, recall=0.45454545454545453, fmeasure=0.5), 'rouge2': Score(precision=0.125, recall=0.1, fmeasure=0.11111111111111112), 'rougeL': Score(precision=0.4444444444444444, recall=0.36363636363636365, fmeasure=0.39999999999999997)}


In [3]:
!pip install scispacy
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_sm-0.5.0.tar.gz


Collecting https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_sm-0.5.0.tar.gz
  Downloading https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_sm-0.5.0.tar.gz (15.9 MB)
     ---------------------------------------- 0.0/15.9 MB ? eta -:--:--
     ---------------------------------------- 0.0/15.9 MB ? eta -:--:--
     ---------------------------------------- 0.0/15.9 MB ? eta -:--:--
     ---------------------------------------- 0.0/15.9 MB ? eta -:--:--
      --------------------------------------- 0.3/15.9 MB ? eta -:--:--
     - -------------------------------------- 0.5/15.9 MB 1.1 MB/s eta 0:00:14
     - -------------------------------------- 0.8/15.9 MB 1.1 MB/s eta 0:00:14
     - -------------------------------------- 0.8/15.9 MB 1.1 MB/s eta 0:00:14
     -- ------------------------------------- 1.0/15.9 MB 1.1 MB/s eta 0:00:14
     --- ------------------------------------ 1.3/15.9 MB 1.1 MB/s eta 0:00:14
     --- -----------

  error: subprocess-exited-with-error
  
  × pip subprocess to install build dependencies did not run successfully.
  │ exit code: 1
  ╰─> [664 lines of output]
      Collecting setuptools
        Downloading setuptools-80.7.0-py3-none-any.whl.metadata (6.6 kB)
      Collecting cython<3.0,>=0.25
        Using cached Cython-0.29.37-py2.py3-none-any.whl.metadata (3.1 kB)
      Collecting cymem<2.1.0,>=2.0.2
        Using cached cymem-2.0.11-cp312-cp312-win_amd64.whl.metadata (8.8 kB)
      Collecting preshed<3.1.0,>=3.0.2
        Using cached preshed-3.0.9-cp312-cp312-win_amd64.whl.metadata (2.2 kB)
      Collecting murmurhash<1.1.0,>=0.28.0
        Using cached murmurhash-1.0.12-cp312-cp312-win_amd64.whl.metadata (2.2 kB)
      Collecting thinc<8.1.0,>=8.0.12
        Downloading thinc-8.0.17.tar.gz (189 kB)
        Installing build dependencies: started
        Installing build dependencies: finished with status 'done'
        Getting requirements to build wheel: started
        Getting

In [3]:
# === Import all modules ===
import pandas as pd
import re, json, ast, torch
from sentence_transformers import SentenceTransformer
from transformers import BertTokenizer, BertModel
from keybert import KeyBERT
from py2neo import Graph, Node, Relationship
from sklearn.mixture import GaussianMixture
import openai
import gradio as gr
from fetch_papers import fetch_openalex_papers, save_to_csv  # Replace with your actual function

# === Model/Loaders ===
model = SentenceTransformer("all-MiniLM-L6-v2")
kw_model = KeyBERT(model='bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")
bert_model.eval()

# === Groq API Setup ===
openai.api_key = "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"  # Replace with your actual key
openai.api_base = "https://api.groq.com/openai/v1"

# === Neo4j Setup ===
graph = Graph("bolt://localhost:7687", auth=("neo4j", "qwertyuiop"))


# === Step 1: Fetch Papers ===
def fetch_and_save_papers(query, max_results=50):
    df = fetch_openalex_papers(query, max_results=max_results)
    save_to_csv(df, "papers.csv")


# === Step 2: Enrichment ===
def enrich_papers():
    df = pd.read_csv("papers.csv")
    def clean_text(text):
        text = str(text).lower()
        return re.sub(r'[^a-zA-Z0-9\s]', '', text).strip()
    df["clean_abstract"] = df["abstract"].fillna("").apply(clean_text)
    df["keywords"] = df["keywords"].fillna("[]").apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])
    df["entities"] = df["entities"].fillna("[]").apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])
    df.to_csv("papers_enriched.csv", index=False)


# === Step 3: Build Knowledge Graph ===
def build_knowledge_graph():
    df = pd.read_csv("papers_enriched.csv")
    graph.delete_all()
    for _, row in df.iterrows():
        paper_node = Node("Paper", title=row["title"], year=row["publication_year"], doi=row["doi"])
        graph.create(paper_node)
        for author in eval(str(row["authors"])):
            author_node = Node("Author", name=author)
            graph.merge(author_node, "Author", "name")
            graph.create(Relationship(author_node, "WROTE", paper_node))
        for keyword in row["keywords"]:
            keyword_node = Node("Keyword", name=keyword)
            graph.merge(keyword_node, "Keyword", "name")
            graph.create(Relationship(paper_node, "HAS_KEYWORD", keyword_node))
        for entity in row["entities"]:
            entity_node = Node("Entity", name=entity)
            graph.merge(entity_node, "Entity", "name")
            graph.create(Relationship(paper_node, "MENTIONS", entity_node))


# === Step 4: Cluster Abstracts ===
def cluster_abstracts():
    df = pd.read_csv("papers_enriched.csv").dropna(subset=["clean_abstract"])
    if df.empty:
        raise ValueError("No valid abstracts to cluster.")
    embeddings = model.encode(df["clean_abstract"].tolist(), show_progress_bar=True)
    gmm = GaussianMixture(n_components=5, covariance_type='full', random_state=42)
    gmm.fit(embeddings)
    df["gmm_cluster"] = gmm.predict(embeddings)
    df["gmm_probs"] = gmm.predict_proba(embeddings).tolist()
    df.to_csv("clustered_papers.csv", index=False)


# === Step 5: Summarization with Groq ===
def summarize_clusters():
    df = pd.read_csv("clustered_papers.csv")
    df.columns = df.columns.str.strip()
    if "gmm_cluster" not in df.columns:
        raise ValueError("Missing 'gmm_cluster' column.")
    clusters = df.groupby("gmm_cluster")["clean_abstract"].apply(list).to_dict()

    def embedding_to_keywords(texts):
        combined = " ".join(texts)
        return [kw[0] for kw in kw_model.extract_keywords(combined, top_n=20)]

    def query_groq(prompt):
        response = openai.ChatCompletion.create(
            model="llama3-70b-8192",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.7,
            max_tokens=800
        )
        return response["choices"][0]["message"]["content"]

    summaries = {}
    for cluster_id, texts in clusters.items():
        try:
            keywords = embedding_to_keywords(texts)
            prompt = (
                "You are a research assistant. Generate a detailed research summary "
                "focusing on research methodology and key findings for:\n\n"
                + ", ".join(keywords) + "\n\nSummary:"
            )
            summaries[cluster_id] = query_groq(prompt)
        except Exception as e:
            summaries[cluster_id] = f"ERROR: {str(e)}"

    with open("cluster_summaries.json", "w") as f:
        json.dump(summaries, f, indent=4)


# === Unified Runner ===
def run_pipeline(query):
    try:
        fetch_and_save_papers(query)
        enrich_papers()
        build_knowledge_graph()
        cluster_abstracts()
        summarize_clusters()
        return "✅ Pipeline executed successfully!"
    except Exception as e:
        return f"❌ Pipeline failed: {e}"


# === UI Wrapper ===
def run_pipeline_ui(query):
    status = run_pipeline(query)

    try:
        df_enriched = pd.read_csv("papers_enriched.csv")
    except:
        df_enriched = pd.DataFrame()

    try:
        df_clustered = pd.read_csv("clustered_papers.csv")
    except:
        df_clustered = pd.DataFrame()

    try:
        with open("cluster_summaries.json") as f:
            summaries = json.load(f)
        summaries_str = "\n\n".join([f"🔹 Cluster {k}:\n{v}" for k, v in summaries.items()])
    except:
        summaries_str = "No summary generated."

    return (
        status,
        df_enriched.head(10) if not df_enriched.empty else "No enriched data available.",
        df_clustered[["title", "gmm_cluster"]].head(10) if not df_clustered.empty else "No clustered data available.",
        summaries_str,
        "papers_enriched.csv" if not df_enriched.empty else None,
        "clustered_papers.csv" if not df_clustered.empty else None,
        "cluster_summaries.json" if summaries_str else None
    )


# === GRADIO UI ===
gr.Interface(
    fn=run_pipeline_ui,
    inputs=gr.Textbox(lines=2, label="Enter Research Topic", placeholder="e.g., LLMs in medicine"),
    outputs=[
        gr.Textbox(label="Pipeline Status"),
        gr.Dataframe(label="Enriched Papers Preview"),
        gr.Dataframe(label="Clustered Papers Preview"),
        gr.Textbox(label="Cluster Summaries"),
        gr.File(label="📄 Enriched CSV"),
        gr.File(label="📄 Clustered CSV"),
        gr.File(label="📄 Summary JSON")
    ],
    title="Unified Hybrid NLP Research Assistant",
    description="End-to-end research pipeline: OpenAlex → Enrichment → Neo4j KG → Clustering → Groq Summarization"
).launch()


No sentence-transformers model found with name bert-base-uncased. Creating a new one with mean pooling.


* Running on local URL:  http://127.0.0.1:7861
* To create a public link, set `share=True` in `launch()`.




In [None]:
# === Imports ===
import pandas as pd
import re, json, ast, torch, os
import matplotlib.pyplot as plt
import networkx as nx
from sentence_transformers import SentenceTransformer
from transformers import BertTokenizer, BertModel
from keybert import KeyBERT
from py2neo import Graph, Node, Relationship
from sklearn.mixture import GaussianMixture
import openai
import gradio as gr
from fetch_papers import fetch_openalex_papers, save_to_csv

# === Model/Loaders ===
model = SentenceTransformer("all-MiniLM-L6-v2")
kw_model = KeyBERT(model='bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")
bert_model.eval()

# === Groq API Setup ===
openai.api_key = "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"  # Replace with your actual key
openai.api_base = "https://api.groq.com/openai/v1"

# === Neo4j Setup ===
graph = Graph("bolt://localhost:7687", auth=("neo4j", "qwertyuiop"))


# === Step 1: Fetch Papers ===
def fetch_and_save_papers(query, max_results=50):
    df = fetch_openalex_papers(query, max_results=max_results)
    save_to_csv(df, "papers.csv")


# === Step 2: Enrichment ===
def enrich_papers():
    df = pd.read_csv("papers.csv")
    def clean_text(text):
        text = str(text).lower()
        return re.sub(r'[^a-zA-Z0-9\s]', '', text).strip()
    df["clean_abstract"] = df["abstract"].fillna("").apply(clean_text)
    df["keywords"] = df["keywords"].fillna("[]").apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])
    df["entities"] = df["entities"].fillna("[]").apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])
    df.to_csv("papers_enriched.csv", index=False)


# === Step 3: Build Knowledge Graph ===
def build_knowledge_graph():
    df = pd.read_csv("papers_enriched.csv")
    graph.delete_all()
    for _, row in df.iterrows():
        paper_node = Node("Paper", title=row["title"], year=row["publication_year"], doi=row["doi"])
        graph.create(paper_node)
        for author in eval(str(row["authors"])):
            author_node = Node("Author", name=author)
            graph.merge(author_node, "Author", "name")
            graph.create(Relationship(author_node, "WROTE", paper_node))
        for keyword in row["keywords"]:
            keyword_node = Node("Keyword", name=keyword)
            graph.merge(keyword_node, "Keyword", "name")
            graph.create(Relationship(paper_node, "HAS_KEYWORD", keyword_node))
        for entity in row["entities"]:
            entity_node = Node("Entity", name=entity)
            graph.merge(entity_node, "Entity", "name")
            graph.create(Relationship(paper_node, "MENTIONS", entity_node))


# === Step 4: Cluster Abstracts ===
def cluster_abstracts():
    df = pd.read_csv("papers_enriched.csv").dropna(subset=["clean_abstract"])
    if df.empty:
        raise ValueError("No valid abstracts to cluster.")
    embeddings = model.encode(df["clean_abstract"].tolist(), show_progress_bar=True)
    gmm = GaussianMixture(n_components=5, covariance_type='full', random_state=42)
    gmm.fit(embeddings)
    df["gmm_cluster"] = gmm.predict(embeddings)
    df["gmm_probs"] = gmm.predict_proba(embeddings).tolist()
    df.to_csv("clustered_papers.csv", index=False)


# === Step 5: Summarization with Groq ===
def summarize_clusters():
    df = pd.read_csv("clustered_papers.csv")
    df.columns = df.columns.str.strip()
    if "gmm_cluster" not in df.columns:
        raise ValueError("Missing 'gmm_cluster' column.")
    clusters = df.groupby("gmm_cluster")["clean_abstract"].apply(list).to_dict()

    def embedding_to_keywords(texts):
        combined = " ".join(texts)
        return [kw[0] for kw in kw_model.extract_keywords(combined, top_n=20)]

    def query_groq(prompt):
        response = openai.ChatCompletion.create(
            model="llama3-70b-8192",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.7,
            max_tokens=800
        )
        return response["choices"][0]["message"]["content"]

    summaries = {}
    for cluster_id, texts in clusters.items():
        try:
            keywords = embedding_to_keywords(texts)
            prompt = (
                "You are a research assistant. Generate a detailed research summary "
                "focusing on research methodology and key findings for:\n\n"
                + ", ".join(keywords) + "\n\nSummary:"
            )
            summaries[cluster_id] = query_groq(prompt)
        except Exception as e:
            summaries[cluster_id] = f"ERROR: {str(e)}"

    with open("cluster_summaries.json", "w") as f:
        json.dump(summaries, f, indent=4)


# === Visualize Neo4j Graph ===
def visualize_neo4j_graph():
    query = """
    MATCH (n)-[r]->(m)
    RETURN n.name AS source, type(r) AS relation, m.name AS target
    """
    result = graph.run(query).data()

    G = nx.DiGraph()
    for row in result:
        G.add_edge(row["source"], row["target"], label=row["relation"])

    plt.figure(figsize=(12, 10))
    pos = nx.spring_layout(G, k=0.5)
    nx.draw(G, pos, with_labels=True, node_size=800, node_color="skyblue", font_size=10, font_weight='bold', edge_color="gray")
    edge_labels = nx.get_edge_attributes(G, 'label')
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)
    plt.tight_layout()
    plt.savefig("graph.png")
    plt.close()


# === Unified Runner ===
def run_pipeline(query):
    try:
        fetch_and_save_papers(query)
        enrich_papers()
        build_knowledge_graph()
        cluster_abstracts()
        summarize_clusters()
        visualize_neo4j_graph()
        return "✅ Pipeline executed successfully!"
    except Exception as e:
        return f"❌ Pipeline failed: {e}"


# === UI Wrapper ===
def run_pipeline_ui(query):
    status = run_pipeline(query)

    try:
        df_enriched = pd.read_csv("papers_enriched.csv")
    except:
        df_enriched = pd.DataFrame()

    try:
        df_clustered = pd.read_csv("clustered_papers.csv")
    except:
        df_clustered = pd.DataFrame()

    try:
        with open("cluster_summaries.json") as f:
            summaries = json.load(f)
        summaries_str = "\n\n".join([f"🔹 Cluster {k}:\n{v}" for k, v in summaries.items()])
    except:
        summaries_str = "No summary generated."

    graph_file = "graph.png" if os.path.exists("graph.png") else None

    return (
        status,
        df_enriched.head(10) if not df_enriched.empty else "No enriched data available.",
        df_clustered[["title", "gmm_cluster"]].head(10) if not df_clustered.empty else "No clustered data available.",
        summaries_str,
        "papers_enriched.csv" if not df_enriched.empty else None,
        "clustered_papers.csv" if not df_clustered.empty else None,
        "cluster_summaries.json" if summaries_str else None,
        graph_file,   # for Image (display)
        graph_file    # for File (download)
    )


# === GRADIO UI ===
gr.Interface(
    fn=run_pipeline_ui,
    inputs=gr.Textbox(lines=2, label="Enter Research Topic", placeholder="e.g., LLMs in medicine"),
    outputs=[
        gr.Textbox(label="Pipeline Status"),
        gr.Dataframe(label="Enriched Papers Preview"),
        gr.Dataframe(label="Clustered Papers Preview"),
        gr.Textbox(label="Cluster Summaries"),
        gr.File(label="📄 Enriched CSV"),
        gr.File(label="📄 Clustered CSV"),
        gr.File(label="📄 Summary JSON"),
        gr.Image(label="📷 Graph Preview"),    # Show graph inline
        gr.File(label="📥 Download Graph PNG")  # Allow file download
    ],
    title="Unified Hybrid NLP Research Assistant",
    description="End-to-end research pipeline: OpenAlex → Enrichment → Neo4j KG → Clustering → Groq Summarization → Graph Visualization"
).launch()


No sentence-transformers model found with name bert-base-uncased. Creating a new one with mean pooling.


* Running on local URL:  http://127.0.0.1:7862
* To create a public link, set `share=True` in `launch()`.




Searching OpenAlex for: transformers
Request failed: HTTPSConnectionPool(host='api.openalex.org', port=443): Max retries exceeded with url: /works?filter=title.search:transformers&per-page=25&cursor=* (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate is not yet valid (_ssl.c:1000)')))
✅ Retrieved 0 papers.
Saved metadata to papers.csv
