In [None]:
# Install dependencies
!pip install -q arxiv pandas numpy tqdm sentence-transformers umap-learn hdbscan plotly streamlit transformers torch scikit-learn

# Mount Google Drive (optional - uncomment if you want to save results)


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
import os
import sys
sys.path.append('src')  # relative path from current dir

# Replace with the name of your folder
folder_path = '/content/drive/My Drive/research_insight_engine/src'

os.chdir(folder_path)
!ls  # lists files in the current directory

1_data_ingestion.py  2_embedding_clustering.py	3_summarization_novelty.py  4_visualization.py


In [7]:
import os
import sys
from IPython.display import HTML, display
import pandas as pd
import plotly.express as px
import json
from tqdm.notebook import tqdm


# Import our modules
from data_ingestion import fetch_papers, setup_directory
from embedding_clustering import create_embeddings, reduce_dimensions, cluster_papers
from summarization_novelty import setup_model, create_cluster_prompt, generate_summary, compute_novelty_score

In [17]:
from datetime import datetime, timedelta
import arxiv
def setup_directory():
    """Create and return the path to the data directory."""
    base_path = "data"
    os.makedirs(base_path, exist_ok=True)
    return base_path

def format_paper_title(title):
    """Clean and format paper title."""
    return title.strip().replace('\n', ' ').replace('  ', ' ')

# Create data directory
def fetch_papers(days=7, category='cs.LG'):
    """Fetch papers from arXiv published in the last n days."""
    # Calculate date range with timezone awareness
    from datetime import timezone
    end_date = datetime.now(timezone.utc)
    start_date = end_date - timedelta(days=days)
    print(f"Fetching papers from {start_date.date()} to {end_date.date()}")

    # Create search query
    search = arxiv.Search(
        query=f"cat:{category}",
        max_results=1000,  # We'll filter by date later
        sort_by=arxiv.SortCriterion.SubmittedDate
    )

    # Fetch and process papers
    papers = []
    for result in tqdm(search.results(), desc="Fetching papers"):
        # The result.published is timezone-aware, so we can compare directly
        if start_date <= result.published <= end_date:
            paper = {
                'id': result.entry_id.split('/')[-1],
                'title': format_paper_title(result.title),
                'abstract': result.summary,
                'authors': [author.name for author in result.authors],
                'date': result.published.strftime('%Y-%m-%d'),
                'categories': result.categories,
                'comment': result.comment if result.comment else '',
                'pdf_url': result.pdf_url
            }
            papers.append(paper)

    return papers

data_path = "data"
os.makedirs(data_path, exist_ok=True)

# Step 1: Data Ingestion
print("📚 Step 1: Fetching Papers")
papers = fetch_papers(days=7, category='cs.LG')
df = pd.DataFrame(papers)
df.to_csv(os.path.join(data_path, 'recent_papers.csv'), index=False)
print(f"Found {len(papers)} papers\n")

# Display sample
display(df[['title', 'date', 'authors']].head())


📚 Step 1: Fetching Papers
Fetching papers from 2025-03-16 to 2025-03-23


  for result in tqdm(search.results(), desc="Fetching papers"):


Fetching papers: 0it [00:00, ?it/s]

Found 483 papers



Unnamed: 0,title,date,authors
0,GAEA: A Geolocation Aware Conversational Model,2025-03-20,"[Ron Campos, Ashmal Vayani, Parth Parag Kulkar..."
1,MagicMotion: Controllable Video Generation wit...,2025-03-20,"[Quanhao Li, Zhen Xing, Rui Wang, Hui Zhang, Q..."
2,InfiniteYou: Flexible Photo Recrafting While P...,2025-03-20,"[Liming Jiang, Qing Yan, Yumin Jia, Zichuan Li..."
3,Survey on Evaluation of LLM-based Agents,2025-03-20,"[Asaf Yehudai, Lilach Eden, Alan Li, Guy Uziel..."
4,DreamTexture: Shape from Virtual Texture with ...,2025-03-20,"[Ananta R. Bhattarai, Xingzhe He, Alla Sheffer..."


In [18]:
# Step 2: Embedding & Clustering
print("\n🧬 Step 2: Creating Embeddings & Clusters")
# Create embeddings
texts = df['title'] + " " + df['abstract']
embeddings = create_embeddings(texts)

# Reduce dimensions
embeddings_2d = reduce_dimensions(embeddings)

# Cluster papers
cluster_labels = cluster_papers(embeddings)

# Add results to dataframe
df['cluster'] = cluster_labels
df['x'] = embeddings_2d[:, 0]
df['y'] = embeddings_2d[:, 1]

# Save intermediate results
df.to_csv(os.path.join(data_path, 'papers_with_clusters.csv'), index=False)



🧬 Step 2: Creating Embeddings & Clusters
Loading model...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Generating embeddings...


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Reducing dimensions...


  warn(


Clustering papers...




In [19]:
# Visualize clusters
fig = px.scatter(
    df,
    x='x',
    y='y',
    color='cluster',
    hover_data=['title'],
    title='Paper Clusters',
    width=1000,
    height=600
)
fig.show()

In [22]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import os
from tqdm import tqdm
from sklearn.metrics.pairwise import euclidean_distances
import json

# Step 3: Summarization & Novelty
print("\n🎯 Step 3: Generating Summaries & Computing Novelty")

def setup_model():
    """Setup Deepseek model for inference."""
    print("Loading Deepseek model...")
    model_name = "deepseek-ai/deepseek-coder-6.7b-instruct"

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        device_map="auto"
    )
    return model, tokenizer

def create_cluster_prompt(titles):
    """Create prompt for cluster summarization."""
    # Limit number of titles if there are too many
    if len(titles) > 20:
        titles = titles[:20]  # Take first 20 titles
        titles.append("... and more papers")

    titles_str = '\n'.join([f"- {title}" for title in titles])
    return f"""You are a research assistant helping to analyze machine learning papers.
    Below are titles of related research papers. Please provide:
    1. A short theme that connects these papers (1 sentence)
    2. Key research directions or trends (2-3 bullet points)

    Papers:
    {titles_str}

    Response:"""

def generate_summary(model, tokenizer, prompt):
    """Generate summary using Deepseek model."""
    # Truncate input if too long
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=3072)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=512,  # Control length of generated summary
            temperature=0.7,
            num_return_sequences=1,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=True,
            top_p=0.9,
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def compute_novelty_score(embedding, cluster_embeddings):
    """Compute novelty score as average distance to other papers in cluster."""
    distances = euclidean_distances([embedding], cluster_embeddings)[0]
    return float(np.mean(distances))

def generate_summary(model, tokenizer, prompt):
    """Generate summary using Deepseek model."""
    # Truncate input if too long
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=3072)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=512,  # Control length of generated summary
            temperature=0.7,
            num_return_sequences=1,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=True,
            top_p=0.9,
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Setup model
model, tokenizer = setup_model()

# Generate summaries for each cluster
cluster_summaries = {}
for cluster in tqdm(df['cluster'].unique(), desc="Summarizing clusters"):
    if cluster >= 0:  # Skip noise cluster (-1)
        cluster_papers = df[df['cluster'] == cluster]
        prompt = create_cluster_prompt(cluster_papers['title'].tolist())
        summary = generate_summary(model, tokenizer, prompt)
        cluster_summaries[str(cluster)] = summary

        # Display summary as we go
        print(f"\nCluster {cluster} Summary:")
        print(summary)

# Compute novelty scores
print("\nComputing novelty scores...")
novelty_scores = []
for idx, row in tqdm(df.iterrows(), total=len(df)):
    cluster = row['cluster']
    if cluster >= 0:
        cluster_papers = df[df['cluster'] == cluster]
        cluster_embeddings = embeddings[cluster_papers.index]
        score = compute_novelty_score(embeddings[idx], cluster_embeddings)
    else:
        score = 1.0  # High novelty for outliers
    novelty_scores.append(score)

df['novelty_score'] = novelty_scores

# Flag standout papers
df['is_novel'] = False
for cluster in df['cluster'].unique():
    if cluster >= 0:
        cluster_mask = df['cluster'] == cluster
        threshold = df[cluster_mask]['novelty_score'].quantile(0.9)
        df.loc[cluster_mask & (df['novelty_score'] >= threshold), 'is_novel'] = True

# Save final results
df.to_csv(os.path.join(data_path, 'papers_analyzed.csv'), index=False)
with open(os.path.join(data_path, 'cluster_summaries.json'), 'w') as f:
    json.dump(cluster_summaries, f, indent=2)

# Display novel papers
novel_papers = df[df['is_novel']].sort_values('novelty_score', ascending=False)
print(f"\nFound {len(novel_papers)} novel papers")
print("\nTop 5 most novel papers:")
display(novel_papers[['title', 'cluster', 'novelty_score']].head())


🎯 Step 3: Generating Summaries & Computing Novelty
Loading Deepseek model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Summarizing clusters:  33%|███▎      | 1/3 [00:20<00:41, 20.78s/it]


Cluster 0 Summary:
You are a research assistant helping to analyze machine learning papers.
    Below are titles of related research papers. Please provide:
    1. A short theme that connects these papers (1 sentence)
    2. Key research directions or trends (2-3 bullet points)
    
    Papers:
    - GAEA: A Geolocation Aware Conversational Model
- MagicMotion: Controllable Video Generation with Dense-to-Sparse Trajectory Guidance
- InfiniteYou: Flexible Photo Recrafting While Preserving Your Identity
- Survey on Evaluation of LLM-based Agents
- DreamTexture: Shape from Virtual Texture with Analysis by Augmentation
- RoboFactory: Exploring Embodied Agent Collaboration with Compositional Constraints
- The Emperor's New Clothes in Benchmarking? A Rigorous Examination of Mitigation Strategies for LLM Benchmark Data Contamination
- Exploring the Hidden Reasoning Process of Large Language Models by Misleading Them
- ScalingNoise: Scaling Inference-Time Search for Generating Infinite Videos

Summarizing clusters: 100%|██████████| 3/3 [00:33<00:00, 11.10s/it]



Cluster 1 Summary:
You are a research assistant helping to analyze machine learning papers.
    Below are titles of related research papers. Please provide:
    1. A short theme that connects these papers (1 sentence)
    2. Key research directions or trends (2-3 bullet points)
    
    Papers:
    - NeuralFoil: An Airfoil Aerodynamics Analysis Tool Using Physics-Informed Machine Learning
- Accelerating Transient CFD through Machine Learning-Based Flow Initialization
- Localized Physics-informed Gaussian Processes with Curriculum Training for Topology Optimization
- C(NN)FD -- Deep Learning Modelling of Multi-Stage Axial Compressors Aerodynamics
- High-entropy Advantage in Neural Networks' Generalizability
    
    Response:
1. The theme that connects these papers is the integration of machine learning (ML) and computational fluid dynamics (CFD) to improve the efficiency and accuracy of aerodynamic analysis.
2. Key research directions or trends include:
   - Exploring the use of ML in

100%|██████████| 483/483 [00:00<00:00, 1376.25it/s]



Found 28 novel papers

Top 5 most novel papers:


Unnamed: 0,title,cluster,novelty_score
204,A Comprehensive Study of LLM Secure Code Gener...,0,1.31221
363,Strain Problems got you in a Twist? Try Strain...,0,1.264747
338,SRBB-Based Quantum State Preparation,0,1.261714
212,MusicInfuser: Making Video Diffusion Listen an...,0,1.251776
88,Approximation properties of neural ODEs,0,1.251656


In [23]:


# Step 4: Interactive Exploration
print("\n📊 Step 4: Interactive Exploration")

def explore_cluster(cluster_id):
    """Display information about a specific cluster."""
    cluster_papers = df[df['cluster'] == cluster_id].sort_values('novelty_score', ascending=False)

    print(f"\nCluster {cluster_id} Summary:")
    print(cluster_summaries[str(cluster_id)])

    print(f"\nPapers in Cluster {cluster_id}:")
    for _, paper in cluster_papers.iterrows():
        print(f"\n{'🌟 ' if paper['is_novel'] else ''}Title: {paper['title']}")
        print(f"Authors: {paper['authors']}")
        print(f"Novelty Score: {paper['novelty_score']:.3f}")
        print(f"Abstract: {paper['abstract'][:200]}...")

def search_papers(query):
    """Search papers by title or abstract."""
    mask = df['title'].str.contains(query, case=False) | \
           df['abstract'].str.contains(query, case=False)
    results = df[mask].sort_values('novelty_score', ascending=False)

    print(f"\nFound {len(results)} papers matching '{query}':")
    for _, paper in results.iterrows():
        print(f"\n{'🌟 ' if paper['is_novel'] else ''}Title: {paper['title']}")
        print(f"Cluster: {paper['cluster']}")
        print(f"Novelty Score: {paper['novelty_score']:.3f}")
        print(f"Abstract: {paper['abstract'][:200]}...")

# Example usage:
print("To explore a cluster:")
print("explore_cluster(0)  # Replace 0 with any cluster number")
print("\nTo search papers:")
print("search_papers('reinforcement learning')")


📊 Step 4: Interactive Exploration
To explore a cluster:
explore_cluster(0)  # Replace 0 with any cluster number

To search papers:
search_papers('reinforcement learning')


In [30]:
explore_cluster(0)


Cluster 2 Summary:


KeyError: '2'