In [12]:
!pip install -q sentence-transformers faiss-cpu scikit-learn pandas matplotlib jinja2


In [13]:
%%writefile src/tools/retriever.py
import pandas as pd
from dataclasses import dataclass
from typing import List

@dataclass
class Paper:
    paper_id: str
    title: str
    abstract: str
    year: int = None
    authors: str = None

class Retriever:
    def __init__(self, csv_path="data/abstracts.csv"):
        self.csv_path = csv_path
        self.papers = []
        self._load()

    def _load(self):
        try:
            df = pd.read_csv(self.csv_path).fillna("")
            for i, r in df.iterrows():
                self.papers.append(
                    Paper(
                        str(r.get("paper_id", f"id_{i}")),
                        r.get("title", ""),
                        r.get("abstract", ""),
                        int(r["year"]) if "year" in r and r["year"] != "" else None,
                        r.get("authors", "")
                    )
                )
        except:
            # fallback if no file exists
            self.papers = [
                Paper("1","FastNet","We propose FastNet using CIFAR-10 dataset."),
                Paper("2","GNNs","We use GNNs for QM9 dataset.")
            ]

    def retrieve(self, query, max_results=10) -> List[Paper]:
        q = query.lower()
        hits = [p for p in self.papers if q in (p.title + " " + p.abstract).lower()]
        return hits[:max_results] if hits else self.papers[:max_results]


Overwriting src/tools/retriever.py


In [14]:
%%writefile src/tools/vectorstore.py
from sentence_transformers import SentenceTransformer
import numpy as np

try:
    import faiss
    _FAISS = True
except:
    faiss = None
    _FAISS = False

class VectorStore:
    def __init__(self, model="all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model)
        self.index = None
        self.meta = []

    def index_texts(self, items):
        texts = [t for t,_ in items]
        self.meta = [m for _,m in items]
        vecs = self.model.encode(texts, convert_to_numpy=True)

        if _FAISS:
            d = vecs.shape[1]
            index = faiss.IndexFlatL2(d)
            index.add(vecs.astype("float32"))
            self.index = index
        else:
            self.index = vecs

    def search(self, query, k=5):
        qv = self.model.encode([query], convert_to_numpy=True)

        if _FAISS:
            D,I = self.index.search(qv.astype("float32"), k)
            return [self.meta[idx] for idx in I[0] if idx < len(self.meta)]
        else:
            sims = (self.index @ qv[0])
            ranked = sorted(enumerate(sims), key=lambda x: -x[1])[:k]
            return [self.meta[i] for i,_ in ranked]


Writing src/tools/vectorstore.py


In [15]:
%%writefile src/tools/summarizer_extractive.py
from sentence_transformers import SentenceTransformer
import numpy as np

class ExtractiveSummarizer:
    def __init__(self, model="all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model)

    def summarize(self, text, top_k=3):
        sents = [s.strip() for s in text.split('.') if s.strip()]
        if not sents:
            return []

        emb = self.model.encode(sents, convert_to_numpy=True)
        centroid = emb.mean(axis=0)
        sims = (emb @ centroid) / (np.linalg.norm(emb, axis=1) * np.linalg.norm(centroid))

        idx = np.argsort(-sims)[:top_k]
        return [sents[i] for i in idx]


Writing src/tools/summarizer_extractive.py


In [16]:
%%writefile src/tools/keywords.py
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

class KeywordExtractor:
    def __init__(self):
        self.v = TfidfVectorizer(stop_words="english", max_features=5000)

    def fit(self, docs):
        self.v.fit(docs)

    def extract(self, text, top_k=5):
        x = self.v.transform([text]).toarray()[0]
        idx = np.argsort(-x)[:top_k]
        names = self.v.get_feature_names_out()
        return [names[i] for i in idx]


Writing src/tools/keywords.py


In [17]:
%%writefile src/tools/tablegen.py
import re

def extract_method_and_dataset(abstract):
    method = ""
    dataset = ""
    lower = abstract.lower()

    if "we propose" in lower or "we present" in lower:
        method = "Proposed new method"

    if "dataset" in lower:
        dataset = "Dataset mentioned"

    return method, dataset


Writing src/tools/tablegen.py


In [18]:
%%writefile src/tools/report_writer.py
import os

class ReportWriter:
    def __init__(self, outdir="outputs"):
        self.outdir = outdir

    def write(self, query, papers, topics):
        os.makedirs(self.outdir, exist_ok=True)
        path = os.path.join(self.outdir, "report.md")

        lines = []
        lines.append(f"# Literature Survey on '{query}'\n")
        lines.append("## Topics\n")
        lines.append(", ".join(topics) + "\n\n")

        for p in papers:
            lines.append(f"### {p['title']} ({p['paper_id']})\n")
            lines.append(f"- Year: {p['year']}\n")
            lines.append(f"- Authors: {p['authors']}\n")
            lines.append(f"- Keywords: {', '.join(p['keywords'])}\n")
            lines.append("Summary:\n")
            for s in p["summary"]:
                lines.append(f"> {s}\n")
            lines.append("\n---\n")

        with open(path, "w") as f:
            f.writelines("\n".join(lines))

        return path


Writing src/tools/report_writer.py


In [19]:
%%writefile src/agent.py
import json
from src.tools.retriever import Retriever
from src.tools.vectorstore import VectorStore
from src.tools.summarizer_extractive import ExtractiveSummarizer
from src.tools.keywords import KeywordExtractor
from src.tools.tablegen import extract_method_and_dataset
from src.tools.report_writer import ReportWriter

class AutoLitSurveyAgent:
    def __init__(self, csv_path="data/abstracts.csv"):
        self.retriever = Retriever(csv_path)
        self.vector = VectorStore()
        self.summarizer = ExtractiveSummarizer()
        self.kx = KeywordExtractor()
        docs = [p.title + " " + p.abstract for p in self.retriever.papers]
        self.kx.fit(docs)
        self.report = ReportWriter()

    def run(self, query):
        papers = self.retriever.retrieve(query)

        items = []
        for p in papers:
            items.append((p.abstract, {"paper_id":p.paper_id, "title":p.title, "text":p.abstract}))

        self.vector.index_texts(items)
        ranked = self.vector.search(query, k=len(papers))

        selected_ids = []
        selected = []
        paper_map = {p.paper_id:p for p in papers}

        for m in ranked:
            pid = m["paper_id"]
            if pid not in selected_ids:
                selected_ids.append(pid)
                selected.append(paper_map[pid])

        final = []
        topics_all = []

        for p in selected:
            summary = self.summarizer.summarize(p.abstract, top_k=2)
            keywords = self.kx.extract(p.title + " " + p.abstract)
            topics_all.extend(keywords)

            method, dataset = extract_method_and_dataset(p.abstract)

            final.append({
                "paper_id": p.paper_id,
                "title": p.title,
                "year": p.year,
                "authors": p.authors,
                "keywords": keywords,
                "summary": summary,
                "method": method,
                "dataset": dataset
            })

        topics = list(dict.fromkeys(topics_all))[:10]

        path = self.report.write(query, final, topics)
        return final, topics, path


Overwriting src/agent.py


In [20]:
# create sample CSV
import pandas as pd

data = [
    {"paper_id":"1","title":"FastNet","abstract":"We propose FastNet using CIFAR-10 dataset for image classification.","year":2020,"authors":"A"},
    {"paper_id":"2","title":"GNN Molecules","abstract":"We use graph neural networks to predict molecular properties using QM9 dataset.","year":2019,"authors":"B"},
    {"paper_id":"3","title":"Contrastive Learning","abstract":"We present a self-supervised contrastive learning method for representation learning.","year":2021,"authors":"C"},
]
pd.DataFrame(data).to_csv("data/abstracts.csv", index=False)

from src.agent import AutoLitSurveyAgent
agent = AutoLitSurveyAgent("data/abstracts.csv")

papers, topics, report_path = agent.run("learning")

print("Report:", report_path)
print("Topics:", topics)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Report: outputs/report.md
Topics: ['learning', 'contrastive', 'method', 'present', 'representation']
