<a href="https://colab.research.google.com/github/betamaan/Final/blob/main/file.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
### requirements

In [None]:
langchain
langchain-core

langchain-openai
openai

langchain-anthropic

langchain-google-genai
google-generativeai

langchain-huggingface
transformers
huggingface-hub

python-dotenv

numpy
scikit-learn


All the agents are included together.**bold text**

In [None]:
import os
import re
import json
import time
import matplotlib.pyplot as plt
from pathlib import Path

from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_ollama import ChatOllama
from dotenv import load_dotenv

load_dotenv()
model = ChatOllama(model="mistral-large:123b")
parser = StrOutputParser()
t_start = time.time()
os.makedirs("results", exist_ok=True)

# ========== AGENT 1: Citation Numbers ==========
print("\n--- Agent 1: Extracting Citation Numbers ---")
loader = PyPDFLoader("/home/skumar/Langchain/file/paper81.pdf")
docs = loader.load()
splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
chunks = splitter.split_documents(docs)

citation_prompt = PromptTemplate(
    template="""
Extract ONLY citation numbers from the academic text.

Citations look like [1], [2], (3), (Smith et al., 2021), [Touvron et al., 2023], (Research, 2022), [2024], (2011)

Return a plain JSON list of integers like:
[1, 2, 3]

Do not return extra text or markdown.

Text:
{text}
""",
    input_variables=["text"]
)
chain = citation_prompt | model | parser
all_citation_numbers = set()

for i, chunk in enumerate(chunks):
    print(f"Processing chunk {i+1} (citations)")
    try:
        result = chain.invoke({"text": chunk.page_content})
        try:
            numbers = json.loads(result)
        except json.JSONDecodeError:
            numbers = re.findall(r"\b\d{1,4}\b", result)
            numbers = list(map(int, numbers))
        all_citation_numbers.update(numbers)
    except Exception:
        pass

candidate_citations = sorted(n for n in all_citation_numbers if 1 <= int(n) <= 300)

verified_citations = set()
for n in candidate_citations:
    pattern_square = rf"\[{n}\]"
    pattern_round = rf"\({n}\)"
    for chunk in chunks:
        if re.search(pattern_square, chunk.page_content) or re.search(pattern_round, chunk.page_content):
            verified_citations.add(n)
            break

filtered_citation_numbers = sorted(verified_citations)

with open("results/citation_numbers.json", "w") as f:
    json.dump(filtered_citation_numbers, f)

with open("results/chunks.json", "w") as f:
    json.dump([{"page_content": chunk.page_content} for chunk in chunks], f)

print("Total citations found:", len(filtered_citation_numbers))
t1 = time.time()
print(f"Agent 1 time: {t1 - t_start:.2f} sec")

# ========== AGENT 2: Citation Authors ==========
print("\n--- Agent 2: Extracting Citation Authors ---")
with open("results/citation_numbers.json") as f:
    citation_numbers = json.load(f)
with open("results/chunks.json") as f:
    chunks = json.load(f)

author_prompt = PromptTemplate(
    template="""
You are given a section of an academic paper. Extract a list of properly formatted citation entries.

Each entry must have:
- The citation number
- The full author list

Format each citation like this:
Citation No. 1: Kumar R., Sharma V.

Return as JSON list:
[
  {{"citation_no": 1, "author": "Kumar R., Sharma V."}},
  ...
]

Only include entries with author. Do not return anything else.

Text:
{text}
""",
    input_variables=["text"]
)

chain = author_prompt | model | parser
final_citations = {}

for i, chunk in enumerate(chunks):
    print(f"Processing chunk {i+1} (authors)")
    try:
        result = chain.invoke({"text": chunk["page_content"]})
        match = re.search(r"\[.*\]", result.strip(), re.DOTALL)
        if not match:
            continue
        data = json.loads(match.group())

        for entry in data:
            cnum = entry.get("citation_no")
            author = entry.get("author")
            if (
                cnum in citation_numbers and
                author and
                "not available" not in author.lower() and
                "n/a" not in author.lower()
            ):
                final_citations[int(cnum)] = author
    except Exception as e:
        print(f"Error in chunk {i+1}: {e}")

with open("results/clean_citations.json", "w") as f:
    json.dump(final_citations, f, indent=2)

print("\nExtracted Citation Authors:")
for cnum in sorted(final_citations):
    print(f"Citation {cnum}: {final_citations[cnum]}")

t2 = time.time()
print(f"Agent 2 time: {t2 - t1:.2f} sec")

# ========== AGENT 3: Citation Years + Timeline ==========
print("\n--- Agent 3: Extracting Citation Years ---")

year_prompt = PromptTemplate(
    template="""
You are given a chunk of an academic research paper.

Your task is to find the publication year for each in-text citation.
Only return years if they are clearly associated with a citation.

Return the result in this JSON format:
[
  {{"citation_no": 1, "year": "2023"}},
  {{"citation_no": 2, "year": "2020"}}
]

Do not include citations without a year. Skip missing or unclear entries.

Text:
{text}
""",
    input_variables=["text"]
)

chain = year_prompt | model | parser
citation_years = {}

for i, chunk in enumerate(chunks):
    print(f"Processing chunk {i+1} (years)")
    try:
        result = chain.invoke({"text": chunk["page_content"]})
        match = re.search(r"\[.*\]", result.strip(), re.DOTALL)
        if not match:
            continue
        data = json.loads(match.group())

        for entry in data:
            cnum = entry.get("citation_no")
            year = entry.get("year")

            if cnum in citation_numbers:
                if year and re.match(r"^(19|20)\d{2}$", str(year)):
                    citation_years[int(cnum)] = year
                else:
                    # Add as unknown temporarily; we may overwrite later if found in other chunks
                    citation_years[int(cnum)] = "Unknown"

    except Exception as e:
        print(f"Error in chunk {i+1}: {e}")

for cnum in citation_numbers:
    if int(cnum) not in citation_years:
        citation_years[int(cnum)] = "Unknown"

with open("results/years.json", "w") as f:
    json.dump(citation_years, f, indent=2)

print(f"\nTotal citation years extracted: {len(citation_years)}")

# Timeline plotting
def draw_basic_timeline(years_json_path, figSize=(15, 10)):
    with open(years_json_path, 'r') as f:
        years_map = json.load(f)

    # citations_with_years = []
    # for num_str, year_str in years_map.items():
    #     try:
    #         citations_with_years.append({"number": num_str, "year": int(year_str)})
    #     except ValueError:
    #         continue
    citations_with_years = [
        {"number": num_str, "year": int(year_str)}
        for num_str, year_str in years_map.items()
        if str(year_str).isdigit()
    ]

    citations_with_years.sort(key=lambda x: x['year'])

    fig, ax = plt.subplots(figsize=figSize)
    min_year = min(c['year'] for c in citations_with_years) - 1
    max_year = max(c['year'] for c in citations_with_years) + 1

    ax.hlines(0, min_year, max_year, color='gray', linestyle='-', linewidth=1.5)
    ax.set_xticks(range(min_year, max_year + 1))
    ax.tick_params(axis='x', rotation=45)
    ax.set_xlim(min_year, max_year)
    ax.yaxis.set_visible(False)
    ax.spines[['left', 'right', 'top']].set_visible(False)
    ax.spines['bottom'].set_linewidth(1.5)

    y_offset_factor = 0.1
    y_positions = {}

    for citation in citations_with_years:
        year = citation['year']
        label = f"#{citation['number']}"
        current_y_offset = y_positions.get(year, 0)

        # y_pos = y_offset_factor * ((current_y_offset + 1) * (-1 if current_y_offset % 2 else 1))
        # y_positions[year] = current_y_offset + 1

        if year % 2 == 0:
            y_pos = y_offset_factor * ((current_y_offset  + 0.5 // 2) * (1 if current_y_offset % 2 == 0 else -1))
            y_positions[year] = current_y_offset + 1
        else:
            y_pos = y_offset_factor * ((current_y_offset - 0.5 // 2 + 1) * (1 if current_y_offset % 2 == 0 else -1))
            y_positions[year] = current_y_offset + 1

        ax.plot(year, 0, 'o', color='darkblue', markersize=6)
        ax.plot([year, year], [0, y_pos], color='skyblue', linestyle='--', linewidth=0.8)

        ax.annotate(
            label,
            xy=(year, y_pos),
            xytext=(year, y_pos + (0.02 if y_pos > 0 else -0.02)),
            fontsize=9,
            ha='center',
            va='bottom' if y_pos > 0 else 'top',
            bbox=dict(boxstyle="round,pad=0.2", fc="white", ec="gray", alpha=0.8),
            arrowprops=dict(arrowstyle="-", color='gray', linewidth=0.5)
        )

    plt.title("Citation Timeline", fontsize=14, pad=20)
    plt.tight_layout()
    plt.show()

    try:
        output_path = Path(years_json_path).with_name("Basic_Citations_Timeline.jpeg")
        fig.savefig(output_path, format="jpeg", bbox_inches="tight", dpi=300)
        print(f"Timeline image saved to {output_path}")
    except Exception as e:
        print(f"Could not save image: {e}")

draw_basic_timeline("results/years.json")
t3 = time.time()
print(f"Agent 3 time: {t3 - t2:.2f} sec")
print(f"\n All agents completed in {t3 - t_start:.2f} seconds.")


# ========== AGENT 4: Citation Summaries and Clustering ==========
print("\n--- Agent 4: Summarizing and Clustering Citations ---")

from sklearn.cluster import KMeans
from langchain_ollama import OllamaEmbeddings
from collections import defaultdict

# Load previously extracted citation numbers
with open("results/citation_numbers.json") as f:
    citation_numbers = json.load(f)

# Load chunks
with open("results/chunks.json") as f:
    chunks = json.load(f)

summary_prompt = PromptTemplate(
    template="""
Given this chunk of a research paper, generate a unique 2-line summary for each citation.

Format as a JSON list:
[
  {{"citation_no": 1, "summary": "This study explored ..."}},
  {{"citation_no": 2, "summary": "Authors evaluated ..."}}
]

Do not repeat the same summary.

Text:
{text}
""",
    input_variables=["text"]
)

chain = summary_prompt | model | parser

citation_summaries = {}

for i, chunk in enumerate(chunks):
    print(f"Processing chunk {i+1} (summaries)")
    try:
        result = chain.invoke({"text": chunk["page_content"]})
        match = re.search(r"\[.*\]", result.strip(), re.DOTALL)
        if not match:
            continue
        data = json.loads(match.group())
        for entry in data:
            cnum = entry.get("citation_no")
            summary = entry.get("summary")
            if (
                cnum in citation_numbers and
                summary and
                "not available" not in summary.lower() and
                "n/a" not in summary.lower() and
                int(cnum) not in citation_summaries
            ):
                citation_summaries[int(cnum)] = summary.strip()
    except Exception as e:
        print(f"Error in chunk {i+1}: {e}")

# Save summaries
with open("results/summaries.json", "w") as f:
    json.dump(citation_summaries, f, indent=2)

# Embedding and clustering
citation_ids = list(citation_summaries.keys())
citation_texts = list(citation_summaries.values())

embedding_model = OllamaEmbeddings(model="nomic-embed-text:latest")
embeddings = embedding_model.embed_documents(citation_texts)

# Elbow method
k_range = range(2, 11)
inertias = []
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(embeddings)
    inertias.append(kmeans.inertia_)

def find_elbow_point(inertias):
    diffs = [inertias[i] - inertias[i+1] for i in range(len(inertias)-1)]
    slopes = [diffs[i] - diffs[i+1] for i in range(len(diffs)-1)]
    return slopes.index(max(slopes)) + 2

optimal_k = find_elbow_point(inertias)
print(f"Optimal k (by elbow method): {optimal_k}")

# Cluster
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
labels = kmeans.fit_predict(embeddings)

clusters_raw = defaultdict(list)
for idx, label in enumerate(labels):
    clusters_raw[label].append((citation_ids[idx], citation_texts[idx]))

cluster_summaries = []
for i, entries in clusters_raw.items():
    cluster_text = "\n".join([f"[{cid}] {text}" for cid, text in entries])
    cluster_summaries.append((f"Cluster {i+1}", cluster_text))

# Subgrouping via LLM
cluster_prompt = PromptTemplate(
    template="""
You are a helpful assistant. You are given a group of research summaries, each prefixed by a citation number like [22].

Your task is to analyze this group and organize it into labeled subgroups based on theme, technique, or topic.
Return the result as a JSON object with structure like:

{{
  "Main Theme of This Cluster": {{
    "Subgroup A": [22, 23],
    "Subgroup B": [24, 25]
  }}
}}

Only use citation numbers in the output. Do not include summaries or explanations.

Cluster label: {label}
Entries:
{text}
""",
    input_variables=["label", "text"]
)

final_clusters = {}
for label, cluster_text in cluster_summaries:
    try:
        result = (cluster_prompt | model | parser).invoke({"label": label, "text": cluster_text})
        parsed = json.loads(result[result.find("{"):])
        final_clusters[label] = parsed
    except Exception as e:
        print(f"Failed to process {label}: {e}")

with open("results/hybrid_llm_embedding_clusters.json", "w") as f:
    json.dump(final_clusters, f, indent=2)
print("Hybrid clustering complete.")

### ========== Output Representation ==========

# D3 Hierarchy Builder
def build_hierarchy(data):
    children = []
    for cluster, structure in data.items():
        theme = list(structure.keys())[0]
        cluster_node = {"name": cluster, "children": []}
        for subgroup, citations in structure[theme].items():
            subgroup_node = {"name": subgroup, "children": [{"name": f"Citation {c}"} for c in citations]}
            cluster_node["children"].append(subgroup_node)
        children.append(cluster_node)
    return {"name": "Root", "children": children}

hierarchy = build_hierarchy(final_clusters)

with open("results/d3_citation_tree.json", "w") as f:
    json.dump(hierarchy, f, indent=2)

print("D3-compatible JSON saved to results/d3_citation_tree.json")




For representation
(IN HTML format)

In [None]:
<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="UTF-8">
  <title>Citation Cluster Visualization</title>
  <style>
    .node circle {
      fill: #1f77b4;
    }

    .node text {
      font: 12px sans-serif;
    }

    .link {
      fill: none;
      stroke: #ccc;
      stroke-width: 2px;
    }
  </style>
</head>
<body>
  <h2>Citation Clustering Hierarchy (D3.js)</h2>
  <svg width="1800" height="800"></svg>

  <script src="https://d3js.org/d3.v7.min.js"></script>
  <script>
    const svg = d3.select("svg"),
          width = +svg.attr("width"),
          height = +svg.attr("height");

    const g = svg.append("g").attr("transform", "translate(120, 80)");

    const tree = d3.tree().size([height - 100, width - 300]);

    d3.json("d3_citation_tree.json").then(data => {
      const root = d3.hierarchy(data);
      tree(root);

      // Draw links
      const link = g.selectAll(".link")
          .data(root.links())
          .enter().append("path")
          .attr("class", "link")
          .attr("d", d3.linkHorizontal()
                      .x(d => d.y)
                      .y(d => d.x));

      // Draw nodes
      const node = g.selectAll(".node")
          .data(root.descendants())
          .enter().append("g")
          .attr("class", "node")
          .attr("transform", d => `translate(${d.y},${d.x})`);

      node.append("circle")
          .attr("r", 6);

      node.append("text")
          .attr("dy", 3)
          .attr("x", d => d.children ? -10 : 10)
          .style("text-anchor", d => d.children ? "end" : "start")
          .text(d => d.data.name);
    });
  </script>
</body>
</html>




<!-- cd your_project
python3 -m http.server
http://localhost:8000
 -->
