<a href="https://colab.research.google.com/github/betamaan/Demo3/blob/main/file.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
### requirements

In [None]:
langchain
langchain-core

langchain-openai
openai

langchain-anthropic

langchain-google-genai
google-generativeai

langchain-huggingface
transformers
huggingface-hub

python-dotenv

numpy
scikit-learn


In [None]:
#### Agent 1

from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_ollama import ChatOllama
from dotenv import load_dotenv
import json
import re
import os

load_dotenv()

loader = PyPDFLoader("/home/skumar/Langchain/file/paper81.pdf")
docs = loader.load()

splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
chunks = splitter.split_documents(docs)

model = ChatOllama(model="mistral:latest")
parser = StrOutputParser()

citation_prompt = PromptTemplate(
    template="""
Extract ONLY citation numbers from the academic text.

Citations look like [1], [2], (3), etc.

Return a plain JSON list of integers like:
[1, 2, 3]

Do not return extra text or markdown.

Text:
{text}
""",
    input_variables=["text"]
)

chain = citation_prompt | model | parser

all_citation_numbers = set()

for i, chunk in enumerate(chunks):
    try:
        result = chain.invoke({"text": chunk.page_content})
        try:
            numbers = json.loads(result)
        except json.JSONDecodeError:
            numbers = re.findall(r"\b\d{1,4}\b", result)
            numbers = list(map(int, numbers))
        all_citation_numbers.update(numbers)
    except Exception:
        pass

filtered_citation_numbers = sorted([
    int(n) for n in all_citation_numbers
    if str(n).isdigit() and 1 <= int(n) <= 300
])

os.makedirs("results", exist_ok=True)
with open("results/citation_numbers.json", "w") as f:
    json.dump(filtered_citation_numbers, f)

with open("results/chunks.json", "w") as f:
    json.dump([{"page_content": chunk.page_content} for chunk in chunks], f)

print("Total citation numbers found:", len(filtered_citation_numbers))
print("Saved to results/citation_numbers.json")


In [None]:
!pip install tqdm
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_ollama import ChatOllama
from dotenv import load_dotenv
from tqdm import tqdm
import json
import os

# Step 0: Load environment variables
load_dotenv()

# Step 1: Load PDF
loader = PyPDFLoader("/home/skumar/Langchain/file/paper66.pdf")
docs = loader.load()

# Step 2: Split into manageable chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
chunks = splitter.split_documents(docs)

# Step 3: Setup LLM model and prompt
model = ChatOllama(model="mistral:latest")
parser = StrOutputParser()

citation_prompt = PromptTemplate(
    template="""
Extract all citation references from the academic text.

Citations can look like:
- [1], [2]
- (Smith et al., 2021)
- [Touvron et al., 2023]
- (Research, 2022)

Return a plain JSON list of citation strings that exactly match what’s in the text, like:
[
  "[Touvron et al., 2023]",
  "(Research, 2022)",
  "[1]",
  "[2]"
]

Do not add extra commentary or markdown.

Text:
{text}
""",
    input_variables=["text"]
)

chain = citation_prompt | model | parser

all_citation_strings = set()

print("🔍 Extracting citations from chunks...")
for i, chunk in enumerate(tqdm(chunks, desc="Processing chunks")):
    try:
        result = chain.invoke({"text": chunk.page_content})
        citations = json.loads(result)
        citations = [c.strip() for c in citations if isinstance(c, str) and len(c.strip()) > 2]
        all_citation_strings.update(citations)
    except Exception as e:
        print(f"Error in chunk {i}: {e}")

print("🔍 Verifying extracted citations exist in text...")
verified_citations = set()

all_text = " ".join(chunk.page_content for chunk in chunks)

for citation in sorted(all_citation_strings):
    if citation in all_text:
        verified_citations.add(citation)

sorted_citations = sorted(verified_citations)
citation_number_map = {i + 1: cite for i, cite in enumerate(sorted_citations)}
citation_numbers = list(citation_number_map.keys())


# Step 6: Save to files
os.makedirs("results", exist_ok=True)

with open("results/citation_numbers.json", "w") as f:
    json.dump(citation_numbers, f, indent=2)

with open("results/citation_map.json", "w") as f:
    json.dump(citation_number_map, f, indent=2)

with open("results/chunks.json", "w") as f:
    json.dump([{"page_content": chunk.page_content} for chunk in chunks], f, indent=2)

# Step 7: Final summary
print("Citation extraction complete.")
print(f"Total unique citations found: {len(citation_number_map)}")
print(f"Results saved to: results/citation_numbers.json and citation_map.json")
print("Sample citation map:", list(citation_number_map.items())[:5])


In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_ollama import ChatOllama
from dotenv import load_dotenv
import json
import re
import os

load_dotenv()

loader = PyPDFLoader("/home/skumar/Langchain/file/paper81.pdf")
docs = loader.load()

splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
chunks = splitter.split_documents(docs)

model = ChatOllama(model="mistral:latest")
parser = StrOutputParser()

citation_prompt = PromptTemplate(
    template="""
Extract ONLY citation numbers from the academic text.

Citations look like [1], [2], (3), etc.

Return a plain JSON list of integers like:
[1, 2, 3]

Do not return extra text or markdown.

Text:
{text}
""",
    input_variables=["text"]
)

chain = citation_prompt | model | parser

all_citation_numbers = set()

for i, chunk in enumerate(chunks):
    try:
        result = chain.invoke({"text": chunk.page_content})
        try:
            numbers = json.loads(result)
        except json.JSONDecodeError:
            numbers = re.findall(r"\b\d{1,4}\b", result)
            numbers = list(map(int, numbers))
        all_citation_numbers.update(numbers)
    except Exception:
        pass

candidate_citations = sorted([
    int(n) for n in all_citation_numbers
    if str(n).isdigit() and 1 <= int(n) <= 300
])

verified_citations = set()

for n in candidate_citations:
    pattern_square = rf"\[{n}\]"
    pattern_round = rf"\({n}\)"
    found = False
    for chunk in chunks:
        if re.search(pattern_square, chunk.page_content) or re.search(pattern_round, chunk.page_content):
            verified_citations.add(n)
            break

filtered_citation_numbers = sorted(verified_citations)


os.makedirs("results", exist_ok=True)
with open("results/citation_numbers.json", "w") as f:
    json.dump(filtered_citation_numbers, f)

with open("results/chunks.json", "w") as f:
    json.dump([{"page_content": chunk.page_content} for chunk in chunks], f)

print("Total citation numbers found:", len(filtered_citation_numbers))
print("Saved to results/citation_numbers.json")


In [None]:
### Agent2


from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_ollama import ChatOllama
import json
import re
import os

# Load model
model = ChatOllama(model="mistral:latest")
parser = StrOutputParser()

# Load previous results
with open("results/citation_numbers.json") as f:
    citation_numbers = json.load(f)

with open("results/chunks.json") as f:
    chunks = json.load(f)

# Prompt to extract author + year
author_prompt = PromptTemplate(
    template="""
You are given a section of an academic paper. Extract a list of properly formatted citation entries.

Each entry must have:
- The citation number
- The full author list

Format each citation like this:
Citation No. 1: Kumar R., Sharma V.

Return as JSON list:
[
  {{"citation_no": 1, "author": "Kumar R., Sharma V."}},
  ...
]

Only include entries with author . Do not anything else except this.

Text:
{text}
""",
    input_variables=["text"]
)

chain = author_prompt | model | parser

final_citations = {}

for i, chunk in enumerate(chunks):
    print(f"Processing chunk {i+1}")
    try:
        result = chain.invoke({"text": chunk["page_content"]})
        match = re.search(r"\[.*\]", result.strip(), re.DOTALL)
        if not match:
            continue
        data = json.loads(match.group())

        for entry in data:
            cnum = entry.get("citation_no")
            author = entry.get("author")
            if (
                cnum in citation_numbers and
                author and
                "not available" not in author.lower() and
                "n/a" not in author.lower()
            ):
                final_citations[int(cnum)] = f"{author}"
    except Exception as e:
        print(f"Error in chunk {i+1}: {e}")

# Save final clean result
os.makedirs("results", exist_ok=True)
with open("results/clean_citations.json", "w") as f:
    json.dump(final_citations, f, indent=2)

print("\n Clean Citations Extracted:")
for cnum in sorted(final_citations):
    print(f"Citation {cnum}: {final_citations[cnum]}")


In [None]:
#### Agent3


from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_ollama import ChatOllama
import json
import re
import os

model = ChatOllama(model="mistral:latest")
parser = StrOutputParser()

with open("results/citation_numbers.json") as f:
    citation_numbers = json.load(f)

with open("results/chunks.json") as f:
    chunks = json.load(f)

with open("results/authors.json") as f:
    author_map = json.load(f)

year_prompt = PromptTemplate(
    template="""
You are given a chunk of an academic research paper.

Your task is to find the publication year for each in-text citation.
Only return years if they are clearly associated with a citation.

Return the result in this JSON format:
[
  {{"citation_no": 1, "year": "2023"}},
  {{"citation_no": 2, "year": "2020"}}
]

Do not include citations without a year. Skip missing or unclear entries.

Text:
{text}
""",
    input_variables=["text"]
)

chain = year_prompt | model | parser

citation_years = {}

for i, chunk in enumerate(chunks):
    print(f"Processing chunk {i+1}")
    try:
        result = chain.invoke({"text": chunk["page_content"]})
        match = re.search(r"\[.*\]", result.strip(), re.DOTALL)
        if not match:
            continue
        data = json.loads(match.group())

        for entry in data:
            cnum = entry.get("citation_no")
            year = entry.get("year")
            if (
                cnum in citation_numbers and
                year and
                re.match(r"^(19|20)\d{2}$", str(year))
            ):
                citation_years[int(cnum)] = year
    except Exception as e:
        print(f"Error in chunk {i+1}: {e}")

os.makedirs("results", exist_ok=True)
with open("results/years.json", "w") as f:
    json.dump(citation_years, f, indent=2)

print("\n Verified Citation Years:")
for cnum in sorted(citation_years):
    print(f"Citation {cnum}: {citation_years[cnum]}")


In [None]:
### Agent4

In [None]:
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_ollama import ChatOllama
import json
import re
import os

model = ChatOllama(model="mistral:latest")
parser = StrOutputParser()

with open("results/citation_numbers.json") as f:
    citation_numbers = json.load(f)

with open("results/chunks.json") as f:
    chunks = json.load(f)

with open("results/authors.json") as f:
    author_map = json.load(f)

summary_prompt = PromptTemplate(
    template="""
Given this chunk of a research paper, generate a unique 2-line summary for each citation.

Format as a JSON list:
[
  {{"citation_no": 1, "summary": "This study explored ..."}},
  {{"citation_no": 2, "summary": "Authors evaluated ..."}}
]

Do not repeat the same summary.

Text:
{text}
""",
    input_variables=["text"]
)

chain = summary_prompt | model | parser

citation_summaries = {}

for i, chunk in enumerate(chunks):
    print(f"Processing chunk {i+1}")
    try:
        result = chain.invoke({"text": chunk["page_content"]})
        match = re.search(r"\[.*\]", result.strip(), re.DOTALL)
        if not match:
            continue
        data = json.loads(match.group())

        for entry in data:
            cnum = entry.get("citation_no")
            summary = entry.get("summary")
            if (
                cnum in citation_numbers and
                summary and
                "not available" not in summary.lower() and
                "n/a" not in summary.lower() and
                cnum not in citation_summaries
            ):
                citation_summaries[int(cnum)] = summary.strip()
    except Exception as e:
        print(f"Error in chunk {i+1}: {e}")

os.makedirs("results", exist_ok=True)
with open("results/summaries.json", "w") as f:
    json.dump(citation_summaries, f, indent=2)

print("\n Citation Summaries Extracted:")
for cnum in sorted(citation_summaries):
    print(f"Citation {cnum}: {citation_summaries[cnum]}")


In [None]:
### Verifier

import json
import os

with open("results/citation_numbers.json") as f:
    citation_numbers = json.load(f)

with open("results/authors.json") as f:
    authors = json.load(f)

with open("results/years.json") as f:
    years = json.load(f)

with open("results/summaries.json") as f:
    summaries = json.load(f)

verified_citations = {}

for cnum in citation_numbers:
    cnum = int(cnum)
    if (
        str(cnum) in authors and
        str(cnum) in years and
        str(cnum) in summaries
    ):
        verified_citations[cnum] = {
            "author": authors[str(cnum)],
            "year": years[str(cnum)],
            "summary": summaries[str(cnum)]
        }

os.makedirs("results", exist_ok=True)
with open("results/verified_citations.json", "w") as f:
    json.dump(verified_citations, f, indent=2)

print("\n Verified Citations (All Conditions Met):")
for cnum in sorted(verified_citations):
    entry = verified_citations[cnum]
    print(f"Citation {cnum}: {entry['author']} ({entry['year']}) - {entry['summary']}")


In [None]:
import json
import csv
from tabulate import tabulate
import os

# Load the verified citation data
with open("results/verified_citations.json") as f:
    verified = json.load(f)

# Prepare tabular data
table = []
for cnum in sorted(verified):
    entry = verified[cnum]
    table.append([
        cnum,
        entry["author"],
        entry["year"],
        entry["summary"]
    ])

# Print table to console
headers = ["Citation No.", "Author(s)", "Year", "Summary"]
print("\n Verified Citation Table:\n")
print(tabulate(table, headers=headers, tablefmt="grid"))

# Save as CSV
os.makedirs("results", exist_ok=True)
csv_file = "results/verified_citations.csv"
with open(csv_file, "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(headers)
    writer.writerows(table)

print(f"\n Saved to {csv_file}")


In [None]:
Agent 1



import json
import re
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_ollama import ChatOllama

with open("/home/skumar/Langchain/.venv/results/summaries.json", "r") as f:
    data = json.load(f)

items = [(k, v) for k, v in data.items()]
string_list = [f"[{k}] {v}" for k, v in items]

model = ChatOllama(model="mistral:latest")
parser = StrOutputParser()

cluster_prompt = PromptTemplate(
    template="""
You are a helpful assistant. Group the following strings into **semantically meaningful clusters**.

Each string is prefixed with its citation number like [1], [2], etc.

Clusters may be flat (simple lists of citation numbers), or hierarchical (with subcategories like "part a", "subgroup 1.1", etc.).

Your output must be a valid JSON object. Examples of valid formats:

Flat:
{{
  "Cluster 1": [1, 4, 5],
  "Cluster 2": [2, 3]
}}

Hierarchical:
{{
  "Cluster 1": {{
    "part a": [1, 2, 16, 17],
    "part b": [15, 18, 19]
  }},
  "Cluster 2": [3, 4, 5, 6],
  "Cluster 3": {{
    "deep subcluster": {{
      "type x": [22, 39],
      "type y": [40]
    }}
  }}
}}

Only include citation numbers in each list. Do not include summaries or explanations.

Strings:
{text}
""",
    input_variables=["text"]
)

chain = cluster_prompt | model | parser

result = chain.invoke({"text": "\n".join(string_list)})

match = re.search(r"\{.*\}", result.strip(), re.DOTALL)
clusters = json.loads(match.group()) if match else {}

def print_clusters(clusters, indent=0):
    for key, value in clusters.items():
        prefix = " " * indent
        if isinstance(value, dict):
            print(f"{prefix}{key}:")
            print_clusters(value, indent + 2)
        else:
            print(f"{prefix}{key}: {value}")

print("Clusters:")
print_clusters(clusters)

with open("results/clustered_citations_nested.json", "w") as f:
    json.dump(clusters, f, indent=2)


In [None]:
#### clustering


import json
from sklearn.cluster import KMeans
from langchain_ollama import OllamaEmbeddings
from langchain_ollama import ChatOllama
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from collections import defaultdict

with open("/home/skumar/Langchain/.venv/results/summaries.json", "r") as f:
    citation_data = json.load(f)

citation_ids = list(citation_data.keys())
citation_texts = list(citation_data.values())

embedding_model = OllamaEmbeddings(model="nomic-embed-text:latest")  # use "llama3" if you want full LLM embedding
embeddings = embedding_model.embed_documents(citation_texts)

k = 6
kmeans = KMeans(n_clusters=k, random_state=42)
labels = kmeans.fit_predict(embeddings)

clusters_raw = defaultdict(list)
for idx, label in enumerate(labels):
    clusters_raw[label].append((citation_ids[idx], citation_texts[idx]))

cluster_summaries = []
for i, entries in clusters_raw.items():
    cluster_text = "\n".join([f"[{cid}] {text}" for cid, text in entries])
    cluster_summaries.append((f"Cluster {i+1}", cluster_text))

model = ChatOllama(model="mistral:latest")
parser = StrOutputParser()

cluster_prompt = PromptTemplate(
    template="""
You are a helpful assistant. You are given a group of research summaries, each prefixed by a citation number like [22].

Your task is to analyze this group and organize it into labeled subgroups based on theme, technique, or topic.
Return the result as a JSON object with structure like:

{{
  "Main Theme of This Cluster": {{
    "Subgroup A": [22, 23],
    "Subgroup B": [24, 25]
  }}
}}

Only use citation numbers in the output. Do not include summaries or explanations.

Cluster label: {label}
Entries:
{text}
""",
    input_variables=["label", "text"]
)

final_clusters = {}

for label, cluster_text in cluster_summaries:
    try:
        result = (cluster_prompt | model | parser).invoke({"label": label, "text": cluster_text})
        parsed = json.loads(result[result.find("{"):])
        final_clusters[label] = parsed
    except Exception as e:
        print(f"Failed to process {label}: {e}")

with open("results/hybrid_llm_embedding_clusters.json", "w") as f:
    json.dump(final_clusters, f, indent=2)
print("Hybrid clustering complete. Output saved to results/hybrid_llm_embedding_clusters.json")


In [None]:
%pip install matplotlib
import json
from sklearn.cluster import KMeans
from langchain_ollama import OllamaEmbeddings
from langchain_ollama import ChatOllama
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from collections import defaultdict

with open("/home/skumar/Langchain/.venv/results/summaries.json", "r") as f:
    citation_data = json.load(f)

citation_ids = list(citation_data.keys())
citation_texts = list(citation_data.values())

embedding_model = OllamaEmbeddings(model="nomic-embed-text:latest")  # use "llama3" if you want full LLM embedding
embeddings = embedding_model.embed_documents(citation_texts)

import matplotlib.pyplot as plt

# Range of k values to test
k_range = range(2, 11)
inertias = []

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(embeddings)
    inertias.append(kmeans.inertia_)

# Optional: Plot the elbow curve to visualize
plt.figure(figsize=(8, 5))
plt.plot(k_range, inertias, marker='o')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Inertia (sum of squared distances)')
plt.title('Elbow Method For Optimal k')
plt.grid(True)
plt.savefig("results/elbow_curve.png")  # Save plot instead of showing it (since this is often run headless)
plt.close()

# Automatically find "elbow point" using slope change (simple heuristic)
def find_elbow_point(inertias):
    diffs = [inertias[i] - inertias[i+1] for i in range(len(inertias)-1)]
    slopes = [diffs[i] - diffs[i+1] for i in range(len(diffs)-1)]
    return slopes.index(max(slopes)) + 2  # +2 because index offset

optimal_k = find_elbow_point(inertias)
print(f"Optimal k (by elbow method): {optimal_k}")


k = optimal_k
kmeans = KMeans(n_clusters=k, random_state=42)
labels = kmeans.fit_predict(embeddings)

clusters_raw = defaultdict(list)
for idx, label in enumerate(labels):
    clusters_raw[label].append((citation_ids[idx], citation_texts[idx]))

cluster_summaries = []
for i, entries in clusters_raw.items():
    cluster_text = "\n".join([f"[{cid}] {text}" for cid, text in entries])
    cluster_summaries.append((f"Cluster {i+1}", cluster_text))

model = ChatOllama(model="mistral:latest")
parser = StrOutputParser()

cluster_prompt = PromptTemplate(
    template="""
You are a helpful assistant. You are given a group of research summaries, each prefixed by a citation number like [22].

Your task is to analyze this group and organize it into labeled subgroups based on theme, technique, or topic.
Return the result as a JSON object with structure like:

{{
  "Main Theme of This Cluster": {{
    "Subgroup A": [22, 23],
    "Subgroup B": [24, 25]
  }}
}}

Only use citation numbers in the output. Do not include summaries or explanations.

Cluster label: {label}
Entries:
{text}
""",
    input_variables=["label", "text"]
)

final_clusters = {}

for label, cluster_text in cluster_summaries:
    try:
        result = (cluster_prompt | model | parser).invoke({"label": label, "text": cluster_text})
        parsed = json.loads(result[result.find("{"):])
        final_clusters[label] = parsed
    except Exception as e:
        print(f"Failed to process {label}: {e}")

with open("results/hybrid_llm_embedding_clusters.json", "w") as f:
    json.dump(final_clusters, f, indent=2)
print("Hybrid clustering complete. Output saved to results/hybrid_llm_embedding_clusters.json")


In [None]:
#### Visualization



import json
import plotly.graph_objects as go
import os

# Path to your cluster JSON file
json_path = "results/hybrid_llm_embedding_clusters.json"

# Ensure the file exists
if not os.path.exists(json_path):
    raise FileNotFoundError("The clustering JSON file does not exist.")

# Load the data
with open(json_path, "r") as f:
    data = json.load(f)

# Prepare the treemap hierarchy
labels = ["Root"]
parents = [""]

for cluster, cluster_content in data.items():
    labels.append(cluster)
    parents.append("Root")

    for theme, subgroups in cluster_content.items():
        theme_label = f"{cluster} - {theme}"
        labels.append(theme_label)
        parents.append(cluster)

        if isinstance(subgroups, dict):
            for subgroup, citations in subgroups.items():
                labels.append(subgroup)
                parents.append(theme_label)
                for citation in citations:
                    citation_label = f"Citation {citation}"
                    labels.append(citation_label)
                    parents.append(subgroup)
        elif isinstance(subgroups, list):
            for citation in subgroups:
                citation_label = f"Citation {citation}"
                labels.append(citation_label)
                parents.append(theme_label)

# Create the treemap
fig = go.Figure(go.Treemap(
    labels=labels,
    parents=parents,
    marker=dict(colorscale="Blues"),
    root=dict(color="white"),
    branchvalues="total"
))

fig.update_layout(
    title="Citation Clustering Hierarchy",
    margin=dict(t=50, l=25, r=25, b=25),
)

# Save and display
fig.write_html("results/citation_cluster_tree_horizontal.html")
fig.show()


In [None]:
from graphviz import Digraph
import json
import os

# Load the full citation cluster data
with open("results/hybrid_llm_embedding_clusters.json") as f:
    data = json.load(f)

# Create the graph
dot = Digraph(comment="Citation Tree Diagram", format='png')
dot.attr('graph', rankdir='TB', size='10')

# Recursive function to add nodes and edges
def add_nodes_recursive(parent_id, content, prefix):
    if isinstance(content, list):  # Leaf: list of citation IDs
        for cid in content:
            cid_node = f"{prefix}_cid_{cid}"
            dot.node(cid_node, f"[{cid}]", shape='ellipse', style='filled', color='lightblue')
            dot.edge(parent_id, cid_node)
    elif isinstance(content, dict):  # Internal node
        for key, value in content.items():
            child_id = f"{prefix}_{key.replace(' ', '_').replace('>', '').replace(':', '')}"
            dot.node(child_id, key, shape='box', style='filled', color='orange')
            dot.edge(parent_id, child_id)
            add_nodes_recursive(child_id, value, prefix=child_id)

# Start from clusters
for cluster_name, themes in data.items():
    cluster_id = f"cluster_{cluster_name.replace(' ', '_')}"
    dot.node(cluster_id, cluster_name, shape='box', style='filled', color='red')
    add_nodes_recursive(cluster_id, themes, prefix=cluster_id)

# Render the diagram to a file
output_path = "/home/skumar/Langchain/.venv/results/hybrid_llm_embedding_clusters.json"
dot.render(output_path, format="png")

output_path + ".png"  # Return path for viewing


In [None]:
!pip install nbformat
import json
import plotly.express as px

with open("results/hybrid_llm_embedding_clusters.json") as f:
    data = json.load(f)

labels = []
parents = []

for cluster, structure in data.items():
    labels.append(cluster)
    parents.append("")  # root
    theme = list(structure.keys())[0]
    for subgroup, citations in structure[theme].items():
        labels.append(subgroup)
        parents.append(cluster)
        for cid in citations:
            labels.append(f"Citation {cid}")
            parents.append(subgroup)

fig = px.sunburst(
    names=labels,
    parents=parents,
    title="Citation Clusters and Subgroups",
)
fig.show()


In [None]:
#### citation_tree


import json

with open("results/hybrid_llm_embedding_clusters.json") as f:
    data = json.load(f)

def build_hierarchy(data):
    children = []
    for cluster, structure in data.items():
        theme = list(structure.keys())[0]
        cluster_node = {"name": cluster, "children": []}
        for subgroup, citations in structure[theme].items():
            subgroup_node = {"name": subgroup, "children": [{"name": f"Citation {c}"} for c in citations]}
            cluster_node["children"].append(subgroup_node)
        children.append(cluster_node)
    return {"name": "Root", "children": children}

hierarchy = build_hierarchy(data)

with open("results/d3_citation_tree.json", "w") as f:
    json.dump(hierarchy, f, indent=2)

print("D3-compatible JSON saved to results/d3_citation_tree.json")


In [None]:
import json
from pathlib import Path

# Load cluster data
json_path = Path("results/hybrid_llm_embedding_clusters.json")
with open(json_path, "r") as f:
    data = json.load(f)

# Build nested tree for D3.js
def build_d3_tree(data):
    root = {"name": "Root", "children": []}
    for cluster, content in data.items():
        cluster_node = {"name": cluster, "children": []}
        for theme, subgroups in content.items():
            theme_node = {"name": theme, "children": []}
            for subgroup, citations in subgroups.items():
                subgroup_node = {
                    "name": subgroup,
                    "children": [{"name": f"Citation {cid}"} for cid in citations]
                }
                theme_node["children"].append(subgroup_node)
            cluster_node["children"].append(theme_node)
        root["children"].append(cluster_node)
    return root

d3_data = build_d3_tree(data)

# Save the result
output_path = Path("results/d3_tree_data.json")
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w") as f:
    json.dump(d3_data, f, indent=2)

print("✅ D3.js data saved to:", output_path)


In [None]:
##### Timeline


import matplotlib.pyplot as plt
import json
from pathlib import Path

def draw_basic_timeline(years_json_path, figSize=(15, 10)):
    with open(years_json_path, 'r', encoding='utf-8') as f:
        years_map = json.load(f)

    citations_with_years = []

    for num_str, year_str in years_map.items():
        try:
            year = int(year_str)
            citations_with_years.append({
                "number": num_str,
                "year": year
            })
        except ValueError:
            continue

    citations_with_years.sort(key=lambda x: x['year'])

    # Plot
    fig, ax = plt.subplots(figsize=figSize)
    min_year = min(c['year'] for c in citations_with_years) - 1
    max_year = max(c['year'] for c in citations_with_years) + 1

    ax.hlines(0, min_year, max_year, color='gray', linestyle='-', linewidth=1.5)

    ax.set_xlabel("Year", fontsize=12)
    ax.set_xticks(range(min_year, max_year + 1))
    ax.tick_params(axis='x', rotation=45)
    ax.set_xlim(min_year, max_year)
    ax.yaxis.set_visible(False)
    ax.spines[['left', 'right', 'top']].set_visible(False)
    ax.spines['bottom'].set_linewidth(1.5)

    y_offset_factor = 0.1
    y_positions = {}

    for citation in citations_with_years:
        year = citation['year']
        label = f"#{citation['number']}"
        current_y_offset = y_positions.get(year, 0)

        if year % 2 == 0:
            y_pos = y_offset_factor * ((current_y_offset  + 0.5 // 2) * (1 if current_y_offset % 2 == 0 else -1))
            y_positions[year] = current_y_offset + 1
        else:
            y_pos = y_offset_factor * ((current_y_offset - 0.5 // 2 + 1) * (1 if current_y_offset % 2 == 0 else -1))
            y_positions[year] = current_y_offset + 1

        ax.plot(year, 0, 'o', color='darkblue', markersize=6)
        ax.plot([year, year], [0, y_pos], color='skyblue', linestyle='--', linewidth=0.8)

        ax.annotate(
            label,
            xy=(year, y_pos),
            xytext=(year, y_pos + (0.02 if y_pos > 0 else -0.02)),
            fontsize=9,
            ha='center',
            va='bottom' if y_pos > 0 else 'top',
            bbox=dict(boxstyle="round,pad=0.2", fc="white", ec="gray", alpha=0.8),
            arrowprops=dict(arrowstyle="-", color='gray', linewidth=0.5)
        )

    plt.title("Citation Timeline (Using Only years.json)", fontsize=14, pad=20)
    plt.tight_layout()
    plt.show()

    try:
        output_path = Path(years_json_path).with_name("Basic_Citations_Timeline.jpeg")
        fig.savefig(output_path, format="jpeg", bbox_inches="tight", dpi=300)
        print(f"Timeline image saved to {output_path}")
    except Exception as e:
        print(f"Could not save image: {e}")

# ========== MAIN ==========
if __name__ == "__main__":
    draw_basic_timeline("/home/skumar/Langchain/.venv/results/years.json")
