# LangGraph + Tree-of-Thought + RAG — Internship Project


In [None]:
# Step 2 — Sample Data & Chunking Strategies (fixed sliding + semantic)
sample_text = """Artificial Intelligence (AI) is transforming industries through automation and insight.
From recommendation systems that suggest content to autonomous vehicles making decisions, AI is everywhere.
Retrieval-Augmented Generation (RAG) improves a language model's answers by retrieving external context
before generating responses. Chunking breaks large documents into smaller pieces so retrieval can find
relevant passages efficiently.

In this demo we implement two chunkers:
1) Fixed-size sliding window (with overlap)
2) Semantic-style chunking (by sentences/paragraphs)
"""

def fixed_sliding_chunks(text, max_chars=200, overlap=50):
    chunks = []
    start = 0
    while start < len(text):
        end = min(len(text), start + max_chars)
        chunks.append(text[start:end].strip())
        start = end - overlap
        if start < 0:
            start = 0
    return [c for c in chunks if c]

def semantic_chunks(text, max_sent_per_chunk=3):
    # very lightweight: split into sentences and group a few sentences per chunk
    sents = [s.strip() for s in text.replace("\n", " ").split('.') if s.strip()]
    chunks = []
    cur = []
    for s in sents:
        cur.append(s + '.')
        if len(cur) >= max_sent_per_chunk:
            chunks.append(' '.join(cur).strip())
            cur = []
    if cur:
        chunks.append(' '.join(cur).strip())
    return chunks

# create chunks
fixed = fixed_sliding_chunks(sample_text)
semantic = semantic_chunks(sample_text)

print('Fixed chunks count:', len(fixed))
for i,c in enumerate(fixed,1):
    print(f'[{i}]', c[:200])
print('\nSemantic chunks count:', len(semantic))
for i,c in enumerate(semantic,1):
    print(f'[{i}]', c[:200])

In [None]:
# Step 3 — Simple Retriever Agent (token-count vectors + cosine similarity)
import math
from collections import Counter

def tokenize(text):
    # very small tokenizer: lowercase + split on non-alphanum
    import re
    tokens = re.findall(r"\w+", text.lower())
    return tokens

def tf_vector(text, vocabulary=None):
    toks = tokenize(text)
    c = Counter(toks)
    if vocabulary is None:
        return c
    # return vector over given vocabulary
    return [c.get(w,0) for w in vocabulary]

def build_corpus_vectors(documents):
    # documents: list of dicts with 'id' and 'text'
    # build vocabulary (top words) to keep vectors small
    all_tokens = []
    for d in documents:
        all_tokens += tokenize(d['text'])
    vocab = sorted(list(set(all_tokens)))[:500]  # cap vocab to 500 tokens
    vectors = {}
    for d in documents:
        vectors[d['id']] = tf_vector(d['text'], vocabulary=vocab)
    return vocab, vectors

def cosine_sim(vec1, vec2):
    # both are lists
    dot = sum(a*b for a,b in zip(vec1,vec2))
    norm1 = math.sqrt(sum(a*a for a in vec1))
    norm2 = math.sqrt(sum(b*b for b in vec2))
    if norm1==0 or norm2==0:
        return 0.0
    return dot/(norm1*norm2)

# Build a small index combining fixed+semantic chunks
documents = []
for i,c in enumerate(fixed):
    documents.append({'id':f'fixed_{i}','text':c,'meta':{'type':'fixed'}})
for i,c in enumerate(semantic):
    documents.append({'id':f'sem_{i}','text':c,'meta':{'type':'semantic'}})

vocab, vectors = build_corpus_vectors(documents)

def retrieve(query, top_k=3):
    qvec = tf_vector(query, vocabulary=vocab)
    scores = []
    for d in documents:
        score = cosine_sim(qvec, vectors[d['id']])
        scores.append((score, d))
    scores.sort(reverse=True, key=lambda x:x[0])
    return scores[:top_k]

# quick test
print('Retrieve test for query: "What is Retrieval-Augmented Generation?"')
res = retrieve('What is Retrieval-Augmented Generation?', top_k=4)
for score, d in res:
    print('score', round(score,3), d['id'], d['meta']['type'], d['text'][:200])

In [None]:
# Step 4 — Query Planner Agent (decides vector vs web)
def plan_query(query):
    ql = query.lower()
    use_web = any(tok in ql for tok in ['latest','today','2025','2024','news','price','score','who is','when'])
    # naive decomposition: split on 'and' or ';' for demo
    subqueries = [s.strip() for s in re.split(r'[;\n]', query) if s.strip()]
    return {'subqueries': subqueries, 'use_web': use_web}

# Demo:
import re
print('Planner demo:')
print(plan_query('Benchmark the in-house DL framework vs top 3 market leaders'))
print(plan_query('What is Retrieval-Augmented Generation?'))


In [None]:
# Step 5 — Synthesizer + Writer Agent
def synthesize_evidence(retrieved):
    # retrieved: list of (score, doc)
    # group and produce concise evidence text
    pieces = []
    for score, d in retrieved:
        pieces.append({'id':d['id'], 'score':score, 'text':d['text'], 'meta':d['meta']})
    return pieces

def compute_confidence(pieces):
    if not pieces:
        return 0.0
    avg = sum(p['score'] for p in pieces)/len(pieces)
    # map to 0-1
    return max(0.0, min(1.0, avg))

def writer_generate(query, retrieved):
    pieces = synthesize_evidence(retrieved)
    conf = compute_confidence(pieces)
    # simple summary: join top piece texts
    summary = ' '.join(p['text'] for p in pieces[:2])
    citations = [p['id'] for p in pieces]
    return {'answer': summary, 'citations': citations, 'confidence': round(conf,3), 'pieces': pieces}

# Demo writer:
q = 'What is Retrieval-Augmented Generation?'
res = retrieve(q, top_k=3)
out = writer_generate(q, res)
print('Answer:', out['answer'][:300])
print('Citations:', out['citations'], 'Confidence:', out['confidence'])


In [None]:
# Step 6 — Tree of Thought (simulated): generate multiple reasoning paths and score them
def propose_expansions(query, n=3):
    # create simple candidate claims by splitting query into n paraphrases (mock)
    parts = [query]  # baseline: same query
    # lightweight variations:
    parts += [query + ' definition', query + ' explanation']
    return parts[:n]

def evaluate_path(path_claim, retriever_func):
    # evaluate by retrieving for the claim and using top score as path score
    res = retriever_func(path_claim, top_k=3)
    if not res:
        return 0.0, res
    return res[0][0], res  # return top score

def tree_of_thought(query, beam=3, depth=2):
    # root
    nodes = [{'claims':[], 'score':0.0, 'retrieved':[]}]
    for d in range(depth):
        candidates = []
        for node in nodes:
            expansions = propose_expansions(query, n=beam)
            for e in expansions:
                score, retrieved = evaluate_path(e, retrieve)
                new = {'claims': node['claims'] + [e], 'score': node['score'] + score, 'retrieved': retrieved}
                candidates.append(new)
        candidates.sort(key=lambda x: x['score'], reverse=True)
        nodes = candidates[:beam]
    best = nodes[0]
    return best

# Demo ToT:
best = tree_of_thought('What is Retrieval-Augmented Generation?', beam=3, depth=2)
print('Best path score:', best['score'])
print('Path claims:', best['claims'])
print('Top retrieved for final claim:')
for s,d in best['retrieved']:
    print(round(s,3), d['id'], d['text'][:200])


In [None]:
# Step 7 — Reviewer Agent & RAGAS-like metrics (simple)
def reviewer_assess(answer_obj, gold_text=None):
    # answer_obj from writer_generate
    # simple faithfulness: check overlap of words between answer and supporting pieces
    ans_words = set(tokenize(answer_obj['answer']))
    support_words = set()
    for p in answer_obj['pieces']:
        support_words |= set(tokenize(p['text']))
    overlap = len(ans_words & support_words)
    faithfulness = overlap / (len(ans_words) + 1e-6)
    recall = None
    if gold_text:
        gold_words = set(tokenize(gold_text))
        recall = len(support_words & gold_words) / (len(gold_words) + 1e-6)
    return {'faithfulness':round(faithfulness,3), 'recall': round(recall,3) if recall is not None else None}

# Demo reviewer:
gold = 'RAG is a technique that retrieves context to improve generation quality.'
print(reviewer_assess(out, gold))


In [None]:
# Step 8 — Example Queries & Final Checklist
queries = [
    'What is Retrieval-Augmented Generation?',
    'How does chunking affect retrieval?',
    'Benchmark the in-house DL framework against top 3 leaders'
]

for q in queries:
    print('\n=== QUERY:', q)
    plan = plan_query(q)
    print('Plan:', plan)
    if plan['use_web']:
        print('Planner decided to use web retrieval (mock) — using vector retrieval as fallback.')
    res = retrieve(q, top_k=4)
    out = writer_generate(q, res)
    print('Answer (snippet):', out['answer'][:200])
    print('Citations:', out['citations'], 'Confidence:', out['confidence'])
    print('Reviewer:', reviewer_assess(out))

print('\n--- Submission checklist ---')
print('1) Notebook named Anwesha_Guha_Internship_25.ipynb')
print('2) Contains chunking, retrieval, ToT demo, and reviewer')
print('3) Ready to upload to Colab and run on CPU-only runtime')
