In [26]:

!pip uninstall -y langchain langchain-core langchain-community langchain-experimental langchain-openai langchainhub langchain-classic langchain-text-splitters langgraph langgraph-prebuilt langgraph-checkpoint langgraph-sdk langsmith

!pip install -U "langchain==0.1.16" "langchain-core==0.1.52" "langchain-community==0.0.34" "langchain-openai==0.0.8"


Found existing installation: langchain 0.1.16
Uninstalling langchain-0.1.16:
  Successfully uninstalled langchain-0.1.16
Found existing installation: langchain-core 0.1.52
Uninstalling langchain-core-0.1.52:
  Successfully uninstalled langchain-core-0.1.52
Found existing installation: langchain-community 0.0.34
Uninstalling langchain-community-0.0.34:
  Successfully uninstalled langchain-community-0.0.34
[0mFound existing installation: langchain-openai 0.0.8
Uninstalling langchain-openai-0.0.8:
  Successfully uninstalled langchain-openai-0.0.8
Found existing installation: langchainhub 0.1.21
Uninstalling langchainhub-0.1.21:
  Successfully uninstalled langchainhub-0.1.21
Found existing installation: langchain-classic 1.0.0
Uninstalling langchain-classic-1.0.0:
  Successfully uninstalled langchain-classic-1.0.0
Found existing installation: langchain-text-splitters 0.0.2
Uninstalling langchain-text-splitters-0.0.2:
  Successfully uninstalled langchain-text-splitters-0.0.2
Found existing

In [36]:

# Load environment variables (API keys)

import os
from dotenv import load_dotenv

# Load keys from the .env 
load_dotenv()

# Check if the keys were read correctly (should print partially masked keys)
print(" OpenAI key found:", os.getenv("OPENAI_API_KEY")[:10] + "...")
print(" Hugging Face token found:", os.getenv("HUGGINGFACEHUB_API_TOKEN")[:10] + "...")


 OpenAI key found: sk-proj-Pj...
 Hugging Face token found: hf_eDmWSIP...


In [37]:

# Step 3: Initialize LLMs (GPT-4 from OpenAI and LLaMA-3 from Hugging Face)

from langchain_openai import ChatOpenAI
from langchain_community.llms import HuggingFaceHub

# GPT-4  →  primary model for text-to-SQL conversion
gpt4 = ChatOpenAI(
    model="gpt-4-turbo",     
    temperature=0.2
)

# LLaMA-3  →  backup model for open-source processing or offline RAG use
llama3 = HuggingFaceHub(
    repo_id="meta-llama/Meta-Llama-3-8B-Instruct",
    model_kwargs={"temperature": 0.3, "max_new_tokens": 256}
)

print(" GPT-4 model initialized")
print(" LLaMA-3 model initialized")


 GPT-4 model initialized
 LLaMA-3 model initialized


In [38]:
#  IMDb Database Connection 

from langchain_community.utilities import SQLDatabase
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableLambda, RunnableMap, RunnableSequence

#  Connect IMDb SQLite database
db_path = "data/imdb.db"
db = SQLDatabase.from_uri(f"sqlite:///{db_path}")
print("IMDb database connected successfully!")
print("Database path:", db_path)
print("Tables available:", db.get_usable_table_names())

# Initialize GPT-4o model ---
llm = ChatOpenAI(model="gpt-4o", temperature=0)

#  Prompt template for SQL generation
prompt = ChatPromptTemplate.from_template("""
You are an expert SQL data analyst.
Given the following question, generate a valid SQL query for the provided database schema.

Database schema:
{schema}

Question: {question}

SQL Query:
""")

#  Wrap prompt inside a RunnableLambda manually 
prompt_runnable = RunnableLambda(
    lambda x: prompt.format(schema=x["schema"], question=x["question"])
)

#  Build LCEL pipeline
sql_chain = RunnableSequence(
    RunnableMap({
        "schema": RunnableLambda(lambda _: db.get_table_info()),
        "question": RunnableLambda(lambda x: x["question"])
    }),
    prompt_runnable,
    llm,
    StrOutputParser()
)

print("IMDb database connected and GPT-4o text-to-SQL engine ready!")


IMDb database connected successfully!
Database path: data/imdb.db
Tables available: ['name_basics', 'title_basics', 'title_principals', 'title_ratings']
IMDb database connected and GPT-4o text-to-SQL engine ready!


In [39]:
query = "List top 5 movies released after 2015 with their IMDb ratings."
sql = sql_chain.invoke({"question": query})
print(" Generated SQL:\n", sql)


 Generated SQL:
 ```sql
SELECT tb.primaryTitle, tb.startYear, tr.averageRating
FROM title_basics tb
JOIN title_ratings tr ON tb.tconst = tr.tconst
WHERE tb.startYear > 2015 AND tb.titleType = 'movie'
ORDER BY tr.averageRating DESC
LIMIT 5;
```


In [40]:
from sqlalchemy import text

# Execute SQL query on the IMDb database
result = db.run(sql.replace("```sql", "").replace("```", "").strip())
print("Query Results:\n", result)


Query Results:
 [('Silenced as mercury rises', 2021.0, 10.0), ('Ami Soumitra', 2022.0, 10.0), ('The Art Whisperer', 2023.0, 10.0), ('Shwaas', 2025.0, 10.0), ('Damas', 2025.0, 10.0)]


In [12]:
!pip uninstall -y langchain langchain-core langchain-community langchain-openai langchain-huggingface langchain-text-splitters langchain-experimental langchain-classic langsmith


Found existing installation: langchain 0.2.14
Uninstalling langchain-0.2.14:
  Successfully uninstalled langchain-0.2.14
Found existing installation: langchain-core 0.2.35
Uninstalling langchain-core-0.2.35:
  Successfully uninstalled langchain-core-0.2.35
Found existing installation: langchain-community 0.2.12
Uninstalling langchain-community-0.2.12:
  Successfully uninstalled langchain-community-0.2.12
Found existing installation: langchain-openai 0.1.15
Uninstalling langchain-openai-0.1.15:
  Successfully uninstalled langchain-openai-0.1.15
Found existing installation: langchain-huggingface 0.0.3
Uninstalling langchain-huggingface-0.0.3:
  Successfully uninstalled langchain-huggingface-0.0.3
Found existing installation: langchain-text-splitters 0.2.2
Uninstalling langchain-text-splitters-0.2.2:
  Successfully uninstalled langchain-text-splitters-0.2.2
[0mFound existing installation: langchain-classic 1.0.0
Uninstalling langchain-classic-1.0.0:
  Successfully uninstalled langchain-c

In [13]:
!pip install -U "langchain==0.2.14" "langchain-core==0.2.35" "langchain-community==0.2.12" "langchain-openai==0.1.15" "langchain-huggingface==0.0.3" "langchain-text-splitters==0.2.2" sentence-transformers faiss-cpu python-dotenv



Collecting langchain==0.2.14
  Using cached langchain-0.2.14-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core==0.2.35
  Using cached langchain_core-0.2.35-py3-none-any.whl.metadata (6.2 kB)
Collecting langchain-community==0.2.12
  Using cached langchain_community-0.2.12-py3-none-any.whl.metadata (2.7 kB)
Collecting langchain-openai==0.1.15
  Using cached langchain_openai-0.1.15-py3-none-any.whl.metadata (2.5 kB)
Collecting langchain-huggingface==0.0.3
  Using cached langchain_huggingface-0.0.3-py3-none-any.whl.metadata (1.2 kB)
Collecting langchain-text-splitters==0.2.2
  Using cached langchain_text_splitters-0.2.2-py3-none-any.whl.metadata (2.1 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain==0.2.14)
  Using cached langsmith-0.1.147-py3-none-any.whl.metadata (14 kB)
Using cached langchain-0.2.14-py3-none-any.whl (997 kB)
Using cached langchain_core-0.2.35-py3-none-any.whl (394 kB)
Using cached langchain_community-0.2.12-py3-none-any.whl (2.3 MB)
Using cached langch

In [41]:

# RAG Retriever for IMDb DB


# Imports
import os
import json
import pickle
import numpy as np
import pandas as pd
from sqlalchemy import create_engine, text

import faiss
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()  # assumes you already have OPENAI_API_KEY in .env

# Paths / config
DB_PATH = "data/imdb.db"                
INDEX_DIR = "rag_imdb"
os.makedirs(INDEX_DIR, exist_ok=True)

INDEX_BIN = os.path.join(INDEX_DIR, "faiss_index.bin")
META_PKL  = os.path.join(INDEX_DIR, "meta.pkl")
TEXTS_JSONL = os.path.join(INDEX_DIR, "texts.jsonl")

EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2" 
TOP_K = 8  # retrieve this many passages

# Load data from SQLite 
engine = create_engine(f"sqlite:///{DB_PATH}")

sql = """
SELECT tb.tconst, tb.primaryTitle, tb.startYear, tb.genres, tr.averageRating
FROM title_basics tb
LEFT JOIN title_ratings tr ON tb.tconst = tr.tconst
WHERE tb.titleType = 'movie'
  AND tb.startYear >= 2015          -- only recent movies
  AND tr.averageRating >= 7.0       -- better quality
LIMIT 100000;
"""
df = pd.read_sql(text(sql), engine)
print(f"Loaded {len(df):,} rows from IMDb for RAG")

# Build small text passages (documents)
def row_to_text(r):
    parts = [str(r["primaryTitle"]).strip()]
    if pd.notna(r.get("startYear")): parts.append(f"({int(r['startYear'])})")
    if pd.notna(r.get("genres")):    parts.append(f"Genres: {r['genres']}")
    if pd.notna(r.get("averageRating")): parts.append(f"Rating: {r['averageRating']}")
    return " ".join(parts)

docs = [
    {
        "id": r["tconst"],
        "text": row_to_text(r),
        "meta": {
            "title": r["primaryTitle"],
            "year": None if pd.isna(r["startYear"]) else int(r["startYear"]),
            "genres": None if pd.isna(r["genres"]) else r["genres"],
            "rating": None if pd.isna(r["averageRating"]) else float(r["averageRating"]),
        },
    }
    for _, r in df.iterrows()
]

# Embed with sentence-transformers
model = SentenceTransformer(EMBED_MODEL)
emb = model.encode([d["text"] for d in docs], batch_size=256, show_progress_bar=True, normalize_embeddings=True)
emb = np.asarray(emb).astype("float32")
dim = emb.shape[1]
print("Embedding shape:", emb.shape)

# Build a FAISS index (cosine similarity via normalized vectors + inner product)
index = faiss.IndexFlatIP(dim)
index.add(emb)
faiss.write_index(index, INDEX_BIN)

# Persist metadata + texts to disk
with open(META_PKL, "wb") as f:
    pickle.dump(
        {
            "embed_model": EMBED_MODEL,
            "dim": dim,
            "ids": [d["id"] for d in docs],
            "meta": [d["meta"] for d in docs],
        },
        f,
    )

with open(TEXTS_JSONL, "w", encoding="utf-8") as f:
    for d in docs:
        f.write(json.dumps({"id": d["id"], "text": d["text"], "meta": d["meta"]}, ensure_ascii=False) + "\n")

print(f"Saved index ➜ {INDEX_BIN}")
print(f"Saved meta  ➜ {META_PKL}")
print(f"Saved texts ➜ {TEXTS_JSONL}")

# Simple retriever
def load_retriever(index_bin=INDEX_BIN, meta_pkl=META_PKL, texts_jsonl=TEXTS_JSONL):
    idx = faiss.read_index(index_bin)
    with open(meta_pkl, "rb") as f:
        meta = pickle.load(f)
    # lazy-load model used for embeddings
    sbert = SentenceTransformer(meta["embed_model"])
    # load texts
    texts = []
    with open(texts_jsonl, "r", encoding="utf-8") as f:
        for line in f:
            texts.append(json.loads(line))
    id2text = {t["id"]: t for t in texts}
    return idx, sbert, meta, id2text

index, sbert, meta, id2text = load_retriever()

def retrieve(query: str, k: int = TOP_K):
    q = sbert.encode([query], normalize_embeddings=True)
    D, I = index.search(np.asarray(q).astype("float32"), k)
    results = []
    ids = meta["ids"]
    for rank, (score, idx_i) in enumerate(zip(D[0], I[0]), start=1):
        mid = ids[idx_i]
        entry = id2text[mid]
        results.append({"rank": rank, "score": float(score), "id": mid, **entry})
    return results

#  Ask OpenAI with retrieved context
client = OpenAI()  # uses OPENAI_API_KEY from env

def answer_with_rag(question: str, k: int = TOP_K, model_name: str = "gpt-4o-mini"):
    ctx = retrieve(question, k=k)
    context_block = "\n".join([f"- {c['text']}" for c in ctx])

    system = (
        "You are a helpful movie assistant. Use ONLY the provided CONTEXT to answer the user.\n"
        "If the context does not contain the answer, say you don't know."
    )
    user = f"QUESTION: {question}\n\nCONTEXT:\n{context_block}\n\nAnswer clearly with titles and years."

    resp = client.chat.completions.create(
        model=model_name,
        messages=[
            {"role": "system", "content": system},
            {"role": "user", "content": user},
        ],
        temperature=0.2,
    )
    return ctx, resp.choices[0].message.content

# Demo query
q = "Recommend 5 high-rated sci-fi movies after 2018 about time travel or alternate timelines."
ctx, ans = answer_with_rag(q)

print("Top retrieved passages:\n")
for c in ctx:
    print(f"{c['rank']:>2}. {c['text']}  (score={c['score']:.3f})")
print("\nModel answer:\n")
print(ans)



Loaded 77,826 rows from IMDb for RAG


Batches:   0%|          | 0/305 [00:00<?, ?it/s]

Embedding shape: (77826, 384)
Saved index ➜ rag_imdb/faiss_index.bin
Saved meta  ➜ rag_imdb/meta.pkl
Saved texts ➜ rag_imdb/texts.jsonl
Top retrieved passages:

 1. Time Rewind (2025) Genres: Adventure,Family,Sci-Fi Rating: 7.3  (score=0.702)
 2. Time Rewind (2025) Genres: Adventure,Family,Sci-Fi Rating: 7.3  (score=0.702)
 3. How to Time Travel (2016) Genres: Comedy,Drama,Sci-Fi Rating: 7.1  (score=0.690)
 4. How to Time Travel (2016) Genres: Comedy,Drama,Sci-Fi Rating: 7.1  (score=0.690)
 5. Time's Paradox (2023) Genres: History,Sci-Fi Rating: 7.8  (score=0.658)
 6. Time's Paradox (2023) Genres: History,Sci-Fi Rating: 7.8  (score=0.658)
 7. Back to 2005 (2024) Genres: Comedy,Sci-Fi Rating: 8.3  (score=0.646)
 8. Back to 2005 (2024) Genres: Comedy,Sci-Fi Rating: 8.3  (score=0.646)

Model answer:

1. Back to 2005 (2024) - Rating: 8.3
2. Time's Paradox (2023) - Rating: 7.8
3. Time Rewind (2025) - Rating: 7.3
4. Time Rewind (2025) - Rating: 7.3
5. How to Time Travel (2016) - Rating: 7.1


In [28]:
q = "List top 5 action movies released after 2019 with IMDb rating above 7.5."
ctx, ans = answer_with_rag(q)

print("Top retrieved passages:\n")
for c in ctx:
    print(f"{c['rank']:>2}. {c['text']}  (score={c['score']:.3f})")

print("\nModel answer:\n")
print(ans)



Top retrieved passages:

 1. G the Film (2020) Genres: Action Rating: 7.6  (score=0.671)
 2. G the Film (2020) Genres: Action Rating: 7.6  (score=0.671)
 3. Showed Up (2019) Genres: Action Rating: 7.7  (score=0.665)
 4. Showed Up (2019) Genres: Action Rating: 7.7  (score=0.665)
 5. The Answer (2018) Genres: Action Rating: 7.0  (score=0.663)
 6. The Answer (2018) Genres: Action Rating: 7.0  (score=0.663)
 7. For Clues (2019) Genres: Action Rating: 7.4  (score=0.640)
 8. For Clues (2019) Genres: Action Rating: 7.4  (score=0.640)

Model answer:

1. G the Film (2020) - Rating: 7.6


In [29]:
q = "Recommend a few romantic comedies after 2020 with decent IMDb ratings."
ctx, ans = answer_with_rag(q)

print("Top retrieved passages:\n")
for c in ctx:
    print(f"{c['rank']:>2}. {c['text']}  (score={c['score']:.3f})")

print("\nModel answer:\n")
print(ans)


Top retrieved passages:

 1. Comedy Couple (2020) Genres: Comedy,Romance Rating: 7.3  (score=0.728)
 2. Comedy Couple (2020) Genres: Comedy,Romance Rating: 7.3  (score=0.728)
 3. Love Happens (2020) Genres: Comedy,Horror,Romance Rating: 9.0  (score=0.700)
 4. Love Happens (2020) Genres: Comedy,Horror,Romance Rating: 9.0  (score=0.700)
 5. Lovers (2020) Genres: Action,Comedy,Drama Rating: 7.8  (score=0.690)
 6. Lovers (2020) Genres: Action,Comedy,Drama Rating: 7.8  (score=0.690)
 7. Untitled Project (2021) Genres: Comedy,Drama,Romance Rating: 7.0  (score=0.683)
 8. Untitled Project (2021) Genres: Comedy,Drama,Romance Rating: 7.0  (score=0.683)

Model answer:

Here are a few romantic comedies after 2020 with decent IMDb ratings:

1. Comedy Couple (2020) - Rating: 7.3
2. Love Happens (2020) - Rating: 9.0
3. Lovers (2020) - Rating: 7.8
4. Untitled Project (2021) - Rating: 7.0


In [30]:
q = "List a few underrated drama movies released after 2020 with IMDb ratings between 6.5 and 7.5."
ctx, ans = answer_with_rag(q)

print("Top retrieved passages:\n")
for c in ctx:
    print(f"{c['rank']:>2}. {c['text']}  (score={c['score']:.3f})")

print("\nModel answer:\n")
print(ans)


Top retrieved passages:

 1. Unsuspected (2020) Genres: Action,Comedy,Drama Rating: 7.2  (score=0.722)
 2. Unsuspected (2020) Genres: Action,Comedy,Drama Rating: 7.2  (score=0.722)
 3. Out of Order (2025) Genres: Drama Rating: 7.5  (score=0.712)
 4. Out of Order (2025) Genres: Drama Rating: 7.5  (score=0.712)
 5. The Ones from Below (2022) Genres: Drama Rating: 7.0  (score=0.700)
 6. The Ones from Below (2022) Genres: Drama Rating: 7.0  (score=0.700)
 7. Over/Under (2022) Genres: Drama Rating: 7.7  (score=0.700)
 8. Over/Under (2022) Genres: Drama Rating: 7.7  (score=0.700)

Model answer:

1. Unsuspected (2020) - Rating: 7.2
2. Out of Order (2025) - Rating: 7.5
3. The Ones from Below (2022) - Rating: 7.0
