In [None]:
# Core Libraries for Data and RAG
%pip install pandas faiss-cpu langchain langchain-community langchain-huggingface

# Libraries for the Local LLM (Hugging Face)
%pip install transformers accelerate bitsandbytes

Collecting faiss-cpu
  Downloading faiss_cpu-1.13.1-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.6 kB)
Collecting langchain-community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain-huggingface
  Downloading langchain_huggingface-1.2.0-py3-none-any.whl.metadata (2.8 kB)
Collecting langchain-classic<2.0.0,>=1.0.0 (from langchain-community)
  Downloading langchain_classic-1.0.0-py3-none-any.whl.metadata (3.9 kB)
Collecting requests<3.0.0,>=2.32.5 (from langchain-community)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7.0,>=0.6.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting langchain-core<2.0.0,>=1.1.2 (from langchain)
  Downloading langchain_core-1.2.2-py3-none-any.whl.metadata (3.7 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7.0,>=0.6.7->langchain-community)
  Downloading marshma

In [None]:
from google.colab import files

uploaded = files.upload()

Saving student_behavior_log.csv to student_behavior_log.csv


In [None]:
#Convert data to Langhain documents. Candidate can combine other columns as well such as Subject#

import pandas as pd
from langchain_community.document_loaders import DataFrameLoader

# 1. Load the Dummy Data
df = pd.read_csv("student_behavior_log.csv")

# 2. Pre-processing: Create a combined text field for the Context
# We combine Name, Class, and Notes so the model has full context in one string.
df['combined_text'] = (
    "Student Name: " + df['Student_ID'] +
    "; Class: " + df['Class'].astype(str) +
    "; Notes: " + df['Teacher_Notes']
)

# 3. Convert to LangChain Documents
loader = DataFrameLoader(df, page_content_column="combined_text")
documents = loader.load()

print(f"Loaded {len(documents)} documents. Here is one example:")
print(documents[0].page_content)



FileNotFoundError: [Errno 2] No such file or directory: 'student_behavior_log.csv'

In [None]:
# The Embedding and indexing Part. This uses a small, fast model (all-MiniLM-L6-v2) and FAISS for vector storage.
# Knowledge Check for candidate: Chunking is not required in this use case because:
# Tabular Data, Notes column doesn't have huge text, each row acts as chunks

from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

# 1. Initialize the Embedding Model (Runs on CPU easily)
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# 2. Create the Vector Store (This indexes the documents)
# In an interview, ask: "Why does this step take time?" (Ans: Converting text to numbers)
vector_db = FAISS.from_documents(documents, embedding_model)

print("Vector Database created successfully!")

Vector Database created successfully!


In [None]:
# The LLM & RAG Chain (The "Generation" Engine)
#For the interview, we use google/flan-t5-large.
#It is fast (2GB), runs on CPU, and is smart enough for this specific task without needing a GPU.

from langchain_huggingface import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

# 1. Load a Small, Local LLM (Flan-T5)
model_id = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

# Create a pipeline
pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
local_llm = HuggingFacePipeline(pipeline=pipe)

# 2. Build the Retrieval Chain
from langchain_community.chains import RetrievalQA

rag_chain = RetrievalQA.from_chain_type(
    llm=local_llm,
    chain_type="stuff", # 'Stuff' means putting all retrieved docs into the prompt
    retriever=vector_db.as_retriever(search_kwargs={"k": 3}), # Retrieve top 3 matches
    return_source_documents=True # Helpful to see WHICH docs were used
)

print("RAG Chain is ready to answer questions!")

ModuleNotFoundError: No module named 'langchain_huggingface'