In [None]:
%pip install -q langchain langchain-community chromadb sentence-transformers openai tiktoken python-dotenv langchain_openai

In [None]:
pip install pypdf

In [None]:
from openai import OpenAI

# DO NOT EXPOSE THIS KEY PUBLICLY!!!!
api_key = "..."
# DO NOT EXPOSE THIS KEY PUBLICLY!!!!

project_id = "proj_fHRnVJY0Oyfm1ufG1sffxa6W"

client = OpenAI(api_key=api_key, project=project_id)

In [None]:
import os
os.environ["OPENAI_API_KEY"] = "..."

In [None]:
import os
import pandas as pd
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

# Where the unified Chroma DB will live
#persist_dir = "unified_chroma_db"
#os.makedirs(persist_dir, exist_ok=True)

#embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [None]:
# --- 1. Load PDFs ---
#pdf_docs = []
#pdf_folder = "path/to/your/pdfs"
# 2. List of PDFs
pdf_folder = ["..\Data\SQL slides\Advanced SQL I copy.pdf", "..\Data\SQL slides\Advanced SQL II copy.pptx.pdf", "..\Data\Assignment_prompt.pdf","..\Data\Syllabus.pdf" ]
pdf_docs = {}

for file in pdf_folder:
    loader = PyPDFLoader(file)
    pages = loader.load_and_split()
    # pdf_docs[file] = pages

# Add source metadata
for p in pages:
    p.metadata["source_name"] = file
    
pdf_docs[file] = pages

# Flatten dictionary into a list for embedding
flat_docs = [doc for pages in pdf_docs.values() for doc in pages]    


In [None]:
flat_docs["..\Data\SQL slides\Advanced SQL I copy.pdf"]

In [None]:
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

solutions_db = Chroma.from_documents(documents=flat_docs, embedding=embeddings, persist_directory="unified_chroma_db")
solutions_db.persist()


In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# --- 2. Load CSVs ---
csv_docs = []
csv_folder = ["..\Data\Student_rubric_feedback.csv", "..\Data\General_Rubric.csv"]
for file in csv_folder:
    df = pd.read_csv(file)
    df.columns = df.columns.str.strip()  # clean headers

    if file == "..\Data\Student_rubric_feedback.csv":
      expected = ["Student", "Criteria", "Ratings", "Points", "Points Awarded", "Final Grade", "Feedback"]
      missing = [c for c in expected if c not in df.columns]
      if missing:
          raise ValueError(f"Missing expected columns after normalization: {missing}")

      # --- Clean text columns ---
      for col in ["Student", "Criteria", "Ratings", "Feedback"]:
          df[col] = df[col].astype(str).str.strip()

        # --- Convert numeric columns ---
      df["Points"] = pd.to_numeric(df["Points"], errors="coerce").fillna(25)
      df["Points Awarded"] = pd.to_numeric(df["Points Awarded"], errors="coerce").fillna(0)
      df["Final Grade"] = pd.to_numeric(df["Final Grade"], errors="coerce")

      # --- Add unique ID for each row ---
      df["id"] = [f"row-{i}" for i in range(len(df))]

    #Inspect DataFrame
    print("First 10 rows:")
    display(df.head())

    print("\nColumn types:")
    display(df.dtypes)



In [None]:
# Make a document string per row
for _, row in df.iterrows():
  text = " | ".join(f"{col}: {row[col]}" for col in df.columns if not pd.isna(row[col]))
  csv_docs.append({"page_content": text, "metadata": {"source": file}})

In [None]:
# Convert CSV rows into LangChain documents
from langchain.schema import Document
csv_docs = [Document(page_content=d["page_content"], metadata=d["metadata"]) for d in csv_docs]

# --- 3. Combine everything ---
all_docs = pdf_docs + csv_docs

# --- 4. Create / load vectorstore ---
vectorstore = Chroma.from_documents(all_docs, embeddings, persist_directory=persist_dir)

print("Total documents in unified store:", vectorstore._collection.count())

In [None]:
# The underlying Chroma client
collection = vectorstore._collection

# Get the first 10 embeddings + docs
results = collection.get(include=["embeddings", "documents"], limit=10)

# Print shapes + snippet of values
for i, emb in enumerate(results["embeddings"]):
    print(f"Vector {i+1}: length={len(emb)}")
    print(f"First 5 dims: {emb[:5]}")
    print(f"Doc snippet: {results['documents'][i][:120]}...\n")

In [None]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
# --- 5. Create retriever ---
retriever = vectorstore.as_retriever(search_kwargs={"k":3})

# --- 6. Connect LLM (ChatOpenAI) ---
llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)

# --- 7. Create a RetrievalQA chain ---
qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff"  # simple chain that stuffs docs together
)

# --- 8. Ask a question ---
query = "What feedback did Student_1 get?"
result = qa.run(query)
print(result)