# Assignment 2

In [1]:
!pip install langchain_google_genai faiss-cpu pandas numpy tqdm scikit-learn



In [2]:
# Load API keys from .env file
from dotenv import load_dotenv

load_dotenv(override=True)

True

In [3]:
import requests
import pandas as pd
import numpy as np
from typing import List, Tuple
from sklearn.preprocessing import normalize
import faiss
from langchain_google_genai import GoogleGenerativeAIEmbeddings

## Embedding Model

In [4]:
# Dataset URL
DATA_URL = "https://raw.githubusercontent.com/Bluedata-Consulting/GAAPB01-training-code-base/main/Assignments/assignment2dataset.csv"
LOCAL_FILE = "assignment2dataset.csv"

# Embedding model
embedding_model_name = "models/embedding-001"
embeddings = GoogleGenerativeAIEmbeddings(model=embedding_model_name)

In [5]:
# Load Dataset

def load_dataset():
    try:
        r = requests.get(DATA_URL, timeout=15)
        if r.status_code == 200:
            with open(LOCAL_FILE, "wb") as f:
                f.write(r.content)
            print(f"Downloaded dataset to {LOCAL_FILE}")
        else:
            print(f"Failed to fetch from URL, status {r.status_code}")
    except Exception as e:
        print("Download error:", e)

    # Load CSV
    df = pd.read_csv(LOCAL_FILE)
    print("Loaded dataset:", df.shape)
    return df

df_catalog = load_dataset()

Downloaded dataset to assignment2dataset.csv
Loaded dataset: (25, 3)


In [6]:
df_catalog.head()

Unnamed: 0,course_id,title,description
0,C001,Foundations of Machine Learning,Understand foundational machine learning algor...
1,C002,Deep Learning with TensorFlow and Keras,Explore neural network architectures using Ten...
2,C003,Natural Language Processing Fundamentals,Dive into NLP techniques for processing and un...
3,C004,Computer Vision and Image Processing,Learn the principles of computer vision and im...
4,C005,Reinforcement Learning Basics,Get introduced to reinforcement learning parad...


In [7]:
# Prepare Text for Embedding
def build_text(row):
    title = str(row.get("title", "") or "")
    desc = str(row.get("description", "") or "")
    return title.strip() + ". " + desc.strip()

df_catalog["text"] = df_catalog.apply(build_text, axis=1)
df_catalog = df_catalog.reset_index(drop=True)

In [8]:
df_catalog

Unnamed: 0,course_id,title,description,text
0,C001,Foundations of Machine Learning,Understand foundational machine learning algor...,Foundations of Machine Learning. Understand fo...
1,C002,Deep Learning with TensorFlow and Keras,Explore neural network architectures using Ten...,Deep Learning with TensorFlow and Keras. Explo...
2,C003,Natural Language Processing Fundamentals,Dive into NLP techniques for processing and un...,Natural Language Processing Fundamentals. Dive...
3,C004,Computer Vision and Image Processing,Learn the principles of computer vision and im...,Computer Vision and Image Processing. Learn th...
4,C005,Reinforcement Learning Basics,Get introduced to reinforcement learning parad...,Reinforcement Learning Basics. Get introduced ...
5,C006,Data Engineering on AWS,Build scalable data pipelines using AWS servic...,Data Engineering on AWS. Build scalable data p...
6,C007,Cloud Computing with Azure,Master Microsoft Azure’s core services: virtua...,Cloud Computing with Azure. Master Microsoft A...
7,C008,DevOps Practices and CI/CD,Adopt DevOps methodologies to accelerate softw...,DevOps Practices and CI/CD. Adopt DevOps metho...
8,C009,Containerization with Docker and Kubernetes,Learn container fundamentals with Docker: imag...,Containerization with Docker and Kubernetes. L...
9,C010,APIs and Microservices Architecture,Design and implement RESTful and GraphQL APIs ...,APIs and Microservices Architecture. Design an...


In [9]:
# Embed Courses
def embed_texts(texts: List[str]) -> np.ndarray:
    vectors = embeddings.embed_documents(texts)  # list of lists
    arr = np.array(vectors, dtype="float32")
    arr = normalize(arr, norm="l2", axis=1)  # normalize for cosine similarity
    return arr

course_texts = df_catalog["text"].tolist()
course_embs = embed_texts(course_texts)
print("Course embeddings shape:", course_embs.shape)

Course embeddings shape: (25, 768)


In [10]:
# Build FAISS Index
def build_faiss_index(embs: np.ndarray):
    d = embs.shape[1]
    index = faiss.IndexFlatIP(d)  # cosine similarity
    index.add(embs)
    return index

index = build_faiss_index(course_embs)
metadata = df_catalog.to_dict(orient="index")

In [11]:
# Recommendation Function
def recommend_courses(profile: str, completed_ids: List[str]) -> List[Tuple[str, float]]:
    topn = 5 # Top 5 recommendations
    
    # Build query string
    completed_titles = []
    for cid in completed_ids:
        match = df_catalog[df_catalog["course_id"].astype(str) == str(cid)]
        if len(match) > 0:
            completed_titles.append(match.iloc[0]["title"])
    query_text = "Completed: " + "; ".join(completed_titles) + ". Interests: " + profile if completed_titles else profile

    # Embed query
    q_emb = np.array(embeddings.embed_query(query_text), dtype="float32").reshape(1, -1)
    q_emb = normalize(q_emb, norm="l2", axis=1)

    # Search FAISS
    distances, indices = index.search(q_emb, 20)

    # Filter results
    results = []
    seen = set()
    for idx, score in zip(indices[0], distances[0]):
        cid = str(metadata[idx]["course_id"])
        if cid in completed_ids or cid in seen:
            continue
        seen.add(cid)
        results.append((cid, float(score)))
        if len(results) >= topn:
            break
    return results

In [12]:
# Helper to Get Course Info
def get_course_info(course_id):
    row = df_catalog[df_catalog["course_id"].astype(str) == str(course_id)]
    if len(row) == 0:
        return None
    r = row.iloc[0]
    return {"course_id": r["course_id"], "title": r["title"], "description": r["description"]}

In [15]:
# Test with Sample Profiles
sample_profiles = [
    {"query": "I've completed the 'Python Programming for Data Science' course and enjoy data visualization. What should I take next?",
     "completed_ids": ["C016"]},
    {"query": "I know Azure basics and want to manage containers and build CI/CD pipelines. Recommend courses.",
     "completed_ids": []},
    {"query": "My background is in ML fundamentals; I'd like to specialize in neural networks and production workflows.",
     "completed_ids": []},
    {"query": "I want to learn to build and deploy microservices with Kubernetes—what courses fit best?",
     "completed_ids": []},
    {"query": "I'm interested in blockchain and smart contracts but have no prior experience. Which courses do you suggest?",
     "completed_ids": []}
]

for i, profile in enumerate(sample_profiles, 1):
    print(f"\n=== Profile {i}: {profile['query']}")
    recs = recommend_courses(profile["query"], profile["completed_ids"])
    for cid, score in recs:
        info = get_course_info(cid)
        print(f" -> {cid} | {info['title']} | Score: {score:.4f}")


=== Profile 1: I've completed the 'Python Programming for Data Science' course and enjoy data visualization. What should I take next?
 -> C017 | R Programming and Statistical Analysis | Score: 0.8230
 -> C001 | Foundations of Machine Learning | Score: 0.8162
 -> C002 | Deep Learning with TensorFlow and Keras | Score: 0.8127
 -> C011 | Big Data Analytics with Spark | Score: 0.8118
 -> C014 | Data Visualization with Tableau | Score: 0.8100

=== Profile 2: I know Azure basics and want to manage containers and build CI/CD pipelines. Recommend courses.
 -> C009 | Containerization with Docker and Kubernetes | Score: 0.8192
 -> C007 | Cloud Computing with Azure | Score: 0.8190
 -> C008 | DevOps Practices and CI/CD | Score: 0.8123
 -> C010 | APIs and Microservices Architecture | Score: 0.7806
 -> C002 | Deep Learning with TensorFlow and Keras | Score: 0.7706

=== Profile 3: My background is in ML fundamentals; I'd like to specialize in neural networks and production workflows.
 -> C002 | Deep