In [None]:
#installing packages
!pip install openai faiss-cpu pandas tqdm --quiet


In [None]:
#importaing libraries
import os
import pandas as pd
import numpy as np
import faiss
from tqdm import tqdm
from typing import List, Tuple
from openai import AzureOpenAI


In [None]:
#Environment setup and variables
os.environ["OPENAI_API_KEY"] = "YOUR_AZURE_API_KEY"
os.environ["OPENAI_API_BASE"] = "https://YOUR-RESOURCE-NAME.openai.azure.com/"
os.environ["OPENAI_API_VERSION"] = "2024-02-15-preview"
os.environ["OPENAI_DEPLOYMENT_NAME"] = "text-embedding-ada-002"

os.environ['OPENAI_API_KEY'] = '04f9a983b5d747baac8c74a75c0d525a'
os.environ['OPENAI_ENDPOINT'] = 'https://swedencentral.api.cognitive.microsoft.com/'

In [None]:

AZURE_DEPLOYMENT_NAME = os.getenv("OPENAI_DEPLOYMENT_NAME")

# Azure client
client = AzureOpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
    api_version=os.getenv("OPENAI_API_VERSION"),
    azure_endpoint=os.getenv("OPENAI_ENDPOINT")
)


In [None]:
#Reading data files on Course information
DATA_URL = "https://raw.githubusercontent.com/Bluedata-Consulting/GAAPB01-training-code-base/refs/heads/main/Assignments/assignment2dataset.csv"
df = pd.read_csv(DATA_URL)
df.fillna("", inplace=True)
df["full_text"] =  df["description"]
df.head(10)


Unnamed: 0,course_id,title,description,full_text
0,C001,Foundations of Machine Learning,Understand foundational machine learning algor...,Understand foundational machine learning algor...
1,C002,Deep Learning with TensorFlow and Keras,Explore neural network architectures using Ten...,Explore neural network architectures using Ten...
2,C003,Natural Language Processing Fundamentals,Dive into NLP techniques for processing and un...,Dive into NLP techniques for processing and un...
3,C004,Computer Vision and Image Processing,Learn the principles of computer vision and im...,Learn the principles of computer vision and im...
4,C005,Reinforcement Learning Basics,Get introduced to reinforcement learning parad...,Get introduced to reinforcement learning parad...
5,C006,Data Engineering on AWS,Build scalable data pipelines using AWS servic...,Build scalable data pipelines using AWS servic...
6,C007,Cloud Computing with Azure,Master Microsoft Azure’s core services: virtua...,Master Microsoft Azure’s core services: virtua...
7,C008,DevOps Practices and CI/CD,Adopt DevOps methodologies to accelerate softw...,Adopt DevOps methodologies to accelerate softw...
8,C009,Containerization with Docker and Kubernetes,Learn container fundamentals with Docker: imag...,Learn container fundamentals with Docker: imag...
9,C010,APIs and Microservices Architecture,Design and implement RESTful and GraphQL APIs ...,Design and implement RESTful and GraphQL APIs ...


In [None]:
#function to compute embeddings
def compute_embeddings(texts: List[str]) -> np.ndarray:
    all_embeddings = []
    for i in tqdm(range(0, len(texts), 10)):
        batch = texts[i:i+10]
        response = client.embeddings.create(model=AZURE_DEPLOYMENT_NAME, input=batch)
        embeddings = [e.embedding for e in response.data]
        all_embeddings.extend(embeddings)
    return np.array(all_embeddings).astype("float32")

catalog_embeddings = compute_embeddings(df["full_text"].tolist())


100%|██████████| 3/3 [00:01<00:00,  1.75it/s]


In [None]:
#Indexing of embeddings
dimension = catalog_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(catalog_embeddings)


In [None]:
# Main function that accepts Profiles and Completed course as Inputs and recommends courses based on profiles.
# It also insures that courses already completed are not recommended again.

def recommend_courses(profile: str, completed_ids: List[str], top_k=5) -> List[Tuple[str, float]]:
    # Filter remaining courses
    mask = ~df["course_id"].isin(completed_ids)
    remaining_df = df[mask].reset_index(drop=True)
    remaining_embeddings = catalog_embeddings[mask.values]

    # Embed the query
    query_embedding = client.embeddings.create(model=AZURE_DEPLOYMENT_NAME, input=[profile]).data[0].embedding
    query_vec = np.array(query_embedding).astype("float32").reshape(1, -1)

    # Build temp FAISS index for filtered courses
    temp_index = faiss.IndexFlatL2(dimension)
    temp_index.add(remaining_embeddings)

    # Search and return top results
    distances, indices = temp_index.search(query_vec, top_k + 5)
    recommendations = []
    for idx, dist in zip(indices[0], distances[0]):
        course_id = remaining_df.iloc[idx]["course_id"]
        score = 1 / (1 + dist)
        recommendations.append((course_id, round(score, 4)))
        if len(recommendations) >= top_k:
            break
    return recommendations


In [None]:
#Test input with information on Profiles and course already completed
test_profiles = [
    ("I’ve completed the ‘Python Programming for Data Science’ course and enjoy data visualization.", ["C016"]),
    ("I know Azure basics and want to manage containers and build CI/CD pipelines.", ["C007"]),
    ("My background is in ML fundamentals; I’d like to specialize in neural networks and production workflows.", ["C001"]),
    ("I want to learn to build and deploy microservices with Kubernetes—what courses fit best?", []),
    ("I’m interested in blockchain and smart contracts but have no prior experience.", [])
]



In [None]:
#Running recommendation engine for test inputs
for i, (query, completed) in enumerate(test_profiles):
    print(f"\n=== Test Case {i+1} ===")
    print(f"User Profile: {query}")
    results = recommend_courses(profile=query, completed_ids=completed)
    for cid, score in results:
        title = df[df["course_id"] == cid]["title"].values[0]
        print(f"{cid} - {title} (Score: {score})")


=== Test Case 1 ===
User Profile: I’ve completed the ‘Python Programming for Data Science’ course and enjoy data visualization.
C011 - Big Data Analytics with Spark (Score: 0.7271000146865845)
C014 - Data Visualization with Tableau (Score: 0.7181000113487244)
C017 - R Programming and Statistical Analysis (Score: 0.7095000147819519)
C001 - Foundations of Machine Learning (Score: 0.7092000246047974)
C004 - Computer Vision and Image Processing (Score: 0.7085000276565552)

=== Test Case 2 ===
User Profile: I know Azure basics and want to manage containers and build CI/CD pipelines.
C009 - Containerization with Docker and Kubernetes (Score: 0.7436000108718872)
C008 - DevOps Practices and CI/CD (Score: 0.7426999807357788)
C025 - MLOps: Productionizing Machine Learning (Score: 0.7289999723434448)
C006 - Data Engineering on AWS (Score: 0.7093999981880188)
C010 - APIs and Microservices Architecture (Score: 0.7077000141143799)

=== Test Case 3 ===
User Profile: My background is in ML fundamenta