#### Load libraries

In [39]:
import os
import dotenv
from openai import AzureOpenAI
import pandas as pd
import faiss
import numpy as np
import pickle

#### Setup embedding model

In [40]:
client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2024-12-01-preview",  # 
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

def get_embedding(text: str, deployment="text-embedding-ada-002") -> list:
    response = client.embeddings.create(
        input=[text],
        model=deployment  # this is your *deployment name*, not base model
    )
    return response.data[0].embedding

#### Create embeddings using openai and persist it using faiss. Store metadata separately.

In [41]:
# Load data
df = pd.read_csv("/home/user/Desktop/GAAPB01-training-code-base/Assignments/assignment2dataset.csv")
df['full_text'] = df['title'] + ": " + df['description']
df['embedding'] = df['full_text'].apply(get_embedding)

# Create FAISS index
index = faiss.IndexFlatL2(1536)
embedding_matrix = np.vstack(df['embedding'].values)
index.add(embedding_matrix)

# Save FAISS index
faiss.write_index(index, "course_index.faiss")

# Create ID -> metadata mapping (course_id, title, etc.)
metadata = df[['course_id', 'title', 'description']].to_dict(orient='records')

# Save metadata to match FAISS index
with open("course_metadata.pkl", "wb") as f:
    pickle.dump(metadata, f)


#### Get recommendations through similarity search

In [42]:
from typing import List, Tuple
import numpy as np
import faiss
import pickle

# Load FAISS index
index = faiss.read_index("course_index.faiss")

# Load metadata (list of dicts with 'course_id', 'title', 'description')
with open("course_metadata.pkl", "rb") as f:
    metadata = pickle.load(f)

def recommend_courses(profile: str, completed_ids: List[str]) -> List[Tuple[str, float]]:
    input_text = profile

    # # Get full_texts of completed courses
    # completed_texts = [
    #     f"{item['title']}: {item['description']}"
    #     for item in metadata
    #     if item['course_id'] in completed_ids
    # ]

    # # Append completed course content to profile
    # if completed_texts:
    #     input_text += " " + " ".join(completed_texts)

    # Embed the final input
    query_embedding = np.array(get_embedding(input_text)).reshape(1, -1)

    # Query FAISS index
    distances, indices = index.search(query_embedding, k=10)

    # Prepare recommendations (exclude already completed)
    recommendations = [
        (metadata[i]['course_id'], float(1 - distances[0][j]))
        for j, i in enumerate(indices[0])
        if metadata[i]['course_id'] not in completed_ids
    ]
    return recommendations[:5]  # Return top 5 recommendations


#### Execution to get recommendations

In [38]:

questions_to_ask = [
    "I've completed the 'Python Programming for Data Science' course and enjoy data visualization. What should I take next?",
    "I know Azure basics and want to manage containers and build CI/CD pipelines. Recommend courses.",
    "My background is in ML fundamentals; I'd like to specialize in neural networks and production workflows.",
    "I want to learn to build and deploy microservices with Kubernetes—what courses fit best?",
    "I'm interested in blockchain and smart contracts but have no prior experience. Which courses do you suggest?"
]
completed_courses_list = ["C016","","","","",""]

for question in questions_to_ask:
    print("-" * 80)
    print(f"Input query:\n {question}")
    recommend_course_ids = recommend_courses(question, completed_courses_list[questions_to_ask.index(question)])
    print(f"\nRecommended Course IDs for:\n", recommend_course_ids)
    print("\nAdditional Information:")
    for course_id, score in recommend_course_ids:
        metadata_item = next((item for item in metadata if item['course_id'] == course_id), None)
        if metadata_item:
            title = metadata_item['title']
            print(f"{course_id}: {title}")

--------------------------------------------------------------------------------
Input query:
 I've completed the 'Python Programming for Data Science' course and enjoy data visualization. What should I take next?

Recommended Course IDs for:
 [('C011', 0.6178039312362671), ('C014', 0.6132631301879883), ('C017', 0.5913984775543213), ('C004', 0.5881150960922241), ('C001', 0.5750609040260315)]

Additional Information:
C011: Big Data Analytics with Spark
C014: Data Visualization with Tableau
C017: R Programming and Statistical Analysis
C004: Computer Vision and Image Processing
C001: Foundations of Machine Learning
--------------------------------------------------------------------------------
Input query:
 I know Azure basics and want to manage containers and build CI/CD pipelines. Recommend courses.

Recommended Course IDs for:
 [('C007', 0.6888490915298462), ('C009', 0.665590226650238), ('C008', 0.6512447595596313), ('C010', 0.5979080200195312), ('C025', 0.5947254300117493)]

Addition