In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m50.6 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
!pip install sentence-transformers



In [None]:
!pip install faiss-cpu



In [None]:
!pip install qdrant-client



In [None]:
!pip install --upgrade qdrant-client



In [None]:
!pip install google-generativeai



In [None]:
!pip install agentops



In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
import concurrent.futures
from tqdm import tqdm
import time
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import random
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import spacy
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from qdrant_client import QdrantClient
from qdrant_client.http.models import PointStruct, Distance, VectorParams
import google.generativeai as genai
from sklearn.metrics.pairwise import cosine_similarity
from langchain.schema.runnable import Runnable
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from qdrant_client.http.models import Filter, FieldCondition, MatchText
import agentops

#5.1 Data Preparation & Vector Database Integration

Data Cleaning and Normalization:

In [None]:
# Load dataset
df = pd.read_csv("/content/drive/MyDrive/udemy_course_data.csv")

In [None]:
# Drop irrelevant columns
columns_to_drop = [
    'published_timestamp', 'published_date',
    'published_time', 'year', 'month', 'day', 'profit'
]

df = df.drop(columns=columns_to_drop)

In [None]:
# function to categories course status
def categorize_course_status(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        html = response.text

        # Non-English check
        if '<html lang="en"' not in html and 'lang="en"' not in html:
            return "Non-English"

        # Error page
        error_tag = soup.find('h1', string=lambda x: x and "we can’t find the page you’re looking for" in x.lower())
        if error_tag:
            return "Error Page"

        # Course unavailable (either case)
        title_check = soup.find('div', {'data-purpose': 'safely-set-inner-html:limited-access-container:title'})
        subtitle_check = soup.find('div', {'data-purpose': 'safely-set-inner-html:limited-access-controller:subtitle'})
        if (title_check and "no longer accepting enrollments" in title_check.get_text(strip=True).lower()) or \
           (subtitle_check and "no longer accepting enrollments" in subtitle_check.get_text(strip=True).lower()):
            return "Course Unavailable"

        # Private course
        private_tag = soup.find('div', string=lambda x: x and "this is a private course." in x.lower())
        if private_tag:
            return "Private Course"

        # Standard description
        description_container = soup.find('div', {'data-purpose': 'safely-set-inner-html:description:description'})
        if description_container and description_container.find_all('p'):
            return "Valid"

        # Alternate known container
        alt_container = soup.find('div', {'class': 'ud-component--clp--description'})
        if alt_container and alt_container.find_all('p'):
            return "Alternate Description Location"

        # Nothing found
        return "No Description Found"

    except requests.exceptions.HTTPError as err:
        return f"Error {err.response.status_code}"
    except Exception:
        return "Failed"


In [None]:
urls = df['url'].tolist()

# Apply the categorization function in parallel
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    status_list = list(tqdm(executor.map(categorize_course_status, urls), total=len(urls)))

# Add status column
df['status'] = status_list

# Show summary
summary = df['status'].value_counts()
print("\n Summary of URL Statuses:\n")
print(summary)

100%|██████████| 3683/3683 [09:53<00:00,  6.21it/s]


 Summary of URL Statuses:

status
Valid                   1964
Course Unavailable       846
No Description Found     631
Non-English              194
Error 404                 45
Failed                     3
Name: count, dtype: int64





In [None]:
# Filter only rows with status == "Valid"
df = df[df['status'] == "Valid"].copy()

# Drop duplicate rows based on the 'url' column
df.drop_duplicates(subset='url', keep='first', inplace=True)

# Reset index for cleanliness
df.reset_index(drop=True, inplace=True)

In [None]:
# Function to extract course description from URL
def extract_course_description(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Standard description container
        description_container = soup.find('div', {'data-purpose': 'safely-set-inner-html:description:description'})
        if description_container and description_container.find_all('p'):
            # Join all paragraph texts into a single string
            return "\n".join([p.get_text(strip=True) for p in description_container.find_all('p')])

        # Alternate known container
        alt_container = soup.find('div', {'class': 'ud-component--clp--description'})
        if alt_container and alt_container.find_all('p'):
            # Join all paragraph texts into a single string
            return "\n".join([p.get_text(strip=True) for p in alt_container.find_all('p')])

        # If no description is found
        return "No Description Found"

    except requests.exceptions.HTTPError as err:
        return f"Error {err.response.status_code}"
    except Exception:
        return "Failed"

In [None]:
# Apply the description extraction function in parallel
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    description_list = list(tqdm(executor.map(extract_course_description, df['url']), total=len(df)))

# Add descriptions to a new column
df['descriptions'] = description_list

print("\n Dataset with descriptions added:")
print(df[['url', 'descriptions']].head())

100%|██████████| 1960/1960 [06:11<00:00,  5.27it/s]


 Dataset with descriptions added:
                                                 url  \
0  https://www.udemy.com/ultimate-investment-bank...   
1      https://www.udemy.com/goods-and-services-tax/   
2  https://www.udemy.com/financial-modeling-for-b...   
3  https://www.udemy.com/complete-excel-finance-c...   
4  https://www.udemy.com/how-to-maximize-your-pro...   

                                        descriptions  
0  Our dedicated approach and solid methodology w...  
1  WHAT IS GST ?\nGST stands for “Goods and Servi...  
2  What is the aim of this course?\nAs a business...  
3  Why this course is for you\nYou want to increa...  
4  *****Join Over 1,680 Students Currently Enroll...  





In [None]:
# Filter out rows where description extraction failed
df = df[df['descriptions'] != "Failed"].copy()

# Drop rows with duplicate descriptions
df.drop_duplicates(subset='descriptions', keep='first', inplace=True)

# Reset index for cleanliness after filtering and dropping duplicates
df.reset_index(drop=True, inplace=True)

In [None]:
df['descriptions'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 1954 entries, 0 to 1953
Series name: descriptions
Non-Null Count  Dtype 
--------------  ----- 
1954 non-null   object
dtypes: object(1)
memory usage: 15.4+ KB


In [None]:
# Drop the 'status' column as it's no longer needed
df.drop(columns=['status'], inplace=True)

In [None]:
df.head(2)

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,subject,descriptions
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5 hours,Business Finance,Our dedicated approach and solid methodology w...
1,1113822,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,True,75,2792,923,274,All Levels,39 hours,Business Finance,WHAT IS GST ?\nGST stands for “Goods and Servi...


In [None]:
# Save the final cleaned dataset
output_file_path = "/content/drive/MyDrive/cleaned_udemy_course_data.csv"
df.to_csv(output_file_path, index=False)

print(f"\n Final cleaned dataset saved successfully to: {output_file_path}")


 Final cleaned dataset saved successfully to: /content/drive/MyDrive/cleaned_udemy_course_data.csv


Text Normalization:

In [None]:
# Function to preprocess text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove special characters, numbers, and extra spaces
    text = re.sub(r'[^a-z\s]', '', text)  # Keep only letters and spaces
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces

    # Remove stopwords (optional)
    stop_words = set(ENGLISH_STOP_WORDS)
    words = text.split()
    words = [word for word in words if word not in stop_words]

    # Join the cleaned words back into a single string
    return " ".join(words)

In [None]:
# Convert the column to string type
df['descriptions'] = df['descriptions'].astype(str)

In [None]:
# Apply preprocessing to the 'course_description' column
df['cleaned_descriptions'] = df['descriptions'].apply(preprocess_text)

# Display the first few rows with cleaned descriptions
print(df[['cleaned_descriptions']].head())

                                cleaned_descriptions
0  dedicated approach solid methodology teach too...
1  gst gst stands goods services tax proposed com...
2  aim course business analyst consultant time ti...
3  course want increase excel skills efficient wo...
4  join students currently enrolled course course...


Tokenization and Lemmatization:

In [None]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to tokenize and lemmatize text
def tokenize_and_lemmatize(text):
    tokens = word_tokenize(text)  # Tokenize text into words
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]  # Lemmatize each token
    return " ".join(lemmatized_tokens)

In [None]:
# Apply tokenization and lemmatization to the 'cleaned_description' column
df['lemmatized_descriptions'] = df['cleaned_descriptions'].apply(tokenize_and_lemmatize)

# Display the first few rows
print(df[['descriptions', 'cleaned_descriptions', 'lemmatized_descriptions']].head())

                                        descriptions  \
0  Our dedicated approach and solid methodology w...   
1  WHAT IS GST ?\nGST stands for “Goods and Servi...   
2  What is the aim of this course?\nAs a business...   
3  Why this course is for you\nYou want to increa...   
4  *****Join Over 1,680 Students Currently Enroll...   

                                cleaned_descriptions  \
0  dedicated approach solid methodology teach too...   
1  gst gst stands goods services tax proposed com...   
2  aim course business analyst consultant time ti...   
3  course want increase excel skills efficient wo...   
4  join students currently enrolled course course...   

                             lemmatized_descriptions  
0  dedicated approach solid methodology teach too...  
1  gst gst stand good service tax proposed compre...  
2  aim course business analyst consultant time ti...  
3  course want increase excel skill efficient wor...  
4  join student currently enrolled course course ..

#Data Enrichment:

Semantic Tagging:

In [None]:
# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Function to extract semantic tags
def extract_semantic_tags(text):
    doc = nlp(text)
    topics = [ent.label_ for ent in doc.ents]  # Extract named entities as semantic tags
    return ", ".join(topics)

In [None]:
# Apply semantic tagging to the 'lemmatized_descriptions' column
df['semantic_tags'] = df['lemmatized_descriptions'].apply(extract_semantic_tags)

# Display the first few rows
print(df[['descriptions', 'lemmatized_descriptions', 'semantic_tags']].head())

                                        descriptions  \
0  Our dedicated approach and solid methodology w...   
1  WHAT IS GST ?\nGST stands for “Goods and Servi...   
2  What is the aim of this course?\nAs a business...   
3  Why this course is for you\nYou want to increa...   
4  *****Join Over 1,680 Students Currently Enroll...   

                             lemmatized_descriptions  \
0  dedicated approach solid methodology teach too...   
1  gst gst stand good service tax proposed compre...   
2  aim course business analyst consultant time ti...   
3  course want increase excel skill efficient wor...   
4  join student currently enrolled course course ...   

                                       semantic_tags  
0                                               NORP  
1                                                     
2  CARDINAL, PERSON, PERSON, PERSON, PERSON, ORG,...  
3                                               DATE  
4                                                  

#Vectorization and Embedding:

In [None]:
# Load a pre-trained BERT-based model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for the 'lemmatized_descriptions' column
df['embeddings'] = df['lemmatized_descriptions'].apply(lambda x: model.encode(x))

# Display the first few rows
print(df[['descriptions', 'lemmatized_descriptions', 'embeddings']].head())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


                                        descriptions  \
0  Our dedicated approach and solid methodology w...   
1  WHAT IS GST ?\nGST stands for “Goods and Servi...   
2  What is the aim of this course?\nAs a business...   
3  Why this course is for you\nYou want to increa...   
4  *****Join Over 1,680 Students Currently Enroll...   

                             lemmatized_descriptions  \
0  dedicated approach solid methodology teach too...   
1  gst gst stand good service tax proposed compre...   
2  aim course business analyst consultant time ti...   
3  course want increase excel skill efficient wor...   
4  join student currently enrolled course course ...   

                                          embeddings  
0  [0.041338906, -0.048048142, -0.06560842, 0.000...  
1  [-0.0643979, 0.0043843095, 0.07195871, -0.0847...  
2  [0.055106495, -0.03882285, -0.08019861, 0.0042...  
3  [-0.043257754, 0.040885597, -0.051284906, -0.0...  
4  [0.0033338307, -0.052462436, -0.043229874, 0.0..

#Indexing for Retrieval:

In [None]:
# Convert embeddings to a NumPy array
embeddings_matrix = np.vstack(df['embeddings'].values)

# Build a FAISS index
dimension = embeddings_matrix.shape[1]  # Dimensionality of the embeddings
index = faiss.IndexFlatL2(dimension)    # Use L2 distance for similarity search
index.add(embeddings_matrix)            # Add embeddings to the index

# Save the index
faiss.write_index(index, "course_embeddings.index")

Extract Metadata

In [None]:
# Inspect the columns of df_subset
print("Columns in df_subset:", df.columns)

Columns in df_subset: Index(['course_id', 'course_title', 'url', 'is_paid', 'price',
       'num_subscribers', 'num_reviews', 'num_lectures', 'level',
       'content_duration', 'subject', 'descriptions', 'cleaned_descriptions',
       'lemmatized_descriptions', 'semantic_tags', 'embeddings'],
      dtype='object')


In [None]:
# Select relevant metadata columns
metadata_df = df[['course_id', 'course_title', 'descriptions', 'level', 'subject', 'url', 'is_paid']]

# Display the extracted metadata
metadata_df.head()
# Save metadata_df to a CSV file
metadata_df.to_csv("course_metadata.csv", index=False)

print("Metadata saved to 'course_metadata.csv'")

Metadata saved to 'course_metadata.csv'


In [None]:
metadata_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1954 entries, 0 to 1953
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   course_id     1954 non-null   int64 
 1   course_title  1954 non-null   object
 2   descriptions  1954 non-null   object
 3   level         1954 non-null   object
 4   subject       1954 non-null   object
 5   url           1954 non-null   object
 6   is_paid       1954 non-null   bool  
dtypes: bool(1), int64(1), object(5)
memory usage: 93.6+ KB


#Vector Database Integration:

Embedding Storage:

In [None]:
# Load the FAISS index
index_path = "course_embeddings.index"
index = faiss.read_index(index_path)

# Get the dimension of the embeddings
dimension = index.d
print(f"Dimension of the embeddings: {dimension}")

Dimension of the embeddings: 384


In [None]:
# Reset the indices of metadata_df to ensure alignment with embeddings
metadata_df = metadata_df.reset_index(drop=True)

# Verify the new indices
print("New indices of metadata_df:")
print(metadata_df.index)

New indices of metadata_df:
RangeIndex(start=0, stop=1954, step=1)


In [None]:
# Convert course_id to integers
metadata_df['course_id'] = metadata_df['course_id'].apply(int)

# Verify the conversion
print("Updated course_id types:")
print(metadata_df['course_id'].apply(type).unique())

Updated course_id types:
[<class 'int'>]


In [None]:
# Initialize Qdrant client (in-memory mode)
client = QdrantClient(":memory:")

# Drop the existing collection if it exists
try:
    client.delete_collection(collection_name="courses")
except Exception:
    pass  # Ignore if the collection doesn't exist

# Recreate the collection with support for integer IDs
dimension = embeddings_matrix.shape[1]
collection_name = "courses"
client.create_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=dimension, distance=Distance.COSINE),
    optimizers_config=None,
    shard_number=1,
    on_disk_payload=False
)

print("Collection recreated with support for integer IDs.")

Collection recreated with support for integer IDs.


In [None]:
# Upload points to Qdrant using integer IDs
for i, row in metadata_df.iterrows():
    vector = embeddings_matrix[i]  # Use the embeddings matrix

    # Use the integer course_id as the ID
    point_id = int(row['course_id'])  # Ensure the ID is an integer

    # Create a PointStruct
    point = PointStruct(
        id=point_id,  # Use the integer ID
        vector=vector,
        payload={
            "title": row['course_title'],
            "description": row['descriptions'],
            "level": row['level'],
            "subject": row['subject'],
            "url": row['url']
        }
    )

    # Add the point to Qdrant
    client.upsert(collection_name="courses", points=[point])

print("Embeddings and metadata successfully uploaded to Qdrant!")

Embeddings and metadata successfully uploaded to Qdrant!


In [None]:
# Retrieve a sample point from Qdrant
sample_id = int(metadata_df.iloc[0]['course_id'])  # Get the ID of the first course
point = client.retrieve(
    collection_name="courses",
    ids=[sample_id]  # Query by the integer ID
)

# Print the retrieved point
print("Sample Point from Qdrant:")
print(point)

Sample Point from Qdrant:
[Record(id=1070968, payload={'title': 'Ultimate Investment Banking Course', 'description': 'Our dedicated approach and solid methodology will teach you\xa0the tools and skills of a good investment banker.\nThis course is also for anyone who wants to work in\xa0Investment Banking, and wants to learn about the necessary skills required to get into this sector.\nNo prior technical knowledge is required.\nWe have\xa0DIRECT EXAMPLES \xa0for all of the concepts that we will be introducing, so you can practice directly as you go forward into the course.\nBanking Organization: Understand how a bank really works, the main actors and how they interact with each others.\nIntroduction to the Financial Markets: Introducing the different markets where the assets classes are exchanged. We will cover Equities, Bonds, Commodities and Forex.\nInterest Rates: Learn about the most common interest rates, from simple to periodic compound rates, those will not have any secrets for y

In [None]:
# Load the embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to retrieve relevant courses from Qdrant
def retrieve_courses(query, top_k=3):
    # Generate an embedding for the query
    query_embedding = embedding_model.encode(query)

    # Search Qdrant for similar courses
    search_result = client.search(
        collection_name="courses",
        query_vector=query_embedding,
        limit=top_k  # Retrieve the top-k most similar courses
    )
    # Extract course titles and descriptions from the search result
    retrieved_courses = [
        {
            "id": hit.id,
            "title": hit.payload["title"],
            "description": hit.payload["description"],
            "level": hit.payload["level"],
            "subject": hit.payload["subject"],
            "url": hit.payload["url"]
        }
        for hit in search_result
    ]
    return retrieved_courses

In [None]:
# Test the retrieval function
user_query = "What are some beginner courses on finance?"
relevant_courses = retrieve_courses(user_query)

# Print the retrieved courses
print("Retrieved Courses:")
for course in relevant_courses:
    print(f"ID: {course['id']}, Title: {course['title']}, Description: {course['description']}")

Retrieved Courses:
ID: 806122, Title: Learn and Master the Basics of Finance, Description: This course in Finance is meant for beginners and intermediate level students of finance who would like to get a good grip of the concepts of the subject and be able to comprehend the financial world.
Most of us read the news without really paying attention to the financial news. This is due to the fact that most of us don't comprehend the basics of the subject. By taking this course, one will be able to build a solid foundation and understand the complex topics by breaking it down to simple concepts.
In order to take this course, one does not need any prior knowledge. However, I would like one to keep an open and receptive mind.
In over 3 hours worth of course material , you will learn every thing from the basics of banking to complex derivative products. The course is structured in such a way that it will gradually build on complex topics yet keeping things simple.
This course is also a great h

#5.2 LLM & RAG Pipeline

In [None]:
# Configure the Gemini API key
genai.configure(api_key="AIzaSyBZ9E4RG96F90hTvjZbD0hdc9E7Sm_nOk0")
model = genai.GenerativeModel('gemini-2.0-flash')

# Function to generate a response using the RAG pipeline
def rag_response(user_query):
    # Retrieve relevant courses from Qdrant
    relevant_courses = retrieve_courses(user_query, top_k=3)

    # Format the retrieved courses into a context string
    context = "\n".join(
        [
            f"Course Title: {course['title']}\nDescription: {course['description']}\nLevel: {course['level']}\nSubject: {course['subject']}\nURL: {course['url']}"
            for course in relevant_courses
        ]
    )

    # Create a prompt for the LLM
    prompt = f"""
    User Query: {user_query}

    Relevant Courses:
    {context}

    Generate a concise and helpful response based on the above information.
    """

    # Generate a response using the LLM
    response = model.generate_content(prompt)

    return response.text

In [None]:
# Test the RAG pipeline
user_query = "What are some beginner courses on finance?"
response = rag_response(user_query)

# Print the generated response
print("Generated Response:")
print(response)

Generated Response:
Here are three beginner-friendly finance courses that can help you build a solid foundation:

*   **Learn and Master the Basics of Finance:** A comprehensive 3-hour course covering everything from basic banking to derivatives. No prior knowledge needed. ([https://www.udemy.com/finance-101/](https://www.udemy.com/finance-101/))
*   **Understand Core Finance Principles in 60 Minutes:** A practical overview of key corporate finance principles, including financial analysis, forecasting, and risk management. ([https://www.udemy.com/core-finance-principles-in-60-minutes/](https://www.udemy.com/core-finance-principles-in-60-minutes/))
*   **Surviving Introduction to Finance:** Designed to supplement an introductory finance course, covering topics like time value of money and discounted cash flow analysis. ([https://www.udemy.com/surviving-introduction-to-finance/](https://www.udemy.com/surviving-introduction-to-finance/))



In [None]:
# Function to compare embeddings
def inspect_embeddings(embeddings_matrix):
    # Compare the first embedding with a few others
    sample_embedding = embeddings_matrix[0].reshape(1, -1)
    similarities = cosine_similarity(sample_embedding, embeddings_matrix[:5])

    print("Cosine Similarities with the First Embedding:")
    for i, sim in enumerate(similarities[0]):
        print(f"Course {i}: {sim}")

# Inspect the embeddings
inspect_embeddings(embeddings_matrix)

Cosine Similarities with the First Embedding:
Course 0: 1.0000001192092896
Course 1: 0.36010098457336426
Course 2: 0.6701974272727966
Course 3: 0.48962515592575073
Course 4: 0.6475365161895752


#AI Agents for Context Handling:

Course recommendation & course details retrieval

In [None]:
# Custom wrapper to make the Gemini API compatible with LangChain's Runnable interface
class GeminiRunnable(Runnable):
    def __init__(self, model):
        self.model = model  # The Gemini GenerativeModel instance

    def invoke(self, input_data, config=None, **kwargs):  # Add **kwargs to handle extra arguments
        # Extract the prompt from the input data
        prompt = input_data.get("query", "") + "\n" + input_data.get("context", "")
        # Generate a response using the Gemini API
        response = self.model.generate_content(prompt)
        return response.text

In [None]:
# Initialize the custom wrapper for the Gemini API
gemini_llm = GeminiRunnable(model=model)

In [None]:
# Fallback to generate_content for generating responses
def generate_response(query, context):
    # Combine query and context into a single prompt
    prompt = f"""
    User Query: {query}

    Relevant Courses:
    {context}

    Generate a concise and helpful response based on the above information.
    """
    # Use the Gemini API to generate a response
    response = gemini_llm.invoke({"query": query, "context": context})
    return response

In [None]:
# Example usage for course recommendation
user_query = "What are some beginner courses on data science?"
relevant_courses = retrieve_courses(user_query, top_k=3)
context = "\n".join(
    [
        f"Course Title: {course['title']}\nDescription: {course['description']}\nLevel: {course['level']}\nSubject: {course['subject']}\nURL: {course['url']}"
        for course in relevant_courses
    ]
)

# Generate the response
response = generate_response(user_query, context)

# Print the response
print("Generated Response:")
print(response)

Generated Response:
The courses you listed are focused on web development. However, based on what you have provided, the course "Learn How To Build a Web Application Without Coding" is the only one for beginners.

Here are some **Data Science** courses for beginners, that would better fit your request:

**General Data Science Introductions:**

*   **DataCamp's "Introduction to Data Science":** This is a great starting point, covering foundational concepts like data analysis, visualization, and machine learning. DataCamp provides interactive coding environments, making it hands-on.
*   **Coursera's "What is Data Science?" (IBM):** A high-level overview of the field, its applications, and the roles within data science. Good for understanding the landscape.
*   **edX's "Data Science Basics":** A series of courses that introduce fundamental data science concepts, tools, and techniques.

**Python Focused (Common for Data Science):**

*   **DataCamp's "Introduction to Python for Data Science

Q&A flow

In [None]:
# Function to extract the course name from the query
def extract_course_name_from_query(query):
    # Use a simple regex to extract the course name enclosed in quotes
    match = re.search(r"'(.*?)'", query) # Look for text enclosed in single quotes
    if match:
        return match.group(1).strip()    # Extract and clean the course name
    return None                          # Return None if no course name is found

In [None]:
def answer_course_question(query):
    # Identify the course name or ID from the query
    course_name = extract_course_name_from_query(query)
    if not course_name:
        return "Sorry, I couldn't identify the course name from your query."

    # Generate an embedding for the query
    query_embedding = embedding_model.encode(course_name)

    # Construct the query filter using Qdrant's Filter and FieldCondition
    try:
        query_filter = Filter(
            must=[
                FieldCondition(
                    key="title",
                    match=MatchText(text=course_name)
                )
            ]
        )
        print("Query Filter Successfully Constructed:")
        print(query_filter)
    except Exception as e:
        return f"Failed to construct query filter: {str(e)}"

    # Search Qdrant for the course
    try:
        search_result = client.search(
            collection_name="courses",
            query_vector=query_embedding,  # Pass the query embedding
            query_filter=query_filter,     # Use the constructed filter
            limit=1,                       # Retrieve only one matching course
            with_payload=True,
            with_vectors=False
        )

        # Print the search result to verify its structure
        print("Search Result:")
        print(search_result)
    except Exception as e:
        return f"An error occurred during the search: {str(e)}"

    # Check if a course was found
    if not search_result:
        return "Sorry, I couldn't find any course matching your query."

    # Extract course details
    course = search_result[0]
    course_title = course.payload["title"]
    course_description = course.payload["description"]
    course_level = course.payload["level"]
    course_subject = course.payload["subject"]
    course_url = course.payload["url"]

    # Generate a response using the LLM
    prompt = f"""
    User Query: {query}

    Course Details:
    Title: {course_title}
    Description: {course_description}
    Level: {course_level}
    Subject: {course_subject}
    URL: {course_url}

    Generate a concise and helpful response based on the above information.
    """
    response = gemini_llm.invoke({"query": query, "context": prompt})
    return response

In [None]:
user_query = "What is the description of the course 'Ultimate Investment Banking Course'?"
response = answer_course_question(user_query)
print("Generated Response:")
print(response)

Query Filter Successfully Constructed:
should=None min_should=None must=[FieldCondition(key='title', match=MatchText(text='Ultimate Investment Banking Course'), range=None, geo_bounding_box=None, geo_radius=None, geo_polygon=None, values_count=None)] must_not=None
Search Result:
[ScoredPoint(id=1070968, version=0, score=0.45796096324920654, payload={'title': 'Ultimate Investment Banking Course', 'description': 'Our dedicated approach and solid methodology will teach you\xa0the tools and skills of a good investment banker.\nThis course is also for anyone who wants to work in\xa0Investment Banking, and wants to learn about the necessary skills required to get into this sector.\nNo prior technical knowledge is required.\nWe have\xa0DIRECT EXAMPLES \xa0for all of the concepts that we will be introducing, so you can practice directly as you go forward into the course.\nBanking Organization: Understand how a bank really works, the main actors and how they interact with each others.\nIntroduc

Career coaching flow

In [None]:
def career_coaching_response(query):
    # Extract the course name from the query
    course_name = extract_course_name_from_query(query)
    if not course_name:
        return "Sorry, I couldn't identify the course name from your query."

    # Generate an embedding for the course name
    query_embedding = embedding_model.encode(course_name)

    # Search Qdrant for the course
    search_result = client.search(
        collection_name="courses",
        query_vector=query_embedding,
        query_filter=Filter(
            must=[
                FieldCondition(
                    key="title",
                    match=MatchText(text=course_name)
                )
            ]
        ),
        limit=1,
        with_payload=True
    )

    # Check if the course exists
    if not search_result:
        return "Sorry, I couldn't find any course matching your query."

    # Extract course details
    course = search_result[0]
    course_title = course.payload["title"]
    course_subject = course.payload["subject"]
    course_description = course.payload["description"]

    # Generate career coaching advice using the LLM
    prompt = f"""
    User Query: {query}

    Course Details:
    Title: {course_title}
    Subject: {course_subject}
    Description: {course_description}

    Generate career coaching advice for someone who has completed this course. Include:
    - Potential job roles
    - Industries where this knowledge is applicable
    - Next steps for career advancement
    - Skills gained from the course
    """
    response = gemini_llm.invoke({"query": query, "context": prompt})
    return response

In [None]:
user_query = "What career paths can I pursue after completing the 'Ultimate Investment Banking Course'?"
response = career_coaching_response(user_query)
print("Generated Response:")
print(response)

Generated Response:
Okay, here's career coaching advice tailored for someone who has completed the "Ultimate Investment Banking Course," based on the provided course details:

**Congratulations on completing the "Ultimate Investment Banking Course!"** This course provides a strong foundation, and you're now well-equipped to explore various exciting career paths. Let's map out some options:

**I. Potential Job Roles:**

This course focuses on the technical skills needed for front office and trading roles. As such, these jobs are all reasonably possible to obtain post-course:

*   **Investment Banking Analyst/Associate:** (Requires Networking and Additional Courses) Support senior bankers in deal execution, financial modeling, valuation, and due diligence.  *Entry-level roles in investment banking are highly competitive, but this course gives you a technical advantage.*
*   **Sales & Trading Analyst/Associate:** (Requires Networking)  Work on a trading floor, either selling financial pro

Define the AI Agent Framework

Intent classification agent

In [None]:
# List of possible intents
INTENTS = {
    "qa": ["what is", "describe", "explain", "tell me about"],
    "career_coaching": ["career", "job", "path", "opportunity", "after completing"]
}

def classify_intent(query):
    """
    Classify the intent of the user query based on predefined keywords.
    """
    query_lower = query.lower()
    for intent, keywords in INTENTS.items():
        if any(keyword in query_lower for keyword in keywords):
            return intent
    return "unknown"  # Default intent if no match is found

In [None]:
# Example usage
user_query = "What career paths can I pursue after completing the 'Ultimate Investment Banking Course'?"
intent = classify_intent(user_query)
print(f"Identified Intent: {intent}")

Identified Intent: career_coaching


Define Context Handling Logic

In [None]:
# Initialize a context dictionary to store conversation state
conversation_context = {
    "last_course_name": None,  # Track the last course name mentioned by the user
    "last_intent": None       # Track the last intent identified
}

def handle_context(query, intent):
    """
    Update and manage the conversation context based on the user query and intent.
    """
    global conversation_context

    # Extract course name from the query
    course_name = extract_course_name_from_query(query)

    # Update context based on intent
    if intent == "qa":
        if course_name:
            conversation_context["last_course_name"] = course_name
            conversation_context["last_intent"] = "qa"
        elif conversation_context["last_course_name"]:
            # Use the last mentioned course name if none is found in the query
            course_name = conversation_context["last_course_name"]

    elif intent == "career_coaching":
        if course_name:
            conversation_context["last_course_name"] = course_name
            conversation_context["last_intent"] = "career_coaching"
        elif conversation_context["last_course_name"]:
            # Use the last mentioned course name if none is found in the query
            course_name = conversation_context["last_course_name"]

    return course_name

In [None]:
user_query = "What is the description of the course 'Ultimate Investment Banking Course'?"
intent = classify_intent(user_query)
course_name = handle_context(user_query, intent)
print(f"Identified Intent: {intent}")
print(f"Course Name from Context: {course_name}")

Identified Intent: qa
Course Name from Context: Ultimate Investment Banking Course


Integrate Everything into the Main Conversational Flow

In [None]:
def handle_conversation(query):
    """
    Handle the user's query by classifying intent, managing context, and routing to the appropriate flow.
    """
    global conversation_context

    # Classify the intent of the query
    intent = classify_intent(query)
    print(f"Identified Intent: {intent}")

    # Handle context and extract course name if needed
    course_name = handle_context(query, intent)
    print(f"Resolved Course Name: {course_name}")

    # Route the query based on intent
    if intent == "qa":
        # Route to Q&A flow
        response = answer_course_question(query)
    elif intent == "career_coaching":
        # Route to Career Coaching flow
        response = career_coaching_response(query)
    else:
        # Default response for unknown intents
        response = "I'm sorry, I didn't understand your query. Could you please clarify?"

    return response

In [None]:
# Example Usage
user_query_1 = "What is the description of the 'Ultimate Investment Banking Course'?"
response_1 = handle_conversation(user_query_1)
print("Generated Response 1:")
print(response_1)

user_query_2 = "What career paths can I pursue after completing this course?"
response_2 = handle_conversation(user_query_2)
print("Generated Response 2:")
print(response_2)

Identified Intent: qa
Resolved Course Name: Ultimate Investment Banking Course
Query Filter Successfully Constructed:
should=None min_should=None must=[FieldCondition(key='title', match=MatchText(text='Ultimate Investment Banking Course'), range=None, geo_bounding_box=None, geo_radius=None, geo_polygon=None, values_count=None)] must_not=None
Search Result:
[ScoredPoint(id=1070968, version=0, score=0.45796096324920654, payload={'title': 'Ultimate Investment Banking Course', 'description': 'Our dedicated approach and solid methodology will teach you\xa0the tools and skills of a good investment banker.\nThis course is also for anyone who wants to work in\xa0Investment Banking, and wants to learn about the necessary skills required to get into this sector.\nNo prior technical knowledge is required.\nWe have\xa0DIRECT EXAMPLES \xa0for all of the concepts that we will be introducing, so you can practice directly as you go forward into the course.\nBanking Organization: Understand how a bank 

#Monitoring and Optimization:

Set Up AgentOps for Monitoring

In [None]:
import agentops
print(dir(agentops))

['ActionEvent', 'Any', 'Client', 'Dict', 'ErrorEvent', 'List', 'Optional', 'ToolEvent', 'Union', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '_client', 'client', 'config', 'configure', 'end_session', 'exceptions', 'get_client', 'helpers', 'init', 'instrumentation', 'legacy', 'logging', 'sdk', 'semconv', 'start_session']


In [None]:
# Initialize AgentOps with your API key
agentops.init(api_key="b4be04ed-dac8-4536-b65b-a2708162ac66")

# Define a function to log events and performance metrics
def log_event(event_name, metadata=None):
    """
    Logs an event to AgentOps for monitoring.
    """
    try:
        # Use the 'log' method to log events
        agentops.agent.log(event_name, metadata=metadata)
    except Exception as e:
        print(f"Failed to log event: {str(e)}")

# Example Usage
log_event("System Initialized", {"status": "success"})

Failed to log event: module 'agentops' has no attribute 'agent'
