In [1]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [3]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
import concurrent.futures
from tqdm import tqdm
import time
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import random
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import spacy
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from qdrant_client import QdrantClient
from qdrant_client.http.models import PointStruct, Distance, VectorParams
import google.generativeai as genai
from sklearn.metrics.pairwise import cosine_similarity
from langchain.schema.runnable import Runnable
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from qdrant_client.http.models import Filter, FieldCondition, MatchText
import agentops
from agentops import ActionEvent
import firebase_admin
from firebase_admin import credentials, auth

#5.1 Data Preparation & Vector Database Integration

Data Cleaning and Normalization:

In [11]:
# Load dataset
df = pd.read_csv("/content/drive/MyDrive/udemy_course_data.csv")

In [12]:
# Drop irrelevant columns
columns_to_drop = [
    'published_timestamp', 'published_date',
    'published_time', 'year', 'month', 'day', 'profit'
]

df = df.drop(columns=columns_to_drop)

In [13]:
# function to categories course status
def categorize_course_status(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        html = response.text

        # Non-English check
        if '<html lang="en"' not in html and 'lang="en"' not in html:
            return "Non-English"

        # Error page
        error_tag = soup.find('h1', string=lambda x: x and "we can’t find the page you’re looking for" in x.lower())
        if error_tag:
            return "Error Page"

        # Course unavailable (either case)
        title_check = soup.find('div', {'data-purpose': 'safely-set-inner-html:limited-access-container:title'})
        subtitle_check = soup.find('div', {'data-purpose': 'safely-set-inner-html:limited-access-controller:subtitle'})
        if (title_check and "no longer accepting enrollments" in title_check.get_text(strip=True).lower()) or \
           (subtitle_check and "no longer accepting enrollments" in subtitle_check.get_text(strip=True).lower()):
            return "Course Unavailable"

        # Private course
        private_tag = soup.find('div', string=lambda x: x and "this is a private course." in x.lower())
        if private_tag:
            return "Private Course"

        # Standard description
        description_container = soup.find('div', {'data-purpose': 'safely-set-inner-html:description:description'})
        if description_container and description_container.find_all('p'):
            return "Valid"

        # Alternate known container
        alt_container = soup.find('div', {'class': 'ud-component--clp--description'})
        if alt_container and alt_container.find_all('p'):
            return "Alternate Description Location"

        # Nothing found
        return "No Description Found"

    except requests.exceptions.HTTPError as err:
        return f"Error {err.response.status_code}"
    except Exception:
        return "Failed"


In [14]:
urls = df['url'].tolist()

# Apply the categorization function in parallel
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    status_list = list(tqdm(executor.map(categorize_course_status, urls), total=len(urls)))

# Add status column
df['status'] = status_list

# Show summary
summary = df['status'].value_counts()
print("\n Summary of URL Statuses:\n")
print(summary)

100%|██████████| 3683/3683 [08:20<00:00,  7.37it/s]


 Summary of URL Statuses:

status
Valid                   1961
Course Unavailable       845
No Description Found     632
Non-English              193
Error 404                 45
Failed                     7
Name: count, dtype: int64





In [15]:
# Filter only rows with status == "Valid"
df = df[df['status'] == "Valid"].copy()

# Drop duplicate rows based on the 'url' column
df.drop_duplicates(subset='url', keep='first', inplace=True)

# Reset index for cleanliness
df.reset_index(drop=True, inplace=True)

In [16]:
# Function to extract course description from URL
def extract_course_description(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Standard description container
        description_container = soup.find('div', {'data-purpose': 'safely-set-inner-html:description:description'})
        if description_container and description_container.find_all('p'):
            # Join all paragraph texts into a single string
            return "\n".join([p.get_text(strip=True) for p in description_container.find_all('p')])

        # Alternate known container
        alt_container = soup.find('div', {'class': 'ud-component--clp--description'})
        if alt_container and alt_container.find_all('p'):
            # Join all paragraph texts into a single string
            return "\n".join([p.get_text(strip=True) for p in alt_container.find_all('p')])

        # If no description is found
        return "No Description Found"

    except requests.exceptions.HTTPError as err:
        return f"Error {err.response.status_code}"
    except Exception:
        return "Failed"

In [17]:
# Apply the description extraction function in parallel
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    description_list = list(tqdm(executor.map(extract_course_description, df['url']), total=len(df)))

# Add descriptions to a new column
df['descriptions'] = description_list

print("\n Dataset with descriptions added:")
print(df[['url', 'descriptions']].head())

100%|██████████| 1957/1957 [05:13<00:00,  6.24it/s]


 Dataset with descriptions added:
                                                 url  \
0  https://www.udemy.com/ultimate-investment-bank...   
1      https://www.udemy.com/goods-and-services-tax/   
2  https://www.udemy.com/financial-modeling-for-b...   
3  https://www.udemy.com/complete-excel-finance-c...   
4  https://www.udemy.com/how-to-maximize-your-pro...   

                                        descriptions  
0  Our dedicated approach and solid methodology w...  
1  WHAT IS GST ?\nGST stands for “Goods and Servi...  
2  What is the aim of this course?\nAs a business...  
3  Why this course is for you\nYou want to increa...  
4  *****Join Over 1,680 Students Currently Enroll...  





In [18]:
# Filter out rows where description extraction failed
df = df[df['descriptions'] != "Failed"].copy()

# Drop rows with duplicate descriptions
df.drop_duplicates(subset='descriptions', keep='first', inplace=True)

# Reset index for cleanliness after filtering and dropping duplicates
df.reset_index(drop=True, inplace=True)

In [19]:
df['descriptions'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 1949 entries, 0 to 1948
Series name: descriptions
Non-Null Count  Dtype 
--------------  ----- 
1949 non-null   object
dtypes: object(1)
memory usage: 15.4+ KB


In [20]:
# Drop the 'status' column as it's no longer needed
df.drop(columns=['status'], inplace=True)

In [21]:
df.head(2)

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,subject,descriptions
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5 hours,Business Finance,Our dedicated approach and solid methodology w...
1,1113822,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,True,75,2792,923,274,All Levels,39 hours,Business Finance,WHAT IS GST ?\nGST stands for “Goods and Servi...


In [22]:
# Save the final cleaned dataset
output_file_path = "/content/drive/MyDrive/cleaned_udemy_course_data.csv"
df.to_csv(output_file_path, index=False)

print(f"\n Final cleaned dataset saved successfully to: {output_file_path}")


 Final cleaned dataset saved successfully to: /content/drive/MyDrive/cleaned_udemy_course_data.csv


Text Normalization:

In [23]:
# Function to preprocess text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove special characters, numbers, and extra spaces
    text = re.sub(r'[^a-z\s]', '', text)  # Keep only letters and spaces
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces

    # Remove stopwords (optional)
    stop_words = set(ENGLISH_STOP_WORDS)
    words = text.split()
    words = [word for word in words if word not in stop_words]

    # Join the cleaned words back into a single string
    return " ".join(words)

In [24]:
# Convert the column to string type
df['descriptions'] = df['descriptions'].astype(str)

In [25]:
# Apply preprocessing to the 'course_description' column
df['cleaned_descriptions'] = df['descriptions'].apply(preprocess_text)

# Display the first few rows with cleaned descriptions
print(df[['cleaned_descriptions']].head())

                                cleaned_descriptions
0  dedicated approach solid methodology teach too...
1  gst gst stands goods services tax proposed com...
2  aim course business analyst consultant time ti...
3  course want increase excel skills efficient wo...
4  join students currently enrolled course course...


Tokenization and Lemmatization:

In [26]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [27]:
# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to tokenize and lemmatize text
def tokenize_and_lemmatize(text):
    tokens = word_tokenize(text)  # Tokenize text into words
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]  # Lemmatize each token
    return " ".join(lemmatized_tokens)

In [28]:
# Apply tokenization and lemmatization to the 'cleaned_description' column
df['lemmatized_descriptions'] = df['cleaned_descriptions'].apply(tokenize_and_lemmatize)

# Display the first few rows
print(df[['descriptions', 'cleaned_descriptions', 'lemmatized_descriptions']].head())

                                        descriptions  \
0  Our dedicated approach and solid methodology w...   
1  WHAT IS GST ?\nGST stands for “Goods and Servi...   
2  What is the aim of this course?\nAs a business...   
3  Why this course is for you\nYou want to increa...   
4  *****Join Over 1,680 Students Currently Enroll...   

                                cleaned_descriptions  \
0  dedicated approach solid methodology teach too...   
1  gst gst stands goods services tax proposed com...   
2  aim course business analyst consultant time ti...   
3  course want increase excel skills efficient wo...   
4  join students currently enrolled course course...   

                             lemmatized_descriptions  
0  dedicated approach solid methodology teach too...  
1  gst gst stand good service tax proposed compre...  
2  aim course business analyst consultant time ti...  
3  course want increase excel skill efficient wor...  
4  join student currently enrolled course course ..

#Data Enrichment:

Semantic Tagging:

In [29]:
# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Function to extract semantic tags
def extract_semantic_tags(text):
    doc = nlp(text)
    topics = [ent.label_ for ent in doc.ents]  # Extract named entities as semantic tags
    return ", ".join(topics)

In [30]:
# Apply semantic tagging to the 'lemmatized_descriptions' column
df['semantic_tags'] = df['lemmatized_descriptions'].apply(extract_semantic_tags)

# Display the first few rows
print(df[['descriptions', 'lemmatized_descriptions', 'semantic_tags']].head())

                                        descriptions  \
0  Our dedicated approach and solid methodology w...   
1  WHAT IS GST ?\nGST stands for “Goods and Servi...   
2  What is the aim of this course?\nAs a business...   
3  Why this course is for you\nYou want to increa...   
4  *****Join Over 1,680 Students Currently Enroll...   

                             lemmatized_descriptions  \
0  dedicated approach solid methodology teach too...   
1  gst gst stand good service tax proposed compre...   
2  aim course business analyst consultant time ti...   
3  course want increase excel skill efficient wor...   
4  join student currently enrolled course course ...   

                                       semantic_tags  
0                                               NORP  
1                                                     
2  CARDINAL, PERSON, PERSON, PERSON, PERSON, ORG,...  
3                                               DATE  
4                                                  

#Vectorization and Embedding:

In [31]:
# Load a pre-trained BERT-based model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for the 'lemmatized_descriptions' column
df['embeddings'] = df['lemmatized_descriptions'].apply(lambda x: model.encode(x))

# Display the first few rows
print(df[['descriptions', 'lemmatized_descriptions', 'embeddings']].head())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

                                        descriptions  \
0  Our dedicated approach and solid methodology w...   
1  WHAT IS GST ?\nGST stands for “Goods and Servi...   
2  What is the aim of this course?\nAs a business...   
3  Why this course is for you\nYou want to increa...   
4  *****Join Over 1,680 Students Currently Enroll...   

                             lemmatized_descriptions  \
0  dedicated approach solid methodology teach too...   
1  gst gst stand good service tax proposed compre...   
2  aim course business analyst consultant time ti...   
3  course want increase excel skill efficient wor...   
4  join student currently enrolled course course ...   

                                          embeddings  
0  [0.041338906, -0.04804816, -0.0656084, 0.00032...  
1  [-0.064397946, 0.004384262, 0.071958706, -0.08...  
2  [0.05510653, -0.03882285, -0.08019859, 0.00426...  
3  [-0.043257803, 0.040885605, -0.051284883, -0.0...  
4  [0.0033338252, -0.05246246, -0.043229945, 0.01..

#Indexing for Retrieval:

In [32]:
# Convert embeddings to a NumPy array
embeddings_matrix = np.vstack(df['embeddings'].values)

# Build a FAISS index
dimension = embeddings_matrix.shape[1]  # Dimensionality of the embeddings
index = faiss.IndexFlatL2(dimension)    # Use L2 distance for similarity search
index.add(embeddings_matrix)            # Add embeddings to the index

# Save the index
faiss.write_index(index, "course_embeddings.index")

Extract Metadata

In [33]:
# Inspect the columns of df_subset
print("Columns in df_subset:", df.columns)

Columns in df_subset: Index(['course_id', 'course_title', 'url', 'is_paid', 'price',
       'num_subscribers', 'num_reviews', 'num_lectures', 'level',
       'content_duration', 'subject', 'descriptions', 'cleaned_descriptions',
       'lemmatized_descriptions', 'semantic_tags', 'embeddings'],
      dtype='object')


In [34]:
# Select relevant metadata columns
metadata_df = df[['course_id', 'course_title', 'descriptions', 'level', 'subject', 'url', 'is_paid']]

# Display the extracted metadata
metadata_df.head()
# Save metadata_df to a CSV file
metadata_df.to_csv("course_metadata.csv", index=False)

print("Metadata saved to 'course_metadata.csv'")

Metadata saved to 'course_metadata.csv'


In [35]:
metadata_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1949 entries, 0 to 1948
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   course_id     1949 non-null   int64 
 1   course_title  1949 non-null   object
 2   descriptions  1949 non-null   object
 3   level         1949 non-null   object
 4   subject       1949 non-null   object
 5   url           1949 non-null   object
 6   is_paid       1949 non-null   bool  
dtypes: bool(1), int64(1), object(5)
memory usage: 93.4+ KB


#Vector Database Integration:

Embedding Storage:

In [36]:
# Load the FAISS index
index_path = "course_embeddings.index"
index = faiss.read_index(index_path)

# Get the dimension of the embeddings
dimension = index.d
print(f"Dimension of the embeddings: {dimension}")

Dimension of the embeddings: 384


In [37]:
# Reset the indices of metadata_df to ensure alignment with embeddings
metadata_df = metadata_df.reset_index(drop=True)

# Verify the new indices
print("New indices of metadata_df:")
print(metadata_df.index)

New indices of metadata_df:
RangeIndex(start=0, stop=1949, step=1)


In [38]:
# Convert course_id to integers
metadata_df['course_id'] = metadata_df['course_id'].apply(int)

# Verify the conversion
print("Updated course_id types:")
print(metadata_df['course_id'].apply(type).unique())

Updated course_id types:
[<class 'int'>]


In [39]:
# Initialize Qdrant client (in-memory mode)
client = QdrantClient(":memory:")

# Drop the existing collection if it exists
try:
    client.delete_collection(collection_name="courses")
except Exception:
    pass  # Ignore if the collection doesn't exist

# Recreate the collection with support for integer IDs
dimension = embeddings_matrix.shape[1]
collection_name = "courses"
client.create_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=dimension, distance=Distance.COSINE),
    optimizers_config=None,
    shard_number=1,
    on_disk_payload=False
)

print("Collection recreated with support for integer IDs.")

Collection recreated with support for integer IDs.


In [40]:
# Upload points to Qdrant using integer IDs
for i, row in metadata_df.iterrows():
    vector = embeddings_matrix[i]  # Use the embeddings matrix

    # Use the integer course_id as the ID
    point_id = int(row['course_id'])  # Ensure the ID is an integer

    # Create a PointStruct
    point = PointStruct(
        id=point_id,  # Use the integer ID
        vector=vector,
        payload={
            "title": row['course_title'],
            "description": row['descriptions'],
            "level": row['level'],
            "subject": row['subject'],
            "url": row['url']
        }
    )

    # Add the point to Qdrant
    client.upsert(collection_name="courses", points=[point])

print("Embeddings and metadata successfully uploaded to Qdrant!")

Embeddings and metadata successfully uploaded to Qdrant!


In [41]:
# Retrieve a sample point from Qdrant
sample_id = int(metadata_df.iloc[0]['course_id'])  # Get the ID of the first course
point = client.retrieve(
    collection_name="courses",
    ids=[sample_id]  # Query by the integer ID
)

# Print the retrieved point
print("Sample Point from Qdrant:")
print(point)

Sample Point from Qdrant:
[Record(id=1070968, payload={'title': 'Ultimate Investment Banking Course', 'description': 'Our dedicated approach and solid methodology will teach you\xa0the tools and skills of a good investment banker.\nThis course is also for anyone who wants to work in\xa0Investment Banking, and wants to learn about the necessary skills required to get into this sector.\nNo prior technical knowledge is required.\nWe have\xa0DIRECT EXAMPLES \xa0for all of the concepts that we will be introducing, so you can practice directly as you go forward into the course.\nBanking Organization: Understand how a bank really works, the main actors and how they interact with each others.\nIntroduction to the Financial Markets: Introducing the different markets where the assets classes are exchanged. We will cover Equities, Bonds, Commodities and Forex.\nInterest Rates: Learn about the most common interest rates, from simple to periodic compound rates, those will not have any secrets for y

In [42]:
# Load the embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to retrieve relevant courses from Qdrant
def retrieve_courses(query, top_k=3):
    # Generate an embedding for the query
    query_embedding = embedding_model.encode(query)

    # Search Qdrant for similar courses
    search_result = client.search(
        collection_name="courses",
        query_vector=query_embedding,
        limit=top_k  # Retrieve the top-k most similar courses
    )
    # Extract course titles and descriptions from the search result
    retrieved_courses = [
        {
            "id": hit.id,
            "title": hit.payload["title"],
            "description": hit.payload["description"],
            "level": hit.payload["level"],
            "subject": hit.payload["subject"],
            "url": hit.payload["url"]
        }
        for hit in search_result
    ]
    return retrieved_courses

In [43]:
# Test the retrieval function
user_query = "What are some beginner courses on finance?"
relevant_courses = retrieve_courses(user_query)

# Print the retrieved courses
print("Retrieved Courses:")
for course in relevant_courses:
    print(f"ID: {course['id']}, Title: {course['title']}, Description: {course['description']}")

Retrieved Courses:
ID: 806122, Title: Learn and Master the Basics of Finance, Description: This course in Finance is meant for beginners and intermediate level students of finance who would like to get a good grip of the concepts of the subject and be able to comprehend the financial world.
Most of us read the news without really paying attention to the financial news. This is due to the fact that most of us don't comprehend the basics of the subject. By taking this course, one will be able to build a solid foundation and understand the complex topics by breaking it down to simple concepts.
In order to take this course, one does not need any prior knowledge. However, I would like one to keep an open and receptive mind.
In over 3 hours worth of course material , you will learn every thing from the basics of banking to complex derivative products. The course is structured in such a way that it will gradually build on complex topics yet keeping things simple.
This course is also a great h

#5.2 LLM & RAG Pipeline

In [44]:
# Configure the Gemini API key
genai.configure(api_key="AIzaSyBZ9E4RG96F90hTvjZbD0hdc9E7Sm_nOk0")
model = genai.GenerativeModel('gemini-2.0-flash')

# Function to generate a response using the RAG pipeline
def rag_response(user_query):
    # Retrieve relevant courses from Qdrant
    relevant_courses = retrieve_courses(user_query, top_k=3)

    # Format the retrieved courses into a context string
    context = "\n".join(
        [
            f"Course Title: {course['title']}\nDescription: {course['description']}\nLevel: {course['level']}\nSubject: {course['subject']}\nURL: {course['url']}"
            for course in relevant_courses
        ]
    )

    # Create a prompt for the LLM
    prompt = f"""
    User Query: {user_query}

    Relevant Courses:
    {context}

    Generate a concise and helpful response based on the above information.
    """

    # Generate a response using the LLM
    response = model.generate_content(prompt)

    return response.text

In [45]:
# Test the RAG pipeline
user_query = "What are some beginner courses on finance?"
response = rag_response(user_query)

# Print the generated response
print("Generated Response:")
print(response)

Generated Response:
Here are three beginner-level finance courses to consider:

*   **Learn and Master the Basics of Finance:** (Udemy) A comprehensive course (3+ hours) designed for beginners with no prior knowledge. Covers a wide range of topics from basic banking to derivatives, aiming to build a solid foundation for understanding financial news and concepts. [https://www.udemy.com/finance-101/](https://www.udemy.com/finance-101/)
*   **Understand Core Finance Principles in 60 Minutes:** (Udemy) A practical and concise course covering key principles of corporate finance, including financial analysis, forecasting, and risk management. Good for a quick overview of core concepts. [https://www.udemy.com/core-finance-principles-in-60-minutes/](https://www.udemy.com/core-finance-principles-in-60-minutes/)
*   **Surviving Introduction to Finance:** (Udemy) Designed as a supplement to a formal Introduction to Finance course. Includes video presentations, PowerPoints, Excel templates, and qu

In [46]:
# Function to compare embeddings
def inspect_embeddings(embeddings_matrix):
    # Compare the first embedding with a few others
    sample_embedding = embeddings_matrix[0].reshape(1, -1)
    similarities = cosine_similarity(sample_embedding, embeddings_matrix[:5])

    print("Cosine Similarities with the First Embedding:")
    for i, sim in enumerate(similarities[0]):
        print(f"Course {i}: {sim}")

# Inspect the embeddings
inspect_embeddings(embeddings_matrix)

Cosine Similarities with the First Embedding:
Course 0: 1.000000238418579
Course 1: 0.36010074615478516
Course 2: 0.6701973676681519
Course 3: 0.48962464928627014
Course 4: 0.64753657579422


#AI Agents for Context Handling:

Course recommendation & course details retrieval

In [47]:
# Custom wrapper to make the Gemini API compatible with LangChain's Runnable interface
class GeminiRunnable(Runnable):
    def __init__(self, model):
        self.model = model  # The Gemini GenerativeModel instance

    def invoke(self, input_data, config=None, **kwargs):  # Add **kwargs to handle extra arguments
        # Extract the prompt from the input data
        prompt = input_data.get("query", "") + "\n" + input_data.get("context", "")
        # Generate a response using the Gemini API
        response = self.model.generate_content(prompt)
        return response.text

In [48]:
# Initialize the custom wrapper for the Gemini API
gemini_llm = GeminiRunnable(model=model)

In [49]:
# Fallback to generate_content for generating responses
def generate_response(query, context):
    # Combine query and context into a single prompt
    prompt = f"""
    User Query: {query}

    Relevant Courses:
    {context}

    Generate a concise and helpful response based on the above information.
    """
    # Use the Gemini API to generate a response
    response = gemini_llm.invoke({"query": query, "context": context})
    return response

In [50]:
# Example usage for course recommendation
user_query = "What are some beginner courses on data science?"
relevant_courses = retrieve_courses(user_query, top_k=3)
context = "\n".join(
    [
        f"Course Title: {course['title']}\nDescription: {course['description']}\nLevel: {course['level']}\nSubject: {course['subject']}\nURL: {course['url']}"
        for course in relevant_courses
    ]
)

# Generate the response
response = generate_response(user_query, context)

# Print the response
print("Generated Response:")
print(response)

Generated Response:
The courses you provided are not beginner courses on Data Science. They are beginner/intermediate courses on Web Development.

To recommend beginner courses on Data Science, I need to know what platform you prefer and what your current knowledge base is. However, here are some general recommendations for data science courses at the beginner level:

**General Beginner Data Science Courses (Often Assume Little to No Prior Programming):**

*   **"Data Science Specialization" by Johns Hopkins University on Coursera:** This is a very popular and comprehensive specialization that covers the fundamentals of data science using R. It includes courses on R programming, statistical inference, regression models, machine learning, and more. It is highly regarded, however, be aware that the entire specialization takes a significant time commitment.
*   **"Google Data Analytics Professional Certificate" on Coursera:** This is another highly-rated specialization that aims to equip 

Q&A flow

In [51]:
# Function to extract the course name from the query
def extract_course_name_from_query(query):
    # Use a simple regex to extract the course name enclosed in quotes
    match = re.search(r"'(.*?)'", query) # Look for text enclosed in single quotes
    if match:
        return match.group(1).strip()    # Extract and clean the course name
    return None                          # Return None if no course name is found

In [52]:
def answer_course_question(query):
    # Identify the course name or ID from the query
    course_name = extract_course_name_from_query(query)
    if not course_name:
        return "Sorry, I couldn't identify the course name from your query."

    # Generate an embedding for the query
    query_embedding = embedding_model.encode(course_name)

    # Construct the query filter using Qdrant's Filter and FieldCondition
    try:
        query_filter = Filter(
            must=[
                FieldCondition(
                    key="title",
                    match=MatchText(text=course_name)
                )
            ]
        )
        print("Query Filter Successfully Constructed:")
        print(query_filter)
    except Exception as e:
        return f"Failed to construct query filter: {str(e)}"

    # Search Qdrant for the course
    try:
        search_result = client.search(
            collection_name="courses",
            query_vector=query_embedding,  # Pass the query embedding
            query_filter=query_filter,     # Use the constructed filter
            limit=1,                       # Retrieve only one matching course
            with_payload=True,
            with_vectors=False
        )

        # Print the search result to verify its structure
        print("Search Result:")
        print(search_result)
    except Exception as e:
        return f"An error occurred during the search: {str(e)}"

    # Check if a course was found
    if not search_result:
        return "Sorry, I couldn't find any course matching your query."

    # Extract course details
    course = search_result[0]
    course_title = course.payload["title"]
    course_description = course.payload["description"]
    course_level = course.payload["level"]
    course_subject = course.payload["subject"]
    course_url = course.payload["url"]

    # Generate a response using the LLM
    prompt = f"""
    User Query: {query}

    Course Details:
    Title: {course_title}
    Description: {course_description}
    Level: {course_level}
    Subject: {course_subject}
    URL: {course_url}

    Generate a concise and helpful response based on the above information.
    """
    response = gemini_llm.invoke({"query": query, "context": prompt})
    return response

In [53]:
user_query = "What is the description of the course 'Ultimate Investment Banking Course'?"
response = answer_course_question(user_query)
print("Generated Response:")
print(response)

Query Filter Successfully Constructed:
should=None min_should=None must=[FieldCondition(key='title', match=MatchText(text='Ultimate Investment Banking Course'), range=None, geo_bounding_box=None, geo_radius=None, geo_polygon=None, values_count=None)] must_not=None
Search Result:
[ScoredPoint(id=1070968, version=0, score=0.45796072483062744, payload={'title': 'Ultimate Investment Banking Course', 'description': 'Our dedicated approach and solid methodology will teach you\xa0the tools and skills of a good investment banker.\nThis course is also for anyone who wants to work in\xa0Investment Banking, and wants to learn about the necessary skills required to get into this sector.\nNo prior technical knowledge is required.\nWe have\xa0DIRECT EXAMPLES \xa0for all of the concepts that we will be introducing, so you can practice directly as you go forward into the course.\nBanking Organization: Understand how a bank really works, the main actors and how they interact with each others.\nIntroduc

Career coaching flow

In [54]:
def career_coaching_response(query):
    # Extract the course name from the query
    course_name = extract_course_name_from_query(query)
    if not course_name:
        return "Sorry, I couldn't identify the course name from your query."

    # Generate an embedding for the course name
    query_embedding = embedding_model.encode(course_name)

    # Search Qdrant for the course
    search_result = client.search(
        collection_name="courses",
        query_vector=query_embedding,
        query_filter=Filter(
            must=[
                FieldCondition(
                    key="title",
                    match=MatchText(text=course_name)
                )
            ]
        ),
        limit=1,
        with_payload=True
    )

    # Check if the course exists
    if not search_result:
        return "Sorry, I couldn't find any course matching your query."

    # Extract course details
    course = search_result[0]
    course_title = course.payload["title"]
    course_subject = course.payload["subject"]
    course_description = course.payload["description"]

    # Generate career coaching advice using the LLM
    prompt = f"""
    User Query: {query}

    Course Details:
    Title: {course_title}
    Subject: {course_subject}
    Description: {course_description}

    Generate career coaching advice for someone who has completed this course. Include:
    - Potential job roles
    - Industries where this knowledge is applicable
    - Next steps for career advancement
    - Skills gained from the course
    """
    response = gemini_llm.invoke({"query": query, "context": prompt})
    return response

In [55]:
user_query = "What career paths can I pursue after completing the 'Ultimate Investment Banking Course'?"
response = career_coaching_response(user_query)
print("Generated Response:")
print(response)

Generated Response:
Okay, let's break down potential career paths and advice for someone who's completed the "Ultimate Investment Banking Course" based on the provided description.

**Career Coaching Advice: After Completing the "Ultimate Investment Banking Course"**

**Overall Impression:**

This course seems like a broad introduction to investment banking concepts and financial markets, rather than a deep dive into a specific area. It's designed to give a solid foundation and overview, making it suitable for career switchers, students, and those looking to move into more specialized roles within finance.  The emphasis on practical examples is a strong point.

**I. Potential Job Roles:**

Given the course content, here's a range of job roles the course graduate might be suitable for, categorized by level of direct applicability:

*   **Entry-Level/Junior Roles (Directly Applicable):**

    *   **Investment Banking Analyst/Associate (at smaller firms or in less specialized areas):** Th

Define the AI Agent Framework

Intent classification agent

In [56]:
# List of possible intents
INTENTS = {
    "qa": ["what is", "describe", "explain", "tell me about"],
    "career_coaching": ["career", "job", "path", "opportunity", "after completing"]
}

def classify_intent(query):
    """
    Classify the intent of the user query based on predefined keywords.
    """
    query_lower = query.lower()
    for intent, keywords in INTENTS.items():
        if any(keyword in query_lower for keyword in keywords):
            return intent
    return "unknown"  # Default intent if no match is found

In [57]:
# Example usage
user_query = "What career paths can I pursue after completing the 'Ultimate Investment Banking Course'?"
intent = classify_intent(user_query)
print(f"Identified Intent: {intent}")

Identified Intent: career_coaching


Define Context Handling Logic

In [58]:
# Initialize a context dictionary to store conversation state
conversation_context = {
    "last_course_name": None,  # Track the last course name mentioned by the user
    "last_intent": None       # Track the last intent identified
}

def handle_context(query, intent):
    """
    Update and manage the conversation context based on the user query and intent.
    """
    global conversation_context

    # Extract course name from the query
    course_name = extract_course_name_from_query(query)

    # Update context based on intent
    if intent == "qa":
        if course_name:
            conversation_context["last_course_name"] = course_name
            conversation_context["last_intent"] = "qa"
        elif conversation_context["last_course_name"]:
            # Use the last mentioned course name if none is found in the query
            course_name = conversation_context["last_course_name"]

    elif intent == "career_coaching":
        if course_name:
            conversation_context["last_course_name"] = course_name
            conversation_context["last_intent"] = "career_coaching"
        elif conversation_context["last_course_name"]:
            # Use the last mentioned course name if none is found in the query
            course_name = conversation_context["last_course_name"]

    return course_name

In [59]:
user_query = "What is the description of the course 'Ultimate Investment Banking Course'?"
intent = classify_intent(user_query)
course_name = handle_context(user_query, intent)
print(f"Identified Intent: {intent}")
print(f"Course Name from Context: {course_name}")

Identified Intent: qa
Course Name from Context: Ultimate Investment Banking Course


Integrate Everything into the Main Conversational Flow

In [60]:
def handle_conversation(query):
    """
    Handle the user's query by classifying intent, managing context, and routing to the appropriate flow.
    """
    global conversation_context

    # Classify the intent of the query
    intent = classify_intent(query)
    print(f"Identified Intent: {intent}")

    # Handle context and extract course name if needed
    course_name = handle_context(query, intent)
    print(f"Resolved Course Name: {course_name}")

    # Route the query based on intent
    if intent == "qa":
        # Route to Q&A flow
        response = answer_course_question(query)
    elif intent == "career_coaching":
        # Route to Career Coaching flow
        response = career_coaching_response(query)
    else:
        # Default response for unknown intents
        response = "I'm sorry, I didn't understand your query. Could you please clarify?"

    return response

In [61]:
# Example Usage
user_query_1 = "What is the description of the 'Ultimate Investment Banking Course'?"
response_1 = handle_conversation(user_query_1)
print("Generated Response 1:")
print(response_1)

user_query_2 = "What career paths can I pursue after completing this course?"
response_2 = handle_conversation(user_query_2)
print("Generated Response 2:")
print(response_2)

Identified Intent: qa
Resolved Course Name: Ultimate Investment Banking Course
Query Filter Successfully Constructed:
should=None min_should=None must=[FieldCondition(key='title', match=MatchText(text='Ultimate Investment Banking Course'), range=None, geo_bounding_box=None, geo_radius=None, geo_polygon=None, values_count=None)] must_not=None
Search Result:
[ScoredPoint(id=1070968, version=0, score=0.45796072483062744, payload={'title': 'Ultimate Investment Banking Course', 'description': 'Our dedicated approach and solid methodology will teach you\xa0the tools and skills of a good investment banker.\nThis course is also for anyone who wants to work in\xa0Investment Banking, and wants to learn about the necessary skills required to get into this sector.\nNo prior technical knowledge is required.\nWe have\xa0DIRECT EXAMPLES \xa0for all of the concepts that we will be introducing, so you can practice directly as you go forward into the course.\nBanking Organization: Understand how a bank 

#Monitoring and Optimization:

Set Up AgentOps for Monitoring

In [78]:
import agentops
print(dir(agentops.client))

['ApiClient', 'Client', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', 'api', 'client', 'http']


In [80]:
import agentops
from agentops import ActionEvent
from agentops.client import client  # <- import the actual client instance

agentops.init(api_key="ff9c5c0c-6bb2-46fa-9e4e-9f78c22a258b")

def log_event(action_type, logs=None, returns=None):
    try:
        client.log(
            event_name=action_type,
            metadata={
                "logs": logs,
                "returns": returns
            }
        )
        print("Logged successfully!")
    except Exception as e:
        print(f"Failed to log event: {e}")

# Example usage
log_event("System Initialized", logs="System booted", returns="Ready")

Failed to log event: module 'agentops.client.client' has no attribute 'log'


#5.3 User Authentication & Data Management

In [21]:
# Replace this with your Firebase service account key (JSON file)
cred = credentials.Certificate("/content/ai-chatbot-sprints-firebase-adminsdk-fbsvc-cfb5e04530.json")
firebase_admin.initialize_app(cred, name="my-app")
print("Firebase initialized successfully!")

ValueError: Firebase app named "my-app" already exists. This means you called initialize_app() more than once with the same app name as the second argument. Make sure you provide a unique name every time you call initialize_app().

Implement User Sign-Up and Sign-In

In [22]:
if not firebase_admin._apps:
    cred = credentials.Certificate("/content/ai-chatbot-sprints-firebase-adminsdk-fbsvc-cfb5e04530.json")
    firebase_admin.initialize_app(cred)

In [16]:
from pyrebase.pyrebase import Firebase

firebase_config = {
    "apiKey": "AIzaSyBpUwIfLgWk9qhuY2wQW-hCNXNmC7QJSOo",
    "authDomain": "ai-chatbot-sprints.firebaseapp.com",
    "projectId": "ai-chatbot-sprints",
    "storageBucket": "ai-chatbot-sprints.firebasestorage.app",
    "messagingSenderId": "1033394761864",
    "appId": "1:1033394761864:web:fe916b05fe54fe861eaf3d",
    'databaseURL': "https://ai-chatbot-sprints.firebaseio.com"
}

firebase = Firebase(firebase_config)
auth_client = firebase.auth()

# Sign-up
user = auth_client.create_user_with_email_and_password("test3@example.com", "password123")

# Sign-in
user = auth_client.sign_in_with_email_and_password("test@example.com", "password123")

Store User Data in Firestore

In [24]:
from firebase_admin import firestore
cred = credentials.Certificate("/content/ai-chatbot-sprints-firebase-adminsdk-fbsvc-cfb5e04530.json")
firebase_admin.initialize_app(cred)

db = firestore.client(firebase_admin.get_app())

In [25]:
def store_user_profile(user_id, email, name=None):
    doc_ref = db.collection('users').document(user_id)
    doc_ref.set({
        'email': email,
        'name': name,
        'learning_history': [],
        'chatbot_interactions': [],
        'logs': []
    })
    print(f"User profile stored for {user_id}")

In [26]:
user_id = "5KcXNRLmXnPqDfmFU97u999SOh82"  # Replace with actual user ID
store_user_profile(user_id, "test@example.com", "John Doe")

User profile stored for 5KcXNRLmXnPqDfmFU97u999SOh82


Store User Profiles :

In [None]:
db.collection('users').document(user_id).set({
    'email': email,
    'name': name,
    'learning_history': [],
    'chatbot_interactions': []
})

Log Learning History :

In [None]:
db.collection('learning_history').add({
    'user_id': user_id,
    'course_id': course_id,
    'progress': progress,
    'timestamp': firestore.SERVER_TIMESTAMP
})

Log Chatbot Interactions :

In [None]:
db.collection('chatbot_interactions').add({
    'user_id': user_id,
    'query': query,
    'response': response,
    'timestamp': firestore.SERVER_TIMESTAMP
})

Store Passive Logs :

In [None]:
db.collection('logs').add({
    'user_id': user_id,
    'action': action,  # e.g., "login", "query", "logout"
    'details': details,  # Additional context
    'timestamp': firestore.SERVER_TIMESTAMP
})

Implement Secure Session Management and Logout

In [27]:
def get_user_session(user_id):
    user = auth.get_user(user_id)
    return user.uid, user.custom_token  # Return UID and custom token

In [28]:
def logout_user(id_token):
    try:
        auth.revoke_refresh_tokens(id_token)
        print("User logged out successfully.")
    except Exception as e:
        print(f"Error during logout: {e}")

In [29]:
def oauth_login(provider_id, credential):
    user = auth.sign_in_with_oauth(provider_id, credential)
    return user

In [33]:
# Import required libraries
import firebase_admin
from firebase_admin import auth as admin_auth
from pyrebase.pyrebase import Firebase

# Initialize Firebase Admin SDK
if not firebase_admin._apps:
    cred = firebase_admin.credentials.Certificate("path/to/serviceAccountKey.json")
    firebase_admin.initialize_app(cred)

# Initialize Firebase Client SDK
firebase_config = {
    "apiKey": "AIzaSyBpUwIfLgWk9qhuY2wQW-hCNXNmC7QJSOo",
    "authDomain": "ai-chatbot-sprints.firebaseapp.com",
    "projectId": "ai-chatbot-sprints",
    "storageBucket": "ai-chatbot-sprints.firebasestorage.app",
    "messagingSenderId": "1033394761864",
    "appId": "1:1033394761864:web:fe916b05fe54fe861eaf3d",
    'databaseURL': "https://ai-chatbot-sprints.firebaseio.com"
}
firebase = Firebase(firebase_config)
auth_client = firebase.auth()

# Step 1: Simulate User Login
def test_login(email, password):
    try:
        # Use Firebase Admin SDK to get user by email
        user = admin_auth.get_user_by_email(email)
        custom_token = admin_auth.create_custom_token(user.uid)  # Create a custom token
        print(f"User logged in successfully: {user.uid}")
        return custom_token
    except Exception as e:
        print(f"Login failed: {e}")
        return None

# Step 2: Verify Active Session
def verify_session(id_token):
    try:
        # Decode the ID token to verify the session
        decoded_token = admin_auth.verify_id_token(id_token)
        print(f"Session verified for user: {decoded_token['uid']}")
        return True
    except Exception as e:
        print(f"Session verification failed: {e}")
        return False

# Step 3: Simulate Logout
def test_logout(user_id):
    try:
        # Revoke refresh tokens to log out the user
        admin_auth.revoke_refresh_tokens(user_id)
        print(f"User logged out successfully: {user_id}")
    except Exception as e:
        print(f"Logout failed: {e}")

# Test Session Management
email = "test@example.com"
password = "securepassword123"

# Step 1: Test Login
custom_token = test_login(email, password)

if custom_token:
    # Exchange custom token for ID token (simulating client-side behavior)
    try:
        user = auth_client.sign_in_with_custom_token(custom_token.decode("utf-8"))  # Use Firebase Client SDK
        id_token = user['idToken']
        print("ID Token generated successfully.")
    except Exception as e:
        print(f"Failed to generate ID token: {e}")
        id_token = None

    if id_token:
        # Step 2: Verify Session
        session_valid = verify_session(id_token)

        if session_valid:
            # Step 3: Test Logout
            try:
                user = admin_auth.get_user_by_email(email)
                test_logout(user.uid)
            except Exception as e:
                print(f"Error during logout: {e}")

User logged in successfully: FLglCXmOXFe4mfcSW07p6YIakaj2
ID Token generated successfully.
Session verified for user: FLglCXmOXFe4mfcSW07p6YIakaj2
User logged out successfully: FLglCXmOXFe4mfcSW07p6YIakaj2
