In [12]:
import pandas as pd
import pinecone
from pinecone import Pinecone, ServerlessSpec
import os
from configs import API_KEY, DEFAULT_MODEL, PINECONE_KEY
from sentence_transformers import SentenceTransformer

In [9]:
pc = Pinecone(api_key = PINECONE_KEY)

In [2]:
files = pd.read_csv("data/course_descriptions.csv", encoding = "ANSI")

In [3]:
files

Unnamed: 0,course_name,course_slug,course_technology,course_description,course_topic,course_description_short
0,Introduction to Tableau,tableau,tableau,Tableau is now one of the most popular busines...,data visualization,Teaching you how to tell compelling stories wi...
1,The Complete Data Visualization Course with Py...,data-visualization,python,The Data Visualization course is designed for ...,data visualization,Teaching you how to master the art of creating...
2,Introduction to R Programming,introduction-to-r-programming,r,R is one of the best programming languages spe...,programming,"Providing you with the skills to manipulate, a..."
3,Data Preprocessing with NumPy,data-preprocessing-numpy,python,This course is designed to show you how to wor...,data processing,This course will guide you through one of Pyth...
4,Introduction to Data and Data Science,intro-to-data-and-data-science,theory,Working with data is an essential part of main...,machine learning,Introducing you to the field of data science a...
...,...,...,...,...,...,...
101,Intro to NLP for AI,intro-to-nlp-for-ai,python,Natural language processing is an exciting and...,programming,Unlock the power of natural language processin...
102,Data Analysis with ChatGPT,data-analysis-with-chatgpt,chatgpt,Leverage ChatGPT's Advanced Data Analysis Code...,programming,Master ChatGPT for data analysis. Boost your p...
103,ChatGPT for Data Science,chatgpt-for-data-science,chatgpt,"In this course, you will learn how to utilize ...",machine learning,Learn how to increase your productivity using ...
104,Intro to LLMs,intro-to-llms,python,"In recent years, large language models (LLMs) ...",machine learning,This LLM course will guide you step-by-step th...


In [4]:
def create_course_description(row):
    return f"""The course name is {row["course_name"]}, the slug is {row["course_slug"]}, the technology is {row["course_technology"]}
        and the course topic is {row["course_topic"]}"""

In [7]:
pd.set_option('display.max_rows',106)
files["course_description_new"] = files.apply(create_course_description, axis=1)
files["course_description_new"]

0      The course name is Introduction to Tableau, th...
1      The course name is The Complete Data Visualiza...
2      The course name is Introduction to R Programmi...
3      The course name is Data Preprocessing with Num...
4      The course name is Introduction to Data and Da...
5      The course name is Data Cleaning and Preproces...
6      The course name is Introduction to Business An...
7      The course name is Data Analysis with Excel Pi...
8      The course name is SQL, the slug is sql, the t...
9      The course name is Credit Risk Modeling in Pyt...
10     The course name is Python Programmer Bootcamp,...
11     The course name is SQL + Tableau + Python, the...
12     The course name is Introduction to Jupyter, th...
13     The course name is Statistics, the slug is sta...
14     The course name is Mathematics, the slug is ma...
15     The course name is Introduction to Excel, the ...
16     The course name is Probability, the slug is pr...
17     The course name is Start

In [10]:
index_name = "my-index"
dimensions = 384
metric = "cosine"
if index_name in [idx.name for idx in pc.list_indexes()]:
    pc.delete_index(index_name)
    print(f"{index_name} successfully deleted!")
else:
    print(f"No matching index found for name {index_name}")

my-index successfully deleted!


In [11]:
pc.create_index(
    name = index_name,
    dimension = dimensions,
    metric = metric,
    spec = ServerlessSpec(
        cloud = "aws",
        region = "us-east-1",
    ))

{
    "name": "my-index",
    "metric": "cosine",
    "host": "my-index-j92o7jv.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "region": "us-east-1",
            "cloud": "aws",
            "read_capacity": {
                "mode": "OnDemand",
                "status": {
                    "state": "Ready",
                    "current_shards": null,
                    "current_replicas": null
                }
            }
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null,
    "_response_info": {
        "raw_headers": {
            "content-type": "application/json",
            "vary": "origin, access-control-request-method, access-control-request-headers",
            "access-control-allow-origin": "*",
            "access-control-expose-headers": "*",
            "x-pinecone-api-version": "2025-10",


## Embedding the data

In [35]:
model = SentenceTransformer("all-MiniLM-L6-v2")
# model = SentenceTransformer("multi-qa-distilbert-cos-v1")

In [36]:
def create_embeddings(row):
    combined_text = ' '.join([str(row[field]) for field in ['course_description', 'course_description_new', 'course_description_short']])
    embedding = model.encode(combined_text, show_progress_bar = False)
    return embedding

In [37]:
files['embedding'] = files.apply(create_embeddings, axis = 1)

In [38]:
vectors_to_upsert = [(str(row['course_name']), row['embedding'].tolist()) for _,row in files.iterrows()]

In [39]:
vectors_to_upsert[:5]

[('Introduction to Tableau',
  [0.036368731409311295,
   -0.027419976890087128,
   -0.08190659433603287,
   -0.008371344767510891,
   -0.010989622212946415,
   -0.05011501535773277,
   -0.02370070479810238,
   -0.0018080847803503275,
   -0.0797080248594284,
   -0.015680953860282898,
   -0.05798396095633507,
   0.012098845094442368,
   0.10549798607826233,
   0.0006489577936008573,
   -0.03221889212727547,
   -0.03068423457443714,
   8.821377559797838e-05,
   -0.05621439218521118,
   0.023939605802297592,
   -0.08303916454315186,
   -0.025227347388863564,
   0.039210882037878036,
   0.007230899762362242,
   -0.05357974022626877,
   -0.0013309798669070005,
   0.02714795432984829,
   0.06675729155540466,
   -0.0416998527944088,
   -0.01690690591931343,
   -0.08034186065196991,
   -0.07219676673412323,
   0.013649800792336464,
   -0.04371761903166771,
   0.04643731936812401,
   -0.1252748668193817,
   -0.06451993435621262,
   0.039433740079402924,
   0.05794467404484749,
   0.0306089464575

In [40]:
index = pc.Index(name = index_name)

In [41]:
index.upsert(vectors = vectors_to_upsert)

print("Data Upserted")

Data Upserted


## Semantic Search

In [42]:
query = "clustering"

In [43]:
query_embedding = model.encode(query, show_progress_bar=False).tolist()

In [44]:
query_result = index.query(
    vector = [query_embedding],
    top_k = 12,
    include_values = True
)

In [45]:
query_result

QueryResponse(matches=[{'id': 'Machine Learning in Excel',
 'score': 0.354953766,
 'values': [-0.0183002315,
            -0.0279485844,
            -0.0253203623,
            -0.0126938745,
            -0.024036685,
            -0.0219841097,
            -0.051123675,
            -0.0535800196,
            0.00997649692,
            0.0282286815,
            -0.040832445,
            -0.036268644,
            0.0683277175,
            -0.0348471217,
            -0.00728514744,
            0.0366662927,
            -0.0033102422,
            -0.00411817571,
            -4.75427878e-05,
            -0.0627969131,
            0.0846960172,
            0.0300104823,
            -0.0528305061,
            -0.0244680978,
            0.0403409116,
            0.0308714863,
            0.0611492395,
            0.00181206572,
            -0.0445214398,
            -0.0367099345,
            -0.0457854904,
            0.00930391904,
            0.0167219806,
            0.0564011857,
          

In [46]:
for match in query_result["matches"]:
    print(f"ID - {match['id']} - Score:{match['scor']}")

ID - Machine Learning in Excel - Score:0.354953766
ID - Machine Learning with K-Nearest Neighbors - Score:0.314138889
ID - Machine Learning in Python - Score:0.282952338
ID - Customer Churn Analysis with SQL and Tableau - Score:0.281318665
ID - Growth Analysis with SQL, Python, and Tableau   - Score:0.259747535
ID - Linear Algebra and Feature Selection - Score:0.259037018
ID - Customer Engagement Analysis with SQL and Tableau - Score:0.234294891
ID - Fashion Analytics with Tableau - Score:0.233248711
ID - Machine Learning with Naive Bayes - Score:0.22774601
ID - Machine Learning with Support Vector Machines - Score:0.225613594
ID - Data Preprocessing with NumPy - Score:0.219470978
ID - Data Analysis with Excel Pivot Tables - Score:0.216790199
