In [15]:
import pandas as pd
import os
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv, find_dotenv
import pinecone
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
files = pd.read_csv("course_descriptions.csv", encoding="ANSI")

In [3]:
def create_course_description(row):
    return f'''The course name is {row["course_name"]}, the slug is {row["course_slug"]},
            the technology is {row["course_technology"]} and the course topic is {row["course_topic"]}'''

In [5]:
pd.set_option('display.max_rows', 106)
files['course_description_new'] = files.apply(create_course_description, axis=1)
print(files['course_description_new'])

0      The course name is Introduction to Tableau, th...
1      The course name is The Complete Data Visualiza...
2      The course name is Introduction to R Programmi...
3      The course name is Data Preprocessing with Num...
4      The course name is Introduction to Data and Da...
5      The course name is Data Cleaning and Preproces...
6      The course name is Introduction to Business An...
7      The course name is Data Analysis with Excel Pi...
8      The course name is SQL, the slug is sql,\n    ...
9      The course name is Credit Risk Modeling in Pyt...
10     The course name is Python Programmer Bootcamp,...
11     The course name is SQL + Tableau + Python, the...
12     The course name is Introduction to Jupyter, th...
13     The course name is Statistics, the slug is sta...
14     The course name is Mathematics, the slug is ma...
15     The course name is Introduction to Excel, the ...
16     The course name is Probability, the slug is pr...
17     The course name is Start

In [8]:
%load_ext dotenv
%dotenv

In [9]:
load_dotenv(find_dotenv(), override=True)

True

In [11]:
pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"), environment=os.environ.get("PINECONE_ENV"))

In [12]:
index_name = "my-index"
dimension = 384
metric = "cosine"

In [13]:
if index_name in [index.name for index in pc.list_indexes()]:
    pc.delete_index(index_name)
    print(f"{index_name} successfully deleted.")
else:
    print(f"{index_name} not in index list.")

my-index successfully deleted.


In [14]:
pc.create_index(
    name = index_name,
    dimension = dimension,
    metric = metric,
    spec = ServerlessSpec(
        cloud = "aws",
        region = "us-east-1"
    )
)

{
    "name": "my-index",
    "metric": "cosine",
    "host": "my-index-by1ouyl.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "region": "us-east-1",
            "cloud": "aws",
            "read_capacity": {
                "mode": "OnDemand",
                "status": {
                    "state": "Ready",
                    "current_shards": null,
                    "current_replicas": null
                }
            }
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null,
    "_response_info": {
        "raw_headers": {
            "content-type": "application/json",
            "vary": "origin, access-control-request-method, access-control-request-headers",
            "access-control-allow-origin": "*",
            "access-control-expose-headers": "*",
            "x-pinecone-api-version": "2025-10",


In [16]:
index = pc.Index(index_name)

## Embedding the data

In [17]:
model = SentenceTransformer("all-MiniLM-L6-v2")

Loading weights: 100%|███████████████████████████| 103/103 [00:00<00:00, 362.25it/s, Materializing param=pooler.dense.weight]
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [18]:
def create_embeddings(row):
    combined_text = ' '.join([str(row[field]) for field in ['course_description', 'course_description_new', 'course_description_short']])
    embedding = model.encode(combined_text, show_progress_bar = False)
    return embedding

In [19]:
files["embedding"] = files.apply(create_embeddings, axis=1)

In [20]:
vectors_to_upsert = [(str(row["course_name"]), row["embedding"].tolist()) for _, row in files.iterrows()]
index.upsert(vectors = vectors_to_upsert)

print("Data upserted")

Data upserted


## Semantic Search

In [25]:
query = "clustering"
query_embedding = model.encode(query, show_progress_bar = False).tolist()

In [26]:
query_results = index.query(
    vector = [query_embedding],
    top_k = 12,
    include_values = True
)

In [27]:
query_results

QueryResponse(matches=[{'id': '51-469',
 'score': 0.545601904,
 'values': [-0.00946476217,
            0.00367277185,
            -0.0133795412,
            0.00449558208,
            0.0349661149,
            0.00915576704,
            -0.0755281597,
            -0.0613898225,
            0.0055425209,
            0.00983239431,
            -0.0240187924,
            -0.0552587025,
            0.0478347614,
            -0.0520656407,
            -0.0418984331,
            -0.014948491,
            0.0168796014,
            -0.0476645939,
            -0.0258275978,
            -0.084545508,
            0.0654711649,
            -0.0139692193,
            -0.0716190338,
            -0.00104933651,
            0.0555511825,
            0.0659990236,
            0.0740754157,
            0.0102074752,
            -0.0579515,
            -0.0219535492,
            -0.0265662838,
            0.0239032917,
            0.0280975029,
            0.0529846251,
            -0.0965075418,
       

In [28]:
for match in query_results["matches"]:
    print(f"Matched item ID: {match['id']}, score: {match['score']}")

Matched item ID: 51-469, score: 0.545601904
Matched item ID: 37-374, score: 0.543318868
Matched item ID: 51-470, score: 0.508051038
Matched item ID: 33-334, score: 0.501163542
Matched item ID: 33-335, score: 0.487876922
Matched item ID: 37-373, score: 0.479442656
Matched item ID: 37-372, score: 0.446531355
Matched item ID: 51-471, score: 0.422650367
Matched item ID: 33-336, score: 0.410655528
Matched item ID: 51-464, score: 0.406552821
Matched item ID: 43-417, score: 0.380612403
Matched item ID: 51-472, score: 0.376635581
