In [1]:
import pandas as pd
import pinecone
from pinecone import Pinecone, ServerlessSpec
import os
from configs import API_KEY, DEFAULT_MODEL, PINECONE_KEY
from sentence_transformers import SentenceTransformer

In [2]:
pc = Pinecone(api_key = PINECONE_KEY)
files = pd.read_csv("data/course_section_descriptions.csv", encoding = "ANSI")

In [3]:
files["unique_id"] = files["course_id"].astype(str) + "-" +  files["section_id"].astype(str)

In [4]:
files["metadata"] = files.apply(
    lambda row: {
        "course_name" : row["course_name"],
        "section_name" : row["section_name"],
        "section_description" : row["section_description"]
    }
    , axis=1)

In [6]:
def create_embedding(row):
    combined_text = f"""{row["course_name"]} row["course_technology"]
                row["course_description"] row["section_name"] row["section_description"]"""
    return model.encode(combined_text, show_progress_bar = False)

In [7]:
model = SentenceTransformer("all-MiniLM-L6-v2")
# model = SentenceTransformer("multi-qa-distilbert-cos-v1")

In [8]:
files['embedding'] = files.apply(create_embedding, axis = 1)

In [9]:
index_name = "my-index"
dimensions = 384
metric = "cosine"
if index_name in [idx.name for idx in pc.list_indexes()]:
    pc.delete_index(index_name)
    print(f"{index_name} successfully deleted!")
else:
    print(f"No matching index found for name {index_name}")

my-index successfully deleted!


In [10]:
pc.create_index(
    name = index_name,
    dimension = dimensions,
    metric = metric,
    spec = ServerlessSpec(
        cloud = "aws",
        region = "us-east-1",
    ))

{
    "name": "my-index",
    "metric": "cosine",
    "host": "my-index-j92o7jv.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "region": "us-east-1",
            "cloud": "aws",
            "read_capacity": {
                "mode": "OnDemand",
                "status": {
                    "state": "Ready",
                    "current_shards": null,
                    "current_replicas": null
                }
            }
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null,
    "_response_info": {
        "raw_headers": {
            "content-type": "application/json",
            "vary": "origin, access-control-request-method, access-control-request-headers",
            "access-control-allow-origin": "*",
            "access-control-expose-headers": "*",
            "x-pinecone-api-version": "2025-10",


In [11]:
index = pc.Index(name = index_name)

In [12]:
vectors_to_upsert = [(row['unique_id'], row['embedding'].tolist(), row['metadata']) for _,row in files.iterrows()]

In [13]:
vectors_to_upsert[0]

('2-9',
 [0.055960047990083694,
  0.04113643616437912,
  -0.03657137602567673,
  0.025163710117340088,
  -0.003083556890487671,
  0.047918085008859634,
  -0.03615285083651543,
  -0.009429008699953556,
  -0.0790412575006485,
  0.02554238960146904,
  0.00019737206457648426,
  -0.01650944910943508,
  0.08443764597177505,
  -0.04053952917456627,
  -0.03189553692936897,
  0.009972297586500645,
  -0.038822732865810394,
  -0.038964394479990005,
  0.013699844479560852,
  -0.01690327748656273,
  0.06397318840026855,
  0.10382235050201416,
  0.02047545649111271,
  0.01982797309756279,
  0.002811008831486106,
  0.09966733306646347,
  0.017810815945267677,
  0.07172717154026031,
  -0.046138934791088104,
  -0.02982327528297901,
  -0.06863654404878616,
  0.002206700388342142,
  -0.02528694085776806,
  0.09277422726154327,
  -0.00571075826883316,
  -0.06557586789131165,
  -0.05584512650966644,
  0.011284856125712395,
  0.0909770056605339,
  0.04767237603664398,
  -0.022756390273571014,
  -0.087661422

In [14]:
index.upsert(vectors = vectors_to_upsert)

print("Data Upserted")

Data Upserted


## Semantic Search

In [23]:
query = "regression in python"

query_embedding = model.encode(query, show_progress_bar=False).tolist()

query_result = index.query(
    vector = [query_embedding],
    top_k = 12,
    include_metadata = True
)

score_threshold = 0.3

In [24]:
for match in query_result["matches"]:
    # if match['score'] >= score_threshold:
        course_details = match.get("metadata", {})
        course_name = course_details.get("course_name", "NA")
        section_name = course_details.get("section_name", "NA")
        section_description = course_details.get("section_description", "NA")
        print(f"Matched item {match['id']} with score {match['score']}")
        print(f"Course - {course_name} \nSection - {section_name} \nsection_description {section_description}")
        print("-----------------------------------------------------------------")

Matched item 27-277 with score 0.312618256
Course - Time Series Analysis with Python 
Section - Business Case 
section_description In this final part of the course, we will examine how a real-life event like the Dieselgate scandal can alter the trends in time series data. 
-----------------------------------------------------------------
Matched item 27-263 with score 0.312618256
Course - Time Series Analysis with Python 
Section - Introduction 
section_description In this short section, we’ll tell you a bit more of what the course is about, how its structured and what our goal is.
-----------------------------------------------------------------
Matched item 27-264 with score 0.312618256
Course - Time Series Analysis with Python 
Section - Setting Up the Environment 
section_description In this part of the course, we will explain to you how to set up Python 3 and then load up Jupyter. We’ll also show you what the Anaconda Prompt is and how we use it to download and import new modules.