In [1]:
import pandas as pd
import pinecone
from pinecone import Pinecone, ServerlessSpec
import os
from configs import API_KEY, DEFAULT_MODEL, PINECONE_KEY
from sentence_transformers import SentenceTransformer

In [2]:
pc = Pinecone(api_key = PINECONE_KEY)
files = pd.read_csv("data/course_section_descriptions.csv", encoding = "ANSI")

In [3]:
files["unique_id"] = files["course_id"].astype(str) + "-" +  files["section_id"].astype(str)

In [4]:
files["metadata"] = files.apply(
    lambda row: {
        "course_name" : row["course_name"],
        "section_name" : row["section_name"],
        "section_description" : row["section_description"]
    }
    , axis=1)

In [5]:
def create_embedding(row):
    combined_text = f"""{row["course_name"]} row["course_technology"]
                row["course_description"] row["section_name"] row["section_description"]"""
    return model.encode(combined_text, show_progress_bar = False)

In [8]:
model = SentenceTransformer("multi-qa-distilbert-cos-v1")

In [9]:
files['embedding'] = files.apply(create_embedding, axis = 1)

In [10]:
index_name = "bert-index"
dimensions = 768
metric = "cosine"
if index_name in [idx.name for idx in pc.list_indexes()]:
    pc.delete_index(index_name)
    print(f"{index_name} successfully deleted!")
else:
    print(f"No matching index found for name {index_name}")

pc.create_index(
    name = index_name,
    dimension = dimensions,
    metric = metric,
    spec = ServerlessSpec(
        cloud = "aws",
        region = "us-east-1",
    ))

No matching index found for name bert-index


{
    "name": "bert-index",
    "metric": "cosine",
    "host": "bert-index-j92o7jv.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "region": "us-east-1",
            "cloud": "aws",
            "read_capacity": {
                "mode": "OnDemand",
                "status": {
                    "state": "Ready",
                    "current_shards": null,
                    "current_replicas": null
                }
            }
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 768,
    "deletion_protection": "disabled",
    "tags": null,
    "_response_info": {
        "raw_headers": {
            "content-type": "application/json",
            "vary": "origin, access-control-request-method, access-control-request-headers",
            "access-control-allow-origin": "*",
            "access-control-expose-headers": "*",
            "x-pinecone-api-version": "2025-1

In [11]:
index = pc.Index(name = index_name)

In [12]:
vectors_to_upsert = [(row['unique_id'], row['embedding'].tolist(), row['metadata']) for _,row in files.iterrows()]

In [13]:
vectors_to_upsert[0]

('2-9',
 [0.025897052139043808,
  0.05320138484239578,
  -0.026883617043495178,
  -0.002313697710633278,
  0.004942266270518303,
  0.05030316114425659,
  -0.05162053555250168,
  0.022284546867012978,
  -0.016394449397921562,
  -0.02136988937854767,
  -0.0025875233113765717,
  -0.013299588114023209,
  -0.0007340928423218429,
  -0.05483908951282501,
  0.013604690320789814,
  0.021110083907842636,
  -0.022657139226794243,
  0.03932056576013565,
  0.019192874431610107,
  0.052036087960004807,
  -0.02294592373073101,
  0.05404011532664299,
  -0.007421423215419054,
  -0.045059990137815475,
  0.057310886681079865,
  0.00771526200696826,
  -0.06795289367437363,
  0.007734532002359629,
  -0.051280826330184937,
  0.011682823300361633,
  -0.016210217028856277,
  0.03289413079619408,
  -0.05154828727245331,
  0.01447776798158884,
  0.009923041798174381,
  -0.07962267100811005,
  -0.08131255209445953,
  0.00935647264122963,
  -0.002877808641642332,
  -0.03512626141309738,
  -0.03334931284189224,
  

In [14]:
index.upsert(vectors = vectors_to_upsert)

print("Data Upserted")

Data Upserted


In [15]:
query = "regression in python"

query_embedding = model.encode(query, show_progress_bar=False).tolist()

query_result = index.query(
    vector = [query_embedding],
    top_k = 12,
    include_metadata = True
)

score_threshold = 0.3

In [17]:
for match in query_result["matches"]:
    if match['score'] >= score_threshold:
        course_details = match.get("metadata", {})
        course_name = course_details.get("course_name", "NA")
        section_name = course_details.get("section_name", "NA")
        section_description = course_details.get("section_description", "NA")
        print(f"Matched item {match['id']} with score {match['score']}")
        print(f"Course - {course_name} \nSection - {section_name} \nsection_description {section_description}")
        print("-----------------------------------------------------------------")

Matched item 15-184 with score 0.319005
Course - Credit Risk Modeling in Python 
Section - LGD model 
section_description LGD models are often estimated using a beta regression. To keep the modeling part simpler, we employ a two-step regression model, which aims to simulate a beta regression. We combine the predictions from a logistic regression with those from a linear regression to estimate the loss given default.

-----------------------------------------------------------------
Matched item 15-175 with score 0.319005
Course - Credit Risk Modeling in Python 
Section - Introduction 
section_description We start by explaining why credit risk is important for financial institutions. We also define ground 0 terms, such as expected loss, probability of default, loss given default and exposure at default.

-----------------------------------------------------------------
Matched item 15-185 with score 0.319005
Course - Credit Risk Modeling in Python 
Section - EAD model 
section_descripti