In [11]:
import pandas as pd
import requests
import numpy as np

In [12]:
docs_url= 'https://raw.githubusercontent.com/bhushandeodhar/llm-zoomcamp/refs/heads/main/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

In [13]:
documents = []

for course_dict in documents_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [14]:
df = pd.DataFrame(documents, columns=['course','section','question','text'])

Vector Spaces
- turn the docs into vector
- ter-document matrix:
        - rows: documents
        - columns - words / token

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
fields = ['section','question','text']

In [17]:
matrices = {}
vectorizers = {}

for f in fields:
    cv = TfidfVectorizer(stop_words='english', min_df=5) # ignore terms that appear in less than 5 documents
    X = cv.fit_transform(df[f]) # Tf-idf-weighted document-term matrix. statistical measure used to evaluate the importance of a word in a document relative to a collection of documents (corpus)
    matrices[f] = X
    vectorizers[f] = cv

In [18]:
matrices

{'section': <Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 3090 stored elements and shape (948, 66)>,
 'question': <Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 3431 stored elements and shape (948, 291)>,
 'text': <Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 23808 stored elements and shape (948, 1333)>}

In [19]:
vectorizers

{'section': TfidfVectorizer(min_df=5, stop_words='english'),
 'question': TfidfVectorizer(min_df=5, stop_words='english'),
 'text': TfidfVectorizer(min_df=5, stop_words='english')}

In [20]:
score = np.zeros(len(df))
query = "I just discovered the course, is it too late to join?"

boosts = {
           'question' : 3
         }

for f in fields:
    q = vectorizers[f].transform([query])
    X = matrices[f]
    f_score = cosine_similarity(X,q).flatten() # compares document store matrix with query matrix
    boost = boosts.get(f, 1.0)
    score = score + boost * f_score

In [21]:
#post processing 
#set scores of the documents as 0 that needs to be filtered out
filters = {
           'course' : 'data-engineering-zoomcamp'
          }
for field, value in filters.items():
    mask = (df[field] == value).astype(int)
    score = score * mask

In [22]:
idx = np.argsort(score)[-10:]  #gives the index of the sorted documents

In [23]:
df.iloc[idx]

Unnamed: 0,course,section,question,text
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
411,data-engineering-zoomcamp,Workshop 1 - dlthub,Edit Course Profile.,The display name listed on the leaderboard is ...
10,data-engineering-zoomcamp,General course-related questions,Course - ​​How many hours per week am I expect...,It depends on your background and previous exp...
9,data-engineering-zoomcamp,General course-related questions,Course - Which playlist on YouTube should I re...,All the main videos are stored in the Main “DA...
5,data-engineering-zoomcamp,General course-related questions,Course - how many Zoomcamps in a year?,"There are 3 Zoom Camps in a year, as of 2024. ..."
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
34,data-engineering-zoomcamp,General course-related questions,How can we contribute to the course?,Star the repo! Share it with friends if you fi...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
7,data-engineering-zoomcamp,General course-related questions,Course - Can I follow the course after it fini...,"Yes, we will keep all the materials after the ..."
