In [2]:
!pip install jupyter scikit-learn pandas -qqq

In [2]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [3]:
documents[2]

{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
 'section': 'General course-related questions',
 'question': 'Course - Can I still join the course after the start date?',
 'course': 'data-engineering-zoomcamp'}

In [4]:
import pandas as pd

In [5]:
df = pd.DataFrame(documents, columns = ['course', 'section', 'question','text'])

In [6]:
df.head()

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


In [7]:
df.tail()

Unnamed: 0,course,section,question,text
943,mlops-zoomcamp,Module 6: Best practices,Github actions: Permission denied error when e...,Problem description\nThis is the step in the c...
944,mlops-zoomcamp,Module 6: Best practices,Managing Multiple Docker Containers with docke...,Problem description\nWhen a docker-compose fil...
945,mlops-zoomcamp,Module 6: Best practices,AWS regions need to match docker-compose,Problem description\nIf you are having problem...
946,mlops-zoomcamp,Module 6: Best practices,Isort Pre-commit,Problem description\nPre-commit command was fa...
947,mlops-zoomcamp,Module 6: Best practices,How to destroy infrastructure created via GitH...,Problem description\nInfrastructure created in...


In [10]:
[df.course == 'data-engineering-zoomcamp']

[0       True
 1       True
 2       True
 3       True
 4       True
        ...  
 943    False
 944    False
 945    False
 946    False
 947    False
 Name: course, Length: 948, dtype: bool]

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
docs_example = [
    "January course details, register now",
    "Course prerequisites listed in January catalog",
    "Submit January course homework by end of month",
    "Register for January course, no prerequisites",
    "January course setup: Python and Google Cloud"
]

# Vectorizer

- turn the docs into vectors

- term-document matrix:

    - Rows: documents
    - Columns: words/tokens

- Bag of words
  - Word order is lost
  - Sparse matrix

In [13]:
cv = CountVectorizer(stop_words = 'english')

In [14]:
cv.fit(docs_example)

In [15]:
cv.get_feature_names_out().shape

(15,)

In [16]:
cv.get_feature_names_out()

array(['catalog', 'cloud', 'course', 'details', 'end', 'google',
       'homework', 'january', 'listed', 'month', 'prerequisites',
       'python', 'register', 'setup', 'submit'], dtype=object)

In [17]:
X = cv.transform(docs_example)

In [18]:
pd.DataFrame(X.todense(), columns = cv.get_feature_names_out()).T

Unnamed: 0,0,1,2,3,4
catalog,0,1,0,0,0
cloud,0,0,0,0,1
course,1,1,1,1,1
details,1,0,0,0,0
end,0,0,1,0,0
google,0,0,0,0,1
homework,0,0,1,0,0
january,1,1,1,1,1
listed,0,1,0,0,0
month,0,0,1,0,0


In [19]:
cv = CountVectorizer(stop_words='english', min_df= 5)
X = cv.fit_transform(df.text)

names = cv.get_feature_names_out()



In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer(stop_words='english', min_df= 5)
X = cv.fit_transform(df.text)

names = cv.get_feature_names_out()



In [21]:
df_docs = pd.DataFrame(X.toarray(), columns=names)
df_docs.round(2)

Unnamed: 0,01,02,03,04,05,06,09,10,100,11,...,y_val,yaml,year,yellow,yellow_tripdata_2021,yes,yml,youtube,zip,zoomcamp
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.00
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.43
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.0,0.28,0.00,0.0,0.0,0.00
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.00
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
943,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.0,0.00,0.11,0.0,0.0,0.00
944,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.00
945,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.17,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.00
946,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.00


In [22]:
query = "Do I need to know python to sign up for the January course?" 

q = cv.transform([query])
q.toarray()


array([[0., 0., 0., ..., 0., 0., 0.]])

In [23]:
query_dict = dict(zip(names, q.toarray()[0]))
query_dict

for k,v in query_dict.items():
    if v>0.0:
        print(f"{k}:{v}")

course:0.38148200594064524
know:0.5608269127690405
need:0.29796783250107517
python:0.31441356049301333
sign:0.5935519664108326


In [24]:
doc_dict = dict(zip(names, X.toarray()[1]))
doc_dict

for k,v in doc_dict.items():
    if v>0:
        print(f"{k}:{v}")

data:0.3127766226016382
datatalksclub:0.5316383823591385
engineering:0.5401030373894639
github:0.3792268345828722
zoomcamp:0.4289605246306481


In [25]:
 q.shape

(1, 1333)

In [26]:
X.shape

(948, 1333)

In [27]:
X.dot(q.T).todense()

matrix([[0.19464486],
        [0.        ],
        [0.        ],
        [0.06011641],
        [0.04932915],
        [0.        ],
        [0.        ],
        [0.13477565],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.15899187],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.07431408],
        [0.        ],
        [0.        ],
        [0.05779673],
        [0.07243428],
        [0.        ],
        [0.05174293],
        [0.16373635],
        [0.08076031],
        [0.        ],
        [0.09755254],
        [0.        ],
        [0.21069625],
        [0.12067781],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.06381749],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.00910541],
        [0.02835681],
        [0.05480112],
        [0.        ],
        [0.        ],
        [0.        ],
        [0

In [28]:
from sklearn.metrics.pairwise import cosine_similarity

In [29]:
score = cosine_similarity(X,q).flatten()

In [30]:
import numpy as np

In [31]:
np.argsort(score)[-5:]

array([764,  27, 806, 577, 445])

In [32]:
df.iloc[0].text

"The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel."

In [33]:
fields = ['section', 'question', 'text']

In [34]:
matrices = {}

vectorizers = {}

for f in fields:
    cv = TfidfVectorizer(stop_words='english', min_df= 5)
    X = cv.fit_transform(df[f])
    matrices[f] = X
    vectorizers[f] = cv
    

In [35]:
matrices

{'section': <948x66 sparse matrix of type '<class 'numpy.float64'>'
 	with 3090 stored elements in Compressed Sparse Row format>,
 'question': <948x291 sparse matrix of type '<class 'numpy.float64'>'
 	with 3431 stored elements in Compressed Sparse Row format>,
 'text': <948x1333 sparse matrix of type '<class 'numpy.float64'>'
 	with 23808 stored elements in Compressed Sparse Row format>}

In [36]:
n = len(df)

In [37]:
score = np.zeros(n)

query = "Do I need to know python to sign up for the January course?" 

boosts = {

    'question':3,
    'text': 0.5
}

for f in fields: 
    q = vectorizers[f].transform([query])
    X = matrices[f]
    f_score = cosine_similarity(X,q).flatten()
    boost = boosts.get(f,1.0)
    score = score+f_score*boost


In [38]:
filters = {
    'course': 'data-engineering-zoomcamp'
}

In [39]:
for k, value in filters.items():
    mask = (df[k] == value).astype(int).values
    score = score*mask
score
    

array([1.71429811, 1.81276416, 1.46676396, 1.51895943, 1.83742874,
       1.81276416, 1.12836823, 1.88015199, 1.45142523, 1.81276416,
       1.64009203, 1.36833852, 0.49512426, 0.49512426, 0.49512426,
       0.5322813 , 0.49512426, 1.43618161, 2.31038823, 1.12387362,
       0.49512426, 0.52099572, 0.57699243, 1.28873661, 0.49512426,
       0.54390053, 0.49512426, 0.60047238, 0.55546316, 0.49512426,
       0.49512426, 0.49512426, 0.49512426, 1.07295977, 1.81276416,
       1.0334609 , 0.49512426, 0.49512426, 0.49512426, 0.49967696,
       0.50930266, 1.15263671, 0.49512426, 0.49512426, 0.        ,
       0.        , 0.        , 0.01234982, 0.02564693, 0.0300672 ,
       0.02626329, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.02084509, 0.        , 0.        , 0.        , 0.00376465,
       0.        , 0.        , 0.00985732, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [40]:
idx = np.argsort(-score)[:5]

In [41]:
df.iloc[idx]

Unnamed: 0,course,section,question,text
18,data-engineering-zoomcamp,General course-related questions,Leaderboard - I am not on the leaderboard / ho...,When you set up your account you are automatic...
7,data-engineering-zoomcamp,General course-related questions,Course - Can I follow the course after it fini...,"Yes, we will keep all the materials after the ..."
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
5,data-engineering-zoomcamp,General course-related questions,Course - how many Zoomcamps in a year?,"There are 3 Zoom Camps in a year, as of 2024. ..."


In [42]:
class TextSearch:

    def __init__(self, text_fields):
        self.text_fields = text_fields
        self.matrices = {}
        self.vectorizers = {}

    def fit(self, records, vectorizer_params={}):
        self.df = pd.DataFrame(records)

        for f in self.text_fields:
            cv = TfidfVectorizer(**vectorizer_params)
            X = cv.fit_transform(self.df[f])
            self.matrices[f] = X
            self.vectorizers[f] = cv

    def search(self, query, n_results=10, boost={}, filters={}):
        score = np.zeros(len(self.df))

        for f in self.text_fields:
            b = boost.get(f, 1.0)
            q = self.vectorizers[f].transform([query])
            s = cosine_similarity(self.matrices[f], q).flatten()
            score = score + b * s

        for field, value in filters.items():
            mask = (self.df[field] == value).values
            score = score * mask

        idx = np.argsort(-score)[:n_results]
        results = self.df.iloc[idx]
        return results.to_dict(orient='records')


In [43]:
index = TextSearch( 
    text_fields = ['section', 'question', 'text']
) 

In [44]:
index.fit(documents)

In [45]:
pd.DataFrame(index.search(
    query='I just singned up. Is it too late to join the course?',
    n_results=5,
    boost={'question': 3.0},
    filters={'course': 'data-engineering-zoomcamp'}
))

Unnamed: 0,text,section,question,course
0,"Yes, even if you don't register, you're still ...",General course-related questions,Course - Can I still join the course after the...,data-engineering-zoomcamp
1,The purpose of this document is to capture fre...,General course-related questions,Course - When will the course start?,data-engineering-zoomcamp
2,"Yes, we will keep all the materials after the ...",General course-related questions,Course - Can I follow the course after it fini...,data-engineering-zoomcamp
3,"No, late submissions are not allowed. But if t...",General course-related questions,Homework - Are late submissions of homework al...,data-engineering-zoomcamp
4,You can start by installing and setting up all...,General course-related questions,Course - What can I do before the course starts?,data-engineering-zoomcamp


In [46]:
from sklearn.decomposition import TruncatedSVD
X = matrices['text']
cv = vectorizers['text']


In [47]:
svd = TruncatedSVD(n_components=16)
X_emb = svd.fit_transform(X)

In [48]:
X_emb.shape

(948, 16)

In [49]:
X

<948x1333 sparse matrix of type '<class 'numpy.float64'>'
	with 23808 stored elements in Compressed Sparse Row format>

In [50]:
X_emb[0]

array([ 0.09653564, -0.08243188, -0.10198086, -0.0785878 ,  0.06986622,
       -0.06100586,  0.02118423, -0.16347753, -0.225107  ,  0.28338267,
        0.0628921 ,  0.06796645,  0.06489491, -0.09566487,  0.01932386,
        0.04758752])

In [51]:
query = 'I just singned up. Is it too late to join the course?'

Q = cv.transform([query])
Q_emb = svd.transform(Q)

In [52]:
Q_emb

array([[ 0.05790367, -0.03866924, -0.05660791, -0.02749105,  0.04199943,
        -0.06315435,  0.01022132, -0.10594982, -0.15532674,  0.1895775 ,
         0.04997699,  0.07337318,  0.05798845, -0.07268065,  0.03819224,
         0.03725183]])

In [53]:
np.dot(X_emb[0], Q_emb[0])


0.15107987153457617

In [54]:
score = cosine_similarity(X_emb, Q_emb).flatten()
idx = np.argsort(-score)[:10]
list(df.loc[idx].text)


['Please choose the closest one to your answer. Also do not post your answer in the course slack channel.',
 'The course videos are pre-recorded, you can start watching the course right now.\nWe will also occasionally have office hours - live sessions where we will answer your questions. The office hours sessions are recorded too.\nYou can see the office hours as well as the pre-recorded course videos in the course playlist on YouTube.',
 "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'If you have submitted two projects (and peer-reviewed at least 3 course-mates’ projects