In [55]:
import pandas as pd
import requests
import numpy as np

## Download the data

In [3]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [8]:
documents_df = pd.DataFrame(documents, columns = ['course', 'section', 'question', 'text'])

In [9]:
documents_df

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
...,...,...,...,...
943,mlops-zoomcamp,Module 6: Best practices,Github actions: Permission denied error when e...,Problem description\nThis is the step in the c...
944,mlops-zoomcamp,Module 6: Best practices,Managing Multiple Docker Containers with docke...,Problem description\nWhen a docker-compose fil...
945,mlops-zoomcamp,Module 6: Best practices,AWS regions need to match docker-compose,Problem description\nIf you are having problem...
946,mlops-zoomcamp,Module 6: Best practices,Isort Pre-commit,Problem description\nPre-commit command was fa...


In [10]:
# CountVectorizer turns text into vectors
from sklearn.feature_extraction.text import CountVectorizer

In [16]:
# only the words that appear in at least 5 documents - min_df = 5
cv = CountVectorizer(min_df = 5)

In [17]:
# look at what words in our data set exist
cv.fit(documents_df.text)

In [19]:
# check how many distinct tokens/features there are in the dataset
# cv.get_feature_names_out().shape

# check the tokens
cv.get_feature_names_out()

array(['01', '02', '03', ..., 'youtube', 'zip', 'zoomcamp'], dtype=object)

## Vectorization example

How CountVectorizer turns text into vectors

In [21]:
docs_example = [
    "Course starts on 15th Jan 2024",
    "Prerequisites listed on GitHub",
    "Submit homeworks after start date",
    "Registration not required for participation",
    "Setup Google Cloud and Python before course"
]

In [29]:
cv_ex = CountVectorizer(stop_words='english')   # remove the stop words
cv_ex.fit(docs_example)

In [30]:
cv_ex.get_feature_names_out()

array(['15th', '2024', 'cloud', 'course', 'date', 'github', 'google',
       'homeworks', 'jan', 'listed', 'participation', 'prerequisites',
       'python', 'registration', 'required', 'setup', 'start', 'starts',
       'submit'], dtype=object)

In [31]:
# transform documents into matrix
X = cv_ex.transform(docs_example)
X

<5x19 sparse matrix of type '<class 'numpy.int64'>'
	with 20 stored elements in Compressed Sparse Row format>

In [32]:
# see what is inside the matrix
X.todense()

matrix([[1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
        [0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0],
        [0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0]])

In [33]:
# see which words/features are in which document, transposed for easier reading
pd.DataFrame(X.todense(), columns=cv_ex.get_feature_names_out()).T

Unnamed: 0,0,1,2,3,4
15th,1,0,0,0,0
2024,1,0,0,0,0
cloud,0,0,0,0,1
course,1,0,0,0,1
date,0,0,1,0,0
github,0,1,0,0,0
google,0,0,0,0,1
homeworks,0,0,1,0,0
jan,1,0,0,0,0
listed,0,1,0,0,0


The above is the **bag of words**.
- doens't take into consideration order of words inside the document (we lose this information with this kind of representation)
- in many cases this is sufficient and gives pretty good results already

# TF-IDF

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer
# tf-idf takes into consideration how often the word appears within the documents - the often it appears, the less important it is

cv = TfidfVectorizer(stop_words='english', min_df=5)
X = cv.fit_transform(documents_df.text)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names)
# df_docs.round(2)

We will use the TF-IDF for scoring. We turn a document in vectors and each term in the set of documents has a weight, the higher the frequency, the smaller the weight/importance.

In [45]:
X

<948x1333 sparse matrix of type '<class 'numpy.float64'>'
	with 23808 stored elements in Compressed Sparse Row format>

In [46]:
query = "Do I need to know python to sign up for the January course?"

q =  cv.transform([query])      # apply our vectorizer to turn the query into a documtent
q.toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

See which of them are non-zero, meaning

In [47]:
query_dict = dict(zip(names, q.toarray()[0]))
query_dict

{'00': 0.0,
 '00000000e': 0.0,
 '0002': 0.0,
 '00021': 0.0,
 '001': 0.0,
 '009s': 0.0,
 '01': 0.0,
 '02': 0.0,
 '020': 0.0,
 '028879': 0.0,
 '02d': 0.0,
 '03': 0.0,
 '0315': 0.0,
 '04': 0.0,
 '04d': 0.0,
 '05': 0.0,
 '051': 0.0,
 '054': 0.0,
 '06': 0.0,
 '06_spark_sql': 0.0,
 '07': 0.0,
 '07cd': 0.0,
 '08': 0.0,
 '09': 0.0,
 '0ms': 0.0,
 '0x3c947bc5': 0.0,
 '0x7efe331cf790': 0.0,
 '0x7f797010a590': 0.0,
 '0x7fbaf2666280': 0.0,
 '0x800701bc': 0.0,
 '0xa0': 0.0,
 '0xff': 0.0,
 '0zw04wdetqo': 0.0,
 '10': 0.0,
 '100': 0.0,
 '1000': 0.0,
 '100000': 0.0,
 '100k': 0.0,
 '100m': 0.0,
 '100mb': 0.0,
 '101': 0.0,
 '1010101': 0.0,
 '1049089': 0.0,
 '1053': 0.0,
 '107': 0.0,
 '1078': 0.0,
 '10gb': 0.0,
 '11': 0.0,
 '111': 0.0,
 '111111111': 0.0,
 '1111111111': 0.0,
 '1128': 0.0,
 '114': 0.0,
 '1186': 0.0,
 '12': 0.0,
 '120': 0.0,
 '121': 0.0,
 '122': 0.0,
 '1221': 0.0,
 '123': 0.0,
 '1234': 0.0,
 '124': 0.0,
 '126': 0.0,
 '127': 0.0,
 '128': 0.0,
 '13': 0.0,
 '130': 0.0,
 '131743': 0.0,
 '132': 0.

In [48]:
doc_dict = dict(zip(names, X.toarray()[1]))
doc_dict

{'00': 0.0,
 '00000000e': 0.0,
 '0002': 0.0,
 '00021': 0.0,
 '001': 0.0,
 '009s': 0.0,
 '01': 0.0,
 '02': 0.0,
 '020': 0.0,
 '028879': 0.0,
 '02d': 0.0,
 '03': 0.0,
 '0315': 0.0,
 '04': 0.0,
 '04d': 0.0,
 '05': 0.0,
 '051': 0.0,
 '054': 0.0,
 '06': 0.0,
 '06_spark_sql': 0.0,
 '07': 0.0,
 '07cd': 0.0,
 '08': 0.0,
 '09': 0.0,
 '0ms': 0.0,
 '0x3c947bc5': 0.0,
 '0x7efe331cf790': 0.0,
 '0x7f797010a590': 0.0,
 '0x7fbaf2666280': 0.0,
 '0x800701bc': 0.0,
 '0xa0': 0.0,
 '0xff': 0.0,
 '0zw04wdetqo': 0.0,
 '10': 0.0,
 '100': 0.0,
 '1000': 0.0,
 '100000': 0.0,
 '100k': 0.0,
 '100m': 0.0,
 '100mb': 0.0,
 '101': 0.0,
 '1010101': 0.0,
 '1049089': 0.0,
 '1053': 0.0,
 '107': 0.0,
 '1078': 0.0,
 '10gb': 0.0,
 '11': 0.0,
 '111': 0.0,
 '111111111': 0.0,
 '1111111111': 0.0,
 '1128': 0.0,
 '114': 0.0,
 '1186': 0.0,
 '12': 0.0,
 '120': 0.0,
 '121': 0.0,
 '122': 0.0,
 '1221': 0.0,
 '123': 0.0,
 '1234': 0.0,
 '124': 0.0,
 '126': 0.0,
 '127': 0.0,
 '128': 0.0,
 '13': 0.0,
 '130': 0.0,
 '131743': 0.0,
 '132': 0.

##### How do we do the search?

If a document and a query contain the same words, the docuemtn is relevant to the query and because we have the tf-idf weights, we know which documents are more improtant for this query. Then we can rank our documents according to the criteria: we can multiply the weight of the term in the query with the weight of the term in the document and sum across all the matching terms (TODO: search for proper explanation)

This will give us a measure of similarity - how similar the query is for the document. If we do this for all documents, we can score all the documents and we can rank the docuemtns by the relevance score.

This multiplying and summing is called dot product.

In [51]:
X.dot(q.T).todense() # computing the similarity between a document and a vector // cosine similarity

matrix([[0.19464486],
        [0.        ],
        [0.        ],
        [0.06011641],
        [0.04932915],
        [0.        ],
        [0.        ],
        [0.13477565],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.15899187],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.07431408],
        [0.        ],
        [0.        ],
        [0.05779673],
        [0.07243428],
        [0.        ],
        [0.05174293],
        [0.16373635],
        [0.08076031],
        [0.        ],
        [0.09755254],
        [0.        ],
        [0.21069625],
        [0.12067781],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.06381749],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.00910541],
        [0.02835681],
        [0.05480112],
        [0.        ],
        [0.        ],
        [0.        ],
        [0

In [54]:
from sklearn.metrics.pairwise import cosine_similarity

# computing the similarity between a document and a vector, with the builtin function
# .flatten() transforms the 1D matrix to a vector
score = cosine_similarity(X, q).flatten()

We can sort this vector. When we sort, we will pick the documents with the highest score

In [57]:
np.argsort(score)[-5:]

array([764,  27, 806, 577, 445])

In [58]:
documents_df.iloc[445].text

'Check this article. If you know everything in this article, you know enough. If you don’t, read the article and join the coursIntroduction to Pythone too :)\nIntroduction to Python – Machine Learning Bookcamp\nYou can follow this English course from the OpenClassrooms e-learning platform, which is free and covers the python basics for data analysis: Learn Python Basics for Data Analysis - OpenClassrooms . It is important to know some basics such as: how to run a Jupyter notebook, how to import libraries (and what libraries are), how to declare a variable (and what variables are) and some important operations regarding data analysis.\n(Mélanie Fouesnard)'

In [59]:
matrices = {}
vectorizers = {}
fields = ['section', 'question', 'text']

# go across all fields and fit a count vectorizer to that field

for f in fields: 
    cv = TfidfVectorizer(stop_words='english', min_df=5)        # create vectorizer
    X = cv.fit_transform(documents_df[f])                       # fit it
    matrices[f] = X
    vectorizers[f] = cv

In [60]:
matrices

{'section': <948x66 sparse matrix of type '<class 'numpy.float64'>'
 	with 3090 stored elements in Compressed Sparse Row format>,
 'question': <948x291 sparse matrix of type '<class 'numpy.float64'>'
 	with 3431 stored elements in Compressed Sparse Row format>,
 'text': <948x1333 sparse matrix of type '<class 'numpy.float64'>'
 	with 23808 stored elements in Compressed Sparse Row format>}

In [86]:
# look across all fields

n = len(documents_df)

score = np.zeros(n)
total_score = np.zeros(n)
query = "I just discovered the course, is it too late to join?"

for f in fields:
    q = vectorizers[f].transform([query])       # the representation of the query for this particular field
    X = matrices[f]

    f_score = cosine_similarity(X, q).flatten()

    total_score = score + f_score

In [87]:
# add filter for the course field, to get answers only from desired course
filters = {
    'course': 'data-engineering-zoomcamp'
}

In [88]:
for field, value in filters.items():
    mask = (documents_df[field] == value).astype(int).values
    total_score = total_score * mask

total_score

array([0.48049682, 0.        , 0.        , 0.2083882 , 0.        ,
       0.        , 0.        , 0.17557272, 0.        , 0.        ,
       0.        , 0.15870689, 0.        , 0.        , 0.        ,
       0.09680922, 0.        , 0.        , 0.07529201, 0.        ,
       0.        , 0.        , 0.29986763, 0.10520675, 0.        ,
       0.        , 0.        , 0.27447476, 0.12828407, 0.        ,
       0.        , 0.        , 0.        , 0.05163407, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.03156309,
       0.04914818, 0.07138962, 0.        , 0.04329773, 0.        ,
       0.        , 0.        , 0.        , 0.02804374, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.06739038, 0.        , 0.00980845,
       0.        , 0.        , 0.        , 0.        , 0.05820102,
       0.        , 0.        , 0.        , 0.        , 0.     

In [89]:
idx = np.argsort(total_score)[-5:]
documents_df.iloc[idx]

Unnamed: 0,course,section,question,text
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
287,data-engineering-zoomcamp,Module 4: analytics engineering with dbt,CREATE TABLE has columns with duplicate name l...,This error could result if you are using some ...
27,data-engineering-zoomcamp,General course-related questions,Environment - The GCP and other cloud provider...,You can do most of the course without a cloud....
22,data-engineering-zoomcamp,General course-related questions,Environment - Do we really have to use GitHub ...,It's up to you which platform and environment ...
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...


#### This is in-memory search - all the vectors and matrices are kept in memory.