In [44]:
import pandas as pd
import numpy as np

In [2]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [3]:
# View the document sample
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [4]:
# Convert to DataFrame with the enforced column ordering
df = pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])
df.head()

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


In [5]:
df.tail()

Unnamed: 0,course,section,question,text
943,mlops-zoomcamp,Module 6: Best practices,Github actions: Permission denied error when e...,Problem description\nThis is the step in the c...
944,mlops-zoomcamp,Module 6: Best practices,Managing Multiple Docker Containers with docke...,Problem description\nWhen a docker-compose fil...
945,mlops-zoomcamp,Module 6: Best practices,AWS regions need to match docker-compose,Problem description\nIf you are having problem...
946,mlops-zoomcamp,Module 6: Best practices,Isort Pre-commit,Problem description\nPre-commit command was fa...
947,mlops-zoomcamp,Module 6: Best practices,How to destroy infrastructure created via GitH...,Problem description\nInfrastructure created in...


In [64]:
# We want to limit the documents to a particular Zoomcamp
df[df.course == "data-engineering-zoomcamp"]

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
...,...,...,...,...
430,data-engineering-zoomcamp,Workshop 2 - RisingWave,Unable to Open Dashboard as xdg-open doesn’t o...,Refer to the solution given in the first solut...
431,data-engineering-zoomcamp,Workshop 2 - RisingWave,Resolving Python Interpreter Path Inconsistenc...,Example Error:\nWhen attempting to execute a P...
432,data-engineering-zoomcamp,Workshop 2 - RisingWave,How does windowing work in Sql?,Ans : Windowing in streaming SQL involves defi...
433,data-engineering-zoomcamp,Triggers in Mage via CLI,"Encountering the error ""ModuleNotFoundError: N...","Python 3.12.1, is not compatible with kafka-py..."


# Implementing Text Search Engine

- We take a document that we have
- We convert the document into vector
  - Term-document matrix, where the rows is the document and columns is words/token (implemented in scikit-learn as `Countvectorizer`). This is an example Bag-of-words: only care about presence of the word, not the order of the word

In [19]:
docs_example = [
    "Course starts on 15th Jan 2024",
    "Prerequisites listed on GitHub",
    "Submit homeworks after start date",
    "Registration not required for participation",
    "Setup Google Cloud and Python before course"
]

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
X = cv.fit_transform(docs_example)

names = cv.get_feature_names_out()
print(names)

df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs

['15th' '2024' 'cloud' 'course' 'date' 'github' 'google' 'homeworks' 'jan'
 'listed' 'participation' 'prerequisites' 'python' 'registration'
 'required' 'setup' 'start' 'starts' 'submit']


Unnamed: 0,0,1,2,3,4
15th,1,0,0,0,0
2024,1,0,0,0,0
cloud,0,0,0,0,1
course,1,0,0,0,1
date,0,0,1,0,0
github,0,1,0,0,0
google,0,0,0,0,1
homeworks,0,0,1,0,0
jan,1,0,0,0,0
listed,0,1,0,0,0


In [21]:
X = cv.transform(docs_example)
X.todense()

matrix([[1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
        [0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0],
        [0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0]])

In [22]:
pd.DataFrame(X.todense(), columns=cv.get_feature_names_out())

Unnamed: 0,15th,2024,cloud,course,date,github,google,homeworks,jan,listed,participation,prerequisites,python,registration,required,setup,start,starts,submit
0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,1
3,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0
4,0,0,1,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0


This representation is called "bag of words" - here we ignore the order of words, just focus on the words themselves. In many cases this is sufficient and gives pretty good results already.

Another method is to use TF-IDF Vectorizer: Let's use it to the df.text 

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer

cv = TfidfVectorizer(stop_words='english', min_df=3)
X = cv.fit_transform(df.text)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names)
df_docs.round(2)

Unnamed: 0,001,01,02,03,04,05,06,06_spark_sql,09,10,...,yesipov,yml,youtube,yyyy,zero,zip,zone,zones,zoom,zoomcamp
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.34
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
943,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00
944,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00
945,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00
946,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00


Here, the logic is: If I look into the query containing "yml" and "yml" is infrequent word in the documents, the more important it is to surface the document containing the word "yml". TF-IDF is a way to give importance to such rare word.

## Query-Document Similarity

In [33]:
query = "Do I need to know python to sign up for the January course?"

q = cv.transform([query])
q.toarray()

array([[0., 0., 0., ..., 0., 0., 0.]], shape=(1, 2118))

In [34]:
query_dict = dict(zip(names, q.toarray()[0]))
query_dict

{'001': np.float64(0.0),
 '01': np.float64(0.0),
 '02': np.float64(0.0),
 '03': np.float64(0.0),
 '04': np.float64(0.0),
 '05': np.float64(0.0),
 '06': np.float64(0.0),
 '06_spark_sql': np.float64(0.0),
 '09': np.float64(0.0),
 '10': np.float64(0.0),
 '100': np.float64(0.0),
 '1000': np.float64(0.0),
 '11': np.float64(0.0),
 '12': np.float64(0.0),
 '127': np.float64(0.0),
 '13': np.float64(0.0),
 '14': np.float64(0.0),
 '15': np.float64(0.0),
 '16': np.float64(0.0),
 '17': np.float64(0.0),
 '18': np.float64(0.0),
 '19': np.float64(0.0),
 '1h': np.float64(0.0),
 '1st': np.float64(0.0),
 '1vqcwqatkjl07mtw': np.float64(0.0),
 '20': np.float64(0.0),
 '2019': np.float64(0.0),
 '2020': np.float64(0.0),
 '2021': np.float64(0.0),
 '2022': np.float64(0.0),
 '2023': np.float64(0.0),
 '2024': np.float64(0.0),
 '21': np.float64(0.0),
 '22': np.float64(0.0),
 '24': np.float64(0.0),
 '25': np.float64(0.0),
 '29': np.float64(0.0),
 '2nd': np.float64(0.0),
 '2pacx': np.float64(0.0),
 '30': np.float64(

In [35]:
doc_dict = dict(zip(names, X.toarray()[1]))
doc_dict

{'001': np.float64(0.0),
 '01': np.float64(0.0),
 '02': np.float64(0.0),
 '03': np.float64(0.0),
 '04': np.float64(0.0),
 '05': np.float64(0.0),
 '06': np.float64(0.0),
 '06_spark_sql': np.float64(0.0),
 '09': np.float64(0.0),
 '10': np.float64(0.0),
 '100': np.float64(0.0),
 '1000': np.float64(0.0),
 '11': np.float64(0.0),
 '12': np.float64(0.0),
 '127': np.float64(0.0),
 '13': np.float64(0.0),
 '14': np.float64(0.0),
 '15': np.float64(0.0),
 '16': np.float64(0.0),
 '17': np.float64(0.0),
 '18': np.float64(0.0),
 '19': np.float64(0.0),
 '1h': np.float64(0.0),
 '1st': np.float64(0.0),
 '1vqcwqatkjl07mtw': np.float64(0.0),
 '20': np.float64(0.0),
 '2019': np.float64(0.0),
 '2020': np.float64(0.0),
 '2021': np.float64(0.0),
 '2022': np.float64(0.0),
 '2023': np.float64(0.0),
 '2024': np.float64(0.0),
 '21': np.float64(0.0),
 '22': np.float64(0.0),
 '24': np.float64(0.0),
 '25': np.float64(0.0),
 '29': np.float64(0.0),
 '2nd': np.float64(0.0),
 '2pacx': np.float64(0.0),
 '30': np.float64(

In [38]:
# To get the similarity between query and documents, we calculate the dot product
# This is the cosine similarity
X.dot(q.T).todense()

matrix([[0.17877261],
        [0.        ],
        [0.        ],
        [0.05071309],
        [0.04289222],
        [0.        ],
        [0.        ],
        [0.12919035],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.14353115],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.05919232],
        [0.        ],
        [0.        ],
        [0.04666469],
        [0.05965079],
        [0.        ],
        [0.04673617],
        [0.16373635],
        [0.08076031],
        [0.        ],
        [0.09294288],
        [0.        ],
        [0.19242852],
        [0.10342148],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.06089334],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.00883201],
        [0.02613483],
        [0.05254717],
        [0.        ],
        [0.        ],
        [0.        ],
        [0

In [41]:
# In practice, we use cosine similarity from scikit-learn
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(X, q).flatten()

array([0.17877261, 0.        , 0.        , 0.05071309, 0.04289222,
       0.        , 0.        , 0.12919035, 0.        , 0.        ,
       0.        , 0.14353115, 0.        , 0.        , 0.        ,
       0.05919232, 0.        , 0.        , 0.04666469, 0.05965079,
       0.        , 0.04673617, 0.16373635, 0.08076031, 0.        ,
       0.09294288, 0.        , 0.19242852, 0.10342148, 0.        ,
       0.        , 0.        , 0.        , 0.06089334, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.00883201,
       0.02613483, 0.05254717, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.02271958, 0.05075072, 0.05894162,
       0.04341122, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.03377575, 0.        , 0.        , 0.        , 0.00748783,
       0.        , 0.        , 0.01893359, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [43]:
# Let's call it score
score = cosine_similarity(X, q).flatten()

In [45]:
# Now sort by the highest score of similarity
np.argsort(score)[-5:]

array([764, 398, 806, 577, 445])

In [54]:
# Inspect the text
df.iloc[np.argsort(score)[-5:][0]].text

'If you have submitted two projects (and peer-reviewed at least 3 course-mates’ projects for each submission), you will get the certificate for the course. According to the course coordinator, Alexey Grigorev, only two projects are needed to get the course certificate.\n(optional) David Odimegwu'

## Vectorizing the Whole Documents

In [74]:
fields = ['section', 'question', 'text']
transformers = {}
matrices = {}

for field in fields:
    cv = TfidfVectorizer(stop_words='english', min_df=3)
    X = cv.fit_transform(df[field])

    transformers[field] = cv
    matrices[field] = X

transformers['text'].get_feature_names_out()
matrices['text']

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 26463 stored elements and shape (948, 2118)>

In [75]:
# Now we have the transformers (vectorizers)...
transformers

{'section': TfidfVectorizer(min_df=3, stop_words='english'),
 'question': TfidfVectorizer(min_df=3, stop_words='english'),
 'text': TfidfVectorizer(min_df=3, stop_words='english')}

In [76]:
# ... and matrices 
matrices

{'section': <Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 3094 stored elements and shape (948, 67)>,
 'question': <Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 4333 stored elements and shape (948, 562)>,
 'text': <Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 26463 stored elements and shape (948, 2118)>}

## Search

In [77]:
n = len(df)
score = np.zeros(n)

query = "I just signed up. Is it too late to join the course?"

for f in fields:
    q = transformers[f].transform([query])
    X = matrices[f]

    f_score = cosine_similarity(X, q).flatten()
    score = score + f_score

In [78]:
score

array([1.68013868, 1.49512426, 1.23253339, 0.98009105, 1.49512426,
       1.49512426, 0.75042379, 1.42539637, 1.22089233, 1.49512426,
       1.14731396, 0.87896774, 0.49512426, 0.49512426, 0.49512426,
       0.73042693, 0.49512426, 0.89147327, 0.54107765, 0.49512426,
       0.49512426, 0.49512426, 0.72180425, 0.57465357, 0.49512426,
       0.49512426, 0.49512426, 0.68461965, 0.57823165, 0.49512426,
       0.49512426, 0.49512426, 0.49512426, 0.94668939, 1.49512426,
       0.90368553, 0.49512426, 0.49512426, 0.65996855, 0.51826746,
       0.5293658 , 1.02508298, 0.49512426, 0.52680125, 0.        ,
       0.        , 0.        , 0.        , 0.02097473, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.04722243, 0.        , 0.0073737 ,
       0.        , 0.        , 0.        , 0.        , 0.04161211,
       0.        , 0.        , 0.        , 0.        , 0.     

In [79]:
idx = np.argsort(score)[-10:]
df.iloc[idx]

Unnamed: 0,course,section,question,text
448,machine-learning-zoomcamp,General course-related questions,I’m new to Slack and can’t find the course cha...,Here’s how you join a in Slack: https://slack....
452,machine-learning-zoomcamp,General course-related questions,I just joined. What should I do next? How can ...,Welcome to the course! Go to the course page (...
449,machine-learning-zoomcamp,General course-related questions,The course has already started. Can I still jo...,"Yes, you can. You won’t be able to submit some..."
7,data-engineering-zoomcamp,General course-related questions,Course - Can I follow the course after it fini...,"Yes, we will keep all the materials after the ..."
5,data-engineering-zoomcamp,General course-related questions,Course - how many Zoomcamps in a year?,"There are 3 Zoom Camps in a year, as of 2024. ..."
34,data-engineering-zoomcamp,General course-related questions,How can we contribute to the course?,Star the repo! Share it with friends if you fi...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
9,data-engineering-zoomcamp,General course-related questions,Course - Which playlist on YouTube should I re...,All the main videos are stored in the Main “DA...
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...


Now we see that the top 10 answer containing answer across course. I want to filter this result by the course name.

In [80]:
filters = {
    'course' : 'data-engineering-zoomcamp'
}

In [81]:
for field, value in filters.items():
    mask = (df[field] == value).astype(int).values
    score = score * mask

score

array([1.68013868, 1.49512426, 1.23253339, 0.98009105, 1.49512426,
       1.49512426, 0.75042379, 1.42539637, 1.22089233, 1.49512426,
       1.14731396, 0.87896774, 0.49512426, 0.49512426, 0.49512426,
       0.73042693, 0.49512426, 0.89147327, 0.54107765, 0.49512426,
       0.49512426, 0.49512426, 0.72180425, 0.57465357, 0.49512426,
       0.49512426, 0.49512426, 0.68461965, 0.57823165, 0.49512426,
       0.49512426, 0.49512426, 0.49512426, 0.94668939, 1.49512426,
       0.90368553, 0.49512426, 0.49512426, 0.65996855, 0.51826746,
       0.5293658 , 1.02508298, 0.49512426, 0.52680125, 0.        ,
       0.        , 0.        , 0.        , 0.02097473, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.04722243, 0.        , 0.0073737 ,
       0.        , 0.        , 0.        , 0.        , 0.04161211,
       0.        , 0.        , 0.        , 0.        , 0.     

In [83]:
# So we can show filtered results
idx = np.argsort(score)[-10:]
df.iloc[idx]

Unnamed: 0,course,section,question,text
10,data-engineering-zoomcamp,General course-related questions,Course - ​​How many hours per week am I expect...,It depends on your background and previous exp...
8,data-engineering-zoomcamp,General course-related questions,Course - Can I get support if I take the cours...,"Yes, the slack channel remains open and you ca..."
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
7,data-engineering-zoomcamp,General course-related questions,Course - Can I follow the course after it fini...,"Yes, we will keep all the materials after the ..."
34,data-engineering-zoomcamp,General course-related questions,How can we contribute to the course?,Star the repo! Share it with friends if you fi...
9,data-engineering-zoomcamp,General course-related questions,Course - Which playlist on YouTube should I re...,All the main videos are stored in the Main “DA...
5,data-engineering-zoomcamp,General course-related questions,Course - how many Zoomcamps in a year?,"There are 3 Zoom Camps in a year, as of 2024. ..."
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...


## Boosting and Filtering

Since we are nature of the query is closer to the `question` field, we can add "boost" factor to the `question` field, in order to prioritize documents with matching `question` from the query

In [87]:
boost = {'question': 3.0}

score = np.zeros(len(df))

for f in fields:
    b = boost.get(f, 1.0)
    q = transformers[f].transform([query])
    s = cosine_similarity(matrices[f], q).flatten()
    score = score + b * s

score

array([3.38295811, 3.49512426, 2.70735166, 1.68424985, 3.49512426,
       3.49512426, 1.26102286, 3.03149832, 2.67242848, 3.49512426,
       2.45169338, 1.43004364, 0.49512426, 0.49512426, 0.49512426,
       0.73042693, 0.49512426, 1.68417129, 0.54107765, 0.49512426,
       0.49512426, 0.49512426, 0.72180425, 0.57465357, 0.49512426,
       0.49512426, 0.49512426, 0.68461965, 0.57823165, 0.49512426,
       0.49512426, 0.49512426, 0.49512426, 1.77533273, 3.49512426,
       1.72080809, 0.49512426, 0.49512426, 0.65996855, 0.51826746,
       0.5293658 , 1.981508  , 0.49512426, 0.52680125, 0.        ,
       0.        , 0.        , 0.        , 0.02097473, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.04722243, 0.        , 0.0073737 ,
       0.        , 0.        , 0.        , 0.        , 0.04161211,
       0.        , 0.        , 0.        , 0.        , 0.     

In [88]:
# So we can show filtered results

filters = {
    'course' : 'data-engineering-zoomcamp'
}

for field, value in filters.items():
    mask = (df[field] == value).astype(int).values
    score = score * mask

score

idx = np.argsort(score)[-10:]
df.iloc[idx]

Unnamed: 0,course,section,question,text
10,data-engineering-zoomcamp,General course-related questions,Course - ​​How many hours per week am I expect...,It depends on your background and previous exp...
8,data-engineering-zoomcamp,General course-related questions,Course - Can I get support if I take the cours...,"Yes, the slack channel remains open and you ca..."
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
7,data-engineering-zoomcamp,General course-related questions,Course - Can I follow the course after it fini...,"Yes, we will keep all the materials after the ..."
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
34,data-engineering-zoomcamp,General course-related questions,How can we contribute to the course?,Star the repo! Share it with friends if you fi...
9,data-engineering-zoomcamp,General course-related questions,Course - Which playlist on YouTube should I re...,All the main videos are stored in the Main “DA...
5,data-engineering-zoomcamp,General course-related questions,Course - how many Zoomcamps in a year?,"There are 3 Zoom Camps in a year, as of 2024. ..."
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...


## Putting it All Together

In [89]:
class TextSearch:

    def __init__(self, text_fields):
        self.text_fields = text_fields
        self.matrices = {}
        self.vectorizers = {}

    def fit(self, records, vectorizer_params={}):
        self.df = pd.DataFrame(records)

        for f in self.text_fields:
            cv = TfidfVectorizer(**vectorizer_params)
            X = cv.fit_transform(self.df[f])
            self.matrices[f] = X
            self.vectorizers[f] = cv

    def search(self, query, n_results=10, boost={}, filters={}):
        score = np.zeros(len(self.df))

        for f in self.text_fields:
            b = boost.get(f, 1.0)
            q = self.vectorizers[f].transform([query])
            s = cosine_similarity(self.matrices[f], q).flatten()
            score = score + b * s

        for field, value in filters.items():
            mask = (self.df[field] == value).values
            score = score * mask

        idx = np.argsort(-score)[:n_results]
        results = self.df.iloc[idx]
        return results.to_dict(orient='records')

In [91]:
index = TextSearch(
    text_fields=['section', 'question', 'text']
)
index.fit(documents)

index.search(
    query='I just signed up. Is it too late to join the course?',
    n_results=5,
    boost={'question': 3.0},
    filters={'course': 'data-engineering-zoomcamp'}
)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineerin

# Embeddings and Vector Search

## Singular Value Decomposition

In [92]:
X

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 26463 stored elements and shape (948, 2118)>

In [93]:
from sklearn.decomposition import TruncatedSVD

X = matrices['text']
cv = transformers['text']

svd = TruncatedSVD(n_components=16)
X_emb = svd.fit_transform(X)

X_emb[0]

array([ 0.08800159, -0.07501303, -0.10073182,  0.05108478,  0.04995078,
       -0.05799374,  0.02480998,  0.04924813, -0.19336232,  0.32874571,
        0.05343162,  0.09926647, -0.14877072,  0.04979594, -0.01836442,
        0.02298565])

In [96]:
# Embedding from the original 2118 columns into 16 columns
X_emb.shape

(948, 16)

In [98]:
X_emb

array([[ 0.08800159, -0.07501303, -0.10073182, ...,  0.04979594,
        -0.01836442,  0.02298565],
       [ 0.12424586, -0.16335016, -0.22929235, ..., -0.02200407,
         0.11278409, -0.03503056],
       [ 0.03671008, -0.03682767, -0.03797144, ..., -0.00244113,
        -0.00231711,  0.01377815],
       ...,
       [ 0.23299289,  0.17904275, -0.0871594 , ..., -0.06630138,
        -0.02179186,  0.01617854],
       [ 0.16594629, -0.01881518,  0.07586982, ..., -0.07247091,
         0.18039587,  0.01178121],
       [ 0.15287567, -0.02588326, -0.02129166, ..., -0.04168962,
         0.07602417, -0.05052771]], shape=(948, 16))

In [109]:
# For query, do the same

query = 'I just signed up. Is it too late to join the course?'

Q = cv.transform([query])
Q_emb = svd.transform(Q)
Q_emb[0]

array([ 0.04353618, -0.03069201, -0.04407726,  0.01168237,  0.02507317,
       -0.05026926,  0.01295348,  0.02969973, -0.11436225,  0.18170732,
        0.04289212,  0.07253724, -0.08051859,  0.03614361, -0.03452402,
        0.02880554])

In [110]:
# Let's do cosine similarity for one document
np.dot(X_emb[0], Q_emb[0])

np.float64(0.12353799068119138)

In [111]:
# And do this for the whole documents
score = cosine_similarity(X_emb, Q_emb).flatten()
idx = np.argsort(-score)[:10]
list(df.loc[idx].text)

['If you have submitted two projects (and peer-reviewed at least 3 course-mates’ projects for each submission), you will get the certificate for the course. According to the course coordinator, Alexey Grigorev, only two projects are needed to get the course certificate.\n(optional) David Odimegwu',
 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
 "No, you can only get a certificate if you finish the course with a “live” cohort. We don't award certificates for the self-paced mode. The reason is you need to peer-review capstone(s) after submitting a project. You can only peer-review projects at the time the course is running.",
 'Please choose the closest one to y

## Non-Negative Matrix Factorization

In [116]:
from sklearn.decomposition import NMF as nmf

nmf = NMF(n_components=16)
X_emb = nmf.fit_transform(X)

X_emb[0]



array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 2.41183096e-05, 0.00000000e+00,
       0.00000000e+00, 3.14557342e-01, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00])

In [117]:
Q = cv.transform([query])
Q_emb = nmf.transform(Q)
Q_emb[0]

array([0.        , 0.00086105, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.17744803,
       0.        , 0.        , 0.        , 0.        , 0.0007611 ,
       0.        ])

In [118]:
score = cosine_similarity(X_emb, Q_emb).flatten()
idx = np.argsort(-score)[:10]
list(df.loc[idx].text)

['Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
 'The course videos are pre-recorded, you can start watching the course right now.\nWe will also occasionally have office hours - live sessions where we will answer your questions. The office hours sessions are recorded too.\nYou can see the office hours as well as the pre-recorded course videos in the course playlist on YouTube.',
 'No, it’s not possible. The form is closed after the due date. But don’t worry, homework is not mandatory for finishing the course.',
 'If you have submitted two projects (and peer-reviewed at least 3 course-mates’ projects for each submission), you will get the certificate for the cour

## BERT

In [119]:
import torch
from transformers import BertModel, BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
model.eval()  # Set the model to evaluation mode if not training

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [120]:
texts = [
    "Yes, we will keep all the materials after the course finishes.",
    "You can follow the course at your own pace after it finishes"
]
encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

In [121]:
encoded_input

{'input_ids': tensor([[  101,  2748,  1010,  2057,  2097,  2562,  2035,  1996,  4475,  2044,
          1996,  2607, 12321,  1012,   102],
        [  101,  2017,  2064,  3582,  1996,  2607,  2012,  2115,  2219,  6393,
          2044,  2009, 12321,   102,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]])}

In [122]:
with torch.no_grad():  # Disable gradient calculation for inference
    outputs = model(**encoded_input)
    hidden_states = outputs.last_hidden_state

In [123]:
hidden_states

tensor([[[ 1.0103e-01,  1.8106e-02,  1.3034e-01,  ..., -2.9319e-01,
           1.8632e-01,  6.6145e-01],
         [ 1.0608e+00, -1.2425e-01,  1.3701e-01,  ..., -1.6050e-01,
           1.0429e+00,  3.5325e-01],
         [ 1.8022e-01,  7.7588e-02,  3.9414e-01,  ..., -1.3787e-01,
           5.9744e-01,  1.7035e-01],
         ...,
         [ 4.7383e-01, -1.8445e-02,  2.1863e-01,  ..., -1.2885e-03,
          -8.3294e-02, -2.1699e-01],
         [ 6.5164e-01,  1.2163e-01, -2.4941e-01,  ...,  1.5567e-01,
          -5.6319e-01, -4.3100e-01],
         [ 7.1638e-01,  2.1572e-01, -2.8087e-02,  ...,  2.2812e-01,
          -6.7250e-01, -3.2448e-01]],

        [[ 3.1965e-01, -2.4620e-01,  1.9934e-01,  ..., -2.4255e-01,
          -1.0942e-01,  5.8847e-01],
         [-6.9823e-01, -7.5619e-01,  1.0645e-01,  ..., -1.1348e-01,
           4.5499e-01,  4.0241e-01],
         [ 4.1643e-01, -4.7885e-01,  3.2889e-01,  ..., -5.1462e-01,
           6.5775e-02,  6.9717e-01],
         ...,
         [ 1.0277e-01, -8

In [124]:
hidden_states.shape

torch.Size([2, 15, 768])

In [125]:
# Compress the embedding by taking the average
sentence_embeddings = hidden_states.mean(dim=1)
sentence_embeddings.shape

torch.Size([2, 768])

In [127]:
X_emb = sentence_embeddings.numpy()
X_emb

array([[ 0.35999236, -0.16072296,  0.35452363, ...,  0.04289254,
         0.03482299, -0.03822253],
       [ 0.17849933, -0.5000251 ,  0.25277558, ..., -0.11413126,
        -0.33608493,  0.41095155]], shape=(2, 768), dtype=float32)