In [56]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [57]:
documents[2]

{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
 'section': 'General course-related questions',
 'question': 'Course - Can I still join the course after the start date?',
 'course': 'data-engineering-zoomcamp'}

In [58]:
import pandas as pd

df = pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])
df.head()

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


In [59]:
de_course = df[df.course == 'data-engineering-zoomcamp']
de_course.head()

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


**Vector space** - defination: 

- turn the document into vectors.
- term-document matrix
    - rows: documents
    - columns: words/tokens.
- *bags of words*
    - word order is lost.
    - *sparse matrix*

In [60]:
from sklearn.feature_extraction.text import CountVectorizer

In [61]:
cv = CountVectorizer()

In [62]:
cv.fit(de_course.text)

In [63]:
cv.get_feature_names_out()

array(['00', '00021', '009s', ..., '要了解键盘快捷键', '要启用屏幕阅读器支持', '请按ctrl'],
      dtype=object)

In [64]:
documents_example = [
    "Course starts on 15th Jan 2024",
    "Prerequisites listed on GitHub",
    "Submit homeworks after start date",
    "Registration not required for participation",
    "Setup Google Cloud and Python before course"
]

In [65]:
cv = CountVectorizer(stop_words='english')
cv.fit(documents_example)

In [66]:
cv.get_feature_names_out()

array(['15th', '2024', 'cloud', 'course', 'date', 'github', 'google',
       'homeworks', 'jan', 'listed', 'participation', 'prerequisites',
       'python', 'registration', 'required', 'setup', 'start', 'starts',
       'submit'], dtype=object)

In [67]:
x = cv.transform(documents_example)

In [68]:
pd.DataFrame(x.todense(), columns=cv.get_feature_names_out()).T

Unnamed: 0,0,1,2,3,4
15th,1,0,0,0,0
2024,1,0,0,0,0
cloud,0,0,0,0,1
course,1,0,0,0,1
date,0,0,1,0,0
github,0,1,0,0,0
google,0,0,0,0,1
homeworks,0,0,1,0,0
jan,1,0,0,0,0
listed,0,1,0,0,0


In [69]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english', min_df=5)
X = cv.fit_transform(de_course.text)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,425,426,427,428,429,430,431,432,433,434
01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
04,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
05,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,2
100,0,0,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yes,0,0,1,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
yml,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,0
youtube,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
zip,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [99]:
from sklearn.feature_extraction.text import TfidfVectorizer
''' find out the value of the text.
'''
tf = TfidfVectorizer(stop_words='english', min_df = 5)
x = tf.fit_transform(de_course.text)
names = cv.get_feature_names_out()
df_docs = pd.DataFrame(x.toarray(), columns = cv.get_feature_names_out()).T

In [108]:
query = "Do I need to know python to sign up for the January course?"
q = cv.transform([query])

In [101]:
from sklearn.metrics.pairwise import cosine_similarity
score = cosine_similarity(x, q).flatten()

In [102]:
import numpy as np

In [103]:
socre_index = np.argsort(score)[-5:]
socre_index

array([ 11,   7,  27,   0, 398])

In [107]:
df.iloc[398]

course                              data-engineering-zoomcamp
section                                               Project
question                How to run python as start up script?
text        You need to redefine the python environment va...
Name: 398, dtype: object

In [112]:
query2 = "I just discovered the course, is it too late to join?"
q2 = cv.transform([query2])

In [113]:
score = cosine_similarity(x, q2).flatten()

In [114]:
socre_index = np.argsort(score)[-5:]
socre_index

array([ 3,  7, 27,  0, 22])

In [118]:
for index in socre_index: 
    print(df.iloc[index].text)
    print('---')

You don't need it. You're accepted. You can also just start learning and submitting homework without registering. It is not checked against any registered list. Registration is just to gauge interest before the start date.
---
Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.
You can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.
---
You can do most of the course without a cloud. Almost everything we use (excluding BigQuery) can be run locally. We won’t be able to provide guidelines for some things, but most of the materials are runnable without GCP.
For everything in the course, there’s a local alternative. You could even do the whole course locally.
---
The purpose of this document is to capture frequently asked technical questions
The exact day and hour of the course will be 15th Jan 2024 at 17h00. The co

In [119]:
fields = ['section', 'question', 'text']

In [120]:
matrices = {}
vectorizes = {}

for field in fields:
    cv = TfidfVectorizer(stop_words='english', min_df=5)
    x  = cv.fit_transform(de_course[field])
    matrices[field] = x
    vectorizes[field] = cv

In [121]:
matrices

{'section': <435x36 sparse matrix of type '<class 'numpy.float64'>'
 	with 1561 stored elements in Compressed Sparse Row format>,
 'question': <435x135 sparse matrix of type '<class 'numpy.float64'>'
 	with 1613 stored elements in Compressed Sparse Row format>,
 'text': <435x726 sparse matrix of type '<class 'numpy.float64'>'
 	with 10493 stored elements in Compressed Sparse Row format>}

In [122]:
n = len(de_course)

In [123]:
query = "I just discovered the course, is it too late to join?"

In [124]:
score = np.zeros(n)

for f in fields:
    q = vectorizes[f].transform([query])
    x = matrices[f]

    f_score = cosine_similarity(x,q).flatten()

    score = score + f_score

In [125]:
idx = np.argsort(score)[-5:]

In [126]:
de_course.iloc[idx]

Unnamed: 0,course,section,question,text
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
7,data-engineering-zoomcamp,General course-related questions,Course - Can I follow the course after it fini...,"Yes, we will keep all the materials after the ..."


In [129]:
score = np.zeros(n)

""" giving question more priority and text less priority """ 
boosts = {
    "question": 3,
    "text": 0.5
}

for f in fields:
    q = vectorizes[f].transform([query])
    x = matrices[f]

    f_score = cosine_similarity(x,q).flatten()

    boost = boosts.get(f,1.0)

    score = score + f_score * boost 

In [132]:
idx = np.argsort(score)[-5:]
de_course.iloc[idx]

Unnamed: 0,course,section,question,text
5,data-engineering-zoomcamp,General course-related questions,Course - how many Zoomcamps in a year?,"There are 3 Zoom Camps in a year, as of 2024. ..."
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
34,data-engineering-zoomcamp,General course-related questions,How can we contribute to the course?,Star the repo! Share it with friends if you fi...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
7,data-engineering-zoomcamp,General course-related questions,Course - Can I follow the course after it fini...,"Yes, we will keep all the materials after the ..."
