In [1]:
import openai

In [2]:
from openai import OpenAI

In [3]:
client = OpenAI()

In [1]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [2]:
documents[2]

{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
 'section': 'General course-related questions',
 'question': 'Course - Can I still join the course after the start date?',
 'course': 'data-engineering-zoomcamp'}

In [4]:
import pandas as pd

In [5]:
df = pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])

In [6]:
df.head()

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


In [7]:
df.tail()

Unnamed: 0,course,section,question,text
943,mlops-zoomcamp,Module 6: Best practices,Github actions: Permission denied error when e...,Problem description\nThis is the step in the c...
944,mlops-zoomcamp,Module 6: Best practices,Managing Multiple Docker Containers with docke...,Problem description\nWhen a docker-compose fil...
945,mlops-zoomcamp,Module 6: Best practices,AWS regions need to match docker-compose,Problem description\nIf you are having problem...
946,mlops-zoomcamp,Module 6: Best practices,Isort Pre-commit,Problem description\nPre-commit command was fa...
947,mlops-zoomcamp,Module 6: Best practices,How to destroy infrastructure created via GitH...,Problem description\nInfrastructure created in...


In [5]:
from sklearn.feature_extraction.text import CountVectorizer

In [21]:
cv = CountVectorizer(stop_words='english')

In [10]:
doc_examples = [
    "Course starts on 15th Jan 2024",
    "Prerequisites listed on GitHub",
    "Submit homeworks after start date",
    "Registration not required for participation",
    "Setup Google Cloud and Python before course"
]

In [22]:
cv.fit(doc_examples)

In [23]:
cv.get_feature_names_out()

array(['15th', '2024', 'cloud', 'course', 'date', 'github', 'google',
       'homeworks', 'jan', 'listed', 'participation', 'prerequisites',
       'python', 'registration', 'required', 'setup', 'start', 'starts',
       'submit'], dtype=object)

In [24]:
X = cv.transform(doc_examples)

In [25]:
X.todense()

matrix([[1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
        [0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0],
        [0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0]])

In [26]:
sample_vector_df = pd.DataFrame(X.todense(), columns=cv.get_feature_names_out())

In [27]:
sample_vector_df.T

Unnamed: 0,0,1,2,3,4
15th,1,0,0,0,0
2024,1,0,0,0,0
cloud,0,0,0,0,1
course,1,0,0,0,1
date,0,0,1,0,0
github,0,1,0,0,0
google,0,0,0,0,1
homeworks,0,0,1,0,0
jan,1,0,0,0,0
listed,0,1,0,0,0


In [93]:
from sklearn.feature_extraction.text import TfidfVectorizer

cv = TfidfVectorizer(stop_words='english', min_df=5)
X = cv.fit_transform(df.text)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names)
df_docs.round(2)

Unnamed: 0,01,02,03,04,05,06,09,10,100,11,...,y_val,yaml,year,yellow,yellow_tripdata_2021,yes,yml,youtube,zip,zoomcamp
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.00
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.43
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.0,0.28,0.00,0.0,0.0,0.00
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.00
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
943,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.0,0.00,0.11,0.0,0.0,0.00
944,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.00
945,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.17,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.00
946,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.00


In [9]:
query = "Do I need to know python to sign up for the January course?"

q = cv.transform([query])
q.toarray()

NameError: name 'cv' is not defined

In [32]:
X

<948x1333 sparse matrix of type '<class 'numpy.float64'>'
	with 23808 stored elements in Compressed Sparse Row format>

In [34]:
X.dot(q.T).todense()

matrix([[0.19464486],
        [0.        ],
        [0.        ],
        [0.06011641],
        [0.04932915],
        [0.        ],
        [0.        ],
        [0.13477565],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.15899187],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.07431408],
        [0.        ],
        [0.        ],
        [0.05779673],
        [0.07243428],
        [0.        ],
        [0.05174293],
        [0.16373635],
        [0.08076031],
        [0.        ],
        [0.09755254],
        [0.        ],
        [0.21069625],
        [0.12067781],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.06381749],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.00910541],
        [0.02835681],
        [0.05480112],
        [0.        ],
        [0.        ],
        [0.        ],
        [0

In [36]:
q.T.todense()

matrix([[0.],
        [0.],
        [0.],
        ...,
        [0.],
        [0.],
        [0.]])

In [37]:
X.todense()

matrix([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.42896052],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.14842753]])

In [38]:
names

array(['01', '02', '03', ..., 'youtube', 'zip', 'zoomcamp'], dtype=object)

In [44]:
cv.get_feature_names_out()

array(['01', '02', '03', ..., 'youtube', 'zip', 'zoomcamp'], dtype=object)

In [46]:
q.T.todense()

matrix([[0.],
        [0.],
        [0.],
        ...,
        [0.],
        [0.],
        [0.]])

In [48]:
question_col_weights_df = pd.DataFrame(q.todense(), columns=names)

In [63]:
question_col_weights_df.iloc[0][question_col_weights_df.iloc[0] > 0]

course    0.381482
know      0.560827
need      0.297968
python    0.314414
sign      0.593552
Name: 0, dtype: float64

In [64]:
question_col_weights_df

Unnamed: 0,01,02,03,04,05,06,09,10,100,11,...,y_val,yaml,year,yellow,yellow_tripdata_2021,yes,yml,youtube,zip,zoomcamp
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [68]:
X.toarray()[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [69]:
df

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
...,...,...,...,...
943,mlops-zoomcamp,Module 6: Best practices,Github actions: Permission denied error when e...,Problem description\nThis is the step in the c...
944,mlops-zoomcamp,Module 6: Best practices,Managing Multiple Docker Containers with docke...,Problem description\nWhen a docker-compose fil...
945,mlops-zoomcamp,Module 6: Best practices,AWS regions need to match docker-compose,Problem description\nIf you are having problem...
946,mlops-zoomcamp,Module 6: Best practices,Isort Pre-commit,Problem description\nPre-commit command was fa...


In [77]:
not_tfidf_cv = CountVectorizer(stop_words='english')

In [79]:
not_tfidf_cv.fit_transform([query]).todense()

matrix([[1, 1, 1, 1, 1, 1]])

In [76]:
not_tfidf_cv.get_feature_names_out()

array(['course', 'do', 'for', 'january', 'know', 'need', 'python', 'sign',
       'the', 'to', 'up'], dtype=object)

In [80]:
not_tfidf_cv.get_feature_names_out()

array(['course', 'january', 'know', 'need', 'python', 'sign'],
      dtype=object)

In [7]:
query = "Do I need to know docker to sign up for the course"

In [83]:
not_tfidf_cv.transform([query]).todense()

matrix([[1, 0, 1, 1, 0, 1]])

In [84]:
cv

In [10]:
new_tfidf_cv = TfidfVectorizer(stop_words='english')

In [11]:
query_1 = "Do I need to know python to sign up for the January course?"

In [12]:
query_2 = "Do I need to know docker to sign up for the course"

In [16]:
new_X = new_tfidf_cv.fit_transform([query_1]).todense()

In [17]:
new_X_wo_dense = new_tfidf_cv.fit_transform([query_1])

In [93]:
new_tfidf_cv.get_feature_names_out()

array(['course', 'january', 'know', 'need', 'python', 'sign'],
      dtype=object)

In [14]:
new_q = new_tfidf_cv.transform([query_2]).todense()

In [18]:
new_q_wo_dense = new_tfidf_cv.transform([query_2])

In [19]:
new_X.dot(new_q.T)

matrix([[0.81649658]])

In [106]:
query_4 = "I need to know llms"

In [103]:
new_q3 = new_tfidf_cv.transform([query_3]).todense()

In [107]:
new_q4 = new_tfidf_cv.transform([query_4]).todense()

In [108]:
new_X.dot(new_q4.T)

matrix([[0.57735027]])

In [14]:
from sklearn.metrics.pairwise import cosine_similarity

In [26]:
f_score = cosine_similarity(new_X_wo_dense, new_q_wo_dense).flatten()

In [23]:
import numpy as np

In [24]:
score = np.zeros(10)

In [25]:
score

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [27]:
score += f_score

In [28]:
score

array([0.81649658, 0.81649658, 0.81649658, 0.81649658, 0.81649658,
       0.81649658, 0.81649658, 0.81649658, 0.81649658, 0.81649658])

In [29]:
f_score

array([0.81649658])

In [34]:
new_array = np.array([0.568])

In [36]:
score += new_array

In [38]:
f_score_whole = cosine_similarity(X, q).flatten()

In [42]:
idx = np.argsort(f_score_whole)[-5:]

In [44]:
df.iloc[idx]

Unnamed: 0,course,section,question,text
764,machine-learning-zoomcamp,Projects (Midterm and Capstone),What If I submitted only two projects and fail...,If you have submitted two projects (and peer-r...
27,data-engineering-zoomcamp,General course-related questions,Environment - The GCP and other cloud provider...,You can do most of the course without a cloud....
806,machine-learning-zoomcamp,Miscellaneous,"Can I do the course in other languages, like R...","Technically, yes. Advisable? Not really. Reaso..."
577,machine-learning-zoomcamp,4. Evaluation Metrics for Classification,How to find the intercept between precision an...,You can find the intercept between these two c...
445,machine-learning-zoomcamp,General course-related questions,How much Python should I know?,Check this article. If you know everything in ...


In [10]:
query = "Do I need to know python to sign up for the January course?"

In [11]:
query

'Do I need to know python to sign up for the January course?'

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
fields = ['section', 'question', 'text']

In [17]:
matrices = {}
vectorizers = {}

for f in fields:
    cv = TfidfVectorizer(stop_words='english', min_df=5)
    X = cv.fit_transform(df.text)

    matrices[f] = X
    vectorizers[f] = cv

In [18]:
matrices, vectorizers

({'section': <948x1333 sparse matrix of type '<class 'numpy.float64'>'
  	with 23808 stored elements in Compressed Sparse Row format>,
  'question': <948x1333 sparse matrix of type '<class 'numpy.float64'>'
  	with 23808 stored elements in Compressed Sparse Row format>,
  'text': <948x1333 sparse matrix of type '<class 'numpy.float64'>'
  	with 23808 stored elements in Compressed Sparse Row format>},
 {'section': TfidfVectorizer(min_df=5, stop_words='english'),
  'question': TfidfVectorizer(min_df=5, stop_words='english'),
  'text': TfidfVectorizer(min_df=5, stop_words='english')})

In [19]:
len(df)

948

In [21]:
import numpy as np

In [22]:
n = len(df)
score = np.zeros(n)

In [24]:
n = len(df)

score = np.zeros(n)
query = "I just discovered the course, is it too late to join?"

boosts = {
    'question': 3,
    'text': 0.5
}

for f in fields:

    q = vectorizers[f].transform([query])
    X = matrices[f]

    f_score = cosine_similarity(X, q).flatten()
    boost_value = boosts.get(f, 1.0)

    score = score + boost_value * f_score
    print(f'field_value = {f}, boost_value = {boost_value}, score value = {score}')

field_value = section, boost_value = 1.0, score value = [0.48049682 0.         0.         0.2083882  0.         0.
 0.         0.17557272 0.         0.         0.         0.15870689
 0.         0.         0.         0.09680922 0.         0.
 0.07529201 0.         0.         0.         0.29986763 0.10520675
 0.         0.         0.         0.27447476 0.12828407 0.
 0.         0.         0.         0.05163407 0.         0.
 0.         0.         0.         0.03156309 0.04914818 0.07138962
 0.         0.04329773 0.         0.         0.         0.
 0.02804374 0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.06739038 0.         0.00980845 0.
 0.         0.         0.         0.05820102 0.         0.
 0.         0.         0.         0.05020173 0.         0.
 0.0605701  0.         0.         0.         0.         0.05101782
 0.         0.         0.04327901 0.         0.03089898 0.
 0.         0.05106564 0.  

In [29]:
matrices['section'].todense()

matrix([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.42896052],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.14842753]])

In [75]:
result_idx = np.argsort(score)[-5:]

In [76]:
df.iloc[result_idx]

Unnamed: 0,course,section,question,text
22,data-engineering-zoomcamp,General course-related questions,Environment - Do we really have to use GitHub ...,It's up to you which platform and environment ...
448,machine-learning-zoomcamp,General course-related questions,I’m new to Slack and can’t find the course cha...,Here’s how you join a in Slack: https://slack....
449,machine-learning-zoomcamp,General course-related questions,The course has already started. Can I still jo...,"Yes, you can. You won’t be able to submit some..."
440,machine-learning-zoomcamp,General course-related questions,"I filled the form, but haven't received a conf...","The process is automated now, so you should re..."
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...


In [77]:
filters = {
    'course': 'data-engineering-zoomcamp'
}

for field, value in filters.items():
    mask = (df[field] == value).astype(int).values
    score = score * mask

In [78]:
result_idx = np.argsort(-score)[:5]
df.iloc[result_idx]

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
22,data-engineering-zoomcamp,General course-related questions,Environment - Do we really have to use GitHub ...,It's up to you which platform and environment ...
27,data-engineering-zoomcamp,General course-related questions,Environment - The GCP and other cloud provider...,You can do most of the course without a cloud....
287,data-engineering-zoomcamp,Module 4: analytics engineering with dbt,CREATE TABLE has columns with duplicate name l...,This error could result if you are using some ...
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...


In [69]:
class TextSearch:

    def __init__(self, text_fields):
        self.text_fields = text_fields
        self.matrices = {}
        self.vectorizers = {}

    def fit(self, records, vectorizer_params={}):
        self.df = pd.DataFrame(records)

        for f in self.text_fields:
            cv = TfidfVectorizer(**vectorizer_params)
            X = cv.fit_transform(self.df[f])
            self.matrices[f] = X
            self.vectorizers[f] = cv

    def search(self, query, n_results=10, boost={}, filters={}):
        score = np.zeros(len(self.df))

        for f in self.text_fields:
            b = boost.get(f, 1.0)
            q = self.vectorizers[f].transform([query])
            s = cosine_similarity(self.matrices[f], q).flatten()
            score = score + b * s

        for field, value in filters.items():
            mask = (self.df[field] == value).values
            score = score * mask

        idx = np.argsort(-score)[:n_results]
        results = self.df.iloc[idx]
        return results.to_dict(orient='records')

In [70]:
index = TextSearch(
    text_fields=['section', 'question', 'text']
)
index.fit(documents)

index.search(
    query='I just singned up. Is it too late to join the course?',
    n_results=5,
    boost={'question': 3.0},
    filters={'course': 'data-engineering-zoomcamp'}
)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineerin

In [86]:
matrices['text'][0].todense()

matrix([[0., 0., 0., ..., 0., 0., 0.]])

In [87]:
from sklearn.decomposition import NMF

In [94]:
nmf = NMF(n_components=16)
X_emb = nmf.fit_transform(X)
X_emb[0]

array([0.13037566, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.0003798 , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        ])

In [95]:
Q = cv.transform([query])
Q_emb = nmf.transform(Q)
Q_emb[0]

array([0.08659353, 0.00228465, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.00190595, 0.        ,
       0.        ])

In [96]:
score = cosine_similarity(X_emb, Q_emb).flatten()
idx = np.argsort(-score)[:10]
df.loc[idx]

Unnamed: 0,course,section,question,text
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
814,mlops-zoomcamp,+-General course questions,What if my answer is not exactly the same as t...,Please choose the closest one to your answer. ...
449,machine-learning-zoomcamp,General course-related questions,The course has already started. Can I still jo...,"Yes, you can. You won’t be able to submit some..."
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
11,data-engineering-zoomcamp,General course-related questions,Certificate - Can I follow the course in a sel...,"No, you can only get a certificate if you fini..."
764,machine-learning-zoomcamp,Projects (Midterm and Capstone),What If I submitted only two projects and fail...,If you have submitted two projects (and peer-r...
7,data-engineering-zoomcamp,General course-related questions,Course - Can I follow the course after it fini...,"Yes, we will keep all the materials after the ..."
451,machine-learning-zoomcamp,General course-related questions,Can I submit the homework after the due date?,"No, it’s not possible. The form is closed afte..."
436,machine-learning-zoomcamp,General course-related questions,Is it going to be live? When?,"The course videos are pre-recorded, you can st..."
450,machine-learning-zoomcamp,General course-related questions,When does the next iteration start?,The course is available in the self-paced mode...
