In [1]:
import requests 
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# get the FAQ data -this returns a list of 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [3]:
print(f">>> data includes {len(documents)} documents")

>>> data includes 948 documents


In [4]:
documents[20]

{'text': 'You can set it up on your laptop or PC if you prefer to work locally from your laptop or PC.\nYou might face some challenges, especially for Windows users. If you face cnd2\nIf you prefer to work on the local machine, you may start with the week 1 Introduction to Docker and follow through.\nHowever, if you prefer to set up a virtual machine, you may start with these first:\nUsing GitHub Codespaces\nSetting up the environment on a cloudV Mcodespace\nI decided to work on a virtual machine because I have different laptops & PCs for my home & office, so I can work on this boot camp virtually anywhere.',
 'section': 'General course-related questions',
 'question': 'Environment - Should I use my local machine, GCP, or GitHub Codespaces for my environment?',
 'course': 'data-engineering-zoomcamp'}

In [5]:
# convert the documents list into a pandas dataframe
df = pd.DataFrame(documents,
                 columns=documents[0].keys())

In [6]:
df.tail()

Unnamed: 0,text,section,question,course
943,Problem description\nThis is the step in the c...,Module 6: Best practices,Github actions: Permission denied error when e...,mlops-zoomcamp
944,Problem description\nWhen a docker-compose fil...,Module 6: Best practices,Managing Multiple Docker Containers with docke...,mlops-zoomcamp
945,Problem description\nIf you are having problem...,Module 6: Best practices,AWS regions need to match docker-compose,mlops-zoomcamp
946,Problem description\nPre-commit command was fa...,Module 6: Best practices,Isort Pre-commit,mlops-zoomcamp
947,Problem description\nInfrastructure created in...,Module 6: Best practices,How to destroy infrastructure created via GitH...,mlops-zoomcamp


In [7]:
df.describe()

Unnamed: 0,text,section,question,course
count,948,948,948,948
unique,945,32,943,3
top,TODO,Module 1: Docker and Terraform,AttributeError: 'DataFrame' object has no attr...,data-engineering-zoomcamp
freq,3,116,2,435


In [8]:
df['course'].value_counts()

course
data-engineering-zoomcamp    435
machine-learning-zoomcamp    375
mlops-zoomcamp               138
Name: count, dtype: int64

In [9]:
# vectorize the text using the count vectorizer
# this is a bag-of-words approach which creates a sparse matrix 

cv = CountVectorizer(min_df=5, #keep only terms which appear in at least 5 documents (df = doc freq)
                    stop_words='english', # remove stop words
                    )

In [10]:
# Learn a vocabulary dictionary of all tokens in the raw documents
cv.fit(df['text'])

In [11]:
# check out the features

cv.get_feature_names_out()

array(['01', '02', '03', ..., 'youtube', 'zip', 'zoomcamp'],
      shape=(1333,), dtype=object)

In [12]:
# create document-term matrix
X = cv.transform(df['text'])

In [13]:
# check the term frequency matrix - this returns a very large df
# df_docs = pd.DataFrame(X.toarray(), columns=names)

In [14]:
# now using TF-IDF which is another way of vectorizing the text using term frequency
# but this  methods conciders the term count relative to the tern count per document 
# reducing the impact of terms that are common but not too important.


In [15]:
tfidf_vec = TfidfVectorizer(min_df=5,
                           stop_words='english')

In [16]:
X = tfidf_vec.fit_transform(df['text'])

In [17]:
# get the terms

names = tfidf_vec.get_feature_names_out()

In [18]:
df_docs = pd.DataFrame(X.toarray(),
                      columns=names).T

In [19]:
# check out the wieghts for each term (feature) for the documents
# the higher the wieght, the more important the feature is

df_docs.round(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,938,939,940,941,942,943,944,945,946,947
01,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
02,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
03,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
04,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
05,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yes,0.0,0.00,0.28,0.0,0.0,0.00,0.21,0.2,0.15,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
yml,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.11,0.0,0.0,0.0,0.00
youtube,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.15,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
zip,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00


In [20]:
# find out how similar a user query is to a document

query = "Do I need to know python to sign up for the January course?"

In [21]:
# vectorize the query using the same vectorizer we calculated for the documents

q = cv.transform([query])
q.toarray()

array([[0, 0, 0, ..., 0, 0, 0]], shape=(1, 1333))

In [22]:
# check what features are common between the query and one of the documents

# query_dict = dict(zip(names, q.toarray()[0]))
# query_dict

# doc_dict = dict(zip(names, X.toarray()[1]))
# doc_dict

In [23]:
# measure the similarity using the cosine similarity (dot product)
cosine_similarity(X, q)

array([[0.22818331],
       [0.        ],
       [0.        ],
       [0.09022744],
       [0.0701645 ],
       [0.        ],
       [0.        ],
       [0.15799829],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.19859786],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.08711883],
       [0.        ],
       [0.        ],
       [0.06775545],
       [0.10302862],
       [0.        ],
       [0.07359779],
       [0.19194909],
       [0.09467579],
       [0.        ],
       [0.07350127],
       [0.        ],
       [0.24700045],
       [0.16326312],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.08275897],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.01366611],
       [0.04256013],
       [0.06424368],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.035

In [32]:
# vectorize all types of text in the documents

fields = ['section', 'question', 'text']
transformers = {}
matrices = {}

for field in fields:
    tfidf_vec = TfidfVectorizer(stop_words='english', min_df=5)
    X = tfidf_vec.fit_transform(df[field])

    transformers[field] = tfidf_vec
    matrices[field] = X

transformers['text'].get_feature_names_out()
matrices['text']

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 23808 stored elements and shape (948, 1333)>