In [1]:
import requests
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 1. Get FAQ documents

In [2]:
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [3]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [4]:
documents[-1]

{'text': 'Problem description\nInfrastructure created in AWS with CD-Deploy Action needs to be destroyed\nSolution description\nFrom local:\nterraform init -backend-config="key=mlops-zoomcamp-prod.tfstate" --reconfigure\nterraform destroy --var-file vars/prod.tfvars\nAdded by Erick Calderin',
 'section': 'Module 6: Best practices',
 'question': 'How to destroy infrastructure created via GitHub Actions',
 'course': 'mlops-zoomcamp'}

In [5]:
documents_df = pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])
documents_df.head()

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


In [6]:
documents_df["course"].unique()

array(['data-engineering-zoomcamp', 'machine-learning-zoomcamp',
       'mlops-zoomcamp'], dtype=object)

# 2. Basics of Text Search

- **Information Retrieval** - The process of obtaining relevant information from large datasets based on user queries.
- **Vector Spaces** - A mathematical representation where text is converted into vectors (points in space) allowing for quantitative comparison.
- **Bag of Words** - A simple text representation model treating each document as a collection of words disregarding grammar and word order but keeping multiplicity.
- **TF-IDF (Term Frequency-Inverse Document Frequency)** - A statistical measure used to evaluate how important a word is to a document in a collection or corpus. It increases with the number of times a word appears in the document but is offset by the frequency of the word in the corpus.


# 3. Implementing Basic Text Seach

## 3.1 Course filtering

First, keyword filtering:

In [7]:
ml_docs_df = documents_df.query("course == 'machine-learning-zoomcamp'").reset_index(drop=True)

In [8]:
ml_docs_df.head()

Unnamed: 0,course,section,question,text
0,machine-learning-zoomcamp,General course-related questions,How do I sign up?,Machine Learning Zoomcamp FAQ\nThe purpose of ...
1,machine-learning-zoomcamp,General course-related questions,Is it going to be live? When?,"The course videos are pre-recorded, you can st..."
2,machine-learning-zoomcamp,General course-related questions,What if I miss a session?,"Everything is recorded, so you won’t miss anyt..."
3,machine-learning-zoomcamp,General course-related questions,How much theory will you cover?,The bare minimum. The focus is more on practic...
4,machine-learning-zoomcamp,General course-related questions,I don't know math. Can I take the course?,Yes! We'll cover some linear algebra in the co...


In [9]:
ml_docs_df.tail()

Unnamed: 0,course,section,question,text
370,machine-learning-zoomcamp,Miscellaneous,"How can I work with very large datasets, e.g. ...",You can consider several different approaches:...
371,machine-learning-zoomcamp,Miscellaneous,"Can I do the course in other languages, like R...","Technically, yes. Advisable? Not really. Reaso..."
372,machine-learning-zoomcamp,Miscellaneous,Is use of libraries like fast.ai or huggingfac...,"Yes, it’s allowed (as per Alexey).\nAdded By R..."
373,machine-learning-zoomcamp,Miscellaneous,"Flask image was built and tested successfully,...",The TF and TF Serving versions have to match (...
374,machine-learning-zoomcamp,Miscellaneous,Any advice for adding the Machine Learning Zoo...,I’ve seen LinkedIn users list DataTalksClub as...


In [10]:
ml_docs_df["section"].unique()

array(['General course-related questions',
       '1. Introduction to Machine Learning',
       '2. Machine Learning for Regression',
       '3. Machine Learning for Classification',
       '4. Evaluation Metrics for Classification',
       '5. Deploying Machine Learning Models',
       '6. Decision Trees and Ensemble Learning',
       '8. Neural Networks and Deep Learning',
       '9. Serverless Deep Learning',
       '10. Kubernetes and TensorFlow Serving', '11. KServe',
       'Projects (Midterm and Capstone)', 'Miscellaneous'], dtype=object)

## 3.2 Vectorization

For Count Vectorizer and TF-IDF we will first use a simple example.

Representation in vector spaces:

- turn the docs into vectors
- term-document matrix:
    - rows: documents
    - columns: words/tokens


References:
- [scikit-learn: Text feature extraction](https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction)
- [scikit-learn: CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)
- [scikit-learn: TfidfVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)

In [11]:
docs_example = [
    "Course starts on 15th Jan 2024",
    "Prerequisites listed on GitHub",
    "Submit homeworks after start date",
    "Registration not required for participation",
    "Setup Google Cloud and Python before course"
]

**_CountVectorizer_** converts a collection of text documents to a matrix of token counts.

This implementation produces a sparse representation of the counts using scipy.sparse.csr_matrix.

If you do not provide an a-priori dictionary and you do not use an analyzer that does some kind of feature selection then the number of features will be equal to the vocabulary size found by analyzing the data.

In [12]:
cv = CountVectorizer(stop_words='english')
X = cv.fit_transform(docs_example)

cols_name = cv.get_feature_names_out()

docs_df = pd.DataFrame(X.toarray(), columns=cols_name)
docs_df

Unnamed: 0,15th,2024,cloud,course,date,github,google,homeworks,jan,listed,participation,prerequisites,python,registration,required,setup,start,starts,submit
0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,1
3,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0
4,0,0,1,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0


This representation is called "bag of words" - here we ignore the order of words, just focus on the words themselves. In many cases this is sufficient and gives pretty good results already.

**_TfidfVectorizer_** converts a collection of raw documents to a matrix of TF-IDF features.

Equivalent to CountVectorizer followed by TfidfTransformer. Tf means term-frequency while tf-idf means term-frequency times inverse document-frequency. This is a common term weighting scheme in information retrieval, that has also found good use in document classification.

The goal of using tf-idf instead of the raw frequencies of occurrence of a token in a given document is to scale down the impact of tokens that occur very frequently in a given corpus and that are hence empirically less informative than features that occur in a small fraction of the training corpus.

In [13]:
cv = TfidfVectorizer(stop_words='english')
X = cv.fit_transform(docs_example)

cols_name = cv.get_feature_names_out()

docs_df = pd.DataFrame(X.toarray(), columns=cols_name)
docs_df.round(2)

Unnamed: 0,15th,2024,cloud,course,date,github,google,homeworks,jan,listed,participation,prerequisites,python,registration,required,setup,start,starts,submit
0,0.46,0.46,0.0,0.37,0.0,0.0,0.0,0.0,0.46,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.46,0.0
1,0.0,0.0,0.0,0.0,0.0,0.58,0.0,0.0,0.0,0.58,0.0,0.58,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.5
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.58,0.0,0.0,0.58,0.58,0.0,0.0,0.0,0.0
4,0.0,0.0,0.46,0.37,0.0,0.0,0.46,0.0,0.0,0.0,0.0,0.0,0.46,0.0,0.0,0.46,0.0,0.0,0.0


## 3.3 Query-Document Similarity

We represent the query in the same vector space - i.e. using the same vectorizer:

In [14]:
query = "Do I need to know python to sign up for the January course?"

q = cv.transform([query])
q.toarray()[0]

array([0.        , 0.        , 0.        , 0.62791376, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.77828292, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        ])

We can see the word scores of the query:

In [15]:
query_dict = dict(zip(cols_name, q.toarray()[0]))
query_dict

{'15th': 0.0,
 '2024': 0.0,
 'cloud': 0.0,
 'course': 0.6279137616509934,
 'date': 0.0,
 'github': 0.0,
 'google': 0.0,
 'homeworks': 0.0,
 'jan': 0.0,
 'listed': 0.0,
 'participation': 0.0,
 'prerequisites': 0.0,
 'python': 0.7782829228046183,
 'registration': 0.0,
 'required': 0.0,
 'setup': 0.0,
 'start': 0.0,
 'starts': 0.0,
 'submit': 0.0}

What is the most similar document in the sample to query?
The more words in common - the better the matching score. 

Let's calculate it: this is a dot-product. So we can use matrix multiplication to compute the score:

In [16]:
X.dot(q.T).toarray()

array([[0.23490553],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.59579005]])

In practice, we usually use [cosine similarity](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.cosine_similarity.html):

In [17]:
cosine_similarity(X, q)

array([[0.23490553],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.59579005]])

The TF-IDF vectorizer already outputs a normalized vectors, so the results are identical. 

## 3.4 Vectorize all documents

Let's now do it for all the documents:

In [18]:
fields = ['section', 'question', 'text']
transformers = {}
matrices = {}

for field in fields:
    cv = TfidfVectorizer(stop_words='english', min_df=3)
    X = cv.fit_transform(documents_df[field])

    transformers[field] = cv
    matrices[field] = X

### Seach and calculate scores

Let's now do search with the `text` field:

In [19]:
query = "I just singned up. Is it too late to join the course?"

q = transformers['text'].transform([query])
score = cosine_similarity(matrices['text'], q).flatten()

Let's do it only for the data engineering course:

In [20]:
mask = (documents_df.course == 'data-engineering-zoomcamp').values
score = score * mask

And get the top results:

In [21]:
idx = np.argsort(-score)[:5]
documents_df.iloc[idx]

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
15,data-engineering-zoomcamp,General course-related questions,Homework - Are late submissions of homework al...,"No, late submissions are not allowed. But if t..."
22,data-engineering-zoomcamp,General course-related questions,Environment - Do we really have to use GitHub ...,It's up to you which platform and environment ...
27,data-engineering-zoomcamp,General course-related questions,Environment - The GCP and other cloud provider...,You can do most of the course without a cloud....
38,data-engineering-zoomcamp,General course-related questions,Project - What is Project Attemp #1 and Projec...,You will have two attempts for a project. If t...


### Search with all the fields & boosting + filtering

We can do it for all the fields. Let's also boost one of the fields - `question` - to give it more importance than to others

In [22]:
boost = {'question': 3.0}

score = np.zeros(len(documents_df))

for f in fields:
    b = boost.get(f, 1.0)
    q = transformers[f].transform([query])
    s = cosine_similarity(matrices[f], q).flatten()
    score = score + b * s

And add filters (in this case, only one):

In [23]:
filters = {
    'course': 'data-engineering-zoomcamp'
}

for field, value in filters.items():
    mask = (documents_df[field] == value).values
    score = score * mask

Getting the results:

In [24]:
idx = np.argsort(-score)[:5]
results = documents_df.iloc[idx]
results.to_dict(orient='records')

[{'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'question': 'Course - What are the prerequisites for this course?',
  'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites'},
 {'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'question': 'How can we contribute to the course?',
  'text': 'Star the repo! Share it with friends if you find it useful ❣️\nCreate a PR if you see you can improve the text or the structure of the repository.'},
 {'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.'},
 {'course': 'data-eng

### Putting it all together

Let's create a class for us to use:

In [26]:
class TextSearch:

    def __init__(self, text_fields):
        self.text_fields = text_fields
        self.matrices = {}
        self.vectorizers = {}

    def fit(self, records, vectorizer_params={}):
        self.df = pd.DataFrame(records)

        for f in self.text_fields:
            cv = TfidfVectorizer(**vectorizer_params)
            X = cv.fit_transform(self.df[f])
            self.matrices[f] = X
            self.vectorizers[f] = cv

    def search(self, query, n_results=10, boost={}, filters={}):
        score = np.zeros(len(self.df))

        for f in self.text_fields:
            b = boost.get(f, 1.0)
            q = self.vectorizers[f].transform([query])
            s = cosine_similarity(self.matrices[f], q).flatten()
            score = score + b * s

        for field, value in filters.items():
            mask = (self.df[field] == value).values
            score = score * mask

        idx = np.argsort(-score)[:n_results]
        results = self.df.iloc[idx]
        return results.to_dict(orient='records')

In [27]:
index = TextSearch(
    text_fields=['section', 'question', 'text']
)
index.fit(documents)

index.search(
    query='I just singned up. Is it too late to join the course?',
    n_results=5,
    boost={'question': 3.0},
    filters={'course': 'data-engineering-zoomcamp'}
)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineerin