In [42]:
import numpy as np
import pandas as pd
import requests 

from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

## Load data

source: https://github.com/alexeygrigorev/build-your-own-search-engine 

**Aim:** build a search engine for the FAQ document of DataTalks.Club courses, returning relevant matches on keywords present in a given section, question and text (text being the answer). Questions are all ordered under the different course names and should be filtered and sorted by their relevance to search words - same way as Lucene, Elastic Search or Google would do (in memory, not for prod).

In [2]:
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [3]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [4]:
df = pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])
df.head()

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


In [5]:
df.course.value_counts()

course
data-engineering-zoomcamp    435
machine-learning-zoomcamp    375
mlops-zoomcamp               138
Name: count, dtype: int64

## Create vector spaces

- Turn the docs into vectors.
- Generate term-document matrix over text fields (rows: documents, columns: words/tokens)
- Bag of words represenation (produced by CountVectorizer): we don't care about the order of the words, only their presence in the input documents -> typically represented in sparse matrices

### CountVectorizer example

In [6]:
docs_example = [
    "Course starts on 15th Jan 2024",
    "Prerequisites listed on GitHub",
    "Submit homeworks after start date",
    "Registration not required for participation",
    "Setup Google Cloud and Python before course"
]

In [7]:
cv = CountVectorizer(stop_words='english') # discard words like 'and', 'the', 'a'
X = cv.fit(docs_example) # fit does the tokenisation, fit_transform generates the sparse matrix
X = cv.transform(docs_example)

names = cv.get_feature_names_out()
names

array(['15th', '2024', 'cloud', 'course', 'date', 'github', 'google',
       'homeworks', 'jan', 'listed', 'participation', 'prerequisites',
       'python', 'registration', 'required', 'setup', 'start', 'starts',
       'submit'], dtype=object)

A nice summary article on sparse vs dense data representations: https://medium.com/biased-algorithms/sparse-data-vs-dense-data-af8d66f931b7 

In [8]:
pd.DataFrame(X.todense(), columns=cv.get_feature_names_out())

Unnamed: 0,15th,2024,cloud,course,date,github,google,homeworks,jan,listed,participation,prerequisites,python,registration,required,setup,start,starts,submit
0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,1
3,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0
4,0,0,1,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0


### Fit Vectorizer on the input data

In [9]:
cv = CountVectorizer(stop_words='english')
X = cv.fit_transform(df.text)

names = cv.get_feature_names_out()
names

array(['00', '00000000e', '0002', ..., '要了解键盘快捷键', '要启用屏幕阅读器支持', '请按ctrl'],
      shape=(6461,), dtype=object)

In [10]:
df_docs = pd.DataFrame(X.toarray(), columns=names).T # after transposing, we have the documents (questions) in columns and the terms in rows
df_docs

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,938,939,940,941,942,943,944,945,946,947
00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
00000000e,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
00021,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
斜杠,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
查找和替换,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
要了解键盘快捷键,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
要启用屏幕阅读器支持,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


There's a lot of noise and foreign language tokens, so let's say we're only interested in more important words that appear in at least x documents (e.g. 5).

In [11]:
tv = TfidfVectorizer(stop_words='english', min_df=5) # exclude terms that appear less than 5 times (doc freq) and use Tf-idf vectorizer to give less weight to words that appear more often across documents
X = tv.fit_transform(df.text)

names = tv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names).T # after transposing, we have the documents (questions) in columns and the terms in rows
df_docs.round(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,938,939,940,941,942,943,944,945,946,947
01,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
02,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
03,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
04,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
05,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yes,0.0,0.00,0.28,0.0,0.0,0.00,0.21,0.2,0.15,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
yml,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.11,0.0,0.0,0.0,0.00
youtube,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.15,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
zip,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00


### Query-Document similarity

In [12]:
# query = "Do I need to know python to sign up for the January course?"
query = "I just discovered the course, is it too late to join?"

q = tv.transform([query])
q.toarray()

array([[0., 0., 0., ..., 0., 0., 0.]], shape=(1, 1333))

In [13]:
# see which of the entries are non-zero (i.e. which words the query or a specific document contains)
query_dict = dict(zip(names, q.toarray()[0]))
doc_dict = dict(zip(names, X.toarray()[1]))

In [14]:
# now we want to identify the words in common to understand the similarity - essentially a dot product (cosine similarity)
X.dot(q.T)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 144 stored elements and shape (948, 1)>

In [15]:
# same as above, returns the similarity to each document
score = cosine_similarity(X, q).flatten()

In [16]:
# sort the entries and returns the indeces with the ordered scores 
# descending order is not possible by default, one would have to reverse/negate the array upfront or simply take the last n elements
np.argsort(score)[-5:]

array([ 22, 448, 449, 440,   0])

In [17]:
df.iloc[449].text

'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.'

### Implement search on DE zoomcamp

In [18]:
# now we'll go through all fields

In [19]:
fields = ['section', 'question', 'text']
vectorizers = {}
matrices = {}

for field in fields:
    cv = TfidfVectorizer(stop_words='english', min_df=5)
    X = cv.fit_transform(df[field])

    vectorizers[field] = cv
    matrices[field] = X

vectorizers['text'].get_feature_names_out()
matrices

{'section': <Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 3090 stored elements and shape (948, 66)>,
 'question': <Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 3431 stored elements and shape (948, 291)>,
 'text': <Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 23808 stored elements and shape (948, 1333)>}

In [20]:
# we will sum the similarities over all three fields to come to an overall document similarity
n = len(df)
score = np.zeros(len(df))

boosts = {'question': 3.0} # allows to put more importance on given fields

for f in fields:
    b = boosts.get(f, 1.0)
    q = vectorizers[f].transform([query])
    s = cosine_similarity(matrices[f], q).flatten()
    score = score + b * s

In [21]:
# implement filter for the results in the form of a mask
filters = {
    'course': 'data-engineering-zoomcamp'
}

for field, value in filters.items():
    mask = (df[field] == value).astype(int).values
    score *= mask

In [22]:
idx = np.argsort(score)[-5:]
df.iloc[idx]

Unnamed: 0,course,section,question,text
5,data-engineering-zoomcamp,General course-related questions,Course - how many Zoomcamps in a year?,"There are 3 Zoom Camps in a year, as of 2024. ..."
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
7,data-engineering-zoomcamp,General course-related questions,Course - Can I follow the course after it fini...,"Yes, we will keep all the materials after the ..."


## Putting it all together into a class

In [23]:
class TextSearch:

    def __init__(self, text_fields):
        self.text_fields = text_fields
        self.matrices = {}
        self.vectorizers = {}

    def fit(self, records, vectorizer_params={}):
        self.df = pd.DataFrame(records)

        for f in self.text_fields:
            cv = TfidfVectorizer(**vectorizer_params)
            X = cv.fit_transform(self.df[f])
            self.matrices[f] = X
            self.vectorizers[f] = cv

    def search(self, query, n_results=10, boost={}, filters={}):
        score = np.zeros(len(self.df))

        for f in self.text_fields:
            b = boost.get(f, 1.0)
            q = self.vectorizers[f].transform([query])
            s = cosine_similarity(self.matrices[f], q).flatten()
            score = score + b * s

        for field, value in filters.items():
            mask = (self.df[field] == value).values
            score = score * mask

        idx = np.argsort(-score)[:n_results]
        results = self.df.iloc[idx]
        return results.to_dict(orient='records')

source: https://github.com/alexeygrigorev/build-your-own-search-engine 

In [24]:
index = TextSearch(
    text_fields=['section', 'question', 'text']
)
index.fit(documents)

index.search(
    query='I just signed up. Is it too late to join the course?',
    n_results=5,
    boost={'question': 3.0},
    filters={'course': 'data-engineering-zoomcamp'}
)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineerin

## Vector Search implementation

Getting rid of the reliance on exact word matches by vector similarity

**What are Embeddings?**

- Conversion to Numbers: Embeddings transform different words, sentences and documents into _dense_ vectors (arrays with numbers).
- Capturing Similarity: They ensure similar items have similar numerical vectors, illustrating their closeness in terms of characteristics.
- Dimensionality Reduction: Embeddings reduce complex characteristics into vectors.
- Use in Machine Learning: These numerical vectors are used in machine learning models for tasks such as recommendations, text analysis, and pattern recognition.

### SVD approach

Singular Value Decomposition is the simplest way to turn Bag-of-Words representation into embeddings

This way we still don't preserve the word order (because it wasn't in the Bag-of-Words representation) but we reduce dimensionality and capture synonyms.This compression is lossy compression - meaning that we won't be able to restore the 100% of the original vector, but the result is close enough.

In [25]:
X = matrices['text']
cv = vectorizers['text']

svd = TruncatedSVD(n_components=16)
X_emb = svd.fit_transform(X)

X_emb[0]

array([ 0.0965283 , -0.08205074, -0.10229286, -0.08037403,  0.06672466,
       -0.06093892,  0.0205677 , -0.13550268, -0.26410244,  0.2936571 ,
        0.05438688,  0.04882194, -0.07777704,  0.11825991, -0.0211176 ,
        0.00894683])

In [26]:
query = 'I just singned up. Is it too late to join the course?'

Q = cv.transform([query])
Q_emb = svd.transform(Q)
Q_emb[0]

array([ 0.05790095, -0.03845721, -0.05679895, -0.02911152,  0.03849191,
       -0.06235415,  0.01204172, -0.08795623, -0.18157405,  0.19466843,
        0.03963886,  0.05979196, -0.05609741,  0.07548796,  0.01628981,
        0.03229698])

In [27]:
# principle is the same as before, we may compute the cosine similarity, just the representation differs
np.dot(X_emb[0], Q_emb[0])

np.float64(0.1588587434458596)

In [28]:
score = cosine_similarity(X_emb, Q_emb).flatten()
idx = np.argsort(-score)[:5]
list(df.loc[idx].text)

['Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
 'If you have submitted two projects (and peer-reviewed at least 3 course-mates’ projects for each submission), you will get the certificate for the course. According to the course coordinator, Alexey Grigorev, only two projects are needed to get the course certificate.\n(optional) David Odimegwu',
 'The course videos are pre-recorded, you can start watching the course right now.\nWe will also occasionally have office hours - live sessions where we will answer your questions. The office hours sessions are recorded too.\nYou can see the office hours as well as the pre-recorded course videos in the course playlist on

### Non-negative Matrix Factorisation approach

Negative values from before are hard to interpret, even if we only take the top values. A different way of compressing comes from NNMF, similar to topic modelling. 

In [29]:
nmf = NMF(n_components=16)
X_emb = nmf.fit_transform(X)
X_emb[0]

array([0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
       0.       , 0.       , 0.2817466, 0.       , 0.       , 0.       ,
       0.       , 0.       , 0.       , 0.       ])

In [30]:
Q = cv.transform([query])
Q_emb = nmf.transform(Q)
Q_emb[0]

array([0.        , 0.0022334 , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.18621734, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        ])

In [31]:
score = cosine_similarity(X_emb, Q_emb).flatten()
idx = np.argsort(-score)[:5]
df.loc[idx]

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
436,machine-learning-zoomcamp,General course-related questions,Is it going to be live? When?,"The course videos are pre-recorded, you can st..."
449,machine-learning-zoomcamp,General course-related questions,The course has already started. Can I still jo...,"Yes, you can. You won’t be able to submit some..."
814,mlops-zoomcamp,+-General course questions,What if my answer is not exactly the same as t...,Please choose the closest one to your answer. ...


### BERT 

In [32]:
# !pip install transformers tqdm huggingface_hub[hf_xet] accelarate

In [33]:
import torch
from transformers import BertModel, BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
model.eval()  # Set the model to evaluation mode if not training

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

**Basic example**

In [35]:
texts = [
    "Yes, we will keep all the materials after the course finishes.",
    "You can follow the course at your own pace after it finishes"
]
encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

with torch.no_grad():  # Disable gradient calculation for inference
    outputs = model(**encoded_input)
    hidden_states = outputs.last_hidden_state

sentence_embeddings = hidden_states.mean(dim=1)
sentence_embeddings.shape

torch.Size([2, 768])

In [36]:
# sentence_embeddings_cpu = sentence_embeddings.cpu()
X_emb = sentence_embeddings.numpy()
X_emb

array([[ 0.35999233, -0.16072287,  0.35452315, ...,  0.04289217,
         0.0348225 , -0.03822239],
       [ 0.17849985, -0.50002515,  0.25277546, ..., -0.11413097,
        -0.33608466,  0.41095075]], shape=(2, 768), dtype=float32)

**Compute embeddings for the input data**

In [37]:
def make_batches(seq, n):
    result = []
    for i in range(0, len(seq), n):
        batch = seq[i:i+n]
        result.append(batch)
    return result

In [48]:
def compute_embeddings(texts, tokenizer=tokenizer, model=model, batch_size=8):
    text_batches = make_batches(texts, 8)
    
    all_embeddings = []
    
    for batch in tqdm(text_batches):
        encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors='pt')
    
        with torch.no_grad():
            outputs = model(**encoded_input)
            hidden_states = outputs.last_hidden_state
            
            batch_embeddings = hidden_states.mean(dim=1)
            batch_embeddings_np = batch_embeddings.cpu().numpy()
            all_embeddings.append(batch_embeddings_np)
    
    final_embeddings = np.vstack(all_embeddings)
    return final_embeddings

In [43]:
X_text = compute_embeddings(df['text'].tolist())

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 119/119 [07:00<00:00,  3.53s/it]


In [51]:
class BERTTextSearch:

    def __init__(self, text_fields):
        self.text_fields = text_fields
        self.matrices = {}
        self.embedders = {}
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        self.model = BertModel.from_pretrained("bert-base-uncased")
        self.model.eval()

    # def fit(self, records, vectorizer_params={}):
    #     self.df = pd.DataFrame(records)

    #     for f in self.text_fields:
    #         X = compute_embeddings(self.df[f].tolist(), self.tokenizer, self.model)
    #         X_reduced = svd.fit_transform(X)
    #         self.matrices[f] = X_reduced
    #         self.embedders[f] = svd

    # def search(self, query, n_results=10, boost={}, filters={}):
    #     score = np.zeros(len(self.df))

    #     query_embedding = compute_embeddings([query], self.tokenizer, self.model)

    #     for f in self.text_fields:
    #         b = boost.get(f, 1.0)
    #         q = self.embedders[f].transform(query_embedding)
    #         s = cosine_similarity(self.matrices[f], q).flatten()
    #         score = score + b * s

    #     for field, value in filters.items():
    #         mask = (self.df[field] == value).values
    #         score = score * mask

    #     idx = np.argsort(-score)[:n_results]
    #     results = self.df.iloc[idx]
    #     return results.to_dict(orient='records')

    def fit(self, records):
        self.df = pd.DataFrame(records)

        for f in self.text_fields:
            X = compute_embeddings(self.df[f].tolist(), self.tokenizer, self.model)
            self.matrices[f] = X

    def search(self, query, n_results=10, boost={}, filters={}):
        score = np.zeros(len(self.df))

        query_embedding = compute_embeddings([query], self.tokenizer, self.model)

        for f in self.text_fields:
            b = boost.get(f, 1.0)
            s = cosine_similarity(self.matrices[f], query_embedding).flatten()
            score = score + b * s

        for field, value in filters.items():
            mask = (self.df[field] == value).values
            score = score * mask

        idx = np.argsort(-score)[:n_results]
        results = self.df.iloc[idx]
        return results.to_dict(orient='records')

In [52]:
index = BERTTextSearch(
    text_fields=['section', 'question', 'text']
)
index.fit(documents)

results = index.search(
    query='I just signed up. Is it too late to join the course?',
    n_results=5,
    boost={'question': 3.0},
    filters={'course': 'data-engineering-zoomcamp'}
)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 119/119 [00:15<00:00,  7.58it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 119/119 [00:46<00:00,  2.54it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 119/119 [06:49<00:00,  3.44s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 17.09it/s]


In [53]:
results

[{'text': "You don't need it. You're accepted. You can also just start learning and submitting homework without registering. It is not checked against any registered list. Registration is just to gauge interest before the start date.",
  'section': 'General course-related questions',
  'question': 'Course - I have registered for the Data Engineering Bootcamp. When can I expect to receive the confirmation email?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "No, you can only get a certificate if you finish the course with a “live” cohort. We don't award certificates for the self-paced mode. The reason is you need to peer-review capstone(s) after submitting a project. You can only peer-review projects at the time the course is running.",
  'section': 'General course-related questions',
  'question': 'Certificate - Can I follow the course in a self-paced mode and get a certificate?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "Yes, even if you don't register, you're still