In [None]:
import pandas as pd

In [None]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [None]:
documents[0]

In [None]:
df = pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df[df.course == 'data-engineering-zoomcamp']

In [None]:
df

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
cv = CountVectorizer(min_df=5)

In [None]:
cv.fit(df.text)

In [None]:
cv.get_feature_names_out()

In [None]:
doc_examples = [
    "Course starts on 15th Jan 2024",
    "Prerequisites listed on GitHub",
    "Submit homeworks after start date",
    "Registration not required for participation",
    "Setup Google Cloud and Python before course"
]

In [None]:
cv = CountVectorizer(stop_words="english")

In [None]:
cv.fit(doc_examples)

In [None]:
cv.get_feature_names_out()

In [None]:
X = cv.transform(doc_examples)

In [None]:
pd.DataFrame(X.todense(), columns = cv.get_feature_names_out()).T

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

cv = TfidfVectorizer(stop_words='english', min_df=5)
X = cv.fit_transform(df.text)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs.round(2)

In [None]:
X

In [None]:
query = "I just discovered the course. Is it too late to join?"

q = cv.transform([query])
pd.DataFrame(q.toarray(), columns=cv.get_feature_names_out()).T

In [None]:
query_dict = dict(zip(names, q.toarray()[0]))
#[print(k, v) for k, v in query_dict.items() if v > 0]
#list(filter(lambda x: x > 0, [v for k, v in query_dict.items()]))

In [None]:
doc_dict = dict(zip(names, X.toarray()[2]))
#[print(k, v) for k, v in doc_dict.items() if v > 0]

In [None]:
X.dot(q.T).todense()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
score = cosine_similarity(X, q).flatten()

In [None]:
import numpy as np

In [None]:
np.argsort(score)[-5:]

In [None]:
df.iloc[449].text

In [None]:
df

In [None]:
fields = ['section', 'question', 'text']

In [None]:
matrices = {}
vectorizers = {}

for f in fields:
    cv = TfidfVectorizer()
    X = cv.fit_transform(df[f])
    matrices[f] = X
    vectorizers[f] = cv

In [None]:
matrices

In [None]:
vectorizers

In [None]:
n = len(df)

In [None]:
n

In [None]:
score = np.zeros(n)
query = 'I just signed up. Is it too late to join the course?'

boosts = {
    'question': 3.0
}

for f in fields:
    q = vectorizers[f].transform([query])
    X = matrices[f]

    f_score = cosine_similarity(X, q).flatten()

    boost = boosts.get(f, 1.0)

    score = score + boost * f_score



In [None]:
score

In [None]:
filters = {
    'course': 'data-engineering-zoomcamp'
}

for field, value in filters.items():
    mask = (df[field] == value).astype(int)
    score = score * mask
mask
score

In [None]:
idx = np.argsort(-score)[:5]

In [None]:
idx

In [None]:
df.iloc[idx]

In [None]:
class TextSearch:

    def __init__(self, text_fields):
        self.text_fields = text_fields
        self.matrices = {}
        self.vectorizers = {}

    def fit(self, records, vectorizer_params={}):
        self.df = pd.DataFrame(records)

        for f in self.text_fields:
            cv = TfidfVectorizer(**vectorizer_params)
            X = cv.fit_transform(self.df[f])
            self.matrices[f] = X
            self.vectorizers[f] = cv

    def search(self, query, n_results=10, boost={}, filters={}):
        score = np.zeros(len(self.df))

        for f in self.text_fields:
            b = boost.get(f, 1.0)
            q = self.vectorizers[f].transform([query])
            s = cosine_similarity(self.matrices[f], q).flatten()
            score = score + b * s

        for field, value in filters.items():
            mask = (self.df[field] == value).values
            score = score * mask

        idx = np.argsort(-score)[:n_results]
        results = self.df.iloc[idx]
        return results.to_dict(orient='records')

In [None]:
index = TextSearch(
    text_fields=['section', 'question', 'text']
)
index.fit(documents)

index.search(
    query='I just signed up. Is it too late to join the course?',
    n_results=5,
    boost={'question': 3.0},
    filters={'course': 'data-engineering-zoomcamp'}
)

In [None]:
X

In [None]:
from sklearn.decomposition import TruncatedSVD

X = matrices['text']
cv = vectorizers['text']

svd = TruncatedSVD(n_components=16)
X_emb = svd.fit_transform(X)

X_emb[0]

In [None]:
cv

In [None]:
X.shape
X_emb.shape

In [None]:
query = 'I just signed up. Is it too late to join the course?'

Q = cv.transform([query])
Q_emb = svd.transform(Q)
Q_emb[0]

In [None]:
np.dot(X_emb[0], Q_emb[0])

In [None]:
score = cosine_similarity(X_emb, Q_emb).flatten()
idx = np.argsort(-score)[:10]
list(df.loc[idx].text)

In [None]:
df.loc[idx]

In [None]:
from sklearn.decomposition import NMF
nmf = NMF(n_components=16)
X_emb = nmf.fit_transform(X)
X_emb[0]

In [None]:
Q = cv.transform([query])
Q_emb = nmf.transform(Q)
Q_emb[0]

In [None]:
score = cosine_similarity(X_emb, Q_emb).flatten()
idx = np.argsort(-score)[:10]
df.loc[idx]

In [None]:
%pip install -U transformers

In [None]:
%pip install torch tqdm

In [None]:
%pip install -U protobuf

In [None]:
from huggingface_hub import snapshot_download

snapshot_download(repo_id="google-bert/bert-base-uncased", repo_type="model")

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
model = AutoModelForMaskedLM.from_pretrained("google-bert/bert-base-uncased")