In [16]:
import pandas as pd

In [17]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [18]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [19]:
df = pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])

In [20]:
df.head()

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


In [21]:
df.tail()

Unnamed: 0,course,section,question,text
943,mlops-zoomcamp,Module 6: Best practices,Github actions: Permission denied error when e...,Problem description\nThis is the step in the c...
944,mlops-zoomcamp,Module 6: Best practices,Managing Multiple Docker Containers with docke...,Problem description\nWhen a docker-compose fil...
945,mlops-zoomcamp,Module 6: Best practices,AWS regions need to match docker-compose,Problem description\nIf you are having problem...
946,mlops-zoomcamp,Module 6: Best practices,Isort Pre-commit,Problem description\nPre-commit command was fa...
947,mlops-zoomcamp,Module 6: Best practices,How to destroy infrastructure created via GitH...,Problem description\nInfrastructure created in...


In [22]:
df[df.course == 'data-engineering-zoomcamp']

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
...,...,...,...,...
430,data-engineering-zoomcamp,Workshop 2 - RisingWave,Unable to Open Dashboard as xdg-open doesn’t o...,Refer to the solution given in the first solut...
431,data-engineering-zoomcamp,Workshop 2 - RisingWave,Resolving Python Interpreter Path Inconsistenc...,Example Error:\nWhen attempting to execute a P...
432,data-engineering-zoomcamp,Workshop 2 - RisingWave,How does windowing work in Sql?,Ans : Windowing in streaming SQL involves defi...
433,data-engineering-zoomcamp,Triggers in Mage via CLI,"Encountering the error ""ModuleNotFoundError: N...","Python 3.12.1, is not compatible with kafka-py..."


In [23]:
df

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
...,...,...,...,...
943,mlops-zoomcamp,Module 6: Best practices,Github actions: Permission denied error when e...,Problem description\nThis is the step in the c...
944,mlops-zoomcamp,Module 6: Best practices,Managing Multiple Docker Containers with docke...,Problem description\nWhen a docker-compose fil...
945,mlops-zoomcamp,Module 6: Best practices,AWS regions need to match docker-compose,Problem description\nIf you are having problem...
946,mlops-zoomcamp,Module 6: Best practices,Isort Pre-commit,Problem description\nPre-commit command was fa...


In [24]:
from sklearn.feature_extraction.text import CountVectorizer

In [25]:
cv = CountVectorizer(min_df=5)

In [26]:
cv.fit(df.text)

In [27]:
cv.get_feature_names_out()

array(['01', '02', '03', ..., 'youtube', 'zip', 'zoomcamp'], dtype=object)

In [28]:
doc_examples = [
    "Course starts on 15th Jan 2024",
    "Prerequisites listed on GitHub",
    "Submit homeworks after start date",
    "Registration not required for participation",
    "Setup Google Cloud and Python before course"
]

In [29]:
cv = CountVectorizer(stop_words="english")

In [30]:
cv.fit(doc_examples)

In [31]:
cv.get_feature_names_out()

array(['15th', '2024', 'cloud', 'course', 'date', 'github', 'google',
       'homeworks', 'jan', 'listed', 'participation', 'prerequisites',
       'python', 'registration', 'required', 'setup', 'start', 'starts',
       'submit'], dtype=object)

In [32]:
X = cv.transform(doc_examples)

In [33]:
pd.DataFrame(X.todense(), columns = cv.get_feature_names_out()).T

Unnamed: 0,0,1,2,3,4
15th,1,0,0,0,0
2024,1,0,0,0,0
cloud,0,0,0,0,1
course,1,0,0,0,1
date,0,0,1,0,0
github,0,1,0,0,0
google,0,0,0,0,1
homeworks,0,0,1,0,0
jan,1,0,0,0,0
listed,0,1,0,0,0


In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer

cv = TfidfVectorizer(stop_words='english', min_df=5)
X = cv.fit_transform(df.text)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs.round(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,938,939,940,941,942,943,944,945,946,947
01,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
02,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
03,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
04,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
05,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yes,0.0,0.00,0.28,0.0,0.0,0.00,0.21,0.2,0.15,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
yml,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.11,0.0,0.0,0.0,0.00
youtube,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.15,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
zip,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00


In [35]:
X

<948x1333 sparse matrix of type '<class 'numpy.float64'>'
	with 23808 stored elements in Compressed Sparse Row format>

In [36]:
query = "I just discovered the course. Is it too late to join?"

q = cv.transform([query])
pd.DataFrame(q.toarray(), columns=cv.get_feature_names_out()).T

Unnamed: 0,0
01,0.0
02,0.0
03,0.0
04,0.0
05,0.0
...,...
yes,0.0
yml,0.0
youtube,0.0
zip,0.0


In [37]:
query_dict = dict(zip(names, q.toarray()[0]))
#[print(k, v) for k, v in query_dict.items() if v > 0]
#list(filter(lambda x: x > 0, [v for k, v in query_dict.items()]))

In [38]:
doc_dict = dict(zip(names, X.toarray()[2]))
#[print(k, v) for k, v in doc_dict.items() if v > 0]

In [39]:
X.dot(q.T).todense()

matrix([[0.48049682],
        [0.        ],
        [0.        ],
        [0.2083882 ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.17557272],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.15870689],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.09680922],
        [0.        ],
        [0.        ],
        [0.07529201],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.29986763],
        [0.10520675],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.27447476],
        [0.12828407],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.05163407],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.03156309],
        [0.04914818],
        [0.07138962],
        [0.        ],
        [0.04329773],
        [0.        ],
        [0

In [40]:
from sklearn.metrics.pairwise import cosine_similarity

In [41]:
score = cosine_similarity(X, q).flatten()

In [42]:
import numpy as np

In [43]:
np.argsort(score)[-5:]

array([ 22, 448, 449, 440,   0])

In [44]:
df.iloc[449].text

'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.'

In [45]:
df

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
...,...,...,...,...
943,mlops-zoomcamp,Module 6: Best practices,Github actions: Permission denied error when e...,Problem description\nThis is the step in the c...
944,mlops-zoomcamp,Module 6: Best practices,Managing Multiple Docker Containers with docke...,Problem description\nWhen a docker-compose fil...
945,mlops-zoomcamp,Module 6: Best practices,AWS regions need to match docker-compose,Problem description\nIf you are having problem...
946,mlops-zoomcamp,Module 6: Best practices,Isort Pre-commit,Problem description\nPre-commit command was fa...


In [46]:
fields = ['section', 'question', 'text']

In [47]:
matrices = {}
vectorizers = {}

for f in fields:
    cv = TfidfVectorizer()
    X = cv.fit_transform(df[f])
    matrices[f] = X
    vectorizers[f] = cv

In [48]:
matrices

{'section': <948x86 sparse matrix of type '<class 'numpy.float64'>'
 	with 3651 stored elements in Compressed Sparse Row format>,
 'question': <948x2051 sparse matrix of type '<class 'numpy.float64'>'
 	with 8938 stored elements in Compressed Sparse Row format>,
 'text': <948x6711 sparse matrix of type '<class 'numpy.float64'>'
 	with 47683 stored elements in Compressed Sparse Row format>}

In [49]:
vectorizers

{'section': TfidfVectorizer(),
 'question': TfidfVectorizer(),
 'text': TfidfVectorizer()}

In [50]:
n = len(df)

In [51]:
n

948

In [52]:
score = np.zeros(n)
query = 'I just signed up. Is it too late to join the course?'

boosts = {
    'question': 3.0
}

for f in fields:
    q = vectorizers[f].transform([query])
    X = matrices[f]

    f_score = cosine_similarity(X, q).flatten()

    boost = boosts.get(f, 1.0)

    score = score + boost * f_score



In [53]:
score

array([1.14094558, 0.81596966, 1.33056267, 0.65732068, 0.83576004,
       0.52476342, 0.66858864, 1.0787197 , 0.74706959, 0.55772975,
       0.68656319, 0.56112268, 0.53336373, 0.43478918, 0.27432056,
       0.95629274, 0.48958677, 0.68492481, 0.3829383 , 0.49858614,
       0.29339573, 0.47971766, 0.48682824, 0.32341637, 0.34790387,
       0.26031374, 0.23246462, 0.67339941, 0.40084129, 0.37116011,
       0.35832744, 0.49357393, 0.23246462, 0.81793683, 0.77687713,
       0.74140289, 0.2468553 , 0.23886212, 0.45739604, 0.41766658,
       0.3907986 , 0.61285357, 0.2384047 , 0.26177404, 0.26613661,
       0.13144557, 0.        , 0.03763082, 0.1684484 , 0.01471408,
       0.01769699, 0.08498023, 0.20756067, 0.17251289, 0.04697298,
       0.09783391, 0.14133976, 0.11178007, 0.09787604, 0.20922613,
       0.00892879, 0.22877672, 0.39785787, 0.03006711, 0.02359607,
       0.01409061, 0.01477017, 0.04512332, 0.01319535, 0.05420395,
       0.12621362, 0.00783901, 0.21820824, 0.03212773, 0.04674

In [54]:
filters = {
    'course': 'data-engineering-zoomcamp'
}

for field, value in filters.items():
    mask = (df[field] == value).astype(int)
    score = score * mask
mask
score

0      1.140946
1      0.815970
2      1.330563
3      0.657321
4      0.835760
         ...   
943    0.000000
944    0.000000
945    0.000000
946    0.000000
947    0.000000
Name: course, Length: 948, dtype: float64

In [55]:
idx = np.argsort(-score)[:5]

In [56]:
idx

0     2
1     0
2     7
3    15
4     4
Name: course, dtype: int64

In [57]:
df.iloc[idx]

Unnamed: 0,course,section,question,text
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
7,data-engineering-zoomcamp,General course-related questions,Course - Can I follow the course after it fini...,"Yes, we will keep all the materials after the ..."
15,data-engineering-zoomcamp,General course-related questions,Homework - Are late submissions of homework al...,"No, late submissions are not allowed. But if t..."
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


In [58]:
class TextSearch:

    def __init__(self, text_fields):
        self.text_fields = text_fields
        self.matrices = {}
        self.vectorizers = {}

    def fit(self, records, vectorizer_params={}):
        self.df = pd.DataFrame(records)

        for f in self.text_fields:
            cv = TfidfVectorizer(**vectorizer_params)
            X = cv.fit_transform(self.df[f])
            self.matrices[f] = X
            self.vectorizers[f] = cv

    def search(self, query, n_results=10, boost={}, filters={}):
        score = np.zeros(len(self.df))

        for f in self.text_fields:
            b = boost.get(f, 1.0)
            q = self.vectorizers[f].transform([query])
            s = cosine_similarity(self.matrices[f], q).flatten()
            score = score + b * s

        for field, value in filters.items():
            mask = (self.df[field] == value).values
            score = score * mask

        idx = np.argsort(-score)[:n_results]
        results = self.df.iloc[idx]
        return results.to_dict(orient='records')

In [59]:
index = TextSearch(
    text_fields=['section', 'question', 'text']
)
index.fit(documents)

index.search(
    query='I just signed up. Is it too late to join the course?',
    n_results=5,
    boost={'question': 3.0},
    filters={'course': 'data-engineering-zoomcamp'}
)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineerin

In [60]:
X

<948x6711 sparse matrix of type '<class 'numpy.float64'>'
	with 47683 stored elements in Compressed Sparse Row format>

In [61]:
from sklearn.decomposition import TruncatedSVD

X = matrices['text']
cv = vectorizers['text']

svd = TruncatedSVD(n_components=16)
X_emb = svd.fit_transform(X)

X_emb[0]

array([ 0.26785232, -0.1067674 , -0.02589185, -0.09180774, -0.05269819,
        0.09851349,  0.08351298,  0.03604572,  0.06628449, -0.04270265,
        0.14105227, -0.18407762, -0.0994693 , -0.03553409, -0.08181468,
       -0.0060881 ])

In [62]:
cv

In [63]:
X.shape
X_emb.shape

(948, 16)

In [64]:
query = 'I just signed up. Is it too late to join the course?'

Q = cv.transform([query])
Q_emb = svd.transform(Q)
Q_emb[0]

array([ 0.14032584, -0.03383334, -0.02826136, -0.05823009, -0.00288054,
        0.07550256,  0.04440695,  0.02317672,  0.02242819, -0.00059374,
        0.04512556, -0.07268187, -0.03029977, -0.00359279, -0.05873588,
       -0.02988463])

In [65]:
np.dot(X_emb[0], Q_emb[0])

0.08879553607597235

In [66]:
score = cosine_similarity(X_emb, Q_emb).flatten()
idx = np.argsort(-score)[:10]
list(df.loc[idx].text)

['No, late submissions are not allowed. But if the form is still not closed and it’s after the due date, you can still submit the homework. confirm your submission by the date-timestamp on the Course page.y\nOlder news:[source1] [source2]',
 'The zoom link is only published to instructors/presenters/TAs.\nStudents participate via Youtube Live and submit questions to Slido (link would be pinned in the chat when Alexey goes Live). The video URL should be posted in the announcements channel on Telegram & Slack before it begins. Also, you will see it live on the DataTalksClub YouTube Channel.\nDon’t post your questions in chat as it would be off-screen before the instructors/moderators have a chance to answer it if the room is very active.',
 "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Cale

In [67]:
df.loc[idx]

Unnamed: 0,course,section,question,text
15,data-engineering-zoomcamp,General course-related questions,Homework - Are late submissions of homework al...,"No, late submissions are not allowed. But if t..."
12,data-engineering-zoomcamp,General course-related questions,Office Hours - What is the video/zoom link to ...,The zoom link is only published to instructors...
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
451,machine-learning-zoomcamp,General course-related questions,Can I submit the homework after the due date?,"No, it’s not possible. The form is closed afte..."
8,data-engineering-zoomcamp,General course-related questions,Course - Can I get support if I take the cours...,"Yes, the slack channel remains open and you ca..."
450,machine-learning-zoomcamp,General course-related questions,When does the next iteration start?,The course is available in the self-paced mode...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
764,machine-learning-zoomcamp,Projects (Midterm and Capstone),What If I submitted only two projects and fail...,If you have submitted two projects (and peer-r...
436,machine-learning-zoomcamp,General course-related questions,Is it going to be live? When?,"The course videos are pre-recorded, you can st..."
814,mlops-zoomcamp,+-General course questions,What if my answer is not exactly the same as t...,Please choose the closest one to your answer. ...


In [68]:
from sklearn.decomposition import NMF
nmf = NMF(n_components=16)
X_emb = nmf.fit_transform(X)
X_emb[0]

array([0.06934826, 0.00031826, 0.02779713, 0.        , 0.0671051 ,
       0.        , 0.00160578, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        ])

In [69]:
Q = cv.transform([query])
Q_emb = nmf.transform(Q)
Q_emb[0]

array([0.04869905, 0.00968287, 0.00291193, 0.        , 0.02507939,
       0.        , 0.00075438, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        ])

In [70]:
score = cosine_similarity(X_emb, Q_emb).flatten()
idx = np.argsort(-score)[:10]
df.loc[idx]

Unnamed: 0,course,section,question,text
456,machine-learning-zoomcamp,General course-related questions,Submitting learning in public links,When you post about what you learned from the ...
450,machine-learning-zoomcamp,General course-related questions,When does the next iteration start?,The course is available in the self-paced mode...
444,machine-learning-zoomcamp,General course-related questions,Will I get a certificate if I missed the midte...,"Yes, it's possible. See the previous answer."
404,data-engineering-zoomcamp,Project,Project evaluation - Reproducibility,The slack thread : thttps://datatalks-club.sla...
29,data-engineering-zoomcamp,General course-related questions,Besides the “Office Hour” which are the live z...,We will probably have some calls during the Ca...
35,data-engineering-zoomcamp,General course-related questions,Environment - Is the course [Windows/mac/Linux...,Yes! Linux is ideal but technically it should ...
768,machine-learning-zoomcamp,Miscellaneous,Do you pass a project based on the average of ...,Alexey Grigorev: “It’s based on all the scores...
27,data-engineering-zoomcamp,General course-related questions,Environment - The GCP and other cloud provider...,You can do most of the course without a cloud....
34,data-engineering-zoomcamp,General course-related questions,How can we contribute to the course?,Star the repo! Share it with friends if you fi...
40,data-engineering-zoomcamp,General course-related questions,How to ask questions,When the troubleshooting guide above does not ...


In [71]:
%pip install -U transformers

Note: you may need to restart the kernel to use updated packages.


In [72]:
%pip install torch tqdm

Note: you may need to restart the kernel to use updated packages.


In [73]:
%pip install -U protobuf

Note: you may need to restart the kernel to use updated packages.


In [74]:
from huggingface_hub import snapshot_download

snapshot_download(repo_id="google-bert/bert-base-uncased", repo_type="model")

Fetching 16 files:   0%|          | 0/16 [00:00<?, ?it/s]

'/Users/zoryana.paliy/.cache/huggingface/hub/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594'

In [75]:
import torch
from transformers import BertModel, BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
model.eval()  # Set the model to evaluation mode if not training

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [76]:
texts = [
    "Yes, we will keep all the materials after the course finishes.",
    "You can follow the course at your own pace after it finishes"
]
encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
print(encoded_input)

{'input_ids': tensor([[  101,  2748,  1010,  2057,  2097,  2562,  2035,  1996,  4475,  2044,
          1996,  2607, 12321,  1012,   102],
        [  101,  2017,  2064,  3582,  1996,  2607,  2012,  2115,  2219,  6393,
          2044,  2009, 12321,   102,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]])}


In [77]:
with torch.no_grad():  # Disable gradient calculation for inference
    outputs = model(**encoded_input)
    hidden_states = outputs.last_hidden_state

In [78]:
hidden_states.shape

torch.Size([2, 15, 768])

In [79]:
sentence_embeddings = hidden_states.mean(dim=1)
sentence_embeddings.shape

torch.Size([2, 768])

In [80]:
sentence_embeddings[0]

tensor([ 3.5999e-01, -1.6072e-01,  3.5452e-01,  2.0895e-01,  2.6956e-01,
        -3.5563e-01, -9.7660e-02,  5.6438e-01, -4.5281e-03, -9.2505e-02,
        -1.7007e-03,  1.9995e-02,  6.5502e-02,  9.3444e-02, -3.0502e-01,
        -4.5216e-02, -1.0339e-01, -3.1108e-02,  2.4581e-01,  9.2440e-02,
        -8.1905e-05, -2.0250e-01,  3.6914e-02,  6.7139e-01,  2.3419e-01,
         1.0647e-01, -1.6311e-01,  4.0824e-01, -2.2219e-01, -2.0644e-01,
         9.7081e-02, -8.0909e-02, -2.2127e-01, -2.1673e-02,  9.8591e-02,
        -7.9521e-02,  5.1483e-03,  1.9826e-01, -3.5654e-01, -1.6401e-02,
        -5.7301e-01,  2.1067e-01,  4.7548e-02, -1.0806e-01,  1.1387e-01,
        -2.2524e-01,  6.4228e-01, -2.4001e-02,  9.5572e-02, -4.6907e-01,
        -4.4579e-01,  4.7136e-01,  8.3200e-02,  2.1551e-01,  3.2217e-01,
         3.9328e-01, -4.3379e-01, -3.0512e-01, -6.7886e-01,  1.1483e-01,
        -1.4814e-01, -1.6699e-02, -1.5396e-01, -6.9516e-02,  3.5819e-01,
        -7.8530e-03,  4.6437e-02,  3.8914e-02, -4.3

In [81]:
X_emb = sentence_embeddings.numpy()

In [82]:
X_emb

array([[ 0.35999233, -0.16072303,  0.35452372, ...,  0.04289259,
         0.03482312, -0.0382223 ],
       [ 0.17849918, -0.5000251 ,  0.2527758 , ..., -0.11413123,
        -0.3360849 ,  0.41095117]], dtype=float32)

In [83]:
next(model.parameters()).is_cuda

False

In [84]:
def make_batches(seq, n):
    result = []
    for i in range(0, len(seq), n):
        batch = seq[i:i+n]
        result.append(batch)
    return result

In [85]:
from tqdm.auto import tqdm
texts = df['text'].tolist()
text_batches = make_batches(texts, 8)

all_embeddings = []

for batch in tqdm(text_batches):
    encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors='pt')

    with torch.no_grad():
        outputs = model(**encoded_input)
        hidden_states = outputs.last_hidden_state
        
        batch_embeddings = hidden_states.mean(dim=1)
        batch_embeddings_np = batch_embeddings.cpu().numpy()
        all_embeddings.append(batch_embeddings_np)

final_embeddings = np.vstack(all_embeddings)

  0%|          | 0/119 [00:00<?, ?it/s]

In [86]:
def compute_embeddings(texts, batch_size=8):
    text_batches = make_batches(texts, 8)
    
    all_embeddings = []
    
    for batch in tqdm(text_batches):
        encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors='pt')
    
        with torch.no_grad():
            outputs = model(**encoded_input)
            hidden_states = outputs.last_hidden_state
            
            batch_embeddings = hidden_states.mean(dim=1)
            batch_embeddings_np = batch_embeddings.cpu().numpy()
            all_embeddings.append(batch_embeddings_np)
    
    final_embeddings = np.vstack(all_embeddings)
    return final_embeddings

In [87]:
X_text = compute_embeddings(df['text'].tolist())

  0%|          | 0/119 [00:00<?, ?it/s]

In [89]:
X_text

array([[-0.00456306, -0.11667507,  0.62747175, ..., -0.0365919 ,
         0.10031683,  0.02927123],
       [-0.14233609, -0.19853906,  0.28455415, ..., -0.01139044,
        -0.1539977 ,  0.09535074],
       [ 0.1967225 , -0.08461303,  0.28200504, ...,  0.11395877,
        -0.06448036, -0.0128261 ],
       ...,
       [-0.28217438, -0.33324364,  0.29785   , ..., -0.35042733,
         0.03266051,  0.09537267],
       [-0.42807105, -0.3946874 ,  0.3094198 , ..., -0.05943286,
        -0.12965173,  0.0788705 ],
       [-0.16892146, -0.2514627 ,  0.47843295, ..., -0.18535408,
        -0.16108921,  0.27272928]], dtype=float32)