In [None]:
import os

if "COLAB_GPU" in os.environ:
    print("[INFO] Running in Google Colab, installing requirements.")
    !pip install -U torch # requires torch 2.1.1+ (for efficient sdpa implementation)
    !pip install PyMuPDF # for reading PDFs with Python
    !pip install tqdm # for progress bars
    !pip install sentence-transformers # for embedding models
    !pip install accelerate # for quantization model loading
    !pip install bitsandbytes # for quantizing models (less storage space)
    !pip install flash-attn --no-build-isolation # for faster attention mechanism = faster LLM inference

[INFO] Running in Google Colab, installing requirements.
Collecting PyMuPDF
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m45.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.3
Collecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl (61.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.47.0
Collecting flash-attn
  Downloading flash_attn-2.8.3.tar.gz (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m51.4 MB/s[0m e

In [None]:
import os
import requests

pdf_path = "Hands_on_ml_new_file.pdf"

if not os.path.exists(pdf_path):
    print("File doesn't exist.. Downloading PDF...")

    url = "https://www.clc.hcmus.edu.vn/wp-content/uploads/2017/11/Hands_On_Machine_Learning_with_Scikit_Learn_and_TensorFlow.pdf"

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                      "AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/91.0.4472.124 Safari/537.36"
    }

    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        with open(pdf_path, "wb") as file:
            file.write(response.content)
        print(f"The file has been downloaded and saved as {pdf_path}")
    else:
        print(f"Failed to download the file. Status code: {response.status_code}")
else:
    print(f"File {pdf_path} exists.")


File doesn't exist.. Downloading PDF...
The file has been downloaded and saved as Hands_on_ml_new_file.pdf


In [None]:
import fitz
from tqdm.auto import tqdm

def text_formatter(text: str) -> str:
    return text.replace("\n", " ").strip()

def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_and_texts = []

    for page_number, page in tqdm(enumerate(doc)):
        page_text = page.get_text()
        text = text_formatter(page_text)

        pages_and_texts.append({
            "page_number": page_number,  # or page_number - 15 if needed
            "page_char_count": len(text),
            "page_word_count": len(text.split(" ")),
            "page_sentence_count_raw": len(text.split(". ")),
            "page_token_count": len(text) / 4,
            "text": text
        })

    return pages_and_texts   # now outside the loop

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[20]


0it [00:00, ?it/s]

{'page_number': 20,
 'page_char_count': 1904,
 'page_word_count': 273,
 'page_sentence_count_raw': 13,
 'page_token_count': 476.0,
 'text': 'John Wiley & Sons, Syngress, Morgan Kaufmann, IBM Redbooks, Packt, Adobe Press, FT Press, Apress, Manning, New Riders, McGraw-Hill, Jones & Bartlett, and Course Technology, among others. For more information, please visit http://oreilly.com/safari. How to Contact Us Please address comments and questions concerning this book to the publisher: O’Reilly Media, Inc. 1005 Gravenstein Highway North Sebastopol, CA 95472 800-998-9938 (in the United States or Canada) 707-829-0515 (international or local) 707-829-0104 (fax) We have a web page for this book, where we list errata, examples, and any additional information. You can access this page at http://bit.ly/hands-on-machine-learning- with-scikit-learn-and-tensorflow. To comment or ask technical questions about this book, send email to bookques‐ tions@oreilly.com. For more information about our books, co

In [None]:
import random
random.sample(pages_and_texts, k=2)

[{'page_number': 151,
  'page_char_count': 2037,
  'page_word_count': 364,
  'page_sentence_count_raw': 18,
  'page_token_count': 509.25,
  'text': '11 It is common to use the notation J(θ) for cost functions that don’t have a short name; we will often use this notation throughout the rest of this book. The context will make it clear which cost function is being dis‐ cussed. 12 Norms are discussed in Chapter 2. 13 A square matrix full of 0s except for 1s on the main diagonal (top-left to bottom-right). up very close to zero and the result is a flat line going through the data’s mean. Equa‐ tion 4-8 presents the Ridge Regression cost function.11 Equation 4-8. Ridge Regression cost function J θ = MSE θ + α1 2 ∑ i = 1 n θi 2 Note that the bias term θ0 is not regularized (the sum starts at i = 1, not 0). If we define w as the vector of feature weights (θ1 to θn), then the regularization term is simply equal to ½(∥ w ∥2)2, where ∥ · ∥2 represents the ℓ2 norm of the weight vector.12 For Grad

In [None]:
import pandas as pd
df = pd.DataFrame(pages_and_texts)
df.head(10)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,0,153,29,1,38.25,Aurélien Géron Hands-On Machine Learning w...
1,1,0,1,1,0.0,
2,2,214,27,1,53.5,Aurélien Géron Hands-On Machine Learning with ...
3,3,1934,266,13,483.5,978-1-491-96229-9 [M] Hands-On Machine Learnin...
4,4,2798,2010,159,699.5,Table of Contents Preface. . . . . . . . . . ....
5,5,4866,3795,63,1216.5,Check the Assumptions ...
6,6,4743,3804,111,1185.75,Exercises ...
7,7,4839,3792,149,1209.75,6. Decision Trees. . . . . . . . . . . . . . ....
8,8,4335,3199,82,1083.75,Kernel PCA ...
9,9,4698,3614,79,1174.5,Exercises ...


In [None]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,570.0,570.0,570.0,570.0,570.0
mean,284.5,1875.84,355.31,14.86,468.96
std,164.69,717.54,386.15,19.09,179.38
min,0.0,0.0,1.0,1.0,0.0
25%,142.25,1468.5,244.25,9.0,367.12
50%,284.5,1935.5,320.0,13.0,483.88
75%,426.75,2339.5,394.0,17.0,584.88
max,569.0,4866.0,3804.0,345.0,1216.5


In [None]:
from spacy.lang.en import English

nlp = English()
nlp.add_pipe("sentencizer")

doc = nlp("This is a sentence. This is another sentence.")
assert len(list(doc.sents)) == 2
list(doc.sents)

[This is a sentence., This is another sentence.]

In [None]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]
    item["page_sentence_count_spacy"] = len(item["sentences"])

  0%|          | 0/570 [00:00<?, ?it/s]

In [None]:
random.sample(pages_and_texts, k=1)

[{'page_number': 401,
  'page_char_count': 2301,
  'page_word_count': 372,
  'page_sentence_count_raw': 17,
  'page_token_count': 575.25,
  'text': '15 This name is quite misleading since this layer does not perform a deconvolution, which is a well-defined mathematical operation (the inverse of a convolution). TensorFlow Convolution Operations TensorFlow also offers a few other kinds of convolutional layers: • tf.layers.conv1d() creates a convolutional layer for 1D inputs. This is useful, for example, in natural language processing, where a sentence may be repre‐ sented as a 1D array of words, and the receptive field covers a few neighboring words. • tf.layers.conv3d() creates a convolutional layer for 3D inputs, such as 3D PET scan. • tf.nn.atrous_conv2d() creates an atrous convolutional layer (“à trous” is French for “with holes”). This is equivalent to using a regular convolutional layer with a filter dilated by inserting rows and columns of zeros (i.e., holes). For example, a 1 × 3

In [None]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,570.0,570.0,570.0,570.0,570.0,570.0
mean,284.5,1875.84,355.31,14.86,468.96,13.96
std,164.69,717.54,386.15,19.09,179.38,7.25
min,0.0,0.0,1.0,1.0,0.0,0.0
25%,142.25,1468.5,244.25,9.0,367.12,10.0
50%,284.5,1935.5,320.0,13.0,483.88,13.0
75%,426.75,2339.5,394.0,17.0,584.88,17.0
max,569.0,4866.0,3804.0,345.0,1216.5,60.0


In [None]:
num_sent_chunk_size = 13
def split_list(input_list: list,
               slice_size: int) -> list[list]:
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

for item in tqdm(pages_and_texts):
    item["sentences_chunks"] = split_list(item["sentences"],slice_size= num_sent_chunk_size)
    item["num_sentences_chunks"] = len(item["sentences_chunks"])

  0%|          | 0/570 [00:00<?, ?it/s]

In [None]:
random.sample(pages_and_texts, k=1)

[{'page_number': 431,
  'page_char_count': 2263,
  'page_word_count': 419,
  'page_sentence_count_raw': 14,
  'page_token_count': 565.75,
  'text': 'Equation 14-4 summarizes how to compute the cell’s state at each time step for a sin‐ gle instance. Equation 14-4. GRU computations z t = σ Wxz T · x t + Whz T · h t −1 + bz r t = σ Wxr T · x t + Whr T · h t −1 + br g t = tanh Wxg T · x t + Whg T · r t ⊗h t −1 + bg h t = z t ⊗h t −1 + 1 −z t ⊗g t Creating a GRU cell in TensorFlow is trivial: gru_cell = tf.contrib.rnn.GRUCell(num_units=n_neurons) LSTM or GRU cells are one of the main reasons behind the success of RNNs in recent years, in particular for applications in natural language processing (NLP). Natural Language Processing Most of the state-of-the-art NLP applications, such as machine translation, automatic summarization, parsing, sentiment analysis, and more, are now based (at least in part) on RNNs. In this last section, we will take a quick look at what a machine trans‐ lation mod

In [None]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_sentences_chunks
count,570.0,570.0,570.0,570.0,570.0,570.0,570.0
mean,284.5,1875.84,355.31,14.86,468.96,13.96,1.53
std,164.69,717.54,386.15,19.09,179.38,7.25,0.67
min,0.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,142.25,1468.5,244.25,9.0,367.12,10.0,1.0
50%,284.5,1935.5,320.0,13.0,483.88,13.0,1.0
75%,426.75,2339.5,394.0,17.0,584.88,17.0,2.0
max,569.0,4866.0,3804.0,345.0,1216.5,60.0,5.0


In [None]:
import re
pages_and_chunks = []
for item in tqdm(pages_and_texts):
  for sentence_chunk in item["sentences_chunks"]:
    chunk_dict = {}
    chunk_dict["page_number"] = item["page_number"]

    joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
    joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full-stop/capital letter combo
    chunk_dict["sentence_chunk"] = joined_sentence_chunk

    chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
    chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
    chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 characters

    pages_and_chunks.append(chunk_dict)

len(pages_and_chunks)






  0%|          | 0/570 [00:00<?, ?it/s]

870

In [None]:
random.sample(pages_and_chunks, k=1)

[{'page_number': 25,
  'sentence_chunk': 'If you already know all the Machine Learning basics, you may want to skip directly to Chapter 2. If you are not sure, try to answer all the questions listed at the end of the chapter before moving on. What Is Machine Learning?Machine Learning is the science (and art) of programming computers so they can learn from data. Here is a slightly more general definition: [Machine Learning is the] field of study that gives computers the ability to learn without being explicitly programmed. —Arthur Samuel, 1959 And a more engineering-oriented one: A computer program is said to learn from experience E with respect to some task T and some performance measure P, if its performance on T, as measured by P, improves with experience E. —Tom Mitchell, 1997 For example, your spam filter is a Machine Learning program that can learn to flag spam given examples of spam emails (e.g., flagged by users) and examples of regular (nonspam, also called “ham”) emails. The e

In [None]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,870.0,870.0,870.0,870.0
mean,294.2,1205.72,209.86,301.43
std,163.33,685.57,191.08,171.39
min,0.0,3.0,1.0,0.75
25%,153.25,638.5,107.25,159.62
50%,304.0,1293.5,211.5,323.38
75%,433.75,1719.75,283.0,429.94
max,569.0,3079.0,2029.0,769.75


In [None]:
min_token_length = 30
for row in df[df["chunk_token_count"] <= min_token_length].sample(10).iterrows():
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')


Chunk token count: 7.75 | Text: Regularized Linear Models | 133
Chunk token count: 21.5 | Text: You can’t speed up time either; adding more computing Introduction to OpenAI Gym | 447
Chunk token count: 29.0 | Text: Most points in a high-dimensional hypercube are very close to the border.3 208 | Chapter 8: Dimensionality Reduction
Chunk token count: 9.0 | Text: 478 | Appendix A: Exercise Solutions
Chunk token count: 14.75 | Text: Parallelizing Neural Networks on a TensorFlow Cluster | 347
Chunk token count: 23.5 | Text: How can you tell that your model is overfitting or underfitting the data?Learning Curves | 125
Chunk token count: 6.25 | Text: Performance Measures | 87
Chunk token count: 29.75 | Text: Note that all the convolutional layers use the ReLU activation function.374 | Chapter 13: Convolutional Neural Networks
Chunk token count: 12.75 | Text: [ 3. 4. 0. 0.] [ 5. 6. 0. 0.]] [[ 1. 0. 0. 0.] [ 0.
Chunk token count: 15.25 | Text: 4. Can you run two graphs in the same session?Exerc

In [None]:
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:2]


[{'page_number': 0,
  'sentence_chunk': 'Aurélien Géron Hands-On  Machine Learning  with Scikit-Learn  & TensorFlow  CONCEPTS, TOOLS, AND TECHNIQUES  TO BUILD INTELLIGENT SYSTEMS powered by',
  'chunk_char_count': 148,
  'chunk_word_count': 24,
  'chunk_token_count': 37.0},
 {'page_number': 2,
  'sentence_chunk': 'Aurélien Géron Hands-On Machine Learning with Scikit-Learn and TensorFlow Concepts, Tools, and Techniques to Build Intelligent Systems Boston Farnham Sebastopol Tokyo Beijing Boston Farnham Sebastopol Tokyo Beijing',
  'chunk_char_count': 214,
  'chunk_word_count': 27,
  'chunk_token_count': 53.5}]

In [None]:
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,870.0,870.0,870.0,870.0
mean,294.2,1205.72,209.86,301.43
std,163.33,685.57,191.08,171.39
min,0.0,3.0,1.0,0.75
25%,153.25,638.5,107.25,159.62
50%,304.0,1293.5,211.5,323.38
75%,433.75,1719.75,283.0,429.94
max,569.0,3079.0,2029.0,769.75


### Embedding Part a head   

In [None]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path ="all-mpnet-base-v2",
                                      device="cpu")
sentences = [
    "The Sentences Transformers library provides an easy and open-source way to create embeddings.",
    "Sentences can be embedded one by one or as a list of strings.",
    "Embeddings are one of the most powerful concepts in machine learning!",
    "Learn to use embeddings well and you'll be well on your way to being an AI engineer."
]

embeddings = embedding_model.encode(sentences)
embeddings_dict =  dict(zip(sentences, embeddings))

for sentence, embedding in embeddings_dict.items():
    print(f'Sentence: {sentence} | Embedding: {embedding}')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Sentence: The Sentences Transformers library provides an easy and open-source way to create embeddings. | Embedding: [-2.07982697e-02  3.03164832e-02 -2.01217849e-02  6.86484650e-02
 -2.55256221e-02 -8.47686455e-03 -2.07225574e-04 -6.32377118e-02
  2.81606894e-02 -3.33353989e-02  3.02633960e-02  5.30721806e-02
 -5.03527038e-02  2.62288321e-02  3.33313718e-02 -4.51577231e-02
  3.63044813e-02 -1.37122418e-03 -1.20171458e-02  1.14947259e-02
  5.04510924e-02  4.70856987e-02  2.11913940e-02  5.14606535e-02
 -2.03746483e-02 -3.58889401e-02 -6.67763175e-04 -2.94393823e-02
  4.95859198e-02 -1.05639677e-02 -1.52014112e-02 -1.31758570e-03
  4.48197424e-02  1.56023465e-02  8.60379430e-07 -1.21392624e-03
 -2.37978697e-02 -9.09368275e-04  7.34484056e-03 -2.53933994e-03
  5.23370504e-02 -4.68043424e-02  1.66214760e-02  4.71579395e-02
 -4.15599644e-02  9.01976076e-04  3.60277519e-02  3.42214219e-02
  9.68227163e-02  5.94829023e-02 -1.64984372e-02 -3.51249315e-02
  5.92516130e-03 -7.07903586e-04 -2.41

In [None]:
single_sentence = "Yo! How cool are embeddings?"
single_embedding = embedding_model.encode(single_sentence)
print(f"Sentence: {single_sentence}")
print(f"Embedding:\n{single_embedding}")
print(f"Embedding size: {single_embedding.shape}")

Sentence: Yo! How cool are embeddings?
Embedding:
[-1.97448116e-02 -4.51077102e-03 -4.98487009e-03  6.55444860e-02
 -9.87674482e-03  2.72836145e-02  3.66426148e-02 -3.30219790e-03
  8.50078743e-03  8.24952591e-03 -2.28497721e-02  4.02430184e-02
 -5.75200766e-02  6.33691922e-02  4.43207175e-02 -4.49506752e-02
  1.25284623e-02 -2.52011865e-02 -3.55293043e-02  1.29559012e-02
  8.67022015e-03 -1.92917809e-02  3.55635886e-03  1.89505499e-02
 -1.47128170e-02 -9.39848926e-03  7.64176017e-03  9.62184835e-03
 -5.98920928e-03 -3.90168764e-02 -5.47824688e-02 -5.67456381e-03
  1.11644436e-02  4.08067554e-02  1.76319099e-06  9.15305689e-03
 -8.77257995e-03  2.39382889e-02 -2.32784264e-02  8.04999918e-02
  3.19176912e-02  5.12598502e-03 -1.47708468e-02 -1.62525177e-02
 -6.03213198e-02 -4.35689725e-02  4.51211631e-02 -1.79053713e-02
  2.63366811e-02 -3.47866565e-02 -8.89172778e-03 -5.47675416e-02
 -1.24372449e-02 -2.38606725e-02  8.33496973e-02  5.71241751e-02
  1.13328276e-02 -1.49595076e-02  9.2037

In [None]:
%%time

# Uncomment to see how long it takes to create embeddings on CPU
# # Make sure the model is on the CPU
embedding_model.to("cpu")

# Embed each chunk one by one
for item in tqdm(pages_and_chunks_over_min_token_len):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])


  0%|          | 0/814 [00:00<?, ?it/s]

CPU times: user 17min 5s, sys: 7.43 s, total: 17min 12s
Wall time: 17min 35s


In [None]:
# Turn text chunks into a single list
text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]


In [None]:
%%time

# Embed all texts in batches
text_chunk_embeddings = embedding_model.encode(
    text_chunks,
    batch_size=32,               # adjust based on your GPU/CPU memory
    convert_to_tensor=True,      # returns torch.Tensor
    show_progress_bar=True       # ✅ adds tqdm progress bar
)

text_chunk_embeddings


Batches:   0%|          | 0/26 [00:00<?, ?it/s]

CPU times: user 17min 14s, sys: 4min 10s, total: 21min 24s
Wall time: 21min 58s


tensor([[ 0.0022,  0.0450, -0.0299,  ...,  0.0374, -0.0108,  0.0050],
        [ 0.0087,  0.0394, -0.0373,  ...,  0.0565,  0.0095, -0.0053],
        [ 0.0148,  0.0318, -0.0414,  ...,  0.0177, -0.0037, -0.0014],
        ...,
        [-0.0179, -0.0009, -0.0142,  ...,  0.0008, -0.0242, -0.0305],
        [ 0.0185,  0.0438, -0.0404,  ...,  0.0475,  0.0362, -0.0479],
        [ 0.0258,  0.0464,  0.0062,  ...,  0.0555,  0.0306, -0.0321]])

In [46]:
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_len)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [49]:
text_chunks_and_embedding_df_load = pd.read_csv(embeddings_df_save_path)
text_chunks_and_embedding_df_load.sample(5)

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
534,387,"Figure 13-7. Padding options—input width: 13, ...",1554,239,388.5,[ 3.21674012e-02 -2.06930693e-02 -5.41840738e-...
730,513,The ELU activation function is a good default....,1814,311,453.5,[ 5.25547145e-03 -2.23357547e-02 9.90909897e-...
145,109,If you are confused about the confusion matrix...,1273,216,318.25,[ 1.83111392e-02 -8.34181458e-02 1.21128671e-...
7,8,Kernel PCA ...,2855,1719,713.75,[-1.34945568e-02 3.52669060e-02 -4.19800319e-...
598,428,Note that Tensor‐ Flow initializes bf to a vec...,157,28,39.25,[-4.71787341e-02 -5.45895398e-02 3.85217392e-...


In [52]:
import random
import torch
import numpy as np
import pandas as pd

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load your CSV file
text_chunks_and_embedding_df = pd.read_csv("text_chunks_and_embeddings_df.csv")

# Convert the string embeddings "[0.1 0.2 ...]" into numpy arrays
text_chunks_and_embedding_df['embedding'] = text_chunks_and_embedding_df['embedding'].apply(
    lambda x: np.fromstring(x.strip("[]"), sep=" ")
)

# Convert to list of dicts for convenience
pages_and_chunks = text_chunks_and_embedding_df.to_dict(orient="records")

# Convert embeddings to torch tensor
embeddings = torch.tensor(
    np.array(text_chunks_and_embedding_df["embedding"].tolist()),
    dtype=torch.float32
).to(device)

print(embeddings.shape)




Using device: cpu
torch.Size([814, 768])


In [53]:
text_chunks_and_embedding_df.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,0,Aurélien Géron Hands-On Machine Learning wit...,148,24,37.0,"[0.00221082987, 0.0449508652, -0.0299461465, 0..."
1,2,Aurélien Géron Hands-On Machine Learning with ...,214,27,53.5,"[0.00865716301, 0.0394063592, -0.0372873917, 0..."
2,3,978-1-491-96229-9 [M] Hands-On Machine Learnin...,1933,265,483.25,"[0.0147812506, 0.0318412296, -0.0414493605, 0...."
3,4,Table of Contents Preface. . . . . . . . . . ....,1918,1130,479.5,"[-0.0144850714, 0.0365120843, -0.0375104211, 0..."
4,5,Check the Assumptions ...,3079,2008,769.75,"[0.0374273397, 0.0144489156, -0.0458006114, -0..."


In [56]:
embeddings[0]

tensor([ 2.2108e-03,  4.4951e-02, -2.9946e-02,  3.6254e-02, -1.0012e-03,
         1.5632e-02,  3.9714e-02, -9.5655e-03,  2.6288e-02, -3.3305e-03,
         2.1505e-02,  3.4660e-02, -8.7466e-03,  8.6895e-02,  2.9053e-02,
        -2.7002e-02,  4.7789e-02,  1.6869e-02,  6.8070e-03, -2.1586e-02,
        -1.5434e-02, -1.4221e-02, -5.9148e-03,  3.0551e-02, -5.4952e-02,
         3.8958e-02, -1.0932e-02,  2.0858e-02,  3.5252e-02,  5.3712e-03,
         2.2708e-03, -1.9837e-03, -4.7705e-03,  1.5541e-01,  1.8933e-06,
        -1.0342e-02,  2.0546e-02,  5.6112e-03, -2.2005e-02, -9.9224e-03,
         3.8736e-02,  2.0816e-02,  2.1808e-02, -6.5073e-03, -5.5299e-02,
        -1.7632e-02,  2.6972e-02, -4.9609e-03,  1.5509e-02,  3.0593e-02,
         3.1994e-03,  3.0453e-02,  7.8200e-02, -2.7555e-02, -2.8228e-02,
        -1.4865e-03, -2.4752e-02, -2.9802e-02,  3.8059e-04, -5.2415e-02,
         1.5955e-02,  5.1663e-02,  4.7411e-02, -2.7640e-02,  9.7311e-02,
         3.9614e-05, -5.1501e-02, -5.2240e-02, -2.4

In [57]:
from sentence_transformers import util, SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                      device=device)

In [62]:
# 1. Define the query
query = "What do you mean by Unsupervised Learning"
print(f"Query: {query}")

# 2. Embed the query to the same numerical space as the text examples
# Note: It's important to embed your query with the same model you embedded your examples with.
query_embedding = embedding_model.encode(query, convert_to_tensor=True)

# 3. Get similarity scores with the dot product (we'll time this for fun)
from time import perf_counter as timer

start_time = timer()
dot_scores = util.dot_score(a=query_embedding, b=embeddings)[0]
end_time = timer()

print(f"Time take to get scores on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")

# 4. Get the top-k results (we'll keep this to 5)
top_results_dot_product = torch.topk(dot_scores, k=3)
top_results_dot_product



Query: What do you mean by Unsupervised Learning
Time take to get scores on 814 embeddings: 0.00076 seconds.


torch.return_types.topk(
values=tensor([0.7618, 0.6378, 0.6170]),
indices=tensor([35, 36, 37]))

In [63]:
larger_embeddings = torch.randn(100*embeddings.shape[0], 768).to(device)
print(f"Embeddings shape: {larger_embeddings.shape}")

# Perform dot product across 168,000 embeddings
start_time = timer()
dot_scores = util.dot_score(a=query_embedding, b=larger_embeddings)[0]
end_time = timer()

print(f"Time take to get scores on {len(larger_embeddings)} embeddings: {end_time-start_time:.5f} seconds.")



Embeddings shape: torch.Size([81400, 768])
Time take to get scores on 81400 embeddings: 0.02600 seconds.


In [64]:
import textwrap

def print_wrapped(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)


In [65]:
print(f"Query: '{query}'\n")
print("Results:")
# Loop through zipped together scores and indicies from torch.topk
for score, idx in zip(top_results_dot_product[0], top_results_dot_product[1]):
    print(f"Score: {score:.4f}")
    # Print relevant sentence chunk (since the scores are in descending order, the most relevant chunk will be first)
    print("Text:")
    print_wrapped(pages_and_chunks[idx]["sentence_chunk"])
    # Print the page number too so we can reference the textbook further (and check the results)
    print(f"Page number: {pages_and_chunks[idx]['page_number']}")
    print("\n")

Query: 'What do you mean by Unsupervised Learning'

Results:
Score: 0.7618
Text:
Unsupervised learning In unsupervised learning, as you might guess, the training
data is unlabeled (Figure 1-7). The system tries to learn without a teacher.
Figure 1-7. An unlabeled training set for unsupervised learning Here are some of
the most important unsupervised learning algorithms (we will cover
dimensionality reduction in Chapter 8): • Clustering — k-Means — Hierarchical
Cluster Analysis (HCA) — Expectation Maximization • Visualization and
dimensionality reduction — Principal Component Analysis (PCA) — Kernel PCA —
Locally-Linear Embedding (LLE) — t-distributed Stochastic Neighbor Embedding
(t-SNE) • Association rule learning — Apriori — Eclat For example, say you have
a lot of data about your blog’s visitors. You may want to run a clustering
algorithm to try to detect groups of similar visitors (Figure 1-8). At no point
do you tell the algorithm which group a visitor belongs to: it finds those
c

In [69]:
import torch

def dot_product(vector1, vector2):
    return torch.dot(vector1, vector2)

def cosine_similarity(vector1, vector2):
    dot_product = torch.dot(vector1, vector2)

    # Get Euclidean/L2 norm of each vector (removes the magnitude, keeps direction)
    norm_vector1 = torch.sqrt(torch.sum(vector1**2))
    norm_vector2 = torch.sqrt(torch.sum(vector2**2))

    return dot_product / (norm_vector1 * norm_vector2)

# Example tensors
vector1 = torch.tensor([1, 2, 3], dtype=torch.float32)
vector2 = torch.tensor([1, 2, 3], dtype=torch.float32)
vector3 = torch.tensor([4, 5, 6], dtype=torch.float32)
vector4 = torch.tensor([-1, -2, -3], dtype=torch.float32)

# Calculate dot product
print("Dot product between vector1 and vector2:", dot_product(vector1, vector2))
print("Dot product between vector1 and vector3:", dot_product(vector1, vector3))
print("Dot product between vector1 and vector4:", dot_product(vector1, vector4))

# Calculate cosine similarity
print("Cosine similarity between vector1 and vector2:", cosine_similarity(vector1, vector2))
print("Cosine similarity between vector1 and vector3:", cosine_similarity(vector1, vector3))
print("Cosine similarity between vector1 and vector4:", cosine_similarity(vector1, vector4))



Dot product between vector1 and vector2: tensor(14.)
Dot product between vector1 and vector3: tensor(32.)
Dot product between vector1 and vector4: tensor(-14.)
Cosine similarity between vector1 and vector2: tensor(1.0000)
Cosine similarity between vector1 and vector3: tensor(0.9746)
Cosine similarity between vector1 and vector4: tensor(-1.0000)


In [74]:
def retrieve_relevant_resources(query: str,
                                embeddings: torch.tensor,
                                model: SentenceTransformer=embedding_model,
                                n_resources_to_return: int=5,
                                print_time: bool=True):
    """
    Embeds a query with model and returns top k scores and indices from embeddings.
    """

    # Embed the query
    query_embedding = model.encode(query,
                                   convert_to_tensor=True)

    # Get dot product scores on embeddings
    start_time = timer()
    dot_scores = util.dot_score(query_embedding, embeddings)[0]
    end_time = timer()

    if print_time:
        print(f"[INFO] Time taken to get scores on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")

    scores, indices = torch.topk(input=dot_scores,
                                 k=n_resources_to_return)

    return scores, indices

def print_top_results_and_scores(query: str,
                                 embeddings: torch.tensor,
                                 pages_and_chunks: list[dict]=pages_and_chunks,
                                 n_resources_to_return: int=3):
    """
    Takes a query, retrieves most relevant resources and prints them out in descending order.

    Note: Requires pages_and_chunks to be formatted in a specific way (see above for reference).
    """

    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings,
                                                  n_resources_to_return=n_resources_to_return)

    print(f"Query: {query}\n")
    print("Results:")
    # Loop through zipped together scores and indicies
    for score, index in zip(scores, indices):
        print(f"Score: {score:.4f}")
        # Print relevant sentence chunk (since the scores are in descending order, the most relevant chunk will be first)
        print_wrapped(pages_and_chunks[index]["sentence_chunk"])
        # Print the page number too so we can reference the textbook further and check the results
        print(f"Page number: {pages_and_chunks[index]['page_number']}")
        print("\n")


In [75]:

query = "how to implement a CNN"

# Get just the scores and indices of top related results
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)
scores, indices

[INFO] Time taken to get scores on 814 embeddings: 0.00048 seconds.


(tensor([0.6355, 0.5768, 0.5753, 0.5582, 0.5415]),
 tensor([539, 536, 534, 541, 522]))

In [76]:
print_top_results_and_scores(query=query,
                             embeddings=embeddings)



[INFO] Time taken to get scores on 814 embeddings: 0.00046 seconds.
Query: how to implement a CNN

Results:
Score: 0.6355
and width) and the depth dimension, so either ksize[1] and ksize[2] must both be
equal to 1, or ksize[3] must be equal to 1. To create an average pooling layer,
just use the avg_pool() function instead of max_pool(). Now you know all the
building blocks to create a convolutional neural network. Let’s see how to
assemble them. CNN Architectures Typical CNN architectures stack a few
convolutional layers (each one generally fol‐ lowed by a ReLU layer), then a
pooling layer, then another few convolutional layers (+ReLU), then another
pooling layer, and so on. The image gets smaller and smaller as it progresses
through the network, but it also typically gets deeper and deeper (i.e., with
more feature maps) thanks to the convolutional layers (see Figure 13-9). At the
top of the stack, a regular feedforward neural network is added, composed of a
few fully connected layers 