# Document/Text Processing and Embedding Creation

In [1]:
import os
import requests

pdf_path = "human-nutrition-text.pdf"


# Download PDF if it doesn't already exist
if not os.path.exists(pdf_path):
  print("File doesn't exist, downloading...")
  url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"
  filename = pdf_path

  response = requests.get(url)

  if response.status_code == 200:
      # Open a file in binary write mode and save the content to it
      with open(filename, "wb") as file:
          file.write(response.content)
      print(f"The file has been downloaded and saved as {filename}")
  else:
      print(f"Failed to download the file. Status code: {response.status_code}")
else:
  print(f"File {pdf_path} exists.")

File human-nutrition-text.pdf exists.


In [2]:
!pip install --quiet PyMuPdf
import fitz #PyMuPdf

!pip install --quiet ipywidgets
from tqdm.auto import tqdm


def text_formatter(text: str) -> str:
    """
    Replaces newline character (\n) with a white space (" ")
    """
    cleaned_text = text.replace("\n", " ").strip()
    return cleaned_text

def open_and_read_pdf(pdf_path: str) -> list[dict]:
    """
    Opens a PDF, reads its text content page by page and gathers statistics.
    
    Parameters:
        pdf_path (str): The file path to the PDF document to be opened and read.
        
    Returns:
         list[dict]: A list of dictionaries, each containing the page number
        (adjusted), character count, word count, sentence count, token count, and the extracted text
        for each page.
    """
    
    doc = fitz.open(pdf_path)
    pages_and_text = []
    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text)
        pages_and_text.append({"page_number": page_number - 41,  # adjust page numbers since our PDF starts on page 42
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4,  # 1 token = ~4 chars
                                "text": text})
    
    return pages_and_text
    
pages_and_text = open_and_read_pdf(pdf_path=pdf_path)

0it [00:00, ?it/s]

In [3]:
import random
print(random.sample(pages_and_text, k = 3))

[{'page_number': 771, 'page_char_count': 1429, 'page_word_count': 227, 'page_sentence_count_raw': 9, 'page_token_count': 357.25, 'text': 'beverages within an appropriate calorie level. A healthy eating  pattern includes2:  • A variety of vegetables from all of the subgroups—dark green,  red and orange, legumes (beans and peas), starchy, and other  • Fruits, especially whole fruits  • Grains, at least half of which are whole grains  • Fat-free or low-fat dairy, including milk, yogurt, cheese, and/ or fortified soy beverages  • A variety of protein foods, including seafood, lean meats and  poultry, eggs, legumes (beans and peas), and nuts, seeds, and  soy products  Oils  A healthy eating pattern limits:  • Saturated fats and trans fats, added sugars, and sodium  • Cholesterol, in order to limit saturated fats.  Previously, the recommendation for cholesterol was less than 300  mg/day of cholesterol for the general public, and less than 200  mg/day for those with cardiovascular disease ris

# EDA

In [4]:
import pandas as pd

df = pd.DataFrame(pages_and_text)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition
1,-40,0,1,1,0.0,
2,-39,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,-38,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...
4,-37,797,145,2,199.25,Contents Preface University of Hawai‘i at Mā...


In [5]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,198.3,9.97,287.0
std,348.86,560.38,95.76,6.19,140.1
min,-41.0,0.0,1.0,1.0,0.0
25%,260.75,762.0,134.0,4.0,190.5
50%,562.5,1231.5,214.5,10.0,307.88
75%,864.25,1603.5,271.0,14.0,400.88
max,1166.0,2308.0,429.0,32.0,577.0


# Further Text Processing

In [6]:
!pip install --quiet spacy

In [7]:
from spacy.lang.en import English # see https://spacy.io/usage for install instructions

nlp = English()

# Add a sentencizer pipeline, see https://spacy.io/api/sentencizer/ 
nlp.add_pipe("sentencizer")

# Create a document instance as an example
doc = nlp("This is a sentence. This another sentence.")
assert len(list(doc.sents)) == 2

# Access the sentences of the document
list(doc.sents)

[This is a sentence., This another sentence.]

In [8]:
for item in tqdm(pages_and_text):
    item["sentences"] = list(nlp(item["text"]).sents)
    
    # Make sure all sentences are strings
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]
    
    # Count the sentences 
    item["page_sentence_count_spacy"] = len(item["sentences"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [9]:
random.sample(pages_and_text, k=1)

[{'page_number': 365,
  'page_char_count': 1054,
  'page_word_count': 190,
  'page_sentence_count_raw': 11,
  'page_token_count': 263.5,
  'text': 'Amino acids  are classified  into four  groups.  These are  nonpolar,  polar, acidic,  and basic.  Essential and Nonessential Amino Acids  Amino acids are further classified based on nutritional aspects.  Recall that there are twenty different amino acids, and we require all  of them to make the many different proteins found throughout the  body. Eleven of these are called nonessential amino acids because  the body can synthesize them. However, nine of the amino acids  are called essential amino acids because we cannot synthesize them  either at all or in sufficient amounts. These must be obtained from  the diet. Sometimes during infancy, growth, and in diseased states  the body cannot synthesize enough of some of the nonessential  amino acids and more of them are required in the diet. These types  of amino acids are called conditionally es

In [10]:
df = pd.DataFrame(pages_and_text)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,198.3,9.97,287.0,10.32
std,348.86,560.38,95.76,6.19,140.1,6.3
min,-41.0,0.0,1.0,1.0,0.0,0.0
25%,260.75,762.0,134.0,4.0,190.5,5.0
50%,562.5,1231.5,214.5,10.0,307.88,10.0
75%,864.25,1603.5,271.0,14.0,400.88,15.0
max,1166.0,2308.0,429.0,32.0,577.0,28.0


# Chunking Sentences Together

In [11]:
num_sentence_chunk_size = 10 

def split_list(input_list: list, slice_size: int) -> list[list[str]]:
    """
    Splits the input_list into sublists of size slice_size (or as close as possible).
    For example, a list of 17 sentences would be split into two lists of [[10], [7]]
    """
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

# Loop through pages and texts and split sentences into chunks
for item in tqdm(pages_and_text):
    item["sentence_chunks"] = split_list(input_list = item["sentences"], slice_size = num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [12]:
random.sample(pages_and_text, k=1)

[{'page_number': 674,
  'page_char_count': 295,
  'page_word_count': 51,
  'page_sentence_count_raw': 3,
  'page_token_count': 73.75,
  'text': 'recommended that users complete these activities using a  desktop or laptop computer and in Google Chrome.  \xa0 An interactive or media element has been  excluded from this version of the text. You can  view it online here:  http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=386  \xa0 674  |  Zinc',
  'sentences': ['recommended that users complete these activities using a  desktop or laptop computer and in Google Chrome.',
   ' \xa0 An interactive or media element has been  excluded from this version of the text.',
   'You can  view it online here:  http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=386  \xa0 674  |  Zinc'],
  'page_sentence_count_spacy': 3,
  'sentence_chunks': [['recommended that users complete these activities using a  desktop or laptop computer and in Google Chrome.',
    ' \xa0 An interactive or media element has bee

In [13]:
# stats
df = pd.DataFrame(pages_and_text)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0,1208.0
mean,562.5,1148.0,198.3,9.97,287.0,10.32,1.53
std,348.86,560.38,95.76,6.19,140.1,6.3,0.64
min,-41.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,260.75,762.0,134.0,4.0,190.5,5.0,1.0
50%,562.5,1231.5,214.5,10.0,307.88,10.0,1.0
75%,864.25,1603.5,271.0,14.0,400.88,15.0,2.0
max,1166.0,2308.0,429.0,32.0,577.0,28.0,3.0


# Splitting each chunk into its own item

In [14]:
import re

# Split each chunk into its own item
pages_and_chunks = []
for item in tqdm(pages_and_text):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]
        
        # Join the sentences together into a paragraph-like structure, aka a chunk (so they are a single string)
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full-stop/capital letter combo 
        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # Get stats about the chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 characters
        
        pages_and_chunks.append(chunk_dict)

# How many chunks do we have?
len(pages_and_chunks)

  0%|          | 0/1208 [00:00<?, ?it/s]

1843

In [15]:
random.sample(pages_and_chunks, k=1)

[{'page_number': 894,
  'sentence_chunk': '894 | Late Adolescence',
  'chunk_char_count': 22,
  'chunk_word_count': 4,
  'chunk_token_count': 5.5}]

In [16]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,1843.0,1843.0,1843.0,1843.0
mean,583.38,734.44,112.33,183.61
std,347.79,447.54,71.22,111.89
min,-41.0,12.0,3.0,3.0
25%,280.5,315.0,44.0,78.75
50%,586.0,746.0,114.0,186.5
75%,890.0,1118.5,173.0,279.62
max,1166.0,1831.0,297.0,457.75


# Filtering Useless Text like Headers and Footers

In [17]:
# Show random chunks with under 30 tokens in length
min_token_length = 30
for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

Chunk token count: 24.75 | Text: U. S. Food and Drug Administration. https://www.fda.gov/food/ 1030 | The Effect of New Technologies
Chunk token count: 3.5 | Text: 190 | Chloride
Chunk token count: 3.25 | Text: 622 | Calcium
Chunk token count: 15.75 | Text: PART IV CHAPTER 4. CARBOHYDRATES Chapter 4. Carbohydrates | 227
Chunk token count: 9.75 | Text: 1002 | The Causes of Food Contamination


In [18]:
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:2]

[{'page_number': -39,
  'sentence_chunk': 'Human Nutrition: 2020 Edition UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM ALAN TITCHENAL, SKYLAR HARA, NOEMI ARCEO CAACBAY, WILLIAM MEINKE-LAU, YA-YUN YANG, MARIE KAINOA FIALKOWSKI REVILLA, JENNIFER DRAPER, GEMADY LANGFELDER, CHERYL GIBBY, CHYNA NICOLE CHUN, AND ALLISON CALABRESE',
  'chunk_char_count': 308,
  'chunk_word_count': 42,
  'chunk_token_count': 77.0},
 {'page_number': -38,
  'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and Human Nutrition Program is licensed under a Creative Commons Attribution 4.0 International License, except where otherwise noted.',
  'chunk_char_count': 210,
  'chunk_word_count': 30,
  'chunk_token_count': 52.5}]

# Embedding Our Text chunks

In [19]:
!pip install --quiet sentence-transformers

In [20]:
!pip install --upgrade torch



In [69]:
pip install --upgrade transformers

Note: you may need to restart the kernel to use updated packages.


In [26]:
from sentence_transformers import SentenceTransformer

In [27]:
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", device="cpu")
# https://huggingface.co/sentence-transformers/all-mpnet-base-v2#usage-huggingface-transformers
sentences = [
    "Hi! I'm Akshit.",
    "This is my first time using sentence embeddings."
]


embeddings = embedding_model.encode(sentences)
embeddings_dict = dict(zip(sentences, embeddings))


for sentence, embedding in embeddings_dict.items():
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("\n")

Sentence: Hi! I'm Akshit.
Embedding: [ 3.35864499e-02 -4.84011434e-02 -2.75562648e-02  2.21025618e-03
  4.48272750e-02 -1.78489573e-02  3.15079316e-02  3.55328713e-03
  6.24577962e-02 -1.94140673e-02 -5.91808278e-03 -4.63166460e-02
  7.75601203e-03  5.24999425e-02  4.39382419e-02 -3.19524445e-02
  4.89878207e-02 -3.92394252e-02  2.39046905e-02  1.26382597e-02
  4.91684489e-02  3.92709672e-03 -9.79111716e-03  5.95356449e-02
  1.72576006e-03 -1.11961861e-04  2.43880283e-02 -2.54431479e-02
  4.94966470e-03  4.86356318e-02  7.81201273e-02 -1.59947425e-02
  1.76511873e-02  1.35999555e-02  2.05053720e-06  6.60407217e-03
 -2.29905564e-02 -7.81758595e-03 -8.48241895e-03 -1.83347869e-03
  2.93705729e-03  2.69134082e-02 -4.67941910e-02  2.78875846e-02
  3.92217226e-02  5.41816466e-02  3.67822573e-02  2.90999264e-02
  6.76722005e-02  5.64308614e-02 -1.19386511e-02 -3.35658826e-02
  1.20938234e-02 -3.61050218e-02 -1.48695717e-02  1.93679929e-02
  1.81785505e-02 -6.71384484e-02 -7.46351629e-02  2.7

In [28]:
single_sentence = "Nigga please we both dead inside."
single_embedding = embedding_model.encode(single_sentence)
print(f"Sentence: {single_sentence}")
print(f"Embedding:\n{single_embedding}")
print(f"Embedding size: {single_embedding.shape}")

Sentence: Nigga please we both dead inside.
Embedding:
[ 2.86354236e-02  1.04440086e-01  1.31157609e-02  1.34248957e-02
  2.05052793e-02  1.72482617e-02 -5.04559763e-02 -3.72468261e-03
 -2.07202323e-02  6.47825375e-03 -7.26150274e-02  1.10343033e-02
  2.09318921e-02  5.04623838e-02  9.36104078e-03  1.87774319e-02
 -1.66521408e-02  4.04065214e-02 -1.00222062e-02 -3.11453156e-02
  3.61606441e-02  4.99652699e-02  3.75323594e-02  2.79572215e-02
  1.19474120e-02  3.58291715e-02  1.04875481e-02  3.76846194e-02
 -7.98078924e-02  1.02194250e-02 -9.77478456e-03  2.58596800e-02
 -5.48083447e-02 -3.82697880e-02  2.32818115e-06  4.23821509e-02
  8.74935358e-04  1.15526654e-02 -3.31509998e-03 -4.07830849e-02
 -7.82831013e-02 -3.30301225e-02  4.28884439e-02 -2.15419158e-02
 -6.60691485e-02 -4.31448258e-02 -5.59423119e-02  1.27691915e-02
  3.95471789e-02  6.18458353e-02 -1.32513950e-02 -2.69188099e-02
 -3.60742584e-03 -3.03036179e-02 -9.87125654e-03  5.94752543e-02
  1.04352096e-02  2.62363106e-02  6

In [29]:
%%time

# Embed each chunk one by one
for item in tqdm(pages_and_chunks_over_min_token_len):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])

  0%|          | 0/1680 [00:00<?, ?it/s]

CPU times: user 7min 59s, sys: 43 s, total: 8min 42s
Wall time: 2min 11s


# Save Embeddings In .CSV

In [30]:
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_len)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [32]:
text_chunks_and_embedding_df_load = pd.read_csv(embeddings_df_save_path)
text_chunks_and_embedding_df_load.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,-39,Human Nutrition: 2020 Edition UNIVERSITY OF HA...,308,42,77.0,[ 6.74242601e-02 9.02280509e-02 -5.09550143e-...
1,-38,Human Nutrition: 2020 Edition by University of...,210,30,52.5,[ 5.52156381e-02 5.92137948e-02 -1.66167673e-...
2,-37,Contents Preface University of Hawai‘i at Māno...,766,114,191.5,[ 2.79801637e-02 3.39813903e-02 -2.06426941e-...
3,-36,Lifestyles and Nutrition University of Hawai‘i...,941,142,235.25,[ 6.82566762e-02 3.81274410e-02 -8.46856367e-...
4,-35,The Cardiovascular System University of Hawai‘...,998,152,249.5,[ 3.30263861e-02 -8.49776343e-03 9.57154110e-...
