In [35]:
import requests
import os 

root_path = os.path.join(os.path.dirname(os.path.abspath(os.getcwd())))
pdf_dir_path = os.path.join(root_path, 'pdfs')

In [36]:
sample_pdf_path = os.path.join(pdf_dir_path, 'monopoly.pdf')

def pdf_downloader(pdf_path: str, url):
    if not os.path.exists(pdf_path):
        print("File doesnot exists, downloading...")
        response = requests.get(url)
        if response.status_code == 200:
            with open(pdf_path, 'wb') as f:
                f.write(response.content)
                print("File saved successfully!")
        else:
            print("Error downloading pdf.")
    else:
        print("File already exists.")

In [37]:
pdf_downloader(pdf_path=sample_pdf_path, url='https://www.hasbro.com/common/instruct/00009.pdf')

File already exists.


In [38]:
## Extract text 

## Example

from pypdf import PdfReader

reader = PdfReader(sample_pdf_path)
page = reader.pages[3]

page.extract_text()

'"GO": Each time a player\'s token lands on or passes over \nGO, whether by throwing the dice or drawing a card, \nthe Banker pays himther a $200 sala,ry. \nThe $200 is paid only once each time kound \nthe board. However, if a player passing GO on \nthe throw of the dice lands 2 spaces beyond it \non Community Chest, or 7 spaces beyond \nit on Chance, and draws the "Advance to GO" card, helshe collects \n$200 for passing GO the first time and another $200 for reaching it the \nsecond time by instructions on the card. \nBUYING PROPERTY: Whenever you land on an unowned property you \nmay buy that property from the Bank at its printed price. You receive the \nTitle Deed card showing ownership; place it faceup in front of you. \nIf you do not wish to buy the property, the Banker sells it at auction \nto the highest bidder. The buyer pays the Bank the amount of the bid \nin cash and receives the Title Deed card for that property. Any player, \nincluding the one who declined the option to bu

In [39]:
## Let's extract all the texts

all_texts = ''
for page in reader.pages:
    all_texts += page.extract_text()
all_texts

'MONOPOLY \nProperty Trading Game from Parker Brothers" \nAGES 8+ \n2 to 8 Players \nContents: Gameboard, 3 dice, tokens, 32 houses, I2 hotels, Chance \nand Community Chest cards, Title Deed cards, play money and a Banker\'s tray. \nNow there\'s a faster way to play MONOPOLY. Choose to play by \nthe classic rules for buying, renting and selling properties or use the \nSpeed Die to get into the action faster. If you\'ve never played the classic \nMONOPOLY game, refer to the Classic Rules beginning on the next page. \nIf you already know how to play and want to use the Speed Die, just \nread the section below for the additional Speed Die rules. \nSPEED DIE RULES \nLearnins how to Play with the S~eed Die IS as \n/ \nfast as playing with i\'t. \n1. When starting the game, hand out an extra $1,000 to each player \n(two $5005 should work). The game moves fast and you\'ll need \nthe extra cash to buy and build. \n2. Do not use the Speed Die until you\'ve landed on or passed over \nGO for the 

In [40]:
def cleaner(text: str):
    text = text.split()
    y = [words for words in text if words.isalnum()]
    return " ".join(y)

In [41]:
cleaner("hi my name is anishka 37 and 23 &&& this $^$%")

'hi my name is anishka 37 and 23 this'

In [42]:
cleaner(all_texts)

'MONOPOLY Property Trading Game from Parker AGES 2 to 8 Players 3 32 I2 Chance and Community Chest Title Deed play money and a Now a faster way to play Choose to play by the classic rules for renting and selling properties or use the Speed Die to get into the action If never played the classic MONOPOLY refer to the Classic Rules beginning on the next If you already know how to play and want to use the Speed just read the section below for the additional Speed Die SPEED DIE RULES Learnins how to Play with the Die IS as fast as playing with When starting the hand out an extra to each player should The game moves fast and need the extra cash to buy and Do not use the Speed Die until landed on or passed over GO for the first Once you collect that first use the Speed Die for the rest of the This means that some players will start using the die before Once you start using the Speed roll it along with the two white dice on your Then do the following depending on what you or Add this number to

In [43]:
## trying PyMuPDF
## !pip install PyMuPDF tqdm

import fitz
from tqdm.auto import tqdm ## progress bars

def text_formatter(text: str) -> str:
    """Performs basic text formatting"""
    cleaned_text = text.replace("\n", " ").strip()

    return cleaned_text

def open_read_pdfs(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text=text)

        pages_and_texts.append({"page_number": page_number, 
                                "char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count": len(text.split(". ")),
                                "page_token_count": len(text) / 4,
                                "text": text
                                })
    return pages_and_texts

In [44]:
pages_and_texts = open_read_pdfs(pdf_path=sample_pdf_path)
pages_and_texts

0it [00:00, ?it/s]

[{'page_number': 0,
  'char_count': 1394,
  'page_word_count': 291,
  'page_sentence_count': 18,
  'page_token_count': 348.5,
  'text': 'MONOPOLY  Property Trading Game from Parker Brothers"  AGES 8+  2 to 8 Players  Contents: Gameboard, 3 dice, tokens, 32 houses, I2 hotels, Chance  and Community Chest cards, Title Deed cards, play money and a Banker\'s tray.  Now there\'s a faster way to play MONOPOLY. Choose to play by  the classic rules for buying, renting and selling properties or use the  Speed Die to get into the action faster. If you\'ve never played the classic  MONOPOLY game, refer to the Classic Rules beginning on the next page.  If you already know how to play and want to use the Speed Die, just  read the section below for the additional Speed Die rules.  SPEED DIE RULES  Learnins how to Play with the S ~ e e d   Die IS as  /  fast as playing with i\'t.  1. When starting the game, hand out an extra $1,000 to each player  (two $5005 should work). The game moves fast and you\'

In [45]:
import random

random.sample(pages_and_texts, k=2)

[{'page_number': 3,
  'char_count': 2089,
  'page_word_count': 408,
  'page_sentence_count': 17,
  'page_token_count': 522.25,
  'text': '"GO": Each time a player\'s token lands on or passes over  GO, whether by throwing the dice or drawing a card,  the Banker pays himther a $200 sala,ry.  The $200 is paid only once each time kound  the board. However, if a player passing GO on  the throw of the dice lands 2 spaces beyond it  on Community Chest, or 7 spaces beyond  it on Chance, and draws the "Advance to GO" card, helshe collects  $200 for passing GO the first time and another $200 for reaching it the  second time by instructions on the card.  BUYING PROPERTY: Whenever you land on an unowned property you  may buy that property from the Bank at its printed price. You receive the  Title Deed card showing ownership; place it faceup in front of you.  If you do not wish to buy the property, the Banker sells it at auction  to the highest bidder. The buyer pays the Bank the amount of the bid 

In [46]:
import pandas as pd 

df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,char_count,page_word_count,page_sentence_count,page_token_count,text
0,0,1394,291,18,348.5,MONOPOLY Property Trading Game from Parker Br...
1,1,1847,389,19,461.75,"Bus: This lets you ""get off the bus early."" Lo..."
2,2,2202,444,21,550.5,"Each player is given $1,500 divided as follows..."
3,3,2089,408,17,522.25,"""GO"": Each time a player's token lands on or p..."
4,4,2047,429,16,511.75,instructions and return the card facedown to t...


In [47]:
df.describe()

Unnamed: 0,page_number,char_count,page_word_count,page_sentence_count,page_token_count
count,8.0,8.0,8.0,8.0,8.0
mean,3.5,1992.5,401.375,18.75,498.125
std,2.44949,266.420184,47.5603,3.370036,66.605046
min,0.0,1394.0,291.0,16.0,348.5
25%,1.75,1997.0,401.75,16.75,499.25
50%,3.5,2073.0,414.5,17.5,518.25
75%,5.25,2122.5,424.5,19.5,530.625
max,7.0,2208.0,444.0,26.0,552.0


In [48]:
## Split the pages into sentences 

from spacy.lang.en import English

nlp = English()

nlp.add_pipe("sentencizer")

doc = nlp("This is a sentence. I love pizza. I am very lucky")


list(doc.sents)

[This is a sentence., I love pizza., I am very lucky]

In [49]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)

    item["sentences"] = [str(sentences) for sentences in item["sentences"]]

    item["page_sentence_count_spacy"] = len(item["sentences"])

  0%|          | 0/8 [00:00<?, ?it/s]

In [50]:
random.sample(pages_and_texts, k=2)

[{'page_number': 7,
  'char_count': 2208,
  'page_word_count': 421,
  'page_sentence_count': 26,
  'page_token_count': 552.0,
  'text': 'BANKRUPTCY.. You are declared bankrupt if you owe more than you  can pay either to another player or to the Bank. If your  ,  debt is to another player, you must tum over to that  player all that you have of value and retire from the  game. In making this settlement, if you own houses or  hotels, you must retum these to the Bank in exchange  for money to the extent of one-half the amount paid  for them; this cash is given to the creditor. If you have  mortgaged property you also turn this property over  to your creditor but the new owner must at once pay .  the Bank the amount of interest on the loan, which is 10% of the value of  the property. The new owner who does this may then, at hislher option,  pay the principal or hold the property until some later turn, then lift the  mortgage. If helshe holds property in this way until a later turn, helshe  

In [51]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,char_count,page_word_count,page_sentence_count,page_token_count,page_sentence_count_spacy
count,8.0,8.0,8.0,8.0,8.0,8.0
mean,3.5,1992.5,401.38,18.75,498.12,18.38
std,2.45,266.42,47.56,3.37,66.61,2.92
min,0.0,1394.0,291.0,16.0,348.5,15.0
25%,1.75,1997.0,401.75,16.75,499.25,16.75
50%,3.5,2073.0,414.5,17.5,518.25,17.5
75%,5.25,2122.5,424.5,19.5,530.62,19.5
max,7.0,2208.0,444.0,26.0,552.0,24.0


In [52]:
num_sentences_chunk_size = 10

def split_text_to_chunks(input_list: list,
                         slice_size: int=num_sentences_chunk_size) -> list[list[str]]:
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

split_text_to_chunks(list(range(25)))

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
 [20, 21, 22, 23, 24]]

In [53]:
for item in pages_and_texts:
    print(item.keys())

dict_keys(['page_number', 'char_count', 'page_word_count', 'page_sentence_count', 'page_token_count', 'text', 'sentences', 'page_sentence_count_spacy'])
dict_keys(['page_number', 'char_count', 'page_word_count', 'page_sentence_count', 'page_token_count', 'text', 'sentences', 'page_sentence_count_spacy'])
dict_keys(['page_number', 'char_count', 'page_word_count', 'page_sentence_count', 'page_token_count', 'text', 'sentences', 'page_sentence_count_spacy'])
dict_keys(['page_number', 'char_count', 'page_word_count', 'page_sentence_count', 'page_token_count', 'text', 'sentences', 'page_sentence_count_spacy'])
dict_keys(['page_number', 'char_count', 'page_word_count', 'page_sentence_count', 'page_token_count', 'text', 'sentences', 'page_sentence_count_spacy'])
dict_keys(['page_number', 'char_count', 'page_word_count', 'page_sentence_count', 'page_token_count', 'text', 'sentences', 'page_sentence_count_spacy'])
dict_keys(['page_number', 'char_count', 'page_word_count', 'page_sentence_count', 

In [55]:
# Loop through pages and texts and split sentences into chunks

for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_text_to_chunks(input_list=item["sentences"], 
                                                   slice_size=num_sentences_chunk_size)
    print(item)


  0%|          | 0/8 [00:00<?, ?it/s]

{'page_number': 0, 'char_count': 1394, 'page_word_count': 291, 'page_sentence_count': 18, 'page_token_count': 348.5, 'text': 'MONOPOLY  Property Trading Game from Parker Brothers"  AGES 8+  2 to 8 Players  Contents: Gameboard, 3 dice, tokens, 32 houses, I2 hotels, Chance  and Community Chest cards, Title Deed cards, play money and a Banker\'s tray.  Now there\'s a faster way to play MONOPOLY. Choose to play by  the classic rules for buying, renting and selling properties or use the  Speed Die to get into the action faster. If you\'ve never played the classic  MONOPOLY game, refer to the Classic Rules beginning on the next page.  If you already know how to play and want to use the Speed Die, just  read the section below for the additional Speed Die rules.  SPEED DIE RULES  Learnins how to Play with the S ~ e e d   Die IS as  /  fast as playing with i\'t.  1. When starting the game, hand out an extra $1,000 to each player  (two $5005 should work). The game moves fast and you\'ll need  th

In [56]:
df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,char_count,page_word_count,page_sentence_count,page_token_count,text,sentences,page_sentence_count_spacy,sentence_chunks
0,0,1394,291,18,348.5,MONOPOLY Property Trading Game from Parker Br...,[MONOPOLY Property Trading Game from Parker B...,18,[[MONOPOLY Property Trading Game from Parker ...
1,1,1847,389,19,461.75,"Bus: This lets you ""get off the bus early."" Lo...","[Bus: This lets you ""get off the bus early."", ...",19,"[[Bus: This lets you ""get off the bus early."",..."
2,2,2202,444,21,550.5,"Each player is given $1,500 divided as follows...","[Each player is given $1,500 divided as follow...",21,"[[Each player is given $1,500 divided as follo..."
3,3,2089,408,17,522.25,"""GO"": Each time a player's token lands on or p...","[""GO"": Each time a player's token lands on or ...",17,"[[""GO"": Each time a player's token lands on or..."
4,4,2047,429,16,511.75,instructions and return the card facedown to t...,[instructions and return the card facedown to ...,15,[[instructions and return the card facedown to...


In [58]:
random.sample(pages_and_texts, k=3)

[{'page_number': 7,
  'char_count': 2208,
  'page_word_count': 421,
  'page_sentence_count': 26,
  'page_token_count': 552.0,
  'text': 'BANKRUPTCY.. You are declared bankrupt if you owe more than you  can pay either to another player or to the Bank. If your  ,  debt is to another player, you must tum over to that  player all that you have of value and retire from the  game. In making this settlement, if you own houses or  hotels, you must retum these to the Bank in exchange  for money to the extent of one-half the amount paid  for them; this cash is given to the creditor. If you have  mortgaged property you also turn this property over  to your creditor but the new owner must at once pay .  the Bank the amount of interest on the loan, which is 10% of the value of  the property. The new owner who does this may then, at hislher option,  pay the principal or hold the property until some later turn, then lift the  mortgage. If helshe holds property in this way until a later turn, helshe  