In [45]:
import os
import pypdf
from gensim.utils import simple_preprocess
from nltk.tokenize import sent_tokenize


<pre>

1) My strategy will be, read the pdf files, and save it into a list, thus this list serving as our corpus.

2) For that I have to pave a path for the files to be read, 

- and this files read have to be stored somewhere as text string first, 
- then this text string has to be stored to a list by using a sent_tokenize, 
- then this has to be word_tokenize if needed and then stored in a list


>> Let's design a helper function first to read the document from pdf and then store it as a text.



In [46]:
raw_data_path = os.path.join("..", "data", "raw")
processed_data_path = os.path.join("..", "data", "processed")

In [47]:
# Defining a helper function:


# If your pdf is like scanned images, which looks like text, this will not work!!


def get_text_from_pdf(pdf_path):
    text = ""

    try:
        reader = pypdf.PdfReader(pdf_path)

        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text+=page_text + "\n"
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
    return text
        

<pre>

1) Now, we have four pdf files here, we will read through it.
2) And will etract raw text.
3) Then will split the text into sentence.
4) Split the sentence into words (using simple_preprocess)

note: simple_prprocess will automatically lowercases and removes punctuation.



In [48]:
corpus = []

for filename in os.listdir(raw_data_path):
    if filename.endswith(".pdf"):
        file_path = os.path.join(raw_data_path, filename)
        print(f"Processing: {filename}...")

        # Extract the Raw Text

        raw_text = get_text_from_pdf(file_path)


        clean_text = raw_text.replace("\n", " ")


        # Split the cleaned version to senences. 

        sentences = sent_tokenize(clean_text)

        # Use simple_preprocess from gensim.utils to remove punctuations, numbers and then to toeknize by words

        for i in sentences:
            tokens = simple_preprocess(i)

            if tokens:
                corpus.append(tokens)

print(f"Total Sentences processed: {len(corpus)}")

Processing: god_father_1.pdf...
Processing: god_father_2.pdf...
Processing: god_father_returns.pdf...
Processing: the sicilian.pdf...
Total Sentences processed: 29575


In [49]:
for i in range(5):
    print(corpus[i])

['pearson', 'education', 'limited', 'edinburgh', 'gate', 'harlow', 'essex', 'cm', 'je', 'england', 'nd', 'associated', 'companies', 'throughout', 'the', 'world']
['isbn', 'isbn', 'first', 'published', 'in', 'great', 'britain', 'by', 'random', 'house', 'uk', 'ltd', 'this', 'adaptation', 'published', 'by', 'penguin', 'books', 'published', 'by', 'addison', 'wesley', 'longman', 'limited', 'and', 'penguin', 'books', 'ltd', 'new', 'edition', 'first', 'published', 'original', 'copyright', 'mario', 'puzo', 'adaptation', 'copyright', 'chris', 'rice', 'photographs', 'copyright', 'paramount', 'reproduced', 'by', 'courtesy', 'of', 'the', 'ronald', 'grant', 'archive', 'all', 'rights', 'reserved', 'typeset', 'by', 'digital', 'type', 'london', 'set', 'in', 'll', 'pt', 'bembo', 'printed', 'in', 'china', 'swtc', 'all', 'rights', 'reserved', 'no', 'part', 'of', 'this', 'publication', 'may', 'be', 'reproduced', 'stored', 'in', 'retrieval', 'system', 'or', 'transmitted', 'in', 'any', 'form', 'or', 'by', '

In [50]:
# let's save the processed data:


# I have to create a file path 

output_file = os.path.join(processed_data_path, "god_father_corpus.txt")

with open(output_file, "w+", encoding = "utf-8") as f:
    for sentence in corpus:
        f.write(" ".join(sentence)+"\n")

print(f"Processed file saved to: {output_file}")

Processed file saved to: ..\data\processed\god_father_corpus.txt


<pre style = "text-align: center; color: red; font-size: 24px">

☠️This notebook ends here, as its target has been achieved☠️