In [1]:
# !pip install nltk
# !pip install PyPDF2

In [2]:
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('stopwords')
# nltk.download('averaged_perceptron_tagger')

In [3]:
import PyPDF2
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
import string
import re

def extract_text_from_pdf(pdf_path):
    pdf_text = ""
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        for page in pdf_reader.pages:
            pdf_text += page.extract_text()
    return pdf_text

In [4]:
def data_cleaning(text):
    cleaned_text = text.replace('\n', ' ').replace('\t', ' ').strip()
    return cleaned_text

In [5]:
def text_normalization(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))

    contractions = {
        "n't": " not",
        "'s": " is",
        "'re": " are",
        "'ll": " will",
        "'d": " would",
        "'ve": " have",
        "i.e": " that is",
    }
    words = text.split()
    expanded_words = [contractions[word] if word in contractions else word for word in words]
    text = " ".join(expanded_words)

    return text

In [6]:
def tokenization(text):
    words = word_tokenize(text)
    return words

In [7]:
def stop_words_removal(words):
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return filtered_words

In [8]:
def lemmatization(words):
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word, pos='v') for word in words]
    return lemmatized_words

In [9]:
def part_of_speech_tagging(words):
    pos_tags = pos_tag(words)
    return pos_tags

In [10]:
def save_to_text_file(processed_text, output_path):
    with open(output_path, 'w', encoding='utf-8') as output_file:
        output_file.write(processed_text)

In [11]:
def preprocess_and_save_to_txt(input_pdf_path, output_txt_path):
    pdf_text = extract_text_from_pdf(input_pdf_path)
    cleaned_text = data_cleaning(pdf_text)
    normalized_text = text_normalization(cleaned_text)
    tokenized_words = tokenization(normalized_text)
    filtered_words = stop_words_removal(tokenized_words)
    lemmatized_words = lemmatization(filtered_words)
    pos_tags = part_of_speech_tagging(lemmatized_words)

    processed_text = f"Cleaned Text: {cleaned_text}\n\n" \
                     f"Normalized Text: {normalized_text}\n\n" \
                     f"Tokenized Words: {' '.join(tokenized_words)}\n\n" \
                     f"Filtered Words: {' '.join(filtered_words)}\n\n" \
                     f"Lemmatized Words: {' '.join(lemmatized_words)}\n\n" \
                     f"Part-of-Speech Tags: {str(pos_tags)}"

    save_to_text_file(processed_text, output_txt_path)

In [12]:
preprocess_and_save_to_txt('input pdf path', 'input for processed file path')