In [None]:
pip install PyMuPDF

In [None]:
import os
import fitz
import nltk
import re
import string
import gensim
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools
import pprint
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.svm import SVC


# For tokenization
from nltk.tokenize import word_tokenize
nltk.download("punkt")
nltk.download('wordnet')

# For removing stopwords
from nltk.corpus import stopwords
nltk.download('stopwords')

# For lemmatization
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('averaged_perceptron_tagger')

# For TF-IDC
from sklearn.feature_extraction.text import TfidfVectorizer

# For N-grams
from collections import defaultdict
from nltk.util import ngrams as nltk_ngrams
from nltk.tokenize import word_tokenize

# For building LDA model
from gensim import corpora
from gensim.models import LdaModel

# For visualizing the LDA model
import pyLDAvis.gensim_models as gensimvis

# For evaluating the LDA model
from gensim.models.coherencemodel import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models as gensimvisualize

### Get text from PDFs

In [None]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as pdf:
        for page_num in range(len(pdf)):
            page = pdf.load_page(page_num)
            text += page.get_text()
    return text

### Preprocess and vectorize text

*Preprocess pipeline*

In [None]:
# Loading WordNet POS tags for lemmatization
def wordnet_pos_tags(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [None]:
# Adding custom stopwords directly to the set
custom_stopwords = set(['g','figure','thank','you','mbove','print','info','escalate','isbn','volume','dr','vol','phd','upon','al', 'e', 'fig', 'cl', 'hap', 'et', 'http', 'ew', 'climate','jun','simon', 'yang','go','copyright','ltd','license','creative','imply','dot', 'dashed','line', 'may', 'either','bi','de', 'la', 'paix', 'tel', 'box', 'fax','front', 'oli', 'scarff', 'getty', 'image', 'back', 'atlas','ness','net', 'san', 'felipe','also','informa', 'tion', 'per', 'cent','due','distri', 'bution','com','bin','thus','km', 'grid','would','christopher','u','aa','aap','aas','ab','abares','abatzoglou','abbs','abc','aberystwyth','doi'])

def txt_preprocess_pipeline(text):

    # Removing multiple white spaces and line breaks
    clean_txt = re.sub(r'\n', ' ', text)
    clean_txt = re.sub(r'\s+', ' ', clean_txt)
    clean_txt = clean_txt.strip()

    # Tokenizing text
    tokens = nltk.word_tokenize(clean_txt)

    # Loading NLTK stopword list and adding original stopwords
    stop_words = set(stopwords.words('english'))
    stop_words.update(custom_stopwords)

    # Standardize the text and remove non-alphabetic or stop words
    words = [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words]

    # Defining lemmatizer
    lemmatizer = WordNetLemmatizer()

    # Conducting POS tagging
    pos_tags = nltk.pos_tag(words)

    # Lemmatizing word-tokens via assigned POS tags
    lemma_tokens = [lemmatizer.lemmatize(token, wordnet_pos_tags(pos_tag)) for token, pos_tag in pos_tags]
    
    return lemma_tokens

*Iteration function*

In [None]:
def iterate_pdf_files(pdf_folder, max_texts=None, n=2):
    # Creating a dictionary to display all documents and tokens together
    docs = {}

    # Counting to tracking the number of processed texts
    text_count = 0  
    
    # Iterating over each file in the folder
    for filename in os.listdir(pdf_folder):

        # Checking wheter the files are PDFs 
        if filename.endswith('.pdf'):

            # Creating a path to the PDF files
            pdf_path = os.path.join(pdf_folder, filename)

            # Extracting the content of files
            text = extract_text_from_pdf(pdf_path)

            # Applying the preprocessing pipeline to the extracted tex
            preprocessed_text = txt_preprocess_pipeline(text)

            # Generating a unique name for the document based on the length of the docs dictionary for ease in readibility
            doc_name = f'doc{len(docs) + 1}'

            # Assigning the preprocessed text content of the current document in the docs dictionary with the document name as the key
            docs[doc_name] = [preprocessed_text]

            # Increasing the text count when finished assigning 
            text_count += 1
            
            # Checking if a maximum number of texts has been specified and if the count of processed texts has reached or exceeded this maximum
            if max_texts is not None and text_count >= max_texts:  # Check if maximum number of texts reached
                break
    return docs

pdf_folder = "/Users/aybikesahinoglu/Desktop/NLP-Docs"

# Setting the maximum number of texts to iterate over
max_texts_to_process = None
docs = iterate_pdf_files(pdf_folder, max_texts=max_texts_to_process)

# Printing each document's tokens as a list
for doc_name, tokens in docs.items():
    print(f"{doc_name}: {tokens}")

*Token dictionary*

In [None]:
# Iterating over the values in the docs dictionary created in the previous step. Each value in the sublist represents the tokens in a certain document. 
all_preprocessed_text = [token for sublist in docs.values() for token in sublist]

# Creating a gensim dictionary to assign an integer ID to each token
dictionary = corpora.Dictionary(all_preprocessed_text)
print(dictionary)

*BoW representation*

In [None]:
# Creating a Bag-of-Words (BoW) representation of the preprocessed text data using the dictionary created in the previous step
bow_corpus = [dictionary.doc2bow(text) for text in all_preprocessed_text]
pprint.pprint(bow_corpus)

### Document term matrix

In [None]:
# Convert BoW corpus to matrix
def bow_to_matrix(bow_corpus):
    num_docs = len(bow_corpus)
    num_terms = len(dictionary)
    dtm_matrix = np.zeros((num_docs, num_terms), dtype=np.int32)
    
    for i, doc in enumerate(bow_corpus):
        for id_, count in doc:
            dtm_matrix[i, id_] = count
    
    return dtm_matrix

# Convert BoW corpus to matrix
dtm_matrix = bow_to_matrix(bow_corpus)

# Display the document-term matrix
print(dtm_matrix)

### N-grams

In [None]:
# Taking a list of tokens as input and generating n-grams
def generate_ngrams(tokens, n=2):
    return list(nltk_ngrams(tokens, n))

# Creating n-gram model from a corpus
def create_ngram_model(corpus, n=2):
    ngram_model = defaultdict(int)
    for text in corpus:
        tokens = word_tokenize(text)  
        ngrams = generate_ngrams(tokens, n)
        for ngram in ngrams:
            ngram_model[ngram] += 1
    return ngram_model

max_texts_to_process = None
docs = iterate_pdf_files(pdf_folder, max_texts=max_texts_to_process)

# Concatenating preprocessed tokens for each document into a single list of tokens
corpus_tokens = [tokens for sublist in docs.values() for tokens in sublist]

# Concatenating tokens into a single string for each document
corpus_strings = [' '.join(tokens) for tokens in corpus_tokens]

# Creating an n-gram model
ngram_model = create_ngram_model(corpus_strings, n=3)

# Printing all n-grams
print("All N-grams:")
for ngram, frequency in ngram_model.items():
    print(ngram, ":", frequency)

### LDA

In [None]:
# Setting training parameters
num_topics = 10
chunksize = 2000
passes = 20
iterations = 400
eval_every = None

# Making an index to word dictionary
temp = dictionary[0] 
id2word = dictionary.id2token

model = LdaModel(
    corpus=bow_corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto', # With alpha and eta set to auto, the model learns the sparsity of the document-topic (alpha) and topic-word (eta) distributions
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

from pprint import pprint
top_topics = model.top_topics(bow_corpus)
pprint(top_topics)

### Visualizing the LDA model

In [None]:
vis = gensimvis.prepare(model, bow_corpus, dictionary)
pyLDAvis.display(vis)