In [None]:
pip install PyMuPDF

In [None]:
pip install BERTopic

In [None]:
import os
import fitz
import nltk
import re
import string
import gensim
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools
import pprint


# For tokenization
from nltk.tokenize import word_tokenize
nltk.download("punkt")
nltk.download('wordnet')

# For removing stopwords
from nltk.corpus import stopwords
nltk.download('stopwords')

# For lemmatization
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('averaged_perceptron_tagger')

# For TF-IDC
from gensim import corpora
from gensim.models import TfidfModel

# For N-grams
from collections import defaultdict
from nltk.util import ngrams as nltk_ngrams
from nltk.tokenize import word_tokenize

# For BERT topic
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

# For coherence score
from gensim.models.coherencemodel import CoherenceModel

### Get text from PDFs

In [5]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as pdf:
        for page_num in range(len(pdf)):
            page = pdf.load_page(page_num)
            text += page.get_text()
    return text

### Preprocess and vectorize text

*Preprocess pipeline*

In [6]:
# Loading WordNet POS tags for lemmatization
def wordnet_pos_tags(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [8]:
def txt_preprocess_pipeline(text):

    # Removing multiple white spaces and line breaks
    clean_txt = re.sub(r'\n', ' ', text)
    clean_txt = re.sub(r'\s+', ' ', clean_txt)
    clean_txt = clean_txt.strip()

    # Tokenizing text
    tokens = nltk.word_tokenize(clean_txt)

    # Loading NLTK stopword list and adding original stopwords
    stop_words = set(stopwords.words('english')+['ehst event','ehst','th','lll','reasonable','good','good good','na','na na','city','oecd','urban','red','paper','forum','figgs','enfield','literature','estimate','model','effect','impact','extreme','study','tantly','treme','tsuji','tural','yt','en','ings','factsheet','edney','budin','apps','app','enbel','alth','aunan','basu','user','search','photo','tures','mistry','letter','cdds','ly','national','plan','hhaps','use','initiate','various','public','health','heat','temperature','change','risk', 'increase','health','india','plan', 'brutally','management','part','highlight','world','united','nation','joint', 'cooperation','brief','issue','professor','tsdps','ons','qlw','lfb','qh','qc','humber','osdma','hse','hm','maha','ucs','lok','international','doya','gsdma','cdri','eswd','dluhc','esd','dfe','niosh','naic','desnz','odisha','defra','easac','bouhi','awhp','ndma','rachel','bureau','scientist','adequate','annual','address','amplify','billion','calculation,','activate','adaptation','additionally','acknowledge','afternoon','accordance','act','accord','data','high','ukhsa','osha','mrt','donthu','ensemble','kristina','webinar','qr','percent','interviewee','licker','eea','ccc','conv','january','iiphb','ferguson','ulbrich','trenberth','schär','rcms','nao','germi','veisz','spell','sres','srex','phenomenon','probability','ρ','qsw','moj','nhs','imd','october','academicians','academic','absorb','aapda','nrdc','report','online','across','action','expand','page','u','https','j','g','figure','thank','you','mbove','print','info','escalate','isbn','volume','dr','vol','phd','upon','al', 'e', 'fig', 'cl', 'hap', 'et', 'http', 'ew', 'climate','jun','simon', 'yang','go','copyright','ltd','license','creative','imply','dot', 'dashed','line', 'may', 'either','bi','de', 'la', 'paix', 'tel', 'box', 'fax','front', 'oli', 'scarff', 'getty', 'image', 'back', 'atlas','ness','net', 'san', 'felipe','also','informa', 'tion', 'per', 'cent','due','distri', 'bution','com','bin','thus','km', 'grid','would','christopher','u','aa','aap','aas','ab','abares','abatzoglou','abbs','abc','aberystwyth','doi'])
    # Adding custom stopwords directly to the set
    custom_stopwords = set(['ehst event','ehst','th','lll','reasonable','good','good good','na','na na','city','oecd','urban','red','paper','forum','figgs','enfield','literature','estimate','model','effect','impact','extreme','study','tantly','treme','tsuji','tural','yt','en','ings','factsheet','edney','budin','apps','app','enbel','alth','aunan','basu','user','search','photo','tures','mistry','letter','cdds','ly','national','plan','hhaps','use','initiate','various','public','health', 'heat', 'temperature', 'change', 'risk', 'increase','health','india','plan', 'brutally','management','part','highlight','world','united','nation','joint', 'cooperation','brief','issue','professor','tsdps','ons','qlw','lfb','qh','qc','humber','osdma','hse','hm','maha','ucs','lok','international','doya','gsdma','cdri','eswd','dluhc','esd','dfe','niosh','naic','desnz','odisha','defra','easac','bouhi','awhp','ndma','rachel','bureau','scientist','adequate','annual','address','amplify','billion','calculation,','activate','adaptation','additionally','acknowledge','afternoon','accordance','act','accord','data','high','ukhsa','osha','mrt','donthu','ensemble','kristina','webinar','qr','percent','interviewee','licker','eea','ccc','conv','january','iiphb','ferguson','ulbrich','trenberth','schär','rcms','nao','germi','veisz','spell','sres','srex','phenomenon','probability','ρ','qsw','moj','nhs','imd','october','academicians','academic','absorb','aapda','nrdc','report','online','across','action','expand','page','u','https','j','g','figure','thank','you','mbove','print','info','escalate','isbn','volume','dr','vol','phd','upon','al', 'e', 'fig', 'cl', 'hap', 'et', 'http', 'ew', 'climate','jun','simon', 'yang','go','copyright','ltd','license','creative','imply','dot', 'dashed','line', 'may', 'either','bi','de', 'la', 'paix', 'tel', 'box', 'fax','front', 'oli', 'scarff', 'getty', 'image', 'back', 'atlas','ness','net', 'san', 'felipe','also','informa', 'tion', 'per', 'cent','due','distri', 'bution','com','bin','thus','km', 'grid','would','christopher','u','aa','aap','aas','ab','abares','abatzoglou','abbs','abc','aberystwyth','doi'])
    stop_words.update(custom_stopwords)

    # Standardize the text and removing non-alphabetic or stop words
    words = [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words]

    # Defining lemmatizer
    lemmatizer = WordNetLemmatizer()

    # Conducting POS tagging
    pos_tags = nltk.pos_tag(words)

    # Lemmatizing word-tokens via assigned POS tags
    lemma_tokens = [lemmatizer.lemmatize(token, wordnet_pos_tags(pos_tag)) for token, pos_tag in pos_tags]

    return lemma_tokens

*Iteration function*

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def iterate_pdf_files(pdf_folder):
    # Creating a dictionary to display all documents and tokens together
    docs = {}

    # Counting to track the number of processed texts
    text_count = 0

    # Iterating over each file in the folder
    for filename in os.listdir(pdf_folder):

        # Checking whether the files are PDFs
        if filename.endswith('.pdf'):

            # Creating a path to the PDF files
            pdf_path = os.path.join(pdf_folder, filename)

            # Extracting the content of files
            text = extract_text_from_pdf(pdf_path)

            # Applying the preprocessing pipeline to the extracted tex
            preprocessed_text = txt_preprocess_pipeline(text)

            # Generating a unique name for the document based on the length of the docs dictionary for ease in readibility
            doc_name = f'doc{len(docs) + 1}'

            # Assigning the preprocessed text content of the current document in the docs dictionary with the document name as the key
            docs[doc_name] = [preprocessed_text]

            # Increasing the text count when finished assigning
            text_count += 1

    return docs

pdf_folder = '/content/drive/MyDrive/Documents/socio'

# Calling the iterate_pdf_files function to get the dictionary of documents
docs = iterate_pdf_files(pdf_folder)

# Printing each document's tokens as a list
for doc_name, tokens in docs.items():
    print(f"{doc_name}: {tokens}")

*Token dictionary*

In [None]:
# Iterating over the values in the docs dictionary created in the previous step. Each value in the sublist represents the tokens in a certain document
all_preprocessed_text = [token for sublist in docs.values() for token in sublist]

# Creating a gensim dictionary to assign an integer ID to each token
dictionary = corpora.Dictionary(all_preprocessed_text)
print(dictionary)

*BoW representation*

In [None]:
# Creating a Bag-of-Words (BoW) representation of the preprocessed text data using the dictionary created in the previous step
bow_corpus = [dictionary.doc2bow(text) for text in all_preprocessed_text]
pprint.pprint(bow_corpus)

In [None]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(bow_corpus))

### TF-IDF

In [None]:
# Creating a TF-IDF model from the Bag-of-Words corpus
tfidf_model = TfidfModel(bow_corpus)

# Applying the TF-IDF model to the Bag-of-Words corpus to get the TF-IDF representation
tfidf_corpus = tfidf_model[bow_corpus]

# Printing the TF-IDF representation for the first few documents
for doc in tfidf_corpus[:5]:
    print(doc)

In [None]:
# Looking up the word corresponding to word_id = 0
word_id = 5
word = dictionary[word_id]
print("Word corresponding to word_id =", word_id, ":", word)

In [16]:
# Defining the TF-IDF threshold
threshold = 0.05

# Iterating through the TF-IDF representation of the documents
for doc in tfidf_corpus[:5]:  # Iterating over the first few documents

    # Filtering words with TF-IDF scores above the threshold
    words_above_threshold = [(dictionary[word_id], tfidf_score) for word_id, tfidf_score in doc if tfidf_score > threshold]

    # Printing the words with TF-IDF scores above the threshold
    print("Words with TF-IDF score > 0.05:", words_above_threshold)

Words with TF-IDF score > 0.05: [('bangko', 0.2731654882420059), ('enso', 0.06123114574364336), ('headline', 0.07355058525705206), ('impulse', 0.2890794382468332), ('inflation', 0.14824382653724183), ('inflationary', 0.14657660344693), ('luzon', 0.05330058307161091), ('macroeconomic', 0.15213533650405922), ('ng', 0.16778098016487464), ('output', 0.07358179674052537), ('parentheses', 0.05608650809697571), ('philippine', 0.17213086377326955), ('pilipinas', 0.2731654882420059), ('ppt', 0.35977893573337366), ('price', 0.05810813571762852), ('sentral', 0.2731654882420059), ('shock', 0.19126917416635486), ('transitory', 0.05608650809697571), ('xx', 0.14731988502281676), ('ℎ', 0.05330058307161091), ('𝐏𝐡𝐢𝐥𝐢𝐩𝐩𝐢𝐧𝐞𝐬', 0.073288301723465), ('𝐵𝐴𝑅𝑀𝑀', 0.073288301723465), ('𝐵𝑖𝑐𝑜𝑙', 0.073288301723465), ('𝐶𝐴𝐿𝐴𝐵𝐴𝑅𝑍𝑂𝑁', 0.073288301723465), ('𝐶𝐴𝑅', 0.073288301723465), ('𝐶𝐴𝑅𝐴𝐺𝐴', 0.073288301723465), ('𝐶𝑎𝑔𝑎𝑦𝑎𝑛', 0.073288301723465), ('𝐶𝑒𝑛𝑡𝑟𝑎𝑙', 0.14657660344693), ('𝐷𝑎𝑣𝑎𝑜', 0.073288301723465), ('𝐸𝑎𝑠𝑡', 0.07328

### N-grams

In [None]:
# Taking a list of tokens as input and generating n-grams
def generate_ngrams(tokens, n=3):
    return list(nltk_ngrams(tokens, n))

# Creating n-gram model from a corpus
def create_ngram_model(corpus, n=3):
    ngram_model = defaultdict(int)
    for text in corpus:
        tokens = word_tokenize(text)
        ngrams = generate_ngrams(tokens, n)
        for ngram in ngrams:
            ngram_model[ngram] += 1
    return ngram_model

max_texts_to_process = None
docs = iterate_pdf_files(pdf_folder)

# Concatenating preprocessed tokens for each document into a single list of tokens
corpus_tokens = [tokens for sublist in docs.values() for tokens in sublist]

# Concatenating tokens into a single string for each document
corpus_strings = [' '.join(tokens) for tokens in corpus_tokens]

# Creating an n-gram model
ngram_model = create_ngram_model(corpus_strings, n=3)

# Printing all n-grams
print("All N-grams:")
for ngram, frequency in ngram_model.items():
    print(ngram, ":", frequency)

In [18]:
# Filtering n-grams with frequency greater than 1
filtered_ngram_model = {ngram: frequency for ngram, frequency in ngram_model.items() if frequency > 1}

# Printing filtered n-grams
print("Filtered N-grams:")
for ngram, frequency in filtered_ngram_model.items():
    print(ngram, ":", frequency)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
('together', 'beat', 'drought') : 3
('beat', 'drought', 'trap') : 3
('drought', 'trap', 'munich') : 3
('trap', 'munich', 'foundation') : 3
('munich', 'foundation', 'munich') : 3
('globalagrisk', 'skees', 'jerry') : 3
('skees', 'jerry', 'challenge') : 3
('jerry', 'challenge', 'rural') : 3
('challenge', 'rural', 'financial') : 3
('rural', 'financial', 'market') : 9
('financial', 'market', 'blend') : 3
('market', 'blend', 'innovation') : 3
('blend', 'innovation', 'rural') : 3
('innovation', 'rural', 'finance') : 3
('rural', 'finance', 'washington') : 3
('country', 'time', 'period') : 18
('development', 'bank', 'aria') : 6
('bank', 'aria', 'diego') : 6
('aria', 'diego', 'covarrubias') : 6
('diego', 'covarrubias', 'katia') : 6
('covarrubias', 'katia', 'agricultural') : 6
('katia', 'agricultural', 'insurance') : 6
('agricultural', 'insurance', 'mesoamerica') : 6
('insurance', 'mesoamerica', 'opportunity') : 6
('mesoamerica', 'o

### BERT topic

In [27]:
def preprocess_texts(docs):
    preprocessed_docs = [' '.join(tokens) for tokens in (doc[0] for doc in docs.values())]
    return preprocessed_docs
preprocessed_texts = preprocess_texts(docs)

In [None]:
# Initializing the SentenceTransformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Encoding the preprocessed texts into embeddings
embeddings = model.encode(preprocessed_texts, show_progress_bar=True)

In [29]:
# Initializing BERTopic with custom parameters
topic_model = BERTopic(min_topic_size=2,
                       top_n_words=6,
                       n_gram_range=(1, 2),
                       calculate_probabilities=True)

# Fitting BERTopic on the preprocessed texts and embeddings
topics, probs = topic_model.fit_transform(preprocessed_texts,embeddings)

In [30]:
# Checking the topics identified
topic_info = topic_model.get_topic_info()
print(topic_info)

    Topic  Count                                      Name  \
0      -1     13  -1_exposure_population_vulnerability_air   
1       0     31       0_disaster_country_development_loss   
2       1     15         1_worker_work_stress_occupational   
3       2      6             2_ination_shock_region_future   
4       3      5        3_emission_energy_trade_technology   
5       4      5               4_unit_insurance_tdcj_texas   
6       5      5         5_drought_vegetation_ozone_global   
7       6      5  6_migration_tourism_australia_australian   
8       7      4            7_stratum_day_mean_electricity   
9       8      4         8_event_return_firm_excess return   
10      9      3    9_location_threshold_fan_neighbourhood   
11     10      3               10_damage_loss_drought_cost   
12     11      3             11_cool_roof_cool roof_access   
13     12      2               12_mortality_city_tract_uhi   

                                       Representation  \
0   [exposur

In [67]:
topic_model.get_topic(12)

[('mortality', 0.03391142049449049),
 ('city', 0.026461964488021333),
 ('tract', 0.02524872411513278),
 ('uhi', 0.02442543094815526),
 ('lst', 0.018210789397039358),
 ('age', 0.017465661306799912)]

In [31]:
# Getting the topic words
topic_words = topic_model.get_topics()

# Preprocessing the texts
preprocessed_texts = preprocess_texts(docs)

# Converting preprocessed texts to list of words for coherence model
texts = [doc.split() for doc in preprocessed_texts]

# Creating a dictionary and corpus for the coherence model
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

coherence_scores = []
for topic_id, topic in topic_words.items():
    # Preparing the data for coherence model
    topics = [[word for word, _ in topic]]

    # Computing Coherence Score using Gensim's CoherenceModel for each topic
    coherence_model = CoherenceModel(topics=topics, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()

    coherence_scores.append((topic_id, coherence_score))

# Printing coherence scores for each topic
for topic_id, coherence_score in coherence_scores:
    print(f'Topic {topic_id}: Coherence Score: {coherence_score}')

Topic -1: Coherence Score: 0.5865906945517748
Topic 0: Coherence Score: 0.49169192606711837
Topic 1: Coherence Score: 0.8309315789563628
Topic 2: Coherence Score: 0.35907251148303115
Topic 3: Coherence Score: 0.6224394017065169
Topic 4: Coherence Score: 0.268453167273575
Topic 5: Coherence Score: 0.40617713979120246
Topic 6: Coherence Score: 0.5758808812855526
Topic 7: Coherence Score: 0.6061261761116337
Topic 8: Coherence Score: 0.6528843252746699
Topic 9: Coherence Score: 0.5942455799010785
Topic 10: Coherence Score: 0.6150541191028925
Topic 11: Coherence Score: 0.6687805530825827
Topic 12: Coherence Score: 0.697434974884389
