In [29]:
# !pip install langchain-community langchain-core
# !pip install pymupdf
# %pip install -qU pypdf
# %pip install PyMuPDF pdfplumber pandas spacy regex
# %pip install PyPDF2 pandas os
# !pip install pdfplumber
# !!python -m spacy download en_core_web_lg 

## Tools and Libraries Neccessary

In [30]:
# import relevant libaries
import os
import spacy
import gensim
from string import punctuation
from pypdf import PdfReader
from gensim import corpora
from nltk.tokenize import word_tokenize
from pdfminer.high_level import extract_text
from langchain_community.document_loaders import PyMuPDFLoader

* **PyPDF2:** This library helps to read the text content from PDF files.
* **Pandas:** Used to create a DataFrame to store your extracted data.
* **os:** Used to interact with the file system and list files.


### Data

In [2]:
# Specify the path to your folder containing PDF files
pdf_folder_path = '../Data/Attribute_Papers/'

# List all the PDF files in the folder
pdf_files = [f for f in os.listdir(pdf_folder_path) if f.endswith('.pdf')]
f"Total journas are {len(pdf_files)}"

'Total journas are 78'

In [3]:
# List the journals
pdf_files[:5]

['International Organizations and Implementation. Enforcers_Managers_ Authorities.pdf',
 'Critical Choices_The United Nations_Networks and the Future of Global Governance.pdf',
 'Organizational Progeny. Why Governments are Losing Control over the Proliferating Structures of Global Governance.pdf',
 'The Politics of International Environmental Management.pdf',
 'New Alliances in Global Environmental Governance_ Hw intergovernmental treaty secretariat interact.pdf']

In [5]:
# Extract text from PDF (using pdfminer)
pdf_text = extract_text('../Data/Attribute_Papers/“Privatisation_’ in the United Nations system_.pdf')

# Tokenize text
tokens = word_tokenize(pdf_text.lower())

# Create a dictionary and corpus for LDA model
dictionary = corpora.Dictionary([tokens])
corpus = [dictionary.doc2bow(text) for text in [tokens]]

# Apply LDA model to find topics
lda_model = gensim.models.LdaMulticore(corpus, num_topics=3, id2word=dictionary, passes=2)

# Print the themes/topics
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)


(0, '0.043*"the" + 0.041*"," + 0.025*"of" + 0.017*"." + 0.014*"and"')
(1, '0.036*"," + 0.028*"the" + 0.018*"of" + 0.015*"." + 0.011*"and"')
(2, '0.059*"," + 0.059*"the" + 0.030*"of" + 0.029*"." + 0.024*"and"')


In [16]:
import os
import pandas as pd
from pdfminer.high_level import extract_text
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import download
from gensim import corpora
import gensim
import string

In [21]:
# Download necessary NLTK resources
download('stopwords')

# Define the path to your directory containing the PDF files
pdf_dir = '../Data/Attribute_Papers/'

# Define stopwords list
stop_words = set(stopwords.words('english'))

# Function for preprocessing the text
def preprocess_text(text):
    # Tokenize the text and convert it to lowercase
    tokens = word_tokenize(text.lower())
    
    # Remove stopwords and punctuation
    tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]
    
    # Optional: Lemmatization can be added here (e.g., using nltk.stem or spacy)
    
    return tokens

# Initialize an empty list to store the results
results = []

# Step 1: Process each PDF file individually
for filename in os.listdir(pdf_dir):
    if filename.endswith(".pdf"):  # Process only PDF files
        pdf_path = os.path.join(pdf_dir, filename)
        
        # Extract text from the PDF file
        pdf_text = extract_text(pdf_path)
        
        # Step 2: Preprocess the text (tokenization, stopword removal, punctuation removal)
        tokens = preprocess_text(pdf_text)
        
        # Step 3: Create a dictionary and corpus for the LDA model
        dictionary = corpora.Dictionary([tokens])  # Create a dictionary for the tokens in this paper
        corpus = [dictionary.doc2bow(tokens)]  # Create the bag-of-words for this paper
        
        # Step 4: Apply LDA model to extract themes (topics)
        lda_model = gensim.models.LdaMulticore(corpus, num_topics=10, id2word=dictionary, passes=2)
        
        # Step 5: Extract and format the topics for the current paper
        topics = lda_model.print_topics(num_words=5)
        themes = [f"Topic {i+1}: {topic[1]}" for i, topic in enumerate(topics)]
        
        # Step 6: Store the filename and its associated topics in the results list
        results.append({"File": filename, "Themes": "; ".join(themes)})

# Step 7: Create a DataFrame from the results list
df = pd.DataFrame(results)


[nltk_data] Downloading package stopwords to /home/milo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
The PDF <_io.BufferedReader name='../Data/Attribute_Papers/The complexification of the United Nations system.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


In [55]:
# Step 8: Display the DataFrame (or save it to a file if needed)
# df['Themes'][0]


# Create a DataFrame from the results list
# df = pd.DataFrame(results)

# Save the DataFrame to an Excel file
df.to_excel('themes_analysis.xlsx', index=False, engine='openpyxl')

# Confirmation message
print("DataFrame has been saved to 'themes_analysis1.xlsx'.")

DataFrame has been saved to 'themes_analysis1.xlsx'.


## Key Phrases

In [41]:
# !python -m spacy download en_core_web_lg

In [26]:
import os
import re
import pandas as pd
from pdfminer.high_level import extract_text
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import download
from gensim import corpora
import gensim
import string
from gensim.models import Phrases

In [43]:
# Download necessary NLTK resources
download('stopwords')

# Load the spaCy model for Named Entity Recognition (NER)
nlp = spacy.load("en_core_web_lg")

# Increase the text length limit (example: set it to 2 million characters)
nlp.max_length = 2000000 

# Define stopwords list
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /home/milo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [44]:
# Function to preprocess the text
def preprocess_text(text):
    # Tokenize the text and convert it to lowercase
    tokens = word_tokenize(text.lower())
    # Remove stopwords and punctuation
    tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]
    return tokens

# Function to extract named entities from text
def extract_entities(text):
    doc = nlp(text)
    entities = [ent.text for ent in doc.ents]
    return entities

# Function to create bigrams and trigrams
def create_bigrams_trigrams(tokens):
    bigram_model = Phrases([tokens], min_count=5, threshold=100)
    trigram_model = Phrases(bigram_model[[tokens]], threshold=100)
    bigrams = bigram_model[tokens]
    trigrams = trigram_model[bigrams]
    return bigrams, trigrams

# Initialize a list to store results
results = []

# Path to your PDF directory
pdf_dir = '../Data/Attribute_Papers/'

# Process each PDF file
for filename in os.listdir(pdf_dir):
    if filename.endswith(".pdf"):  # Process only PDF files
        pdf_path = os.path.join(pdf_dir, filename)
        
        # Extract text from the PDF file
        pdf_text = extract_text(pdf_path)
        
        # Extract named entities from the text
        entities = extract_entities(pdf_text)
        
        # Remove named entities from the text
        cleaned_text = ' '.join([word for word in pdf_text.split() if word not in entities])
        
        # Preprocess the cleaned text
        tokens = preprocess_text(cleaned_text)
        
        # Create bigrams and trigrams
        bigrams, trigrams = create_bigrams_trigrams(tokens)
        
        # Create a dictionary and corpus for the LDA model
        dictionary = corpora.Dictionary([bigrams])  # Use bigrams
        corpus = [dictionary.doc2bow(bigrams)]  # Create bag-of-words for this paper
        
        # Apply LDA model to extract themes
        lda_model = gensim.models.LdaMulticore(corpus, num_topics=10, id2word=dictionary, passes=5)
        
        # Extract and format the topics for the paper
        topics = lda_model.print_topics(num_words=5)
        themes = [f"Topic {i+1}: {topic[1]}" for i, topic in enumerate(topics)]
        
        # Store the filename, themes, and named entities in the results
        results.append({"File": filename, "Themes": "; ".join(themes), "Entities": ", ".join(set(entities))})

# Create a DataFrame from the results
phrases_df = pd.DataFrame(results)

The PDF <_io.BufferedReader name='../Data/Attribute_Papers/The complexification of the United Nations system.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


In [54]:
phrases_df

0     Topic 1: 0.005*"ios" + 0.003*"implementation" ...
1     Topic 1: 0.016*"networks" + 0.010*"global" + 0...
2     Topic 1: 0.002*"international" + 0.002*"e" + 0...
3     Topic 1: 0.003*"international" + 0.003*"enviro...
4     Topic 1: 0.012*"secretariat" + 0.011*"actors" ...
                            ...                        
73    Topic 1: 0.006*"environmental" + 0.005*"govern...
74    Topic 1: 0.015*"international" + 0.012*"organi...
75    Topic 1: 0.017*"environmental" + 0.008*"``" + ...
76    Topic 1: 0.009*"governance" + 0.009*"ios" + 0....
77    Topic 1: 0.004*"e" + 0.003*"secretariats" + 0....
Name: Themes, Length: 78, dtype: object

In [57]:
# Save phrases_df to a different CSV file
phrases_df.to_csv('phrases_analysis.csv', index=False)


In [49]:
import gensim
from gensim import corpora
from gensim.models import TfidfModel
from gensim.models.phrases import Phrases
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import spacy
import string

# Download necessary NLTK resources
nltk.download('stopwords')

# Load the spaCy model for Named Entity Recognition (NER)
nlp = spacy.load("en_core_web_lg")

# Define stopwords list
stop_words = set(stopwords.words('english'))

# Function for text preprocessing (tokenization, removing stopwords, punctuation)
def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Tokenize and convert to lowercase
    tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]
    return tokens

# Function to create bigrams/trigrams
def create_bigrams_trigrams(tokens):
    bigram_model = Phrases([tokens], min_count=5, threshold=100)  # Create bigram model
    trigram_model = Phrases(bigram_model[[tokens]], min_count=5, threshold=100)  # Create trigram model
    bigrams = bigram_model[tokens]  # Apply bigrams to the text
    trigrams = trigram_model[bigrams]  # Apply trigrams to the text
    return trigrams

# Function to process the PDF documents
def process_pdfs(pdf_dir):
    results = []
    
    for filename in os.listdir(pdf_dir):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_dir, filename)
            
            # Step 1: Extract text from PDF
            pdf_text = extract_text(pdf_path)
            
            # Step 2: Preprocess the text (tokenization, stopword removal)
            tokens = preprocess_text(pdf_text)
            
            # Step 3: Create bigrams and trigrams from the tokens
            trigrams = create_bigrams_trigrams(tokens)
            
            # Step 4: Create a dictionary and corpus for Lfrom nltk import ngramsDA
            dictionary = corpora.Dictionary([trigrams])
            corpus = [dictionary.doc2bow(trigrams)]
            
            # Step 5: Apply TF-IDF model to adjust word weights
            tfidf_model = TfidfModel(corpus)
            corpus_tfidf = tfidf_model[corpus]
            
            # Step 6: Apply LDA model to extract topics
            lda_model = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2)
            
            # Step 7: Extract themes (topics) from the model
            topics = lda_model.print_topics(num_words=5)
            themes = [f"Topic {i+1}: {topic[1]}" for i, topic in enumerate(topics)]
            
            # Step 8: Store the results (filename and extracted themes)
            results.append({"File": filename, "Themes": "; ".join(themes)})
    
    # Step 9: Convert the results into a DataFrame
    df = pd.DataFrame(results)
    return df

# Define the directory containing the PDFs
pdf_dir = '../Data/Attribute_Papers/'

# Process the PDFs and get the results as a DataFrame
df2 = process_pdfs(pdf_dir)

[nltk_data] Downloading package stopwords to /home/milo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  perwordbound = self.bound(chunk, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words)
The PDF <_io.BufferedReader name='../Data/Attribute_Papers/The complexification of the United Nations system.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


In [52]:
df2["Themes"][0]

'Topic 1: 0.001*"penalize" + 0.001*"per-" + 0.001*"pean" + 0.001*"penal-" + 0.001*"pattern"; Topic 2: 0.001*"penalize" + 0.001*"per-" + 0.001*"pean" + 0.001*"penal-" + 0.001*"pattern"; Topic 3: 0.001*"penalize" + 0.001*"per-" + 0.001*"pean" + 0.001*"penal-" + 0.001*"pattern"; Topic 4: 0.001*"penalize" + 0.001*"per-" + 0.001*"pean" + 0.001*"penal-" + 0.001*"pattern"; Topic 5: 0.001*"penalize" + 0.001*"per-" + 0.001*"pean" + 0.001*"penal-" + 0.001*"pattern"; Topic 6: 0.001*"penalize" + 0.001*"per-" + 0.001*"pean" + 0.001*"penal-" + 0.001*"pattern"; Topic 7: 0.001*"penalize" + 0.001*"per-" + 0.001*"pean" + 0.001*"penal-" + 0.001*"pattern"; Topic 8: 0.001*"penalize" + 0.001*"per-" + 0.001*"pean" + 0.001*"penal-" + 0.001*"pattern"; Topic 9: 0.001*"penalize" + 0.001*"per-" + 0.001*"pean" + 0.001*"penal-" + 0.001*"pattern"; Topic 10: 0.001*"penalize" + 0.001*"per-" + 0.001*"pean" + 0.001*"penal-" + 0.001*"pattern"'