In [1]:
# IMF MCMCO Test Question 4
# Natural Language Processing in Python: Unsupervised Machine Learning

In [2]:
import os
import zipfile
import PyPDF2  
import glob
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import torch
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import pipeline

2024-10-27 19:14:42.072104: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# Read files from Communication.zip 
zip_file_path = "Communication.zip"

extracted_folder = "Communication_extracted"

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_folder)


In [4]:
folder_path = "Communication_extracted/Communication"

documents = []

# Read TXT files
for txt_file in glob.glob(os.path.join(folder_path, "*.txt")):
    with open(txt_file, 'r', encoding='utf-8') as file:
        documents.append(file.read())

# Read PDF files
for pdf_file in glob.glob(os.path.join(folder_path, "*.pdf")):
    with open(pdf_file, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        pdf_text = ''
        for page in reader.pages:
            pdf_text += page.extract_text()  
        documents.append(pdf_text) 

# text from all PDF and TXT files
for i, doc in enumerate(documents):
    print(f"Document {i+1}:")
    print(doc[:1000])  # Print the first 1000 characters of each document
    print("\n" + "="*80 + "\n")

Document 1:
The Swedish economy has developed strongly so far this year. World trade has increased rapidly, which has contributed to an acceleration in exports and investment. There has also been a substantial improvement in the labour market. Continuing modest activity in the eurozone and a more protracted recovery in the United States are expected to dampen developments in Sweden somewhat in the long run.   Deputy Governor Lars E.O. Svensson entered a reservation against the decision to raise the repo rate by 0.25 percentage points to 0.75 per cent and against the repo-rate path in the Monetary Policy Update. Deputy Governor Karolina Ekholm entered a reservation against the repo-rate path.    on 1 September30/09/2011 


Document 2:
21 January 2021 The Governing Council decided to reconfirm its very accommodative monetary policy stance. First, the interest rate on the main refinancing operations and the interest rates on the marginal lending facility and the deposit facility will rema

In [5]:
# Vectorization
vectorizer = CountVectorizer(stop_words='english')
doc_term_matrix = vectorizer.fit_transform(documents)

In [6]:
# Fit LDA model
lda = LatentDirichletAllocation(n_components=15, random_state=42)
lda.fit(doc_term_matrix)

In [7]:
# Get the topics
topic_keywords = {}
for idx, topic in enumerate(lda.components_):
    topic_keywords[idx] = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-5:]]

# Display the topics
for topic_num, keywords in topic_keywords.items():
    print(f"Topic {topic_num + 1}: {keywords}")

Topic 1: ['cent', 'rate', 'monetary', 'inflation', 'policy']
Topic 2: ['bey', 'legislative', 'thoug', 'expiration', 'siness']
Topic 3: ['deputy', 'entered', 'path', 'rate', 'repo']
Topic 4: ['2020', 'governing', 'policy', 'inflation', 'percent']
Topic 5: ['bey', 'legislative', 'thoug', 'expiration', 'siness']
Topic 6: ['economic', 'japan', 'policy', 'mr', 'bank']
Topic 7: ['fourth', 'end', '10', 'february30', 'cutting']
Topic 8: ['support', 'economy', 'development', 'policy', 'financial']
Topic 9: ['bank', 'figure', 'percent', 'zealand', 'new']
Topic 10: ['trillion', 'month', 'operation', 'yen', 'approximately']
Topic 11: ['forecast', 'rate', '2023', 'growth', 'inflation']
Topic 12: ['policy', 'committee', 'bank', 'inflation', 'growth']
Topic 13: ['2018', 'inflation', 'committee', 'bank', 'growth']
Topic 14: ['march', 'inflation', 'preliminary', 'weather', 'q1']
Topic 15: ['bey', 'legislative', 'thoug', 'expiration', 'siness']


In [8]:
# FINBERT for sentiment computation

In [9]:
# Load FINBERT
model_name = 'yiyanghkust/finbert-tone'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = TFBertForSequenceClassification.from_pretrained(model_name)


All model checkpoint layers were used when initializing TFBertForSequenceClassification.

All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at yiyanghkust/finbert-tone.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [10]:
# Create a sentiment analysis pipeline
finbert_sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)


In [None]:
# Compute sentiment for each document
combined_sentiments = []

for doc in documents:
    # truncate if length exceeds 512 tokens (maximum)
    inputs = tokenizer(doc, max_length=512, truncation=True, return_tensors='tf')
    
    scores = []
    sentiment_labels = []
    
    input_ids = inputs['input_ids'].numpy()  
    
    for i in range(0, len(input_ids), 512):  
        decoded_text = tokenizer.decode(input_ids[i], skip_special_tokens=True)
        sentiment_result = finbert_sentiment_pipeline(decoded_text)  
        scores.append(sentiment_result[0]['score']) 
        sentiment_labels.append(sentiment_result[0]['label'])  
    
    # Calculate the average score and determine the overall sentiment label
    average_score = sum(scores) / len(scores)
    
    # Determine the overall sentiment label (majority voting)
    if scores.count(max(scores)) > 1:
        overall_label = max(set(sentiment_labels), key=sentiment_labels.count)  
    else:
        overall_label = sentiment_labels[scores.index(max(scores))]  
    
    combined_sentiments.append({
        'document': doc,
        'label': overall_label,
        'score': average_score
    })



In [None]:
# Results desplay
sentiment_df = pd.DataFrame(combined_sentiments)
print(sentiment_df)