In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import os
import re
import pickle
import time
from tqdm.notebook import tqdm
from textblob import TextBlob

from sklearn.preprocessing import MinMaxScaler

# Load local preprocesing modules
from module_text_blocks import split_text_into_blocks, clean_text_blocks
from module_process_pdf import process_pdf

# Load local model modules
from module_sentiment_textblob import calculate_textblob_polarity
from module_sentiment_google import calculate_google_polarity
from module_sentiment_amazon import calculate_amazon_polarity
from module_sentiment_openai import calculate_openai_polarity

# add two model modules later (BERT and Roberta)


In [3]:
# Load pdf text and headings from the pickle file
# pdf_texts = pickle.load(open("Src/pdf_texts.pkl", "rb"))                        # Texts are extracted from the pdf
# pdf_headings = pickle.load(open("Src/pdf_headings.pkl", "rb"))                  # Headings are extracted from the pdf text
# pdf_headings_context = pickle.load(open("Src/pdf_headings_context.pkl", "rb"))  # Surrounding text of headings helps to identify headings correctly and avoid duplicates

In [2]:
# Select the pdf file to process
file_paths = ["ShareholderLetters/FINAL-Q2-23-Shareholder-Letter.pdf"]

In [3]:
# Transform pdf files into texts and headings and store them as dictionaries
pdf_texts, pdf_headings, pdf_headings_context = process_pdf(file_paths)

In [4]:
# Normalise model scores
def normalise_score(original_score, old_min, old_max, new_min, new_max):
    old_range = old_max - old_min
    new_range = new_max - new_min
    normalised_score = (original_score - old_min) / old_range
    normalised_score = new_min + (normalised_score * new_range)
    return normalised_score

In [5]:
# Models with correspondings local functions to calculate sentiment score
models = {
    "TextBlob": calculate_textblob_polarity,
    "Google": calculate_google_polarity,
    "Amazon": calculate_amazon_polarity,
    "OpenAI": calculate_openai_polarity
}

# Min and Max values for each model (needed for normalisation)
models_min_max = {
    "TextBlob": [0.07, 0.186],
    "Google": [0.0, 0.30],
    "Amazon": [0.689, 0.9058]
}

# Manual min and max values as benchmark for normalisation
manual_min_value = -0.15
manual_max_value = 0.66

# Iterate over pdf files
for pdf_file in pdf_texts:

    try:

        text = pdf_texts[pdf_file]
        headings = pdf_headings[pdf_file]
        headings_context = pdf_headings_context[pdf_file]

        # Split text into blocks
        text_blocks = split_text_into_blocks(text, headings, headings_context)

        # Clean text blocks
        text_blocks = clean_text_blocks(text_blocks)

        # Iterate over models
        for model_name, model_function in models.items():
            
            # set start time
            start_time = time.time()

            # Calculate sentiment score
            sentiment_score = model_function(text_blocks, headings, text)
            
            # Normalise sentiment score (except for OpenAI) 
            if model_name != "OpenAI": 
                sentiment_score = normalise_score(sentiment_score, models_min_max[model_name][0], models_min_max[model_name][1], manual_min_value, manual_max_value)
        
            # set end time
            end_time = time.time()
            # print sentiment score
            print(f"{pdf_file} {model_name} sentiment score: {sentiment_score}")
            # print processing time
            print(f"Time taken for {model_name} to process text: ", end_time - start_time)

    except Exception as e:

        print(f"Error: {e}")

FINAL-Q2-23-Shareholder-Letter TextBlob sentiment score: 0.6603495902734645
Time taken for TextBlob to process text:  0.11017346382141113
FINAL-Q2-23-Shareholder-Letter Google sentiment score: 0.39000000804662704
Time taken for Google to process text:  3.7825024127960205
FINAL-Q2-23-Shareholder-Letter Amazon sentiment score: 0.5530620518205789
Time taken for Amazon to process text:  20.243367910385132
FINAL-Q2-23-Shareholder-Letter OpenAI sentiment score: 0.25
Time taken for OpenAI to process text:  15.575986385345459
