### Load all necessary libraries and local modules

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import os
import re
import sys
import pickle
import time

# Load local preprocesing modules
sys.path.append("Pipeline/")
from module_text_blocks import split_text_into_blocks, clean_text_blocks
from module_process_pdf import process_pdf
from module_scrape_pdf import scrape_pdf
from module_functions import normalise_score

# Load local model modules
sys.path.append("Models/")
from module_sentiment_textblob import calculate_textblob_polarity
from module_sentiment_google import calculate_google_polarity
from module_sentiment_amazon import calculate_amazon_polarity
from module_sentiment_openai import calculate_openai_polarity
from module_sentiment_bert import calculate_bert_polarity
from module_sentiment_roberta import calculate_roberta_polarity

### Execute Full Pipeline (TextBlob Model Only) - Approximately 15-20 Seconds Running Time

In [2]:
main_start_time = time.time()

# Load historic models min and max values for normalisation
historic_df = pd.read_csv("Src/historic_min_max.csv")

# Models with corresponding local functions to calculate sentiment score
models = {
    "TextBlob": calculate_textblob_polarity,
    "Amazon": calculate_amazon_polarity,
    "Google": calculate_google_polarity,
    "OpenAI": calculate_openai_polarity,
    "BERT": calculate_bert_polarity,
    "RoBERTa": calculate_roberta_polarity,
}

# Models with True or False boolean to indicate whether to run the model or not
models_bool = {
    "TextBlob": True,
    "Amazon": False,
    "Google": False,
    "OpenAI": False,
    "BERT": False,
    "RoBERTa": False,
}

# scrape PDF from Netflix website
Netflix_scraping_page = "https://ir.netflix.net/financials/quarterly-earnings/default.aspx"
# change to scrape_all=False to only scrape the latest pdf, otherwise all pdfs will be scraped
download_folder = "Pipeline" # specify folder to download pdfs to, default is Pipeline
scrape_pdf(Netflix_scraping_page, scrape_all=False, download_folder=download_folder)

# get all downloaded files from the dedicated folder
pipeline_files = [download_folder + '/' + file for file in os.listdir(download_folder) if file.endswith('.pdf')]

# Transform pdf files into texts and headings and store them as dictionaries
pdf_texts, pdf_headings, pdf_headings_context = process_pdf(pipeline_files)

# Iterate over pdf texts
for pdf_file in pdf_texts:

    try:

        # Get text, headings and headings context
        text = pdf_texts[pdf_file]
        headings = pdf_headings[pdf_file]
        headings_context = pdf_headings_context[pdf_file]

        # Split text into blocks
        text_blocks = split_text_into_blocks(text, headings, headings_context)

        # Clean text blocks
        text_blocks = clean_text_blocks(text_blocks)

        # Iterate over models
        for model_name, model_function in models.items():
            
            # skip model if bool is False
            if models_bool[model_name] == False:
                continue
            
            # set start time
            start_time = time.time()

            # Calculate sentiment score
            sentiment_score = model_function(text_blocks, headings, text)
            
            # Normalise sentiment score (except for OpenAI) 
            if model_name != "OpenAI": 
                sentiment_score = normalise_score(sentiment_score, historic_df[historic_df["Name"] == model_name]["Min"].values[0], historic_df[historic_df["Name"] == model_name]["Max"].values[0], historic_df.iloc[0, 1], historic_df.iloc[0, 2])
        
            # set end time
            end_time = time.time()
            # print sentiment score
            print(f"{pdf_file} {model_name} sentiment score: {sentiment_score}")
            # print processing time
            print(f"Time taken for {model_name} model to calculate sentiment: ", end_time - start_time)#

    except Exception as e:

        print(f"Error: {e}")
        continue

main_end_time = time.time()
print(" ")
print(" ")
print(f"Total time taken for full pipeline to run: ", main_end_time - main_start_time)

FINAL-Q2-23-Shareholder-Letter TextBlob sentiment score: 0.6603495902734645
Time taken for TextBlob model to calculate sentiment:  0.03366374969482422
 
 
Total time taken for full pipeline to run:  18.89583158493042


### Execute Full Pipeline (Google and OpenAI Models Only) - Approximately 25-35 Seconds Running Time

In [2]:
main_start_time = time.time()

# Load historic models min and max values for normalisation
historic_df = pd.read_csv("Src/historic_min_max.csv")

# Models with corresponding local functions to calculate sentiment score
models = {
    "TextBlob": calculate_textblob_polarity,
    "Amazon": calculate_amazon_polarity,
    "Google": calculate_google_polarity,
    "OpenAI": calculate_openai_polarity,
    "BERT": calculate_bert_polarity,
    "RoBERTa": calculate_roberta_polarity,
}

# Models with True or False boolean to indicate whether to run the model or not
models_bool = {
    "TextBlob": False,
    "Amazon": False,
    "Google": True,
    "OpenAI": True,
    "BERT": False,
    "RoBERTa": False,
}


# scrape PDF from Netflix website
Netflix_scraping_page = "https://ir.netflix.net/financials/quarterly-earnings/default.aspx"
# change to scrape_all=False to only scrape the latest pdf, otherwise all pdfs will be scraped
download_folder = "Pipeline" # specify folder to download pdfs to, default is Pipeline
scrape_pdf(Netflix_scraping_page, scrape_all=False, download_folder=download_folder)

# get all downloaded files from the dedicated folder
pipeline_files = [download_folder + '/' + file for file in os.listdir(download_folder) if file.endswith('.pdf')]

# Transform pdf files into texts and headings and store them as dictionaries
pdf_texts, pdf_headings, pdf_headings_context = process_pdf(pipeline_files)

# Iterate over pdf texts
for pdf_file in pdf_texts:

    try:

        # Get text, headings and headings context
        text = pdf_texts[pdf_file]
        headings = pdf_headings[pdf_file]
        headings_context = pdf_headings_context[pdf_file]

        # Split text into blocks
        text_blocks = split_text_into_blocks(text, headings, headings_context)

        # Clean text blocks
        text_blocks = clean_text_blocks(text_blocks)

        # Iterate over models
        for model_name, model_function in models.items():
            
            # skip model if bool is False
            if models_bool[model_name] == False:
                continue
            
            # set start time
            start_time = time.time()

            # Calculate sentiment score
            sentiment_score = model_function(text_blocks, headings, text)
            
            # Normalise sentiment score (except for OpenAI) 
            if model_name != "OpenAI": 
                sentiment_score = normalise_score(sentiment_score, historic_df[historic_df["Name"] == model_name]["Min"].values[0], historic_df[historic_df["Name"] == model_name]["Max"].values[0], historic_df.iloc[0, 1], historic_df.iloc[0, 2])
        
            # set end time
            end_time = time.time()
            # print sentiment score
            print(f"{pdf_file} {model_name} sentiment score: {sentiment_score}")
            # print processing time
            print(f"Time taken for {model_name} model to calculate sentiment: ", end_time - start_time)#

    except Exception as e:

        print(f"Error: {e}")
        continue

main_end_time = time.time()
print(" ")
print(" ")
print(f"Total time taken for full pipeline to run: ", main_end_time - main_start_time)

FINAL-Q2-23-Shareholder-Letter Google sentiment score: 0.39000000804662704
Time taken for Google model to calculate sentiment:  4.226781129837036
FINAL-Q2-23-Shareholder-Letter OpenAI sentiment score: 0.25
Time taken for OpenAI model to calculate sentiment:  11.225489854812622
 
 
Total time taken for full pipeline to run:  32.99209499359131


### Semi-Full Pipeline (No Scraping & PDF Preprocessing)

In [5]:
# Load the latest shareholder text and headings from the pickle file
pdf_texts = pickle.load(open("Src/pdf_texts_last_report.pickle", "rb"))                        # Texts are extracted from the pdf
pdf_headings = pickle.load(open("Src/pdf_headings_last_report.pickle", "rb"))                  # Headings are extracted from the pdf text
pdf_headings_context = pickle.load(open("Src/pdf_headings_context_last_report.pickle", "rb"))  # Surrounding text of headings helps to identify headings correctly and avoid duplicates

In [6]:
# Load historic models min and max values for normalisation
historic_df = pd.read_csv("Src/historic_min_max.csv")

In [7]:
# Models with corresponding local functions to calculate sentiment score
models = {
    "TextBlob": calculate_textblob_polarity,
    "Amazon": calculate_amazon_polarity,
    "Google": calculate_google_polarity,
    "OpenAI": calculate_openai_polarity,
    "BERT": calculate_bert_polarity,
    "RoBERTa": calculate_roberta_polarity,
}

# Models with True or False boolean to indicate whether to run the model or not
models_bool = {
    "TextBlob": True,
    "Amazon": False,
    "Google": False,
    "OpenAI": False,
    "BERT": False,
    "RoBERTa": False,
}

main_start_time = time.time()

# Iterate over pdf files
for pdf_file in pdf_texts:

    try:

        text = pdf_texts[pdf_file]
        headings = pdf_headings[pdf_file]
        headings_context = pdf_headings_context[pdf_file]

        # Split text into blocks
        text_blocks = split_text_into_blocks(text, headings, headings_context)

        # Clean text blocks
        text_blocks = clean_text_blocks(text_blocks)

        # Iterate over models
        for model_name, model_function in models.items():

            # skip model if bool is False
            if models_bool[model_name] == False:
                continue
            
            # set start time
            start_time = time.time()

            # Calculate sentiment score
            sentiment_score = model_function(text_blocks, headings, text)
            
            # Normalise sentiment score (except for OpenAI) 
            if model_name != "OpenAI": 
                sentiment_score = normalise_score(sentiment_score, historic_df[historic_df["Name"] == model_name]["Min"].values[0], historic_df[historic_df["Name"] == model_name]["Max"].values[0], historic_df.iloc[0, 1], historic_df.iloc[0, 2])

            # set end time
            end_time = time.time()
            # print sentiment score
            print(f"{pdf_file} {model_name} sentiment score: {sentiment_score}")
            # print processing time
            print(f"Time taken for {model_name} to process text: ", end_time - start_time)
        
        # Calculate the average sentiment score
        average_sentiment_score = np.average(sentiment_scores, weights=[1.0, 1.0])

        # Print the average sentiment score
        print(f"{pdf_file} weighted average sentiment score: {average_sentiment_score}")

    except Exception as e:

        print(f"Error: {e}")

main_end_time = time.time()
print(" ")
print(" ")
print(f"Time taken for semi-full pipeline to run: ", main_end_time - main_start_time)

FINAL-Q2-23-Shareholder-Letter TextBlob sentiment score: 0.6603495902734645
Time taken for TextBlob to process text:  0.03193259239196777
Error: name 'sentiment_scores' is not defined
 
 
Time taken for semi-full pipeline to run:  0.033925771713256836
