In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import os
import re
import pickle
from tqdm.notebook import tqdm
from textblob import TextBlob

In [2]:
def split_text_into_blocks(text, headings):

    # Split the text into blocks based on the headings
    text_blocks = {}

    # Iterate over the headings
    for heading in range(len(headings)):

        if heading == 0:
            document_intro = text.split(headings[heading])[0]
            text_blocks['Document_intro'] = document_intro
        
        text_after_heading = text.split(headings[heading])[1]
        if heading == len(headings) - 1:
            text_blocks[headings[heading]] = text_after_heading
            break
        else:
            text_of_heading = text_after_heading.split(headings[heading+1])[0]
            text_blocks[headings[heading]] = text_of_heading
    
    return text_blocks

In [3]:
def clean_text_blocks(text_blocks):

    # iterate over text blocks, removing '\u200b' and extra spaces
    for key in text_blocks:
        text_blocks[key] = re.sub('\u200b', '', text_blocks[key])
        text_blocks[key] = re.sub(' +', ' ', text_blocks[key])

    return text_blocks

In [4]:
# Load pdf text and headings from the pickle file
pdf_texts = pickle.load(open("pdf_texts.pkl", "rb"))
pdf_headings = pickle.load(open("pdf_headings.pkl", "rb"))

In [9]:
polarity_scores = {}
subjectivity_scores = {}

for pdf_name in tqdm(pdf_texts):

    try:

        if pdf_name == "FINAL-Q2-23-Shareholder-Letter" or pdf_name == "Final-Q1-23-Shareholder-Letter":
            continue

        else:

            text = pdf_texts[pdf_name]
            headings = pdf_headings[pdf_name]

            # split the text into blocks based on the headings
            text_blocks = split_text_into_blocks(text, headings)
            # clean the text blocks
            text_blocks = clean_text_blocks(text_blocks)

            final_prompt = ''
            for heading, text in text_blocks.items():
                final_prompt += heading + ': ' + text + " "

            polarity_score = TextBlob(final_prompt).sentiment.polarity
            subjectivity_score = TextBlob(final_prompt).sentiment.subjectivity

            polarity_scores[pdf_name] = polarity_score
            subjectivity_scores[pdf_name] = subjectivity_score

    except Exception as e:
        print(f"Exception occurred in file {pdf_name}")
        print(f"Exception message: {str(e)}")

  0%|          | 0/50 [00:00<?, ?it/s]

In [13]:
polarity_textblob = pd.DataFrame(list(polarity_scores.items()), columns=['pdf_name', 'polarity'])
subjectivity_textblob = pd.DataFrame(list(subjectivity_scores.items()), columns=['pdf_name', 'subjectivity'])

In [14]:
polarity_textblob.head()

Unnamed: 0,pdf_name,polarity
0,COMBINED-Q4-17-Shareholder-Letter-FINAL,0.142613
1,FINAL-Q1-18-Shareholder-Letter,0.160147
2,FINAL-Q1-19-Shareholder-Letter,0.183131
3,FINAL-Q1-20-Shareholder-Letter,0.122139
4,FINAL-Q1-21-Shareholder-Letter,0.146732


In [15]:
subjectivity_textblob.head()

Unnamed: 0,pdf_name,subjectivity
0,COMBINED-Q4-17-Shareholder-Letter-FINAL,0.399306
1,FINAL-Q1-18-Shareholder-Letter,0.383214
2,FINAL-Q1-19-Shareholder-Letter,0.425397
3,FINAL-Q1-20-Shareholder-Letter,0.36724
4,FINAL-Q1-21-Shareholder-Letter,0.394787
