In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import os
import re
import pickle
from tqdm.notebook import tqdm
from textblob import TextBlob

# Load local modules
from module_text_blocks import split_text_into_blocks, clean_text_blocks

In [3]:
# Load pdf text and headings from the pickle file
pdf_texts = pickle.load(open("pdf_texts2.pkl", "rb"))
pdf_headings = pickle.load(open("pdf_headings2.pkl", "rb"))

In [22]:
polarity_scores = {}
subjectivity_scores = {}

for pdf_name in tqdm(pdf_texts):

    try:

        text = pdf_texts[pdf_name]
        headings = pdf_headings[pdf_name]

        # split the text into blocks based on the headings
        text_blocks = split_text_into_blocks(text, headings)
        # clean the text blocks
        text_blocks = clean_text_blocks(text_blocks)

        final_prompt = ''
        if len(headings) > 0:
            for heading, text in text_blocks.items():
                # Testing condition
                if heading == 'Document_intro' or re.search('results', heading.lower()):
                    final_prompt += heading + ': ' + text + " "
        else:
            final_prompt = text

        polarity_score = TextBlob(final_prompt).sentiment.polarity
        subjectivity_score = TextBlob(final_prompt).sentiment.subjectivity

        polarity_scores[pdf_name] = polarity_score
        subjectivity_scores[pdf_name] = subjectivity_score

        # break

    except Exception as e:

        # break

        print(f"Exception occurred in file {pdf_name}")
        print(f"Exception message: {str(e)}")

  0%|          | 0/50 [00:00<?, ?it/s]

In [24]:
polarity_textblob = pd.DataFrame(list(polarity_scores.items()), columns=['pdf_name', 'polarity'])
subjectivity_textblob = pd.DataFrame(list(subjectivity_scores.items()), columns=['pdf_name', 'subjectivity'])

In [25]:
polarity_textblob.head()

Unnamed: 0,pdf_name,polarity
0,COMBINED-Q4-17-Shareholder-Letter-FINAL,0.095483
1,FINAL-Q1-18-Shareholder-Letter,0.136311
2,FINAL-Q1-19-Shareholder-Letter,0.071943
3,FINAL-Q1-20-Shareholder-Letter,0.106488
4,FINAL-Q1-21-Shareholder-Letter,0.025847


In [26]:
subjectivity_textblob.head()

Unnamed: 0,pdf_name,subjectivity
0,COMBINED-Q4-17-Shareholder-Letter-FINAL,0.300157
1,FINAL-Q1-18-Shareholder-Letter,0.305329
2,FINAL-Q1-19-Shareholder-Letter,0.332486
3,FINAL-Q1-20-Shareholder-Letter,0.381387
4,FINAL-Q1-21-Shareholder-Letter,0.363652


In [20]:
# export polarity and subjectivity scores
polarity_textblob.to_csv('Scores/textblob_polarity_2.csv')
subjectivity_textblob.to_csv('Scores/textblob_subjectivity_2.csv')