In [21]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import os
import re
import pickle
from tqdm.notebook import tqdm

# Connect to Google API
from google.cloud import language_v1
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "application_default_credentials.json"

In [22]:
def split_text_into_blocks(text, headings):

    # Split the text into blocks based on the headings
    text_blocks = {}

    # Iterate over the headings
    for heading in range(len(headings)):

        if heading == 0:
            document_intro = text.split(headings[heading])[0]
            text_blocks['Document_intro'] = document_intro
        
        text_after_heading = text.split(headings[heading])[1]
        if heading == len(headings) - 1:
            text_blocks[headings[heading]] = text_after_heading
            break
        else:
            text_of_heading = text_after_heading.split(headings[heading+1])[0]
            text_blocks[headings[heading]] = text_of_heading
    
    return text_blocks

In [23]:
def clean_text_blocks(text_blocks):

    # iterate over text blocks, removing '\u200b' and extra spaces
    for key in text_blocks:
        text_blocks[key] = re.sub('\u200b', '', text_blocks[key])
        text_blocks[key] = re.sub(' +', ' ', text_blocks[key])

    return text_blocks

In [29]:
def google_sentiment_analysis(final_prompt):
    
    client = language_v1.LanguageServiceClient()

    if isinstance(final_prompt, bytes):
        final_prompt = final_prompt.decode("utf-8")

    type_ = language_v1.Document.Type.PLAIN_TEXT
    document = {"type_": type_, "content": final_prompt}

    response = client.analyze_sentiment(request={"document": document})
    sentiment = response.document_sentiment
    
    polarity_score = sentiment.score
    subjectivity_score = sentiment.magnitude

    return polarity_score, subjectivity_score

In [25]:
# Load pdf text and headings from the pickle file
pdf_texts = pickle.load(open("pdf_texts.pkl", "rb"))
pdf_headings = pickle.load(open("pdf_headings.pkl", "rb"))

In [32]:
polarity_scores = {}
subjectivity_scores = {}

for pdf_name in tqdm(pdf_texts):

    try:

        if pdf_name == "FINAL-Q2-23-Shareholder-Letter" or pdf_name == "Final-Q1-23-Shareholder-Letter":
            continue

        else:

            text = pdf_texts[pdf_name]
            headings = pdf_headings[pdf_name]

            # split the text into blocks based on the headings
            text_blocks = split_text_into_blocks(text, headings)
            # clean the text blocks
            text_blocks = clean_text_blocks(text_blocks)

            final_prompt = ''
            for heading, text in text_blocks.items():
                final_prompt += heading + ': ' + text + " "

            polarity_score, subjectivity_score = google_sentiment_analysis(final_prompt)

            polarity_scores[pdf_name] = polarity_score
            subjectivity_scores[pdf_name] = subjectivity_score

    except Exception as e:
        print(f"Exception occurred in file {pdf_name}")
        print(f"Exception message: {str(e)}")

  0%|          | 0/50 [00:00<?, ?it/s]

In [33]:
polarity_google = pd.DataFrame(list(polarity_scores.items()), columns=['pdf_name', 'polarity'])
subjectivity_google = pd.DataFrame(list(subjectivity_scores.items()), columns=['pdf_name', 'subjectivity'])

In [35]:
polarity_google.head()

Unnamed: 0,pdf_name,polarity
0,COMBINED-Q4-17-Shareholder-Letter-FINAL,0.2
1,FINAL-Q1-18-Shareholder-Letter,0.2
2,FINAL-Q1-19-Shareholder-Letter,0.2
3,FINAL-Q1-20-Shareholder-Letter,0.0
4,FINAL-Q1-21-Shareholder-Letter,0.1


In [36]:
subjectivity_google.head()

Unnamed: 0,pdf_name,subjectivity
0,COMBINED-Q4-17-Shareholder-Letter-FINAL,37.099998
1,FINAL-Q1-18-Shareholder-Letter,27.0
2,FINAL-Q1-19-Shareholder-Letter,23.4
3,FINAL-Q1-20-Shareholder-Letter,59.700001
4,FINAL-Q1-21-Shareholder-Letter,23.799999
