In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import os
import re
import pickle
from tqdm.notebook import tqdm

# Load local modules
from module_text_blocks import split_text_into_blocks, clean_text_blocks

# Connect to Google API
from google.cloud import language_v1
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "credentials_google.json"

In [2]:
def google_sentiment_analysis(final_prompt):
    
    client = language_v1.LanguageServiceClient()

    if isinstance(final_prompt, bytes):
        final_prompt = final_prompt.decode("utf-8")

    type_ = language_v1.Document.Type.PLAIN_TEXT
    document = {"type_": type_, "content": final_prompt}

    response = client.analyze_sentiment(request={"document": document})
    sentiment = response.document_sentiment
    
    polarity_score = sentiment.score
    subjectivity_score = sentiment.magnitude

    return polarity_score, subjectivity_score

In [3]:
# Load pdf text and headings from the pickle file
pdf_texts = pickle.load(open("pdf_texts.pkl", "rb"))
pdf_headings = pickle.load(open("pdf_headings.pkl", "rb"))
pdf_headings_context = pickle.load(open("pdf_headings_context.pkl", "rb"))

In [4]:
polarity_scores = {}
subjectivity_scores = {}

for pdf_name in tqdm(pdf_texts):

    try:

        text = pdf_texts[pdf_name]
        headings = pdf_headings[pdf_name]
        headings_context = pdf_headings_context[pdf_name] 

        # split the text into blocks based on the headings
        text_blocks = split_text_into_blocks(text, headings, headings_context)
        # clean the text blocks
        text_blocks = clean_text_blocks(text_blocks)

        final_prompt = ''
        if len(headings) > 0:
            for heading, text in text_blocks.items():
                final_prompt += heading + ': ' + text + " "
        else:
            final_prompt = text

        polarity_score, subjectivity_score = google_sentiment_analysis(final_prompt)

        polarity_scores[pdf_name] = polarity_score
        subjectivity_scores[pdf_name] = subjectivity_score

    except Exception as e:
        print(f"Exception occurred in file {pdf_name}")
        print(f"Exception message: {str(e)}")

  0%|          | 0/50 [00:00<?, ?it/s]

In [5]:
polarity_google = pd.DataFrame(list(polarity_scores.items()), columns=['pdf_name', 'polarity'])
subjectivity_google = pd.DataFrame(list(subjectivity_scores.items()), columns=['pdf_name', 'subjectivity'])

In [6]:
polarity_google.head()

Unnamed: 0,pdf_name,polarity
0,COMBINED-Q4-17-Shareholder-Letter-FINAL,0.2
1,FINAL-Q1-18-Shareholder-Letter,0.2
2,FINAL-Q1-19-Shareholder-Letter,0.1
3,FINAL-Q1-20-Shareholder-Letter,0.0
4,FINAL-Q1-21-Shareholder-Letter,0.1


In [7]:
subjectivity_google.head()

Unnamed: 0,pdf_name,subjectivity
0,COMBINED-Q4-17-Shareholder-Letter-FINAL,37.099998
1,FINAL-Q1-18-Shareholder-Letter,29.200001
2,FINAL-Q1-19-Shareholder-Letter,26.1
3,FINAL-Q1-20-Shareholder-Letter,43.200001
4,FINAL-Q1-21-Shareholder-Letter,29.5


In [8]:
# export polarity and subjectivity scores
polarity_google.to_csv('Scores/google_polarity.csv', index=False)
subjectivity_google.to_csv('Scores/google_subjectivity.csv', index=False)

In [3]:
# read polarity and subjectivity google files
polarity_google = pd.read_csv('Scores/google_polarity.csv')
subjectivity_google = pd.read_csv('Scores/google_subjectivity.csv') 

In [9]:
polarity_google.head()

Unnamed: 0,pdf_name,polarity
0,COMBINED-Q4-17-Shareholder-Letter-FINAL,0.2
1,FINAL-Q1-18-Shareholder-Letter,0.2
2,FINAL-Q1-19-Shareholder-Letter,0.1
3,FINAL-Q1-20-Shareholder-Letter,0.0
4,FINAL-Q1-21-Shareholder-Letter,0.1


In [10]:
subjectivity_google.head()

Unnamed: 0,pdf_name,subjectivity
0,COMBINED-Q4-17-Shareholder-Letter-FINAL,37.099998
1,FINAL-Q1-18-Shareholder-Letter,29.200001
2,FINAL-Q1-19-Shareholder-Letter,26.1
3,FINAL-Q1-20-Shareholder-Letter,43.200001
4,FINAL-Q1-21-Shareholder-Letter,29.5
