In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import json
import re
import os
import pickle
from tqdm.notebook import tqdm

# Load local modules
from module_text_blocks import split_text_into_blocks, clean_text_blocks

# Import credentials
from credentials_amazon import *

# Connect to Amazon API
import boto3
os.environ["AWS_ACCESS_KEY_ID"] = AWS_ACCESS_KEY_ID
os.environ["AWS_SECRET_ACCESS_KEY"] = AWS_SECRET_ACCESS_KEY
os.environ["AWS_REGION"] = AWS_REGION

In [2]:
def amazon_analyze_sentiment(text):
    comprehend = boto3.client(service_name='comprehend', region_name="us-west-2")
    sentiment_response = comprehend.detect_sentiment(Text=text, LanguageCode='en')
    return sentiment_response["SentimentScore"]

In [3]:
def split_text_by_chars(text, num_chars):
    """Split the input text every num_chars characters."""
    return [text[i:i+num_chars] for i in range(0, len(text), num_chars)]

In [4]:
# Load pdf text and headings from the pickle file
pdf_texts = pickle.load(open("pdf_texts.pkl", "rb"))
pdf_headings = pickle.load(open("pdf_headings.pkl", "rb"))

In [20]:
polarity_scores = {}
pdf_lists = ["FINAL-Q1-19-Shareholder-Letter"]
pdf_lists = ["FINAL-Q2-20-Shareholder-Letter-V3-with-Tables"]

for pdf_name in tqdm(pdf_lists):

    try:

        text_blocks_scores = []

        text = pdf_texts[pdf_name]
        headings = pdf_headings[pdf_name]

        # split the text into blocks based on the headings
        text_blocks = split_text_into_blocks(text, headings)
        # clean the text blocks
        text_blocks = clean_text_blocks(text_blocks)

        # iterate over the text blocks individually, otherwise single request with all text will fail
        if len(headings) > 0:
            for heading, text in text_blocks.items():

                if heading == "Reference":
                    break

                # print(len(text))

                if len(text) == 0:
                    continue

                # the prompt for amazon sentiment analysis should be less than 5000 bytes
                if len(text) < 4750:
                    polarity_score = amazon_analyze_sentiment(text)
                    key = max(polarity_score, key=polarity_score.get)
                    text_blocks_scores.append(polarity_score[key])
                
                # split into multi blocks if the text is too long
                else:
                    text_blocks = split_text_by_chars(text, 4500)
                    for text in text_blocks:
                        polarity_score = amazon_analyze_sentiment(text)
                        key = max(polarity_score, key=polarity_score.get)
                        text_blocks_scores.append(polarity_score[key])

        else:
            text_blocks = split_text_by_chars(text, 4500)
            for text in text_blocks:
                polarity_score = amazon_analyze_sentiment(text)
                key = max(polarity_score, key=polarity_score.get)
                text_blocks_scores.append(polarity_score[key])
                
        
        polarity_scores[pdf_name] = np.mean(text_blocks_scores)
        # print(text_blocks_scores)

    except Exception as e:

        print(f"Exception occurred in file {pdf_name}")
        print(f"Exception message: {str(e)}")

  0%|          | 0/1 [00:00<?, ?it/s]

693
788
2658
2892
1502
607
3456


In [21]:
amazon_polarity = pd.DataFrame(list(polarity_scores.items()), columns=['pdf_name', 'polarity'])

In [22]:
amazon_polarity.head()

Unnamed: 0,pdf_name,polarity
0,FINAL-Q2-20-Shareholder-Letter-V3-with-Tables,0.846756


In [26]:
# export to csv
# amazon_polarity.to_csv("Scores/amazon_polarity.csv", index=False)