In [4]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import json
import re
import os
import pickle
from tqdm.notebook import tqdm

# Load local modules
from module_text_blocks import split_text_into_blocks, clean_text_blocks

# Import credentials
from credentials_amazon import *

# Connect to Amazon API
import boto3
os.environ["AWS_ACCESS_KEY_ID"] = AWS_ACCESS_KEY_ID
os.environ["AWS_SECRET_ACCESS_KEY"] = AWS_SECRET_ACCESS_KEY
os.environ["AWS_REGION"] = AWS_REGION

In [5]:
def amazon_analyze_sentiment(text):
    comprehend = boto3.client(service_name='comprehend', region_name="us-west-2")
    sentiment_response = comprehend.detect_sentiment(Text=text, LanguageCode='en')
    return sentiment_response["SentimentScore"]

In [18]:
def split_text_by_chars(text, num_chars):
    """Split the input text every num_chars characters."""
    return [text[i:i+num_chars] for i in range(0, len(text), num_chars)]

In [6]:
# Load pdf text and headings from the pickle file
pdf_texts = pickle.load(open("pdf_texts.pkl", "rb"))
pdf_headings = pickle.load(open("pdf_headings.pkl", "rb"))

In [19]:
polarity_scores = {}

for pdf_name in tqdm(pdf_texts):

    try:


        text_blocks_scores = []

        text = pdf_texts[pdf_name]
        headings = pdf_headings[pdf_name]

        # split the text into blocks based on the headings
        text_blocks = split_text_into_blocks(text, headings)
        # clean the text blocks
        text_blocks = clean_text_blocks(text_blocks)

        # iterate over the text blocks individually, otherwise single request with all text will fail
        if len(headings) > 0:
            for heading, text in text_blocks.items():
                
                # the prompt for amazon sentiment analysis should be less than 5000 bytes
                if len(text) < 5000:
                    polarity_score = amazon_analyze_sentiment(text)
                    key = max(polarity_score, key=polarity_score.get)
                    text_blocks_scores.append(polarity_score[key])
        
            polarity_scores[pdf_name] = np.mean(text_blocks_scores)
        else:
            text_blocks = split_text_by_chars(text, 4500)
            for text in text_blocks:
                polarity_score = amazon_analyze_sentiment(text)
                key = max(polarity_score, key=polarity_score.get)
                text_blocks_scores.append(polarity_score[key])
                
            polarity_scores[pdf_name] = np.mean(text_blocks_scores)
        
        # break

    except Exception as e:

        print(f"Exception occurred in file {pdf_name}")
        print(f"Exception message: {str(e)}")

        # break

  0%|          | 0/50 [00:00<?, ?it/s]

Exception occurred in file FINAL-Q1-19-Shareholder-Letter
Exception message: Parameter validation failed:
Invalid length for parameter Text, value: 0, valid min length: 1
Exception occurred in file Final-Q1-23-Shareholder-Letter
Exception message: An error occurred (TextSizeLimitExceededException) when calling the DetectSentiment operation: Input text size exceeds limit. Max length of request text allowed is 5000 bytes while in this request the text size is 5011 bytes
Exception occurred in file FINAL-Q2-20-Shareholder-Letter-V3-with-Tables
Exception message: Parameter validation failed:
Invalid length for parameter Text, value: 0, valid min length: 1
Exception occurred in file FINAL-Q3-20-Shareholder-Letter
Exception message: Parameter validation failed:
Invalid length for parameter Text, value: 0, valid min length: 1
Exception occurred in file FINAL-Q420-Shareholder-Letter
Exception message: Parameter validation failed:
Invalid length for parameter Text, value: 0, valid min length: 1


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Exception occurred in file Q2-19-Shareholder-Letter-FINAL
Exception message: Parameter validation failed:
Invalid length for parameter Text, value: 0, valid min length: 1


In [20]:
amazon_polarity = pd.DataFrame(list(polarity_scores.items()), columns=['pdf_name', 'polarity'])

In [23]:
amazon_polarity.head()

Unnamed: 0,pdf_name,polarity
0,COMBINED-Q4-17-Shareholder-Letter-FINAL,0.883941
1,FINAL-Q1-18-Shareholder-Letter,0.870285
2,FINAL-Q1-20-Shareholder-Letter,0.89268
3,FINAL-Q1-21-Shareholder-Letter,0.879103
4,FINAL-Q1-22-Shareholder-Letter,0.848568


In [26]:
# export to csv
# amazon_polarity.to_csv("Scores/amazon_polarity.csv", index=False)