In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import json
import re
import os
import pickle
from tqdm.notebook import tqdm

# Load local modules
from module_text_blocks import split_text_into_blocks, clean_text_blocks

# Import credentials
from credentials_amazon import *

# Connect to Amazon API
import boto3
os.environ["AWS_ACCESS_KEY_ID"] = AWS_ACCESS_KEY_ID
os.environ["AWS_SECRET_ACCESS_KEY"] = AWS_SECRET_ACCESS_KEY
os.environ["AWS_REGION"] = AWS_REGION

In [2]:
def amazon_analyze_sentiment(text):
    comprehend = boto3.client(service_name='comprehend', region_name="us-west-2")
    sentiment_response = comprehend.detect_sentiment(Text=text, LanguageCode='en')
    return sentiment_response["SentimentScore"]

In [3]:
# Load pdf text and headings from the pickle file
pdf_texts = pickle.load(open("pdf_texts.pkl", "rb"))
pdf_headings = pickle.load(open("pdf_headings.pkl", "rb"))

In [7]:
polarity_scores = {}

for pdf_name in tqdm(pdf_texts):

    try:

        text_blocks_scores = []

        pdf_name = "FINAL-Q1-19-Shareholder-Letter"

        text = pdf_texts[pdf_name]
        headings = pdf_headings[pdf_name]

        # split the text into blocks based on the headings
        # text_blocks = split_text_into_blocks(text, headings)

        # define dictionary for storing text blocks
        text_blocks = {}

        # Iterate over the headings
        for heading in range(len(headings)):

            # in the beginning of document, there is no heading, so let's define it as document intro
            if heading == 0:
                document_intro = text.split(headings[heading])[0]
                text_blocks['Document_intro'] = document_intro
            
            # prevent the error of index out of range
            if len(text.split(headings[heading])) > 1:
                text_after_heading = text.split(headings[heading])[1]
            else:
                text_after_heading = ""
            
            # identify the last heading
            if heading == len(headings) - 1:
                text_blocks[headings[heading]] = text_after_heading
                break
            else:
                # identify middle headings
                text_of_heading = text_after_heading.split(headings[heading+1])[0]
                text_blocks[headings[heading]] = text_of_heading

        # clean the text blocks
        text_blocks = clean_text_blocks(text_blocks)

        # iterate over the text blocks individually, otherwise single request with all text will fail
        for heading, text in text_blocks.items():
            
            # the prompt for amazon sentiment analysis should be less than 5000 bytes
            if len(text) < 5000:
                polarity_score = amazon_analyze_sentiment(text)
                key = max(polarity_score, key=polarity_score.get)
                text_blocks_scores.append(polarity_score[key])
        
        polarity_scores[pdf_name] = np.mean(text_blocks_scores)
        
        # break

    except Exception as e:

        print(f"Exception occurred in file {pdf_name}")
        print(f"Exception message: {str(e)}")

        # break

  0%|          | 0/50 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [8]:
# comprehend = boto3.client(service_name='comprehend', region_name="us-west-2")
# sentiment_response = comprehend.detect_sentiment(Text="Wow, so amazing", LanguageCode='en')

In [9]:
# sentiment_response

{'Sentiment': 'POSITIVE',
 'SentimentScore': {'Positive': 0.9996782541275024,
  'Negative': 7.027111132629216e-05,
  'Neutral': 0.00021913267846684903,
  'Mixed': 3.231415030313656e-05},
 'ResponseMetadata': {'RequestId': '39e4554a-7300-4ebd-b513-c9f90813a35e',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '39e4554a-7300-4ebd-b513-c9f90813a35e',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '166',
   'date': 'Tue, 08 Aug 2023 21:14:07 GMT'},
  'RetryAttempts': 0}}

In [50]:
text_blocks_scores

[0.7302021980285645,
 0.9798567891120911,
 0.9805167317390442,
 0.8968285918235779,
 0.8543012738227844,
 0.9246026277542114,
 0.9749899506568909,
 0.9163967370986938,
 0.9347952008247375]

In [12]:
polarity_amazon = pd.DataFrame(list(polarity_scores.items()), columns=['pdf_name', 'polarity'])
# subjectivity_amazon = pd.DataFrame(list(subjectivity_scores.items()), columns=['pdf_name', 'subjectivity'])

In [15]:
polarity_amazon.head()

Unnamed: 0,pdf_name,polarity
0,COMBINED-Q4-17-Shareholder-Letter-FINAL,0.883941
1,FINAL-Q1-18-Shareholder-Letter,0.870285
2,FINAL-Q1-20-Shareholder-Letter,0.89268
3,FINAL-Q1-21-Shareholder-Letter,0.879103
4,FINAL-Q1-22-Shareholder-Letter,0.848568


In [16]:
# export to csv
polarity_amazon.to_csv("Src/polarity_amazon.csv", index=False)