In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import json
import re
import os
import pickle
import openai
from tqdm.notebook import tqdm

# Load local modules
from module_text_blocks import split_text_into_blocks, clean_text_blocks

# Import credentials
from credentials_amazon import *
from credentials_openai import *
openai.api_key = openai_api_key

# Connect to Amazon API
import boto3
os.environ["AWS_ACCESS_KEY_ID"] = AWS_ACCESS_KEY_ID
os.environ["AWS_SECRET_ACCESS_KEY"] = AWS_SECRET_ACCESS_KEY
os.environ["AWS_REGION"] = AWS_REGION

In [2]:
def amazon_analyze_sentiment(text):
    comprehend = boto3.client(service_name='comprehend', region_name="us-west-2")
    sentiment_response = comprehend.detect_sentiment(Text=text, LanguageCode='en')
    return sentiment_response["SentimentScore"]

In [3]:
def split_text_by_chars(text, num_chars):
    """Split the input text every num_chars characters."""
    return [text[i:i+num_chars] for i in range(0, len(text), num_chars)]

In [4]:
import tiktoken

# define the number of tokens in the prompt
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [5]:
def summarize_long_text_blocks(text):

    # for heading, text in text_blocks.items():
        
    text_length = len(text.split(' '))                           # number of words in the text block
    tokens_number = num_tokens_from_string(text, "cl100k_base")  # number of tokens in the text block
    chars_number = len(text)                                     # number of characters in the text block
                    
    summarization_blocks = [text]                                # list of text blocks to summarize
    responses = []

    # if the block contains over 750 words, summarize it
    if text_length > 750:

        # if the block is exceeding the token limit, split it into multiple blocks
        if tokens_number > 3500:

            text_split_threshold = int(chars_number / (tokens_number / 2500))
            summarization_blocks = split_text_by_chars(text, text_split_threshold)
            
        for summarization_block in summarization_blocks:

            completion = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[
                {"role": "user", "content": "Please effectively summarize the following text: " + summarization_block}
                ],
                temperature=0.3,
                max_tokens=500
            )                
            # top_p=1, frequency_penalty=0, presence_penalty=0, stop=["\n"]
            # add the summarized text to the list of responses
            responses.append(completion.choices[0].message.content)

        # join the responses into a single text block
        # text_blocks[heading] = ' '.join(responses)
        text = ' '.join(responses)
    
    return text

In [6]:
# Load pdf text and headings from the pickle file
pdf_texts = pickle.load(open("Src/pdf_texts.pkl", "rb"))                        # Texts are extracted from the pdf
pdf_headings = pickle.load(open("Src/pdf_headings.pkl", "rb"))                  # Headings are extracted from the pdf text
pdf_headings_context = pickle.load(open("Src/pdf_headings_context.pkl", "rb"))  # Surrounding text of headings helps to identify headings correctly and avoid duplicates

In [None]:
polarity_scores = {}

for pdf_name in tqdm(pdf_texts):

    try:

        text_blocks_scores = []

        text = pdf_texts[pdf_name]
        headings = pdf_headings[pdf_name]
        headings_context = pdf_headings_context[pdf_name]

        # split the text into blocks based on the headings
        text_blocks = split_text_into_blocks(text, headings, headings_context)
        # clean the text blocks
        text_blocks = clean_text_blocks(text_blocks)

        # iterate over the text blocks individually, otherwise single request with all text will fail
        if len(headings) > 0:


            for heading, text_block in text_blocks.items():

                if heading == "Reference":
                    break

                if len(text_block) == 0:
                    continue

                # the prompt for amazon sentiment analysis should be less than 5000 bytes
                if len(text_block) < 4750:
                    polarity_score = amazon_analyze_sentiment(text_block)
                    key = max(polarity_score, key=polarity_score.get)
                    text_blocks_scores.append(polarity_score[key])
                
                # split into multi blocks if the text is too long
                else:
                    # text_blocks = split_text_by_chars(text, 4500)
                    text_block = summarize_long_text_blocks(text_block)
                    # for text in text_blocks:
                    polarity_score = amazon_analyze_sentiment(text_block)
                    key = max(polarity_score, key=polarity_score.get)
                    text_blocks_scores.append(polarity_score[key])
                

        # if there are no headings, just split the text into block of 4000 characters
        else:
            print("--- No headings found, splitting text into blocks of 4000 characters")
            text_blocks = split_text_by_chars(text, 4000)
            for text in text_blocks:
                polarity_score = amazon_analyze_sentiment(text)
                key = max(polarity_score, key=polarity_score.get)
                text_blocks_scores.append(polarity_score[key])
                
        
        polarity_scores[pdf_name] = np.mean(text_blocks_scores)

    except Exception as e:

        print(f"Exception occurred in file {pdf_name}")
        print(f"Exception message: {str(e)}")

In [8]:
amazon_polarity = pd.DataFrame(list(polarity_scores.items()), columns=['pdf_name', 'polarity'])

In [9]:
amazon_polarity.head()

Unnamed: 0,pdf_name,polarity
0,COMBINED-Q4-17-Shareholder-Letter-FINAL,0.743377
1,FINAL-Q1-18-Shareholder-Letter,0.849195
2,FINAL-Q1-19-Shareholder-Letter,0.822834
3,FINAL-Q1-20-Shareholder-Letter,0.889153
4,FINAL-Q1-21-Shareholder-Letter,0.939736


In [10]:
# export to csv
amazon_polarity.to_csv("Scores/amazon_polarity.csv", index=False)