In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import os
import re
import pickle
from tqdm.notebook import tqdm
from difflib import SequenceMatcher
import openai

# Load local modules
from module_text_blocks import split_text_into_blocks, clean_text_blocks

# load credentials for OpenAI API
from credentials_openai import *
openai.api_key = openai_api_key

In [22]:
import tiktoken

# define the number of tokens in the prompt
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [23]:
def split_text_by_chars(text, num_chars):
    """Split the input text every num_chars characters."""
    return [text[i:i+num_chars] for i in range(0, len(text), num_chars)]

In [53]:
def summarize_long_text_blocks(text_blocks):

    for heading, text in text_blocks.items():
        
        text_length = len(text.split(' '))                           # number of words in the text block
        tokens_number = num_tokens_from_string(text, "cl100k_base")  # number of tokens in the text block
        chars_number = len(text)                                     # number of characters in the text block
                        
        summarization_blocks = [text]                                # list of text blocks to summarize
        responses = []

        # if the block contains over 750 words, summarize it
        if text_length > 750:

            # if the block is exceeding the token limit, split it into multiple blocks
            if tokens_number > 3500:

                text_split_threshold = int(chars_number / (tokens_number / 2500))
                summarization_blocks = split_text_by_chars(text, text_split_threshold)
                
            for summarization_block in summarization_blocks:

                completion = openai.ChatCompletion.create(
                    model="gpt-3.5-turbo",
                    messages=[
                    {"role": "user", "content": "Please effectively summarize the following text: " + summarization_block}
                    ],
                    temperature=0.3,
                    max_tokens=500
                )                
                # top_p=1, frequency_penalty=0, presence_penalty=0, stop=["\n"]
                # add the summarized text to the list of responses
                responses.append(completion.choices[0].message.content)

            # join the responses into a single text block
            text_blocks[heading] = ' '.join(responses)
    
    return text_blocks

In [54]:
def openai_sentiment_analysis(final_prompt):

    # run the request for ChatGPT
    fine_tune_messages = {"role": "system", "content":
                    "You are a helpful financial assistant who is expert in evaluating sentiment scores for financial statements \
                You give precise answers to questions \
                the quality of your answers is highly important, you never hallucinate answers - only \
                answering based on your knowledge. Where the answer requires creative thought you engage \
                in reflective internal dialogue to ascertain the best answer"
    }

    # user_content = "Calculate the total polarity and subjectivity scores on the strict range -1 to 1 (-1 means perfectly negative; 1 means perfectly positive): "
    user_content = "The overall polarity and subjectivity scores on the strict range -1 to 1 (-1 means perfectly negative; 1 means perfectly positive) for the text: "

    completion = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        fine_tune_messages,
        {"role": "user", "content": user_content + final_prompt}
    ],
    temperature=0.0,
    max_tokens=50
    )

    return completion.choices[0].message.content

In [55]:
# Load pdf text and headings from the pickle file
pdf_texts = pickle.load(open("pdf_texts.pkl", "rb"))
pdf_headings = pickle.load(open("pdf_headings.pkl", "rb"))

In [57]:
# store openai responses in a dictionary
openai_responses = {}
# fine_tuned_responses = {}
# pdf_lists = ["FINAL-Q2-20-Shareholder-Letter-V3-with-Tables"]
# pdf_lists = ["FINAL-Q1-19-Shareholder-Letter",
#             "FINAL-Q420-Shareholder-Letter",
#             "Q2-19-Shareholder-Letter-FINAL"
#             ]

# pdf_lists = [
#     "FINAL-Q2-22-Shareholder-Letter",
#     "FINAL-Q2-23-Shareholder-Letter",
#     "FINAL-Q3-22-Shareholder-Letter",
#     "FINAL-Q4-18-Shareholder-Letter",
#     "FINAL-Q4-19-Shareholder-Letter",
#     "FINAL-Q4-22-Shareholder-Letter",
#     "Investor_Letter_Q12013",
#     "Investor-Letter-Q1-2012",
#     "Investor-Letter-Q2-2012-07",
#     "Investor-Letter-Q4-2011",
#     "Investor-Letter-Q42012-01",
#     "July-Investor-Letter-1130am",
#     "Q1-11-Letter-to-shareholders",
#     "Q3_17_Shareholder_Letter_COMBINED",
# ]

pdf_lists = ["FINAL-Q3-22-Shareholder-Letter", "FINAL-Q4-22-Shareholder-Letter",
            "Investor-Letter-Q4-2011", "July-Investor-Letter-1130am"]

for pdf_name in tqdm(pdf_lists):

    try:
        
        if pdf_name in pdf_lists:

            text = pdf_texts[pdf_name]
            headings = pdf_headings[pdf_name]

            # split the text into blocks based on the headings
            text_blocks = split_text_into_blocks(text, headings)
            
            # clean the text blocks
            text_blocks = clean_text_blocks(text_blocks)

            if len(headings) > 0:
                
                # print the original length of the text blocks
                print("Original length of blocks for " + pdf_name + ":")
                for heading, text in text_blocks.items():
                    print(len(text.split(" ")), end=" ")
                print(" ")

                # summarize the text blocks
                text_blocks = summarize_long_text_blocks(text_blocks)

                # print the length of the text blocks after summarization
                print("Updated length of blocks for " + pdf_name + ":")
                for heading, text in text_blocks.items():
                    print(len(text.split(" ")), end=" ")
                print(" ")

                # Create a final prompt
                final_prompt = ''
                for heading, text in text_blocks.items():
                    if num_tokens_from_string(final_prompt + text, "cl100k_base") >= 3750:
                        break
                    final_prompt += heading + ': ' + text + " "
                    
            else:
                
                tokens_number = num_tokens_from_string(text, "cl100k_base")  # number of tokens in the text block
                chars_number = len(text)                                     # number of characters in the text block
                
                # if the block is exceeding the token limit, cut it
                if tokens_number > 3500:

                    text_split_threshold = int(chars_number / (tokens_number / 2500))
                    text = text[:text_split_threshold]

                final_prompt = text
                

            # Perform openAI sentiment analysis
            response = openai_sentiment_analysis(final_prompt)
            openai_responses[pdf_name] = response

            # response = openai_sentiment_analysis_finetune(final_prompt, response)
            # fine_tuned_responses[pdf_name] = response

            # break

    except Exception as e:

        print(f"Exception occurred in file {pdf_name}")
        print(f"Exception message: {str(e)}")

        continue

        # break

  0%|          | 0/4 [00:00<?, ?it/s]

Original length of blocks for FINAL-Q3-22-Shareholder-Letter:
240 302 612 1284 311 354 265 29 102 559  
Updated length of blocks for FINAL-Q3-22-Shareholder-Letter:
240 302 612 231 311 354 265 29 102 559  
Original length of blocks for FINAL-Q4-22-Shareholder-Letter:
260 319 457 440 712 964 368 304 152 205 29 565  
Updated length of blocks for FINAL-Q4-22-Shareholder-Letter:
260 319 457 440 712 158 368 304 152 205 29 565  
Original length of blocks for Investor-Letter-Q4-2011:
99 640 706 1181 217 175 194 190 139 200 34 93 110 171  
Updated length of blocks for Investor-Letter-Q4-2011:
99 640 706 222 217 175 194 190 139 200 34 93 110 171  
Original length of blocks for July-Investor-Letter-1130am:
70 328 6 488 306 709 100 241 144 11 123 121 80 324 442 1 311  
Updated length of blocks for July-Investor-Letter-1130am:
70 328 6 488 306 709 100 241 144 11 123 121 80 324 442 1 311  


In [58]:
openai_responses

{'FINAL-Q3-22-Shareholder-Letter': 'The overall polarity score for the given text is 0.25, which indicates a slightly positive sentiment. The subjectivity score is 0.35, suggesting that the text contains a moderate amount of subjective information.\n\nThe text highlights positive aspects such as',
 'FINAL-Q4-22-Shareholder-Letter': 'The overall polarity score for the given text is 0.25, which indicates a slightly positive sentiment. The subjectivity score is 0.42, suggesting that the text contains a moderate amount of subjective information.\n\nThe text primarily discusses the financial performance',
 'Investor-Letter-Q4-2011': 'The overall polarity score for the text is 0.14, which indicates a slightly positive sentiment. The subjectivity score is 0.34, suggesting that the text is moderately subjective.',
 'July-Investor-Letter-1130am': 'The overall polarity score for the text is 0.25, which indicates a slightly positive sentiment. The subjectivity score is 0.42, indicating that the t