In [3]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import os
import re
import pickle
from tqdm.notebook import tqdm
from difflib import SequenceMatcher
import openai

# Load local modules
from module_text_blocks import split_text_into_blocks, clean_text_blocks

# load credentials for OpenAI API
from credentials_openai import *
openai.api_key = openai_api_key

In [4]:
def summarize_long_text_blocks(text_blocks):

    for heading, text in text_blocks.items():
        # if the block is too long (contains over 750 words), summarize it
        if len(text.split(' ')) > 750:

            # use the OpenAI API to summarize the text
            completion = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[
                {"role": "user", "content": "Please effectively summarize the following text: " + text}
                ])
            # replace the long text block with the summarized version
            text_blocks[heading] = completion.choices[0].message.content
    
    return text_blocks

In [5]:
def openai_sentiment_analysis(final_prompt):

    # run the request for ChatGPT
    fine_tune_messages = {"role": "system", "content":
                    "You are a helpful financial assistant who is expert in evaluating sentiment scores for financial statements \
                You give precise answers to questions \
                the quality of your answers is highly important, you never hallucinate answers - only \
                answering based on your knowledge. Where the answer requires creative thought you engage \
                in reflective internal dialogue to ascertain the best answer"
    }

    user_content = "Calculate the total polarity and subjectivity scores on the strict range -1 to 1 (-1 means perfectly negative; 1 means perfectly positive): "


    completion = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        fine_tune_messages,
        {"role": "user", "content": user_content + final_prompt}
    ]
    )

    return completion.choices[0].message.content

In [6]:
def openai_sentiment_analysis_finetune(final_prompt, response):

    # run the request for ChatGPT
    fine_tune_messages = {"role": "system", "content":
                    "You are a helpful financial assistant who is expert in evaluating sentiment scores for financial statements \
                You give precise answers to questions \
                the quality of your answers is highly important, you never hallucinate answers - only \
                answering based on your knowledge. Where the answer requires creative thought you engage \
                in reflective internal dialogue to ascertain the best answer"
    }

    user_content = "Calculate the total polarity and subjectivity scores on the strict range -1 to 1 (-1 means perfectly negative; 1 means perfectly positive): "


    completion = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        fine_tune_messages,
        {"role": "user", "content": user_content + final_prompt},
        {"role": "assistant", "content": response},
        {"role": "user", "content": "No, you don't understand, tell me only two values, the total polarity score of report on range from -1 to 1 and the total subjectivity score of report on range from -1 to 1. the text is the following:" + final_prompt}
    ]
    )

    return completion.choices[0].message.content

In [7]:
def split_text_by_chars(text, num_chars):
    """Split the input text every num_chars characters."""
    return [text[i:i+num_chars] for i in range(0, len(text), num_chars)]

In [8]:
# Load pdf text and headings from the pickle file
pdf_texts = pickle.load(open("pdf_texts.pkl", "rb"))
pdf_headings = pickle.load(open("pdf_headings.pkl", "rb"))

In [14]:
# store openai responses in a dictionary
openai_responses = {}
fine_tuned_responses = {}

for pdf_name in tqdm(pdf_texts):

    try:

        # pdf_name = "Investor_Letter_Q12013"

        text = pdf_texts[pdf_name]
        headings = pdf_headings[pdf_name]

        # split the text into blocks based on the headings
        text_blocks = split_text_into_blocks(text, headings)
        
        # clean the text blocks
        text_blocks = clean_text_blocks(text_blocks)

        if len(headings) > 0:
            
            # print the original length of the text blocks
            # print("Original length of blocks for " + pdf_name + ":")
            # for heading, text in text_blocks.items():
            #     print(len(text.split(" ")), end=" ")
            # print(" ")

            # summarize the text blocks
            text_blocks = summarize_long_text_blocks(text_blocks)

            # print the length of the text blocks after summarization
            # print("Updated length of blocks for " + pdf_name + ":")
            # for heading, text in text_blocks.items():
            #     print(len(text.split(" ")), end=" ")
            # print(" ")

            # Create a final prompt
            final_prompt = ''
            for heading, text in text_blocks.items():
                final_prompt += heading + ': ' + text + " "
        else:
            final_prompt = text[:25000]

            

        # Perform openAI sentiment analysis
        response = openai_sentiment_analysis(final_prompt)
        openai_responses[pdf_name] = response

        # response = openai_sentiment_analysis_finetune(final_prompt, response)
        # fine_tuned_responses[pdf_name] = response

        break

    except Exception as e:
        print(f"Exception occurred in file {pdf_name}")
        print(f"Exception message: {str(e)}")

        break

  0%|          | 0/50 [00:00<?, ?it/s]

In [14]:
import tiktoken

# define the number of tokens in the prompt
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [15]:
# show number of tokens in prompt (max 4096 tokens for both input & output)
num_tokens_from_string(final_prompt, "cl100k_base")

2660

In [16]:
# Save openai responses to pickle file
with open("Src/openai_responses_3.pkl", "wb") as f:
    pickle.dump(openai_responses, f)

In [17]:
# Load openai responses from the pickle file
openai_responses = pickle.load(open("Src/openai_responses_3.pkl", "rb"))

In [18]:
openai_responses_df_2 = pd.DataFrame(list(openai_responses.items()), columns=['pdf_name', 'response'])

In [13]:
# export OpenAI responses to excel
openai_responses_df_2.to_excel('Src/openai_responses_df_3.xlsx')

In [72]:
# clean the text of the responses
for key in openai_responses:
        openai_responses[key] = re.sub('\n', ' ', openai_responses[key])
        openai_responses[key] = re.sub(' +', ' ', openai_responses[key])

In [98]:
# Extract polarity and subjectivity scores from OpenAI responses
polarity_scores = {}
subjectivity_scores = {}


for document, text in openai_responses.items():

    words = text.split(" ")
    for word in words:
        if word.lower() == "polarity":
            polarity_scores[document] = words[words.index(word):words.index(word) + 4]
        elif word.lower() == "subjectivity":
            subjectivity_scores[document] = words[words.index(word):words.index(word) + 4]

In [102]:
# Extract digits from polarity_scores dictionary
for document, words in polarity_scores.items():

    for word in words:
        # if word is number using regex
        if re.search("^[0-9]", word):
            word = word.replace(",", "")
            polarity_scores[document] = float(word)

In [104]:
# Extract digits from subjectivity scores dictionary
for document, words in subjectivity_scores.items():

    for word in words:
        # if word is number using regex
        if re.search("^[0-9]", word):
            word = word.replace(",", "")
            subjectivity_scores[document] = float(word)

In [107]:
# filter out keys values of which are lists
polarity_scores = {k: v for k, v in polarity_scores.items() if type(v) != list}
subjectivity_scores = {k: v for k, v in subjectivity_scores.items() if type(v) != list}

In [124]:
df = pd.DataFrame(list(pdf_texts.items()), columns=['pdf_name', 'text'])
polarity_df = pd.DataFrame(list(polarity_scores.items()), columns=['pdf_name', 'polarity'])
subjectivity_df = pd.DataFrame(list(subjectivity_scores.items()), columns=['pdf_name', 'subjectivity'])

# join df and polarity_df on pdf_name column
df = df.join(polarity_df.set_index("pdf_name"), on="pdf_name")
df = df.join(subjectivity_df.set_index("pdf_name"), on="pdf_name")

In [127]:
# export df
df.to_excel('Score/OpenAI_scores.xlsx', index=False)