In [3]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import os
import re
import pickle
from tqdm.notebook import tqdm
from difflib import SequenceMatcher
import openai

# Load local modules
from module_text_blocks import split_text_into_blocks, clean_text_blocks

# load credentials for OpenAI API
from credentials_openai import *
openai.api_key = openai_api_key

In [4]:
import tiktoken

# define the number of tokens in the prompt
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [5]:
def split_text_by_chars(text, num_chars):
    """Split the input text every num_chars characters."""
    return [text[i:i+num_chars] for i in range(0, len(text), num_chars)]

In [6]:
def summarize_long_text_blocks(text_blocks):

    for heading, text in text_blocks.items():
        
        text_length = len(text.split(' '))                           # number of words in the text block
        tokens_number = num_tokens_from_string(text, "cl100k_base")  # number of tokens in the text block
        chars_number = len(text)                                     # number of characters in the text block
                        
        summarization_blocks = [text]                                # list of text blocks to summarize
        responses = []

        # if the block contains over 750 words, summarize it
        if text_length > 750:

            # if the block is exceeding the token limit, split it into multiple blocks
            if tokens_number > 3500:

                text_split_threshold = int(chars_number / (tokens_number / 2500))
                summarization_blocks = split_text_by_chars(text, text_split_threshold)
                
            for summarization_block in summarization_blocks:

                completion = openai.ChatCompletion.create(
                    model="gpt-3.5-turbo",
                    messages=[
                    {"role": "user", "content": "Please effectively summarize the following text: " + summarization_block}
                    ],
                    temperature=0.3,
                    max_tokens=500
                )                
                # top_p=1, frequency_penalty=0, presence_penalty=0, stop=["\n"]
                # add the summarized text to the list of responses
                responses.append(completion.choices[0].message.content)

            # join the responses into a single text block
            text_blocks[heading] = ' '.join(responses)
    
    return text_blocks

In [7]:
def openai_sentiment_analysis(final_prompt):

    # run the request for ChatGPT
    fine_tune_messages = {"role": "system", "content":
                    "You are a helpful financial assistant who is expert in evaluating sentiment scores for financial statements \
                You give precise answers to questions \
                the quality of your answers is highly important, you never hallucinate answers - only \
                answering based on your knowledge. Where the answer requires creative thought you engage \
                in reflective internal dialogue to ascertain the best answer"
    }

    user_content = "Calculate the total polarity and subjectivity scores on the strict range -1 to 1 (-1 means perfectly negative; 1 means perfectly positive): "


    completion = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        fine_tune_messages,
        {"role": "user", "content": user_content + final_prompt}
    ]
    )

    return completion.choices[0].message.content

In [8]:
def openai_sentiment_analysis(final_prompt):

    # run the request for ChatGPT
    fine_tune_messages = {"role": "system", "content":
                    "You are a helpful financial assistant who is expert in evaluating sentiment scores for financial statements \
                You give precise answers to questions \
                the quality of your answers is highly important, you never hallucinate answers - only \
                answering based on your knowledge. Where the answer requires creative thought you engage \
                in reflective internal dialogue to ascertain the best answer"
    }

    # user_content = "Calculate the total polarity and subjectivity scores on the strict range -1 to 1 (-1 means perfectly negative; 1 means perfectly positive): "
    # user_content = "The overall polarity and subjectivity scores on the strict range -1 to 1 (-1 means perfectly negative; 1 means perfectly positive) for the text: "
    user_content = "The overall polarity and subjectivity scores on the strict range (very negative, moderately negative, slightly negative, neutral, slightly positive, moderately positive, very positive) for the text: "

    completion = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        fine_tune_messages,
        {"role": "user", "content": user_content + final_prompt}
    ],
    temperature=0.0,
    max_tokens=50
    )

    return completion.choices[0].message.content

In [9]:
# Load pdf text and headings from the pickle file
pdf_texts = pickle.load(open("pdf_texts.pkl", "rb"))
pdf_headings = pickle.load(open("pdf_headings.pkl", "rb"))

In [None]:
# store openai responses in a dictionary
openai_responses = {}
pdf_lists = ["FINAL-Q1-19-Shareholder-Letter",
            "FINAL-Q420-Shareholder-Letter",
            "Q2-19-Shareholder-Letter-FINAL"]

for pdf_name in tqdm(pdf_texts):

    try:

        text = pdf_texts[pdf_name]
        headings = pdf_headings[pdf_name]

        # split the text into blocks based on the headings
        text_blocks = split_text_into_blocks(text, headings)
        
        # clean the text blocks
        text_blocks = clean_text_blocks(text_blocks)

        # if there are headings, add headings to the text blocks and summarize if needed
        if len(headings) > 0:
            
            # print the original length of the text blocks
            print("Original length of blocks for " + pdf_name + ":")
            for heading, text in text_blocks.items():
                print(len(text.split(" ")), end=" ")
            print(" ")

            # summarize the text blocks
            text_blocks = summarize_long_text_blocks(text_blocks)

            # print the length of the text blocks after summarization
            print("Updated length of blocks for " + pdf_name + ":")
            for heading, text in text_blocks.items():
                print(len(text.split(" ")), end=" ")
            print(" ")

            # Create a final prompt
            final_prompt = ''
            for heading, text in text_blocks.items():
                if num_tokens_from_string(final_prompt + text, "cl100k_base") >= 3750:
                    break
                final_prompt += heading + ': ' + text + " "

        # if there are no headings  
        else:
            
            tokens_number = num_tokens_from_string(text, "cl100k_base")  # number of tokens in the text block
            chars_number = len(text)                                     # number of characters in the text block
            
            # if the block is exceeding the token limit, cut it
            if tokens_number > 3500:

                text_split_threshold = int(chars_number / (tokens_number / 2500))
                text = text[:text_split_threshold]

            final_prompt = text
            

        # Perform openAI sentiment analysis
        response = openai_sentiment_analysis(final_prompt)
        openai_responses[pdf_name] = response

        # response = openai_sentiment_analysis_finetune(final_prompt, response)
        # fine_tuned_responses[pdf_name] = response

        # break

    except Exception as e:

        print(f"Exception occurred in file {pdf_name}")
        print(f"Exception message: {str(e)}")

        continue

        # break

In [15]:
openai_responses

{'COMBINED-Q4-17-Shareholder-Letter-FINAL': 'The overall polarity and subjectivity scores for the given text are as follows:\n\nPolarity:\n- Very negative: 0\n- Moderately negative: 0\n- Slightly negative: 0\n- Neutral: 0\n- Slightly',
 'FINAL-Q1-18-Shareholder-Letter': 'The overall polarity and subjectivity scores for the given text are as follows:\n\nPolarity: Moderately positive\nSubjectivity: Objective',
 'FINAL-Q1-19-Shareholder-Letter': 'The overall polarity and subjectivity scores for the given text are as follows:\n\nPolarity Score: Moderately Positive\nSubjectivity Score: Neutral',
 'FINAL-Q1-20-Shareholder-Letter': 'The overall polarity score for the text is slightly positive. The subjectivity score is moderate.\n\nThe text mentions that Netflix is acknowledging the uncertainty and impact of the coronavirus pandemic, recognizing the importance of their service during this time, and experiencing higher viewership and',
 'FINAL-Q1-21-Shareholder-Letter': "The overall polarity a

In [16]:
# Save openai responses to pickle file
with open("Src/openai_responses_test.pkl", "wb") as f:
    pickle.dump(openai_responses, f)

In [17]:
# Load openai responses from the pickle file
openai_responses = pickle.load(open("Src/openai_responses_test.pkl", "rb"))

In [17]:
openai_responses_test = pd.DataFrame(list(openai_responses.items()), columns=['pdf_name', 'response'])

In [18]:
# export OpenAI responses to excel
openai_responses_test.to_excel('Src/openai_responses_test.xlsx')

In [72]:
# clean the text of the responses
for key in openai_responses:
        openai_responses[key] = re.sub('\n', ' ', openai_responses[key])
        openai_responses[key] = re.sub(' +', ' ', openai_responses[key])

In [98]:
# Extract polarity and subjectivity scores from OpenAI responses
polarity_scores = {}
subjectivity_scores = {}


for document, text in openai_responses.items():

    words = text.split(" ")
    for word in words:
        if word.lower() == "polarity":
            polarity_scores[document] = words[words.index(word):words.index(word) + 4]
        elif word.lower() == "subjectivity":
            subjectivity_scores[document] = words[words.index(word):words.index(word) + 4]

In [102]:
# Extract digits from polarity_scores dictionary
for document, words in polarity_scores.items():

    for word in words:
        # if word is number using regex
        if re.search("^[0-9]", word):
            word = word.replace(",", "")
            polarity_scores[document] = float(word)

In [104]:
# Extract digits from subjectivity scores dictionary
for document, words in subjectivity_scores.items():

    for word in words:
        # if word is number using regex
        if re.search("^[0-9]", word):
            word = word.replace(",", "")
            subjectivity_scores[document] = float(word)

In [107]:
# filter out keys values of which are lists
polarity_scores = {k: v for k, v in polarity_scores.items() if type(v) != list}
subjectivity_scores = {k: v for k, v in subjectivity_scores.items() if type(v) != list}

In [124]:
df = pd.DataFrame(list(pdf_texts.items()), columns=['pdf_name', 'text'])
polarity_df = pd.DataFrame(list(polarity_scores.items()), columns=['pdf_name', 'polarity'])
subjectivity_df = pd.DataFrame(list(subjectivity_scores.items()), columns=['pdf_name', 'subjectivity'])

# join df and polarity_df on pdf_name column
df = df.join(polarity_df.set_index("pdf_name"), on="pdf_name")
df = df.join(subjectivity_df.set_index("pdf_name"), on="pdf_name")

In [127]:
# export df
df.to_excel('Score/OpenAI_scores.xlsx', index=False)