In [2]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import os
import re
import pickle
from tqdm.notebook import tqdm
from difflib import SequenceMatcher

# load credentials for OpenAI API
import openai
import credentials
openai.api_key = credentials.openai_api_key

In [9]:
def split_text_into_blocks(text, headings):

    # Split the text into blocks based on the headings
    text_blocks = {}

    # Iterate over the headings
    for heading in range(len(headings)):

        if heading == 0:
            document_intro = text.split(headings[heading])[0]
            text_blocks['Document_intro'] = document_intro
        
        text_after_heading = text.split(headings[heading])[1]
        if heading == len(headings) - 1:
            text_blocks[headings[heading]] = text_after_heading
            break
        else:
            text_of_heading = text_after_heading.split(headings[heading+1])[0]
            text_blocks[headings[heading]] = text_of_heading
    
    return text_blocks

In [12]:
def clean_text_blocks(text_blocks):

    # iterate over text blocks, removing '\u200b' and extra spaces
    for key in text_blocks:
        text_blocks[key] = re.sub('\u200b', '', text_blocks[key])
        text_blocks[key] = re.sub(' +', ' ', text_blocks[key])

    return text_blocks

In [17]:
def summarize_long_text_blocks(text_blocks):

    for heading, text in text_blocks.items():
        # if the block is too long (contains over 750 words), summarize it
        if len(text.split(' ')) > 750:

            # use the OpenAI API to summarize the text
            completion = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[
                {"role": "user", "content": "Please effectively summarize the following text: " + text}
                ])
            # replace the long text block with the summarized version
            text_blocks[heading] = completion.choices[0].message.content
    
    return text_blocks

In [29]:
def openai_sentiment_analysis(final_prompt):

    # run the request for ChatGPT
    fine_tune_messages = {"role": "system", "content":
                    "You are a helpful financial assistant who is expert in evaluating sentiment scores for financial statements \
                You give precise answers to questions \
                the quality of your answers is highly important, you never hallucinate answers - only \
                answering based on your knowledge. Where the answer requires creative thought you engage \
                in reflective internal dialogue to ascertain the best answer"
    }

    user_content = "Please tell me about the sentiment (positive, negative, neutral) of this information (Netflix) for the investors. \
                    Please, be consice and lucid. \
                    Calculate the total polarity and subjectivity scores on the range -1 to 1 (show scores in the beginning of your output): "


    completion = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        fine_tune_messages,
        {"role": "user", "content": user_content + final_prompt}
    ]
    )

    return completion.choices[0].message.content

In [14]:
# Load pdf text and headings from the pickle file
pdf_texts = pickle.load(open("pdf_texts.pkl", "rb"))
pdf_headings = pickle.load(open("pdf_headings.pkl", "rb"))

In [None]:
# store openai responses in a dictionary
openai_responses = {}

for pdf_name in tqdm(pdf_texts):

    try:

        if pdf_name == "FINAL-Q2-23-Shareholder-Letter" or pdf_name == "Final-Q1-23-Shareholder-Letter":
            continue

        else:

            text = pdf_texts[pdf_name]
            headings = pdf_headings[pdf_name]

            # split the text into blocks based on the headings
            text_blocks = split_text_into_blocks(text, headings)
            
            # clean the text blocks
            text_blocks = clean_text_blocks(text_blocks)

            # print the original length of the text blocks
            print("Original length of blocks for " + pdf_name + ":")
            for heading, text in text_blocks.items():
                print(len(text.split(" ")), end=" ")
            print(" ")

            # summarize the text blocks
            text_blocks = summarize_long_text_blocks(text_blocks)

            # print the length of the text blocks after summarization
            print("Updated length of blocks for " + pdf_name + ":")
            for heading, text in text_blocks.items():
                print(len(text.split(" ")), end=" ")
            print(" ")

            # Create a final prompt
            final_prompt = ''
            for heading, text in text_blocks.items():
                final_prompt += heading + ': ' + text + " "

            # Perform openAI sentiment analysis
            response = openai_sentiment_analysis(final_prompt)
            openai_responses[pdf_name] = response

    except Exception as e:
        print(f"Exception occurred in file {pdf_name}")
        print(f"Exception message: {str(e)}")

In [53]:
import tiktoken

# define the number of tokens in the prompt
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

num_tokens_from_string

In [54]:
# show number of tokens in prompt (max 4096 tokens for both input & output)
num_tokens_from_string(final_prompt, "cl100k_base")

4893

In [61]:
pdf_texts["FINAL-Q2-23-Shareholder-Letter"]

" InMay,wesuccessfullylaunchedpaidsharingin100+countries,representingmorethan80%of ourrevenuebase.  Revenueineachregionisnowhigherthanpre-launch,withsign-upsalreadyexceeding cancellations.  Paidnetadditionswere5.9MinQ2,andtodaywe’rerollingoutpaidsharingtoalmost alloftheremainingcountries.  Q2‘23revenueof$8.2Bandoperatingprofitof$1.8Bweregenerallyin-linewithour forecast—andweexpectrevenuegrowthtoaccelerateinthesecondhalfof‘23aswestartto seethefullbenefitsofpaidsharingpluscontinuedsteadygrowthinourad-supportedplan. We’restilltargetingafullyear2023operatingmarginof18%to20%.  We’realeaderintermsofstreamingengagementand,perNielsen,wehadthetoporiginal streamingseriesintheUSfor24ofthefirst25weeksof2023,andthetopmoviefor21weeks. Whilewe’vemadesteadyprogressthisyear,wehavemoreworktodotoreaccelerateourgrowth.We remainfocusedon:creatingasteadydrumbeatofmustwatchshowsandmovies;improving monetization;growingtheenjoymentofourgames;andinvestingtoimproveourserviceformembers. Q3'23 (inmillionsexceptper

In [58]:
text_blocks

{'Document_intro': " InMay,wesuccessfullylaunchedpaidsharingin100+countries,representingmorethan80%of ourrevenuebase. Revenueineachregionisnowhigherthanpre-launch,withsign-upsalreadyexceeding cancellations. Paidnetadditionswere5.9MinQ2,andtodaywe’rerollingoutpaidsharingtoalmost alloftheremainingcountries. Q2‘23revenueof$8.2Bandoperatingprofitof$1.8Bweregenerallyin-linewithour forecast—andweexpectrevenuegrowthtoaccelerateinthesecondhalfof‘23aswestartto seethefullbenefitsofpaidsharingpluscontinuedsteadygrowthinourad-supportedplan. We’restilltargetingafullyear2023operatingmarginof18%to20%. We’realeaderintermsofstreamingengagementand,perNielsen,wehadthetoporiginal streamingseriesintheUSfor24ofthefirst25weeksof2023,andthetopmoviefor21weeks. Whilewe’vemadesteadyprogressthisyear,wehavemoreworktodotoreaccelerateourgrowth.We remainfocusedon:creatingasteadydrumbeatofmustwatchshowsandmovies;improving monetization;growingtheenjoymentofourgames;andinvestingtoimproveourserviceformembers. Q3'23 (inmi

In [47]:
final_prompt

"Document_intro: Summary: Q1‘23revenueandoperatingprofitwerein-linewithourforecast. WedeliveredastrongcontentslateinQ1with: SuccessfulreturningseasonslikeOuterBanks,You,Ginny&Georgiaandabigsequel filmMurderMystery2. NewhitsacrossnearlyeverygenreofTVlikeTheNightAgent(nowour6thmostpopular EnglishlanguageTVshowever),TheGlory(our5thmostpopularnon-EnglishTVshow ever),FullSwing,That90sShowandfilmsYouPeopleandLuther:TheFallenSun. WithMoody’srecentupgrade,weachievedinvestmentgradestatus.Netflixistheleading streamingservicebasedonengagement,revenueandprofitandweareworkingtobuildonthat in‘23,byseekingtoexpandoperatingmarginto18%-20%andtogenerateatleast+$3.5Bof freecashflow(upfromourpriorexpectationofatleast$3.0BofFCF). InQ1,welaunchedpaidsharinginfourcountriesandarepleasedwiththeresults.Weare planningonabroadrollout,includingintheUS,inQ2. Givencurrenthealthyperformanceandtrajectoryofourper-memberadvertisingeconomics, particularlyintheUS,we’reupgradingouradsexperiencewithmorestreamsandimproved vi

In [46]:
# show number of tokens in prompt (max 4096 tokens for both input & output)
num_tokens_from_string(final_prompt, "cl100k_base")

4021

In [37]:
openai_responses

{'COMBINED-Q4-17-Shareholder-Letter-FINAL': "Sentiment Analysis:\n\n- Total polarity score: 0.14\n- Total subjectivity score: 0.41\n\nOverall, the sentiment in this information appears to be positive for investors. \n\nThe document highlights various achievements and positive financial results for Netflix. The expansion of internet TV globally, the growth in streaming revenue and memberships, the positive international contribution profit, and the increase in global operating income all indicate a successful year for the company. Additionally, the strong quarterly results and higher-than-expected net adds demonstrate the positive performance in Q4.\n\nThe content section emphasizes the success of Netflix's original content, with shows like Stranger Things and Bright gaining significant popularity. The plan to invest in more original content and expand the international originals initiative further highlights the company's confidence in its content strategy.\n\nThe partnerships section 

In [66]:
# Save openai responses to pickle file
with open("openai_responses.pkl", "wb") as f:
    pickle.dump(openai_responses, f)

In [3]:
import pickle

In [4]:
# Load openai responses from the pickle file
openai_responses = pickle.load(open("openai_responses.pkl", "rb"))

In [5]:
openai_responses

{'COMBINED-Q4-17-Shareholder-Letter-FINAL': 'Polarity scores: \n- Q4 Results:  Positive \n- Content:  Positive \n- Product and Partnerships:  Positive \n- Competition:  Neutral \n- Free Cash Flow and Capital Structure:  Negative \n- Board of Directors:  Positive \n- Summary:  Positive \n\nSubjectivity scores: \n- Q4 Results:  Neutral \n- Content:  Positive \n- Product and Partnerships:  Neutral \n- Competition:  Neutral \n- Free Cash Flow and Capital Structure:  Positive \n- Board of Directors:  Positive \n- Summary:  Positive',
 'FINAL-Q1-18-Shareholder-Letter': "Sentiment analysis of the information provided for investors in Netflix:\n\nPolarity score: 0.366\nSubjectivity score: 0.511\n\nThe sentiment of the information is mostly positive. The document emphasizes the company's efforts to entertain and bring joy to people across the world through amazing stories, and highlights their strong financial performance in Q1, with revenue growth and increased memberships. The information als

In [7]:
import pandas as pd

In [8]:
openai_responses_df = pd.DataFrame(list(openai_responses.items()), columns=['pdf_name', 'response'])

In [None]:
openai_responses_df.to_excel('openai_responses_df.xlsx')

In [72]:
for key in openai_responses:
        openai_responses[key] = re.sub('\n', ' ', openai_responses[key])
        openai_responses[key] = re.sub(' +', ' ', openai_responses[key])


In [98]:
polarity_scores = {}
subjectivity_scores = {}


for document, text in openai_responses.items():

    words = text.split(" ")
    for word in words:
        if word.lower() == "polarity":
            polarity_scores[document] = words[words.index(word):words.index(word) + 4]
        elif word.lower() == "subjectivity":
            subjectivity_scores[document] = words[words.index(word):words.index(word) + 4]

In [102]:
for document, words in polarity_scores.items():

    for word in words:
        # if word is number using regex
        if re.search("^[0-9]", word):
            word = word.replace(",", "")
            polarity_scores[document] = float(word)

In [104]:
for document, words in subjectivity_scores.items():

    for word in words:
        # if word is number using regex
        if re.search("^[0-9]", word):
            word = word.replace(",", "")
            subjectivity_scores[document] = float(word)

In [107]:
# filter out keys values of which are lists
polarity_scores = {k: v for k, v in polarity_scores.items() if type(v) != list}
subjectivity_scores = {k: v for k, v in subjectivity_scores.items() if type(v) != list}

In [124]:
df = pd.DataFrame(list(pdf_texts.items()), columns=['pdf_name', 'text'])
polarity_df = pd.DataFrame(list(polarity_scores.items()), columns=['pdf_name', 'polarity'])
subjectivity_df = pd.DataFrame(list(subjectivity_scores.items()), columns=['pdf_name', 'subjectivity'])

# join df and polarity_df on pdf_name column
df = df.join(polarity_df.set_index("pdf_name"), on="pdf_name")
df = df.join(subjectivity_df.set_index("pdf_name"), on="pdf_name")

In [127]:
# export df
df.to_excel('OpenAI Automation.xlsx', index=False)

In [128]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   pdf_name      50 non-null     object 
 1   text          50 non-null     object 
 2   polarity      31 non-null     float64
 3   subjectivity  32 non-null     float64
dtypes: float64(2), object(2)
memory usage: 1.7+ KB


In [130]:
import spacy

In [132]:
# load model
nlp = spacy.load('en_core_web_sm')

In [135]:
# Function to preprocess text
def preprocess_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

# Apply the preprocessing to each text in the 'text' column of the DataFrame
tqdm.pandas()
df['processed_text'] = df['text'].progress_apply(preprocess_text)

  0%|          | 0/50 [00:00<?, ?it/s]

In [141]:
df.head()

Unnamed: 0,pdf_name,text,polarity,subjectivity,processed_text
0,COMBINED-Q4-17-Shareholder-Letter-FINAL,"We had a beautiful Q4, completing a great year...",,,beautiful q4 complete great year internet tv e...
1,FINAL-Q1-18-Shareholder-Letter,We strive to entertain and to bring joy to peo...,0.366,0.511,strive entertain bring joy people world amazin...
2,FINAL-Q1-19-Shareholder-Letter,Revenue surpassed $4.5 billion in Q1 and we re...,,,revenue surpass $ 4.5 billion Q1 record highes...
3,FINAL-Q1-20-Shareholder-Letter,"In our 20+ year history, we have never seen a ...",,0.3447,20 + year history see future uncertain unsettl...
4,FINAL-Q1-21-Shareholder-Letter,Revenue grew 24% year over year and was in lin...,0.08,0.29,revenue grow 24 year year line withour beginni...


In [173]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor

# Drop NAs from polarity
df_copy = df.dropna(subset=['polarity'])

# Split the dataset into the label and text
X = df_copy['processed_text']
y =  df_copy['polarity']

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [174]:
# Create a CountVectorizer object and fit it on the training data
vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(X_train)

# Transform the test data using the fitted vectorizer
X_test_bow = vectorizer.transform(X_test)

In [175]:
# Train a machine learning model, such as Naive Bayes, on the bag-of-words features
model = RandomForestRegressor().fit(X_train_bow, y_train)
# model = LinearRegression().fit(X_train_bow, y_train)

# Predict the labels of the test set: y_pred
y_pred = model.predict(X_test_bow)

# Compute and print metrics
print("R^2: {}".format(model.score(X_test_bow, y_test)))

R^2: -0.1241864594611024
