In [4]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import os
import re
import pickle
from tqdm.notebook import tqdm
from difflib import SequenceMatcher
import pdfplumber

# Import local .py file
from scrapepdf import scrape_pdf

In [5]:
# Define similarity function
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [6]:
# Get file paths for the pdf files
folder_path = "ShareholderLetters/" # put '/' at the end to get all files in the folder
file_paths = []
for root, directories, files in os.walk(folder_path):
    for filename in files:
        filepath = os.path.join(root, filename)
        file_paths.append(filepath)

In [70]:
# Scrape pdf files and store the text and headings in a dictionary
pdf_texts, pdf_headings = scrape_pdf(file_paths) # total run time: 2 min 20 s

In [76]:
# Save pdf texts and headings to pickle file
with open("pdf_texts.pkl", "wb") as f:
    pickle.dump(pdf_texts, f)

with open("pdf_headings.pkl", "wb") as f:
    pickle.dump(pdf_headings, f)

In [39]:
# Load pdf text and headings from the pickle file
pdf_texts = pickle.load(open("pdf_texts.pkl", "rb"))
pdf_headings = pickle.load(open("pdf_headings.pkl", "rb"))

In [40]:
list(pdf_texts.items())[0]

('COMBINED-Q4-17-Shareholder-Letter-FINAL',
 "We had a beautiful Q4, completing a great year as internet TV expands globally. In 2017, we grew streaming revenue 36% to over $11 billion, added 24 million new memberships (compared to 19 million in 2016), achieved for the first time a full-year positive international contribution profit, and more than doubled global operating income. . Q4 Results Average paid streaming memberships rose 25% year over year in Q4. Combined with a 9% increase in ASP, global streaming revenue growth amounted to 35%. Operating income of $245 million (7.5% margin) vs. $154 million prior year (6.2% margin) was slightly above our $238 million forecast. Operating margin for FY17 was 7.2%, on target with our goal at the beginning of this year. EPS was $0.41 vs. $0.15 last year and met our forecast of $0.41. There were several below the line items that affected net income, including a pre-tax $26 million non-cash unrealized loss from F/X remeasurement on our Eurobond

In [41]:
# Get text from the first pdf file
text = list(pdf_texts.items())[0][1]
# Get headings from the first pdf file
headings = list(pdf_headings.items())[0][1]

In [42]:
text

"We had a beautiful Q4, completing a great year as internet TV expands globally. In 2017, we grew streaming revenue 36% to over $11 billion, added 24 million new memberships (compared to 19 million in 2016), achieved for the first time a full-year positive international contribution profit, and more than doubled global operating income. . Q4 Results Average paid streaming memberships rose 25% year over year in Q4. Combined with a 9% increase in ASP, global streaming revenue growth amounted to 35%. Operating income of $245 million (7.5% margin) vs. $154 million prior year (6.2% margin) was slightly above our $238 million forecast. Operating margin for FY17 was 7.2%, on target with our goal at the beginning of this year. EPS was $0.41 vs. $0.15 last year and met our forecast of $0.41. There were several below the line items that affected net income, including a pre-tax $26 million non-cash unrealized loss from F/X remeasurement on our Eurobond. Our tax rate was helped by a $66 million fo

In [43]:
headings

['Q4 Results',
 'Forecast',
 'Content',
 'Product and Partnerships',
 'Competition',
 'Free Cash Flow and Capital Structure',
 'Board of Directors',
 'Summary',
 'January 22, 2018 Earnings Interview, 3pm PST']

In [44]:
# Split the text into blocks based on the headings
text_blocks = {}

# Iterate over the headings
for heading in range(len(headings)):

    if heading == 0:
        document_intro = text.split(headings[heading])[0]
        text_blocks['Document_intro'] = document_intro
    
    text_after_heading = text.split(headings[heading])[1]
    if heading == len(headings) - 1:
        text_blocks[headings[heading]] = text_after_heading
        break
    else:
        text_of_heading = text_after_heading.split(headings[heading+1])[0]
        text_blocks[headings[heading]] = text_of_heading

In [45]:
# iterate over text blocks, removing '\u200b' and extra spaces
for key in text_blocks:
    text_blocks[key] = re.sub('\u200b', '', text_blocks[key])
    text_blocks[key] = re.sub(' +', ' ', text_blocks[key])

In [53]:
text_blocks

{'Document_intro': 'We had a beautiful Q4, completing a great year as internet TV expands globally. In 2017, we grew streaming revenue 36% to over $11 billion, added 24 million new memberships (compared to 19 million in 2016), achieved for the first time a full-year positive international contribution profit, and more than doubled global operating income. . ',
 'Q4 Results': ' Average paid streaming memberships rose 25% year over year in Q4. Combined with a 9% increase in ASP, global streaming revenue growth amounted to 35%. Operating income of $245 million (7.5% margin) vs. $154 million prior year (6.2% margin) was slightly above our $238 million forecast. Operating margin for FY17 was 7.2%, on target with our goal at the beginning of this year. EPS was $0.41 vs. $0.15 last year and met our forecast of $0.41. There were several below the line items that affected net income, including a pre-tax $26 million non-cash unrealized loss from F/X remeasurement on our Eurobond. Our tax rate wa

In [47]:
# split the text into blocks
# these blocks would be further used as separate prompts for the OpenAI model to summarize the whole document
# text_blocks = []
# block = []
# for element in range(len(text)):
#     if text[element] in headings:
#         if text[element] == "Reference":
#             break
#         text_blocks.append(block)
#         block = []
#     block.append(text[element])
# text_blocks.append(block)

In [48]:
# find the number of words in each block
# the number of words should be within the limit of the OpenAI prompt length (2048 tokens)
block_word_count = []
for block in text_blocks.items():
    word_count = 0
    for line in block:
        word_count += len(line.split(" "))
    block_word_count.append(word_count)
block_word_count

[57, 404, 205, 353, 121, 193, 274, 60, 47, 1708]

In [52]:
for heading, text in text_blocks.items():
    print(len(text.split(" ")))

56
402
204
352
118
192
268
57
46
84


### OpenAI Summarization

In [50]:
# load credentials for OpenAI API
import openai
import credentials
openai.api_key = credentials.openai_api_key

In [33]:
# for block in range(len(text_blocks)):
#     # if the block is too long (contains over 750 words), summarize it
#     if block_word_count[block] > 750:
#         block_text = " ".join(text_blocks[block])
#         # use the OpenAI API to summarize the text
#         completion = openai.ChatCompletion.create(
#             model="gpt-3.5-turbo",
#             messages=[
#             {"role": "user", "content": "Please effectively summarize the following text: " + block_text}
#             ])
#         # replace the long text block with the summarized version
#         text_blocks[block] = [completion.choices[0].message.content]

In [51]:
for heading, text in text_blocks.items():
    # if the block is too long (contains over 750 words), summarize it
    if len(text.split(' ')) > 750:

        # use the OpenAI API to summarize the text
        completion = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
            {"role": "user", "content": "Please effectively summarize the following text: " + text}
            ])
        # replace the long text block with the summarized version
        text_blocks[heading] = completion.choices[0].message.content

In [60]:
for heading, text in text_blocks.items():
    print(heading + ': ' + text)

Document_intro: We had a beautiful Q4, completing a great year as internet TV expands globally. In 2017, we grew streaming revenue 36% to over $11 billion, added 24 million new memberships (compared to 19 million in 2016), achieved for the first time a full-year positive international contribution profit, and more than doubled global operating income. . 
Q4 Results:  Average paid streaming memberships rose 25% year over year in Q4. Combined with a 9% increase in ASP, global streaming revenue growth amounted to 35%. Operating income of $245 million (7.5% margin) vs. $154 million prior year (6.2% margin) was slightly above our $238 million forecast. Operating margin for FY17 was 7.2%, on target with our goal at the beginning of this year. EPS was $0.41 vs. $0.15 last year and met our forecast of $0.41. There were several below the line items that affected net income, including a pre-tax $26 million non-cash unrealized loss from F/X remeasurement on our Eurobond. Our tax rate was helped b

In [61]:
final_prompt = ''
for heading, text in text_blocks.items():
    final_prompt += heading + ': ' + text + " "

In [63]:
import tiktoken

# define the number of tokens in the prompt
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    # num_tokens = len(encoding.encode(string))
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [64]:
# show number of tokens in prompt (max 2048 tokens for input)
num_tokens_from_string(final_prompt, "cl100k_base")

2365

In [None]:
# use the OpenAI API to generate the sentiment analysis
completion = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "user",
          "content": "Please tell me about the sentiment (positive, negative, neutral) of this information for the market (Netflix company) Please, be consice and lucid. Calculate the total polarity and subjectivity scores on the range -1 to 1. Don't forget to give me these scores" + final_prompt}
    ]
)

# show the sentiment analysis
print(completion.choices[0].message.content)

Overall, the sentiment of the information for the market (Netflix company) is positive. The document highlights the company's strong performance in Q4 and for the full year, including growth in streaming revenue and memberships, increased operating income and contribution profit, and the success of original content. The document also discusses future forecasts and the company's plans for content investment. The total polarity score is 0.533 and the subjectivity score is 0.371, both falling within the positive range.


### OpenAI Fine Tuning

In [72]:
# run the request for ChatGPT

fine_tune_messages = {"role": "system", "content":
                "You are a helpful financial assistant who is expert in evaluating sentiment scores for financial statements \
               You give precise answers to questions \
               the quality of your answers is highly important, you never hallucinate answers - only \
               answering based on your knowledge. Where the answer requires creative thought you engage \
               in reflective internal dialogue to ascertain the best answer"
}

user_content = "Please tell me about the sentiment (positive, negative, neutral) of this information (Netflix) for the investors. \
                Please, be consice and lucid. \
                Calculate the total polarity and subjectivity scores on the range -1 to 1 (show scores in the beginning of your output): "


completion = openai.ChatCompletion.create(
  model="gpt-3.5-turbo",
  messages=[
      fine_tune_messages,
    {"role": "user", "content": user_content + final_prompt}
  ]
)

print(completion.choices[0].message.content)