In [2]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import os
import re
from tqdm.notebook import tqdm
from difflib import SequenceMatcher
import pdfquery
import pdfplumber

In [3]:
# Define similarity function
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [4]:
# Define headings in the document using word size function
def word_ratio_func(word):
    try:
        word_length = len(word["text"])
        word_bottom = float(word['bottom'])
        word_top = float(word['top'])
        return (word_bottom - word_top), word_length, word["text"]
        
    except:
        return 0, 0, 0

In [5]:
def preprocess_text(texts):

    # preprocess the text
    text = "".join(texts.values()).strip("●").strip("*")
    text = text.split("\n")
    text = [x for x in text if x != '' and x.startswith("Source") == False]
    text = [x[0].replace("●", "") + x[1:] if x[0] == "●" else x for x in text]
    text = [x[0].replace("*", "") + x[1:] if x[0] == "*" else x for x in text]
    text = [x[0].replace("○", "") + x[1:] if x[0] == "○" else x for x in text]
    text = [x[0].replace("1", "") + x[1:] if x[1:3] in ["Q1", "Q2", "Q3", "Q4"] else x for x in text]
    text = text[2:]

    return text

### Scrape the PDFs

In [23]:
# get file paths for all pdfs
pdf_paths = []
for root, dirs, files in os.walk("ShareholderLetters/"):
    for file in files:
        if file.endswith(".pdf"):
             pdf_paths.append(os.path.join(root, file))

In [24]:
# creating a pdf reader object
# pdf_paths = ["ShareholderLetters/FINAL-Q2-21-Shareholder-Letter.pdf", 
#              "ShareholderLetters/FINAL-Q1-21-Shareholder-Letter.pdf"]

pdf_texts = {}
pdf_headings = {}

for file_path in tqdm(pdf_paths):

    try:
        
        reader = pdfplumber.open(file_path)

        texts = {}
        headings = []

        for page_number in range(0, len(reader.pages)):

            # get the specific page from the pdf file
            page = reader.pages[page_number]
            # extract text from page
            text = page.extract_text()
            # add text to dictionary
            texts[page_number] = text

            # extract headings from page
            words = page.extract_words()
            word_count = 0
            while word_count < len(words):
                # find if the words are large enough to be headings
                word_size, word_length, word_text = word_ratio_func(words[word_count])
                heading = []
                if word_size > 15 and word_length > 1:
                    while True:
                        heading.append(word_text)
                        word_count += 1
                        if word_count >= len(words):
                            break
                        word_size, word_length, word_text = word_ratio_func(words[word_count])
                        if not word_size > 15 and word_length > 1:
                            headings.append(" ".join(heading))
                            word_count += 10
                            break
                word_count += 1
            # break if the page covers the reference section
            if re.search("\nReference\n", text):
                break
        
        # preprocess the text
        text = preprocess_text(texts)
        final_text = " ".join(text)

        # export the text to a txt file
        with open("Txt/" + file_path.split("/")[-1].split(".")[0] + ".txt", "w", encoding='utf-8') as f:
            f.write(final_text)

        # add the text to the dictionary
        pdf_texts[file_path.split("/")[-1].split(".")[0]] = final_text

    except Exception as e:
        print(e)
        continue

  0%|          | 0/21 [00:00<?, ?it/s]

In [25]:
# view your pdf texts with pdf names as keys
list(pdf_texts.items())

[('COMBINED-Q4-17-Shareholder-Letter-FINAL',
  "We had a beautiful Q4, completing a great year as internet TV expands globally. In 2017, we grew streaming revenue 36% to over $11 billion, added 24 million new memberships (compared to 19 million in 2016), achieved for the first time a full-year positive international contribution profit, and more than doubled global operating income. . Q4 Results Average paid streaming memberships rose 25% year over year in Q4. Combined with a 9% increase in ASP, global streaming revenue growth amounted to 35%. Operating income of $245 million (7.5% margin) vs. $154 million prior year (6.2% margin) was slightly above our $238 million forecast. Operating margin for FY17 was 7.2%, on target with our goal at the beginning of this year. EPS was $0.41 vs. $0.15 last year and met our forecast of $0.41. There were several below the line items that affected net income, including a pre-tax $26 million non-cash unrealized loss from F/X remeasurement on our Eurobo

In [26]:
# get text from the first pdf file
list(pdf_texts.items())[0][1]

"We had a beautiful Q4, completing a great year as internet TV expands globally. In 2017, we grew streaming revenue 36% to over $11 billion, added 24 million new memberships (compared to 19 million in 2016), achieved for the first time a full-year positive international contribution profit, and more than doubled global operating income. . Q4 Results Average paid streaming memberships rose 25% year over year in Q4. Combined with a 9% increase in ASP, global streaming revenue growth amounted to 35%. Operating income of $245 million (7.5% margin) vs. $154 million prior year (6.2% margin) was slightly above our $238 million forecast. Operating margin for FY17 was 7.2%, on target with our goal at the beginning of this year. EPS was $0.41 vs. $0.15 last year and met our forecast of $0.41. There were several below the line items that affected net income, including a pre-tax $26 million non-cash unrealized loss from F/X remeasurement on our Eurobond. Our tax rate was helped by a $66 million fo

In [7]:
# split the text into blocks
# these blocks would be further used as separate prompts for the OpenAI model to summarize the whole document
text_blocks = []
block = []
for element in range(len(text)):
    if text[element] in headings:
        if text[element] == "Reference":
            break
        text_blocks.append(block)
        block = []
    block.append(text[element])
text_blocks.append(block)

In [8]:
# show the first text block
text_blocks[0]

['Q2 was better-than-expected on membership growth, and foreign exchange was worse-than-expected',
 '(stronger US dollar), resulting in 9% revenue growth (13% constant currency). Our challenge and',
 'opportunity is to accelerate our revenue and membership growth by continuing to improve our product,',
 'content, and marketing as we’ve done for the last 25 years, and to better monetize our big audience.',
 'We’re in a position of strength given our $30 billion-plus in revenue, $6 billion in operating profit last',
 'year, growing free cash flow and a strong balance sheet. Our summary results and forecast are below.']

In [34]:
# find the number of words in each block
# the number of words should be within the limit of the OpenAI prompt length (2048 tokens)
block_word_count = []
for block in text_blocks:
    word_count = 0
    for line in block:
        word_count += len(line.split(" "))
    block_word_count.append(word_count)
block_word_count

[93, 495, 280, 120, 495, 538, 147, 72]

In [31]:
# this is to show the text blocks
for block in text_blocks:
    print(" ".join(block))

Q2 was better-than-expected on membership growth, and foreign exchange was worse-than-expected (stronger US dollar), resulting in 9% revenue growth (13% constant currency). Our challenge and opportunity is to accelerate our revenue and membership growth by continuing to improve our product, content, and marketing as we’ve done for the last 25 years, and to better monetize our big audience. We’re in a position of strength given our $30 billion-plus in revenue, $6 billion in operating profit last year, growing free cash flow and a strong balance sheet. Our summary results and forecast are below.
Q2 Results Revenue in Q2 grew 9% year over year (or 13% excluding a -$339 million foreign currency impact), driven by a 6% and 2% increase in average paid memberships and ARM1, respectively. Excluding the impact of foreign exchange (F/X), ARM rose 7% year over year. The appreciation of the US dollar (USD) vs. most other currencies since our April earnings report was the primary reason for the var

### OpenAI Summarization

In [32]:
# load credentials for OpenAI API
import openai
import credentials
openai.api_key = credentials.openai_api_key

In [33]:
for block in range(len(text_blocks)):
    # if the block is too long (contains over 750 words), summarize it
    if block_word_count[block] > 750:
        block_text = " ".join(text_blocks[block])
        # use the OpenAI API to summarize the text
        completion = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
            {"role": "user", "content": "Please effectively summarize the following text: " + block_text}
            ])
        # replace the long text block with the summarized version
        text_blocks[block] = [completion.choices[0].message.content]

In [35]:
# flatten list of text blocks and join them into a single final prompt
final_prompt = [item for sublist in text_blocks for item in sublist]
final_prompt = "".join(final_prompt)

In [36]:
import tiktoken

# define the number of tokens in the prompt
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    # num_tokens = len(encoding.encode(string))
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [39]:
# show number of tokens in prompt (max 2048 tokens for input)
num_tokens_from_string(final_prompt, "cl100k_base")

2951

In [45]:
# use the OpenAI API to generate the sentiment analysis
completion = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "user",
          "content": "Please tell me about the sentiment (positive, negative, neutral) of this information for the market (Netflix company) Please, be consice and lucid. Calculate the total polarity and subjectivity scores on the range -1 to 1. Don't forget to give me these scores" + final_prompt}
    ]
)

# show the sentiment analysis
print(completion.choices[0].message.content)

Overall, the sentiment of the information for the market (Netflix company) is positive. The company acknowledges the challenges it faces in terms of slowing revenue growth but expresses confidence and optimism about its future. The information highlights the company's strengths, such as its strong revenue and profit figures, growing cash flow, and a strong balance sheet. The company emphasizes the importance of continuously improving its product, content, and marketing to drive revenue and membership growth. It also discusses its plans to monetize its audience through advertising and explore new revenue streams, such as paid sharing. The sentiment is positive, reflecting the company's confidence in its ability to navigate challenges and continue its growth trajectory.

Total polarity score: 0.81
Total subjectivity score: 0.34


### OpenAI Fine Tuning

In [23]:
# run the request for ChatGPT

fine_tune_messages = {"role": "system", "content":
                "You are a helpful financial assistant who is expert in evaluating sentiment scores for financial statements \
               You give precise answers to questions \
               the quality of your answers is highly important, you never hallucinate answers - only \
               answering based on your knowledge. Where the answer requires creative thought you engage \
               in reflective internal dialogue to ascertain the best answer"
}

user_content = "Please tell me about the sentiment (positive, negative, neutral) of this information (Netflix) for the investors. \
                Please, be consice and lucid. \
                Calculate the total polarity and subjectivity scores on the range -1 to 1 (show scores in the beginning of your output): "


completion = openai.ChatCompletion.create(
  model="gpt-3.5-turbo",
  messages=[
      fine_tune_messages,
    {"role": "user", "content": user_content + final_prompt}
  ]
)

print(completion.choices[0].message.content)

Sentiment: Neutral

The information provided in this statement is mainly regarding the financial performance and future forecast of Netflix. The statement discusses the Q2 results and the challenges and opportunities for the company to increase its revenue and membership. The company discusses its focus on improving product, content, and marketing. The statement also mentions the introduction of a new lower-priced ad-supported tier that will be launched in the early part of 2023. The company also discusses its goal to monetize the 100m+ households that are currently enjoying but not directly paying for Netflix. The company's focus on accessibility for people with disabilities is also mentioned. 

Polarity Score: 0.041

Subjectivity Score: 0.284
