In [6]:
# Clean the data
import spacy
import langdetect
from tqdm.notebook import tqdm
import pdfplumber
import fitz
import os
import re
import sys
import traceback
import pickle

In [7]:
# Define headings in the document using word size function (PDFPlumber)
def word_ratio_func(word):
    try:
        # calculate word size parameters
        word_length = len(word[4])
        word_bottom = float(word[1])
        word_top = float(word[3])
        return abs(word_bottom - word_top), word_length, word[4]
        
    except:
        # in case of error, return zeros
        return 0, 0, 0

In [8]:
# Define headings in the document using word size function (Fitz)
def word_ratio_func(word):
    try:
        # calculate word size parameters
        word_length = len(word["text"])
        word_bottom = float(word['bottom'])
        word_top = float(word['top'])
        return (word_bottom - word_top), word_length, word["text"]
        
    except:
        # in case of error, return zeros
        return 0, 0, 0

In [9]:
# Preprocess the text
def preprocess_text(texts):

    # join the text and perform cleansing operations
    text = "".join(texts.values()).strip("●").strip("*")
    text = text.split("\n")
    text = [x for x in text if x != '' and x.startswith("Source") == False]
    text = [x[0].replace("●", "") + x[1:] if x[0] == "●" else x for x in text]
    text = [x[0].replace("1", "") + x[1:] if x[1:3] in ["Q1", "Q2", "Q3", "Q4"] else x for x in text]
    text = text[2:]

    return text

In [10]:
# Define if table is correctly identified by calculating digit to character ratio
def digit_character_ratio(s):
    digit_count = 0
    char_count = 0

    for char in s:
        if char.isdigit():
            digit_count += 1
        if char.isalpha() or char.isdigit():
            char_count += 1

    # Avoid division by zero
    if char_count == 0:
        return 0

    return digit_count / char_count

In [71]:
from difflib import SequenceMatcher
# Define similarity function
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [76]:
# Transform pdf into the text
def process_pdf(pdf_paths):
    
    # store the pdf text and headings in dictionaries
    pdf_texts = {}
    pdf_headings = {}

    # iterate over the pdf files
    for file_path in tqdm(pdf_paths):

        # file_path = "ShareholderLetters/Investor-Letter-Q1-2012.pdf"
        # file_path = "ShareholderLetters/Investor-Letter-Q3-2011.pdf"
        # file_path = "ShareholderLetters/July-Investor-Letter-1130am.pdf"
        # file_path = "ShareholderLetters/July2014EarningsLetter_7.21.14_final.pdf"
        # file_path = "ShareholderLetters/Q4_14_Letter_to_shareholders.pdf"
        # file_path = "ShareholderLetters/FINAL-Q1-23-Shareholder-Letter.pdf"
        # file_path = "ShareholderLetters/FINAL-Q2-23-Shareholder-Letter.pdf"

        try:

            # store the text and headings within one pdf file
            texts = {}
            headings = []
            headings_count = 0
            fitz_flag = False

            # open the pdf file using pdfplumber library
            plumber_reader = pdfplumber.open(file_path)

            # for page_number in range(len(fitz_reader)):
            # iterate over pages in the pdf document
            for page_number in range(0, len(plumber_reader.pages)):

                # get the specific page from the pdf plumber
                plumber_page = plumber_reader.pages[page_number]
                # get text from page using pdf plumber
                text = plumber_page.extract_text()

                # if text is not correctly extracted and spaced using pdf plumber
                if len(text.split(" ")) / len(text) < 0.15 or fitz_flag:
                    # set fitz library flag as true for further pages
                    fitz_flag = True
                    # open the pdf file using fitz library
                    fitz_reader = fitz.open(file_path)
                    # get the specific page from the fitz library
                    fitz_page = fitz_reader.load_page(page_number)
                    # get text from page using fitz library
                    text = fitz_page.get_text()

                # find table from the page
                table = plumber_page.extract_tables()
                # if table exists on the page
                if len(table):
                    # define the beginning of table
                    start_table = table[0][0][0].split("\n")[0]
                    # define the end of table
                    end_table = table[-1][-1]
                    end_table = [x for x in end_table if x is not None][-1]
                    # if table has non-empty start and ending
                    if start_table != '' and end_table != '':
                        # flatten the table list of words and digits
                        table_list = [item for sublist in table for subsublist in sublist for item in subsublist]
                        # remove None values from the list
                        table_list = [x for x in table_list if x is not None]
                        # join the list as single string
                        table_list = " ".join(table_list)
                        # calculate the digit to character ratio
                        ratio = digit_character_ratio(table_list)
                        # remove table if digit to character ratio is over 0.2, meaning that table contains numeric data
                        if ratio > 0.2:
                            # update the page text by removing table
                            text = text.split(start_table)[0] + text.split(end_table)[-1]

                # add text to dictionary of texts
                texts[page_number] = text

                # words = page.get_text("words")
                ### get headings from page
                # extract words
                words = plumber_page.extract_words()
                word_count = 0
                # iterate over words
                while word_count < len(words):
                    # find if the words are large enough to be headings by calculating their size
                    word_size, word_length, word_text = word_ratio_func(words[word_count])
                    heading = []

                    # if word size is over 13.5, this means that the word is heading
                    if word_size > 13.5 and word_length > 1:
                        # append the following words if they satisfy this heading size condition
                        while True:
                            heading.append(word_text)
                            word_count += 1
                            if word_count >= len(words):
                                break
                            word_size, word_length, word_text = word_ratio_func(words[word_count])
                            # if word is small again, break the loop and finish the heading
                            if not word_size > 13.5 and word_length > 1:
                                headings.append(" ".join(heading))
                                # add the indent of 10 words to avoid issues 
                                word_count += 10
                                break
                    headings_count += 1
                    word_count += 1
        
                # break if the page covers the reference section
                if "Reference" in heading or page_number == 10:
                    break
            
            # preprocess the text
            text = preprocess_text(texts)
            final_text = " ".join(text).strip()

            

            # optionally, export the text to a txt file
            # with open("Txt/" + file_path.split("/")[-1].split(".")[0] + ".txt", "w", encoding='utf-8') as f:
            #     f.write(final_text)

            # add the text to the dictionary
            pdf_texts[file_path.split("/")[-1].split(".")[0]] = final_text
            # clean the headings
            headings = [x.replace("\u200b", "remove") for x in headings]
            headings = [x for x in headings if not re.search("remove", x)]    
            # add the headings to the dictionary
            pdf_headings[file_path.split("/")[-1].split(".")[0]] = headings

            # break

        except Exception as e:

            # in case of error, print the specifics of issue
            exc_type, exc_value, exc_traceback = sys.exc_info()
            traceback_details = traceback.extract_tb(exc_traceback)
            filename = traceback_details[-1][0]
            line_no = traceback_details[-1][1]
            func = traceback_details[-1][2]
            print(f"Exception occurred in file {filename} at line {line_no} in function {func}")
            print(f"Exception type: {exc_type.__name__}, Exception message: {str(e)}")

            # break

            # continue
    
    return pdf_texts, pdf_headings

In [77]:
# Get file paths for the pdf files
folder_path = "ShareholderLetters/" # put '/' sign at the end of the folder
file_paths = []
for root, directories, files in os.walk(folder_path):
    for filename in files:
        filepath = os.path.join(root, filename)
        file_paths.append(filepath)

# Transform pdf files into texts and headings and store them as dictionaries
pdf_texts, pdf_headings = process_pdf(file_paths) # total run time: 2 min 20 s 20 files

  0%|          | 0/50 [00:00<?, ?it/s]

In [None]:
pdf_texts

In [75]:
pdf_headings

{'Investor-Letter-Q1-2012': ['Domestic Streaming',
  'International Streaming',
  'DVD',
  'Product Improvement & Partnering',
  'Original Programming',
  'Global Profitability: Q1 Results & Outlook',
  'Free Cash Flow',
  'Business Outlook Q2 2012 Guidance Domestic Streaming:',
  'International Streaming',
  'Domestic DVD:',
  'Consolidated Global:',
  'Summary']}

In [79]:
# Save pdf texts and headings to pickle files
with open("pdf_texts2.pkl", "wb") as f:
    pickle.dump(pdf_texts, f)
with open("pdf_headings2.pkl", "wb") as f:
    pickle.dump(pdf_headings, f)

In [27]:
# file_path = "ShareholderLetters/Investor_Letter_Q12013.pdf"
# file_path = "ShareholderLetters/FINAL-Q3-22-Shareholder-Letter.pdf"
# file_path = "ShareholderLetters/Investor-Letter-Q3-2011.pdf"
# file_path = "ShareholderLetters/July-Investor-Letter-1130am.pdf"
# file_path = "ShareholderLetters/July2014EarningsLetter_7.21.14_final.pdf"
# file_path = "ShareholderLetters/Q3_14_Letter_to_shareholders.pdf"
# file_path = "ShareholderLetters/Q4_14_Letter_to_shareholders.pdf"
# file_path = "ShareholderLetters/FINAL-Q2-23-Shareholder-Letter.pdf"
# file_path = "ShareholderLetters/FINAL-Q1-23-Shareholder-Letter.pdf"
file_path = "ShareholderLetters/Investor-Letter-Q3-2011.pdf"
# file_path = "ShareholderLetters/Investor-Letter-Q42012-01.pdf"

page_number = 0

pdf_plumber = pdfplumber.open(file_path)
plumber_page = pdf_plumber.pages[page_number]

fitz_reader = fitz.open(file_path)
fitz_page = fitz_reader.load_page(page_number)
text = fitz_page.get_text()

plumber_text = plumber_page.extract_text()
table = plumber_page.extract_tables()

if len(plumber_page.extract_tables()):
    print("Table found")
    start_table = plumber_page.extract_tables()[0][0][0].split("\n")[0]
    if not re.search(start_table, text):
        start_table = start_table.replace(" ", "")
    end_table = plumber_page.extract_tables()[-1][-1]
    end_table = [x for x in end_table if x is not None]
    end_table = end_table[-1]
    if start_table != '' and end_table != '':
        table_list = plumber_page.extract_tables()
        table_list = [item for sublist in table_list for subsublist in sublist for item in subsublist]
        table_list = [x for x in table_list if x is not None]
        table_list = " ".join(table_list)
        ratio = digit_character_ratio(table_list)
        print(ratio)
        if ratio > 0.2:
            text_2 = text.split(start_table)[0] + text.split(end_table)[-1]

Table found
0.5042918454935622


In [28]:
text

"April 18, 2023\nFellow shareholders,\nSummary:\n●\nQ1‘23 revenue and operating profit were in-line with our forecast.\n●\nWe delivered a strong content slate in Q1 with:\n○\nSuccessful returning seasons like Outer Banks, You, Ginny & Georgia and a big sequel\nfilm Murder Mystery 2.\n○\nNew hits across nearly every genre of TV like The Night Agent (now our 6th most popular\nEnglish language TV show ever), The Glory (our 5th most popular non-English TV show\never), Full Swing, That 90s Show and films You People and Luther: The Fallen Sun.\n●\nWith Moody’s recent upgrade, we achieved investment grade status. Netflix is the leading\nstreaming service based on engagement, revenue and profit and we are working to build on that\nin ‘23, by seeking to expand operating margin to 18%-20% and to generate at least +$3.5B of\nfree cash flow (up from our prior expectation of at least $3.0B of FCF).\n●\nIn Q1, we launched paid sharing in four countries and are pleased with the results. We are\nplann

In [29]:
plumber_text

"April18,2023\nFellowshareholders,\nSummary:\n● Q1‘23revenueandoperatingprofitwerein-linewithourforecast.\n● WedeliveredastrongcontentslateinQ1with:\n○ SuccessfulreturningseasonslikeOuterBanks,You,Ginny&Georgiaandabigsequel\nfilmMurderMystery2.\n○ NewhitsacrossnearlyeverygenreofTVlikeTheNightAgent(nowour6thmostpopular\nEnglishlanguageTVshowever),TheGlory(our5thmostpopularnon-EnglishTVshow\never),FullSwing,That90sShowandfilmsYouPeopleandLuther:TheFallenSun.\n● WithMoody’srecentupgrade,weachievedinvestmentgradestatus.Netflixistheleading\nstreamingservicebasedonengagement,revenueandprofitandweareworkingtobuildonthat\nin‘23,byseekingtoexpandoperatingmarginto18%-20%andtogenerateatleast+$3.5Bof\nfreecashflow(upfromourpriorexpectationofatleast$3.0BofFCF).\n● InQ1,welaunchedpaidsharinginfourcountriesandarepleasedwiththeresults.Weare\nplanningonabroadrollout,includingintheUS,inQ2.\n● Givencurrenthealthyperformanceandtrajectoryofourper-memberadvertisingeconomics,\nparticularlyintheUS,we’reupgrad

In [None]:
Fellowshareholders

In [75]:
text_2

"April 18, 2023\nFellow shareholders,\nSummary:\n●\nQ1‘23 revenue and operating profit were in-line with our forecast.\n●\nWe delivered a strong content slate in Q1 with:\n○\nSuccessful returning seasons like Outer Banks, You, Ginny & Georgia and a big sequel\nfilm Murder Mystery 2.\n○\nNew hits across nearly every genre of TV like The Night Agent (now our 6th most popular\nEnglish language TV show ever), The Glory (our 5th most popular non-English TV show\never), Full Swing, That 90s Show and films You People and Luther: The Fallen Sun.\n●\nWith Moody’s recent upgrade, we achieved investment grade status. Netflix is the leading\nstreaming service based on engagement, revenue and profit and we are working to build on that\nin ‘23, by seeking to expand operating margin to 18%-20% and to generate at least +$3.5B of\nfree cash flow (up from our prior expectation of at least $3.0B of FCF).\n●\nIn Q1, we launched paid sharing in four countries and are pleased with the results. We are\nplann

In [None]:
from PyPDF2 import PdfReader

file_path = "ShareholderLetters/FINAL-Q1-23-Shareholder-Letter.pdf"
reader = PdfReader(file_path)

# getting a specific page from the pdf file
overall_polarity = []
overall_subjectivity = []

# iterate through all pages of the PDF and extract text
for page_number in tqdm(range(0, len(reader.pages))):
    page = reader.pages[page_number]
    # extracting text from page
    text = page.extract_text()
    print(text)
    # print(sentiment_pipeline(text))
    overall_polarity.append(TextBlob(text).sentiment.polarity)
    overall_subjectivity.append(TextBlob(text).sentiment.subjectivity)

In [25]:
len(reader.pages)

15

In [22]:
text

"April 18, 2023\nFellow shareholders,\nSummary:\n●\nQ1‘23 revenue and operating profit were in-line with our forecast.\n●\nWe delivered a strong content slate in Q1 with:\n○\nSuccessful returning seasons like Outer Banks, You, Ginny & Georgia and a big sequel\nfilm Murder Mystery 2.\n○\nNew hits across nearly every genre of TV like The Night Agent (now our 6th most popular\nEnglish language TV show ever), The Glory (our 5th most popular non-English TV show\never), Full Swing, That 90s Show and films You People and Luther: The Fallen Sun.\n●\nWith Moody’s recent upgrade, we achieved investment grade status. Netflix is the leading\nstreaming service based on engagement, revenue and profit and we are working to build on that\nin ‘23, by seeking to expand operating margin to 18%-20% and to generate at least +$3.5B of\nfree cash flow (up from our prior expectation of at least $3.0B of FCF).\n●\nIn Q1, we launched paid sharing in four countries and are pleased with the results. We are\nplann

In [51]:
table

[[['Revenues',
   '$ 1,484,728',
   '',
   '$ 1,409,432',
   '',
   '$ 1,175,230',
   '',
   '$ 5,504,656',
   '',
   '$ 4,374,562']],
 [['Marketing',
   '203,671',
   '',
   '145,654',
   '',
   '128,017',
   '',
   '607,186',
   '',
   '469,942']],
 [['General and administrative',
   '75,803',
   '',
   '78,024',
   '',
   '46,120',
   '',
   '269,741',
   '',
   '180,301']],
 [['Other income (expense):', '', '', '', '', '', '', '', '', '']],
 [['Interest and other income (expense)',
   '(6,177)',
   '',
   '616',
   '',
   '(846)',
   '',
   '(3,060)',
   '',
   '(3,002)']],
 [['Income before income taxes',
   '45,516',
   '',
   '97,537',
   '',
   '74,004',
   '',
   '349,369',
   '',
   '171,074']],
 [['Net income',
   '$ 83,371',
   '',
   '$ 59,295',
   '',
   '$ 48,421',
   '',
   '$ 266,799',
   '',
   '$ 112,403']],
 [['Basic', '$ 1.38', '', '$ 0.99', '', '$ 0.81', '', '$ 4.44', '', '$ 1.93']],
 [['Weighted-average common shares outstanding:',
   '',
   '',
   '',
   '',
   

In [151]:
table_list = [item for sublist in table_list for subsublist in sublist for item in subsublist]
table_list

TypeError: 'NoneType' object is not iterable

In [150]:
text

"July 21, 2014\nFellow Shareholders,\nFifteen years after launching our subscription service, we have over fifty million members\nenjoying Netflix in over 40 countries. As we gain new members, we are investing to further\nimprove our content and member experience, and to expand the global availability of our\nservice. Our summary results and forecast are below:\n(in millions except per share data and Q2 '13 Q3 '13 Q4 '13 Q1 '14 Q2 '14 Q3 '14\nStreaming Content Obligations)\nForecast\nTotal Streaming:\nRevenue $ 837 $ 884 $ 962 $ 1,066 $ 1,146 $ 1,224\nContribution Profit $ 85 $ 92 $ 117 $ 166 $ 212 $ 203\nContribution Margin 10.2% 10.4% 12.2% 15.6% 18.5% 16.6%\nPaid Members 35.63 38.01 41.43 46.14 47.99 50.89\nTotal Members 37.56 40.28 44.35 48.35 50.05 53.74\nNet Additions 1.24 2.73 4.07 4.00 1.69 3.69\nDomestic Streaming:\nRevenue $ 671 $ 701 $ 741 $ 799 $ 838 $ 877\nContribution Profit $ 151 $ 166 $ 174 $ 201 $ 227 $ 245\nContribution Margin 22.5% 23.7% 23.4% 25.2% 27.1% 27.9%\nPaid

In [141]:
nested_list = plumber_page.extract_tables()
flattened_list = [item for sublist in nested_list for subsublist in sublist for item in subsublist]
flattened_list = [x for x in flattened_list if x is not None]
flattened_list = " ".join(flattened_list)
ratio = digit_character_ratio(flattened_list)
ratio

In [144]:
flattened_list = " ".join(flattened_list)
ratio = digit_character_ratio(flattened_list)
ratio

0.3446414803392444

In [99]:
text

"July 21, 2014\nFellow Shareholders,\nFifteen years after launching our subscription service, we have over fifty million members\nenjoying Netflix in over 40 countries. As we gain new members, we are investing to further\nimprove our content and member experience, and to expand the global availability of our\nservice. Our summary results and forecast are below:\n(in millions except per share data and Q2 '13 Q3 '13 Q4 '13 Q1 '14 Q2 '14 Q3 '14\nStreaming Content Obligations)\nForecast\nTotal Streaming:\nRevenue $ 837 $ 884 $ 962 $ 1,066 $ 1,146 $ 1,224\nContribution Profit $ 85 $ 92 $ 117 $ 166 $ 212 $ 203\nContribution Margin 10.2% 10.4% 12.2% 15.6% 18.5% 16.6%\nPaid Members 35.63 38.01 41.43 46.14 47.99 50.89\nTotal Members 37.56 40.28 44.35 48.35 50.05 53.74\nNet Additions 1.24 2.73 4.07 4.00 1.69 3.69\nDomestic Streaming:\nRevenue $ 671 $ 701 $ 741 $ 799 $ 838 $ 877\nContribution Profit $ 151 $ 166 $ 174 $ 201 $ 227 $ 245\nContribution Margin 22.5% 23.7% 23.4% 25.2% 27.1% 27.9%\nPaid

'*Corresponds to our total known streaming content obligations as defined in our financial statements and related notes in our most recently filed\nSEC Form 10-K'

In [105]:
plumber_page.extract_tables()

[[['Fifteen years after launching our subscription service, we have over fifty million members'],
  ['enjoying Netflix in over 40 countries. As we gain new members, we are investing to further'],
  ['improve our content and member experience, and to expand the global availability of our'],
  ['service. Our summary results and forecast are below:']],
 [['(in millions except per share data and\nStreaming Content Obligations)',
   "Q2 '13 Q3 '13 Q4 '13 Q1 '14 Q2 '14 Q3 '14\nForecast",
   None,
   None,
   None,
   None,
   None],
  ['Total Streaming:', '', '', '', '', '', ''],
  ['Revenue', '$ 837', '$ 884', '$ 962', '$ 1,066', '$ 1,146', '$ 1,224'],
  ['Contribution Profit', '$ 85', '$ 92', '$ 117', '$ 166', '$ 212', '$ 203'],
  ['Contribution Margin',
   '10.2%',
   '10.4%',
   '12.2%',
   '15.6%',
   '18.5%',
   '16.6%'],
  ['Paid Members', '35.63', '38.01', '41.43', '46.14', '47.99', '50.89'],
  ['Total Members', '37.56', '40.28', '44.35', '48.35', '50.05', '53.74'],
  ['Net Additions

In [103]:
text.split(end_table)[-1]

"\n(in millions except per share data and Q2 '13 Q3 '13 Q4 '13 Q1 '14 Q2 '14 Q3 '14\nStreaming Content Obligations)\nForecast\nTotal Streaming:\nRevenue $ 837 $ 884 $ 962 $ 1,066 $ 1,146 $ 1,224\nContribution Profit $ 85 $ 92 $ 117 $ 166 $ 212 $ 203\nContribution Margin 10.2% 10.4% 12.2% 15.6% 18.5% 16.6%\nPaid Members 35.63 38.01 41.43 46.14 47.99 50.89\nTotal Members 37.56 40.28 44.35 48.35 50.05 53.74\nNet Additions 1.24 2.73 4.07 4.00 1.69 3.69\nDomestic Streaming:\nRevenue $ 671 $ 701 $ 741 $ 799 $ 838 $ 877\nContribution Profit $ 151 $ 166 $ 174 $ 201 $ 227 $ 245\nContribution Margin 22.5% 23.7% 23.4% 25.2% 27.1% 27.9%\nPaid Members 28.62 29.93 31.71 34.38 35.09 36.52\nTotal Members 29.81 31.09 33.42 35.67 36.24 37.58\nNet Additions 0.63 1.29 2.33 2.25 0.57 1.33\nInternational Streaming:\nRevenue $ 166 $ 183 $ 221 $ 267 $ 307 $ 347\nContribution Profit (Loss) $ (66) $ (74) $ (57) $ (35) $ (15) $ (42)\nContribution Margin -39.7% -40.6% -25.9% -13.1% -5.0% -12.1%\nPaid Members 7.01

In [102]:
text.split(start_table)[0] + text.split(end_table)[1]

"July 21, 2014\nFellow Shareholders,\n\n(in millions except per share data and Q2 '13 Q3 '13 Q4 '13 Q1 '14 Q2 '14 Q3 '14\nStreaming Content Obligations)\nForecast\nTotal Streaming:\nRevenue $ 837 $ 884 $ 962 $ 1,066 $ 1,146 $ 1,224\nContribution Profit $ 85 $ 92 $ 117 $ 166 $ 212 $ 203\nContribution Margin 10.2% 10.4% 12.2% 15.6% 18.5% 16.6%\nPaid Members 35.63 38.01 41.43 46.14 47.99 50.89\nTotal Members 37.56 40.28 44.35 48.35 50.05 53.74\nNet Additions 1.24 2.73 4.07 4.00 1.69 3.69\nDomestic Streaming:\nRevenue $ 671 $ 701 $ 741 $ 799 $ 838 $ 877\nContribution Profit $ 151 $ 166 $ 174 $ 201 $ 227 $ 245\nContribution Margin 22.5% 23.7% 23.4% 25.2% 27.1% 27.9%\nPaid Members 28.62 29.93 31.71 34.38 35.09 36.52\nTotal Members 29.81 31.09 33.42 35.67 36.24 37.58\nNet Additions 0.63 1.29 2.33 2.25 0.57 1.33\nInternational Streaming:\nRevenue $ 166 $ 183 $ 221 $ 267 $ 307 $ 347\nContribution Profit (Loss) $ (66) $ (74) $ (57) $ (35) $ (15) $ (42)\nContribution Margin -39.7% -40.6% -25.9% -

In [93]:
plumber_page.extract_tables()

[[['Fifteen years after launching our subscription service, we have over fifty million members'],
  ['enjoying Netflix in over 40 countries. As we gain new members, we are investing to further'],
  ['improve our content and member experience, and to expand the global availability of our'],
  ['service. Our summary results and forecast are below:']],
 [['(in millions except per share data and\nStreaming Content Obligations)',
   "Q2 '13 Q3 '13 Q4 '13 Q1 '14 Q2 '14 Q3 '14\nForecast",
   None,
   None,
   None,
   None,
   None],
  ['Total Streaming:', '', '', '', '', '', ''],
  ['Revenue', '$ 837', '$ 884', '$ 962', '$ 1,066', '$ 1,146', '$ 1,224'],
  ['Contribution Profit', '$ 85', '$ 92', '$ 117', '$ 166', '$ 212', '$ 203'],
  ['Contribution Margin',
   '10.2%',
   '10.4%',
   '12.2%',
   '15.6%',
   '18.5%',
   '16.6%'],
  ['Paid Members', '35.63', '38.01', '41.43', '46.14', '47.99', '50.89'],
  ['Total Members', '37.56', '40.28', '44.35', '48.35', '50.05', '53.74'],
  ['Net Additions

In [90]:
text.split(end_table)[-1]

"July 21, 2014\nFellow Shareholders,\n\n(in millions except per share data and Q2 '13 Q3 '13 Q4 '13 Q1 '14 Q2 '14 Q3 '14\nStreaming Content Obligations)\nForecast\nTotal Streaming:\nRevenue $ 837 $ 884 $ 962 $ 1,066 $ 1,146 $ 1,224\nContribution Profit $ 85 $ 92 $ 117 $ 166 $ 212 $ 203\nContribution Margin 10.2% 10.4% 12.2% 15.6% 18.5% 16.6%\nPaid Members 35.63 38.01 41.43 46.14 47.99 50.89\nTotal Members 37.56 40.28 44.35 48.35 50.05 53.74\nNet Additions 1.24 2.73 4.07 4.00 1.69 3.69\nDomestic Streaming:\nRevenue $ 671 $ 701 $ 741 $ 799 $ 838 $ 877\nContribution Profit $ 151 $ 166 $ 174 $ 201 $ 227 $ 245\nContribution Margin 22.5% 23.7% 23.4% 25.2% 27.1% 27.9%\nPaid Members 28.62 29.93 31.71 34.38 35.09 36.52\nTotal Members 29.81 31.09 33.42 35.67 36.24 37.58\nNet Additions 0.63 1.29 2.33 2.25 0.57 1.33\nInternational Streaming:\nRevenue $ 166 $ 183 $ 221 $ 267 $ 307 $ 347\nContribution Profit (Loss) $ (66) $ (74) $ (57) $ (35) $ (15) $ (42)\nContribution Margin -39.7% -40.6% -25.9% -

In [31]:
text = plumber_page.extract_text()
text.split(start_table)[0] + text.split(end_table)[1]

'July 25th, 2011\nDear Fellow Shareholders,\nWe are happy to report that Q2 was another great quarter for Netflix. Streaming is continuing to grow\nrapidly, and with the recently introduced price changes, we’ll be able to further increase the scope and\nquality of our streaming content. At the end of the quarter, Netflix had over 25 million global subscribers,\nup 70% from 15 million just one year ago. Our summary results are below:\n 54.2 54.2 '

In [21]:
start_table

'(in millions except per share data)'

In [24]:
text.split(start_table)[0]

'July 25th, 2011\nDear Fellow Shareholders,\nWe are happy to report that Q2 was another great quarter for Netflix. Streaming is continuing to grow\nrapidly, and with the recently introduced price changes, we’ll be able to further increase the scope and\nquality of our streaming content. At the end of the quarter, Netflix had over 25 million global subscribers,\nup 70% from 15 million just one year ago. Our summary results are below:\n 54.2 54.2 '

In [22]:
end_table

'53.9'

In [23]:
text

'July 25th, 2011\nDear Fellow Shareholders,\nWe are happy to report that Q2 was another great quarter for Netflix. Streaming is continuing to grow\nrapidly, and with the recently introduced price changes, we’ll be able to further increase the scope and\nquality of our streaming content. At the end of the quarter, Netflix had over 25 million global subscribers,\nup 70% from 15 million just one year ago. Our summary results are below:\n 54.2 54.2 '

In [20]:
plumber_page = pdf_plumber.pages[0]
plumber_page.extract_text()

"July 25th, 2011\nDear Fellow Shareholders,\nWe are happy to report that Q2 was another great quarter for Netflix. Streaming is continuing to grow\nrapidly, and with the recently introduced price changes, we’ll be able to further increase the scope and\nquality of our streaming content. At the end of the quarter, Netflix had over 25 million global subscribers,\nup 70% from 15 million just one year ago. Our summary results are below:\n(in millions except per share data) Q2 '09 Q3 '09 Q4 '09 Q1 '10 Q2 '10 Q3 '10 Q4 '10 Q1 '11 Q2 '11\nDomestic:\nNet Subscriber Additions 0.29 0.51 1.16 1.70 1.03 1.80 2.70 3.30 1.80\nY/Y Change 72% 95% 61% 85% 255% 253% 133% 94% 75%\nSubscribers 10.60 11.11 12.27 13.97 15.00 16.80 19.50 22.80 24.59\nY/Y Change 26% 28% 31% 35% 42% 51% 59% 63% 64%\nRevenue $ 409 $ 423 $ 445 $ 494 $ 520 $ 553 $ 592 $ 706 $ 770\nY/Y Change 21% 24% 24% 25% 27% 31% 33% 43% 48%\nContribution Profit $ 93 $ 89 $ 98 $ 111 $ 130 $ 130 $ 152 $ 187 $ 213\nY/Y Change 38% 32% 38% 53% 40% 

In [131]:
plumber_page.extract_tables()

[[['(in millions exc',
   'eept per share data)',
   '',
   "Q3 '09",
   '',
   "Q4 '09",
   "Q1 ''",
   '10',
   "Q2 '10",
   '',
   "Q3 '10",
   'Q44',
   "'10",
   "Q1 '11",
   '',
   "Q2 '11",
   'Q',
   "3 '11"],
  [None,
   None,
   None,
   '',
   '',
   None,
   '',
   None,
   '',
   '',
   None,
   '',
   None,
   '',
   '',
   None,
   '',
   ''],
  [None,
   None,
   None,
   '',
   '',
   None,
   '',
   None,
   '',
   '',
   None,
   '',
   None,
   '',
   '',
   None,
   '22',
   '1.45'],
  [None,
   None,
   None,
   '',
   '',
   None,
   '',
   None,
   '',
   '',
   None,
   '',
   None,
   '',
   '',
   None,
   '11',
   '3.93'],
  [None,
   None,
   None,
   '',
   '',
   None,
   '',
   None,
   '',
   '',
   None,
   '',
   None,
   '',
   '',
   None,
   '',
   ''],
  [None,
   None,
   None,
   '11.11',
   '12.27',
   None,
   '13..97',
   None,
   '15.00',
   '16.80',
   None,
   '199.50',
   None,
   '22.80',
   '24.59',
   None,
   '22',
   '3.79'],
  [None

In [98]:
text

'October 224th, 2011\nDear Felloow Shareholdders,\nThe Internnet is transforming video eentertainmennt, stream by stream, conssumer by consumer, nation by\nnation. Ouur opportunitty is to be onee of the leadeers of this transformation with the bestt streaming vvideo\nsubscriptiion service onn the planet. The last feww months, howwever, have bbeen difficult for shareholdders,\nemployeees, and most uunfortunatelyy, many memmbers of Netflix. While wee dramaticallyy improved our\n$7.99 unlimited streamming service bby embracing new platformms, simplifyinng our user‐innterface, and more\nthan doubbling domestiic spending on streaming ccontent over 2010, we greeatly upset many domesticc\nNetflix meembers with our significannt DVD‐related pricing changes, and to a lesser degree, with the\nproposed‐and‐now‐cancelled rebranding of our DVD service. In doing so, we’ve hurt oour hard‐earned\nreputation, and stalledd our domestiic growth. But our long‐terrm streamingg opportunityy is as compellling

In [21]:
pdf_headings

{'Investor-Letter-Q1-2012': ['23rd,',
  'Domestic Streaming',
  'International Streaming',
  'DVD',
  'Product Improvement & Partnering',
  'Original Programming',
  'Global Profitability: Q1 Results & Outlook',
  'Free Cash Flow',
  'Business Outlook Domestic Streaming:',
  'International Streaming',
  'Domestic DVD:',
  'Consolidated Global:',
  'Summary']}

In [55]:
import re
import fitz

pdf_document = "ShareholderLetters/Investor-Letter-Q1-2012.pdf"
page_number = 0  # Replace with the page number you're interested in

doc = fitz.open(pdf_document)
page = doc[page_number]

text = page.get_text()

In [38]:
text

" \n \n1 \nApril 23rd, 2012    \nDear Fellow Shareholders, \nNetflix added nearly 3 million streaming members in Q1, bringing our total to over 26 million global \nstreaming members, and strengthening our position as the world’s leading Internet TV network.   We \nanticipate returning to global profitability in Q2, and plan to launch our next international market in Q4.   \nWe are constantly improving our service with better personalization, better user-interfaces, better \nstreaming, and more content.  As a result, per-member viewing hours set new records in Q1 and are on \ntrack to do so again in Q2, on a year-over-year basis.  We launched our service in the UK and Ireland in \nJanuary and are very pleased that, after the first 90 days, we had substantially more members than we \nhad after the first 90 days of Canada or Latin America.       \n \n (in millions except per share data)\nQ2 '10\nQ3 '10\nQ4 '10\nQ1 '11\nQ2 '11\nQ3 '11\nQ4 '11\nQ1 '12 \nG id\n \nDomestic Streaming:\nNet Sub

In [54]:
start_table

'Q2 2012 Guidance'

In [64]:
if start_table != '' and end_table != '':
    print(text.split(start_table)[0] + text.split(end_table)[1])

 
 
1 
April 23rd, 2012    
Dear Fellow Shareholders, 
Netflix added nearly 3 million streaming members in Q1, bringing our total to over 26 million global 
streaming members, and strengthening our position as the world’s leading Internet TV network.   We 
anticipate returning to global profitability in Q2, and plan to launch our next international market in Q4.   
We are constantly improving our service with better personalization, better user-interfaces, better 
streaming, and more content.  As a result, per-member viewing hours set new records in Q1 and are on 
track to do so again in Q2, on a year-over-year basis.  We launched our service in the UK and Ireland in 
January and are very pleased that, after the first 90 days, we had substantially more members than we 
had after the first 90 days of Canada or Latin America.       
 
 



In [93]:
# Load pdf text and headings from the pickle file
pdf_texts = pickle.load(open("pdf_texts.pkl", "rb"))
pdf_headings = pickle.load(open("pdf_headings.pkl", "rb"))

In [94]:
pdf_texts

{'COMBINED-Q4-17-Shareholder-Letter-FINAL': "Fellow shareholders,    We had a beautiful Q4, completing a great year as internet TV expands globally. In 2017, we grew  streaming revenue 36% to over $11 billion, added 24 million new memberships (compared to 19 million  in 2016), achieved for the first time a full-year positive international contribution profit, and more than  doubled global operating income.  .    1  Q4 Results  Average paid streaming memberships rose 25% year over year in Q4. Combined with a 9% increase in  ASP, global streaming revenue growth amounted to 35%. Operating income of $245 million (7.5%  margin) vs. $154 million prior year (6.2% margin) was slightly above our $238 million forecast. Operating  margin for FY17 was 7.2%, on target with our goal at the beginning of this year.   EPS was $0.41 vs. $0.15 last year and met our forecast of $0.41. There were several below the line items  that affected net income, including a pre-tax $26 million non-cash unrealized los

In [63]:
start_table = tables[0][0][0]
end_table = tables[0][-1][-1]

In [46]:
tables

[[['Q2 2012 Guidance', None],
  ['Domestic Streaming:', ''],
  ['Total Subscriptions', '23.6 m to 24.2 m'],
  ['Paid Subscriptions', '22.3 m to 22.9 m'],
  ['Revenue', '$526 m to $534 m'],
  ['Contribution Profit', '$72 m to $84 m'],
  ['', ''],
  ['International Streaming', ''],
  ['Total Subscriptions', '3.45 m to 4.0 m'],
  ['Paid Subscriptions', '2.8 m to 3.25 m'],
  ['Revenue', '$60 m to $67 m'],
  ['Contribution Profit (Loss)', '($98 m) to ($86 m)'],
  ['', ''],
  ['Domestic DVD:', ''],
  ['Total Subscriptions', '8.95 m to 9.35 m'],
  ['Paid Subscriptions', '8.9 m to 9.3 m'],
  ['Revenue', '$287 m to $294 m'],
  ['Contribution Profit', '$126 m to $138 m'],
  ['', ''],
  ['Consolidated Global:', ''],
  ['Net Income (Loss)', '($6 m) to $8 m'],
  ['EPS', '($0.10) to $0.14']]]

In [67]:
pdf_plumber = pdfplumber.open(pdf_document)
page = pdf_plumber.pages[1]
if len(page.extract_tables()):
    print("Table found")
    start_table = tables[0][0][0]
    end_table = tables[0][-1][-1]

In [56]:
import pdfplumber

pdf_document = "ShareholderLetters/Investor-Letter-Q1-2012.pdf"
page_number = 0  # Replace with the page number you're interested in

with pdfplumber.open(pdf_document) as pdf:
    page = pdf.pages[page_number]
    
    # Extract tables as DataFrame objects
    tables = page.extract_tables()
    
    # Process the tables
    for table in tables:
        for row in table:
            print(row)
        print("-" * 20)

['(in millions except per share data)', "Q2 '10", "Q3 '10", "Q4 '10", "Q1 '11", "Q2 '11", "Q3 '11", "Q4 '11", "Q1 '12"]
['Domestic Streaming:', '', '', '', '', '', '', '', 'G id']
['Net Subscription Additions', '-', '-', '-', '-', '-', '-', '0.22', '1.74']
['Total Subscriptions', '-', '-', '-', '-', '-', '21.45', '21.67', '23.41']
['Paid Subscriptions', '-', '-', '-', '-', '-', '20.51', '20.15', '22.02']
['Revenue', '-', '-', '-', '-', '-', '-', '$ 476', '$ 507']
['Contribution Profit', '-', '-', '-', '-', '-', '-', '$ 52', '$ 67']
['Contribution Margin', '-', '-', '-', '-', '-', '-', '10.9%', '13.2%']
['', '', '', '', '', '', '', '', '']
['International Streaming:', '', '', '', '', '', '', '', '']
['Net Subscription Additions', '-', '0.13', '0.38', '0.29', '0.16', '0.51', '0.38', '1.21']
['Total Subscriptions', '-', '0.13', '0.51', '0.80', '0.97', '1.48', '1.86', '3.07']
['Paid Subscriptions', '-', '-', '0.33', '0.67', '0.86', '0.99', '1.45', '2.41']
['Revenue', '-', '$ -', '$ 4', '$ 

In [36]:
text

" \n \n1 \nApril 23rd, 2012    \nDear Fellow Shareholders, \nNetflix added nearly 3 million streaming members in Q1, bringing our total to over 26 million global \nstreaming members, and strengthening our position as the world’s leading Internet TV network.   We \nanticipate returning to global profitability in Q2, and plan to launch our next international market in Q4.   \nWe are constantly improving our service with better personalization, better user-interfaces, better \nstreaming, and more content.  As a result, per-member viewing hours set new records in Q1 and are on \ntrack to do so again in Q2, on a year-over-year basis.  We launched our service in the UK and Ireland in \nJanuary and are very pleased that, after the first 90 days, we had substantially more members than we \nhad after the first 90 days of Canada or Latin America.       \n \n (in millions except per share data)\nQ2 '10\nQ3 '10\nQ4 '10\nQ1 '11\nQ2 '11\nQ3 '11\nQ4 '11\nQ1 '12 \nG id\n \nDomestic Streaming:\nNet Sub

In [34]:
import re
import fitz

pdf_document = "ShareholderLetters/Investor-Letter-Q1-2012.pdf"
page_number = 0  # Replace with the page number you're interested in

doc = fitz.open(pdf_document)
page = doc[page_number]

text = page.get_text()

# Define regular expressions for tabular data
row_pattern = re.compile(r'\n.*\n')  # Match rows based on newline
column_pattern = re.compile(r'\s+')  # Match columns based on whitespace

rows = row_pattern.findall(text)
table_detected = False

# Check if tabular data is detected
if len(rows) > 1:
    columns = column_pattern.split(rows[0])
    if len(columns) > 1:
        table_detected = True

if table_detected:
    print("Table detected on the page.")
else:
    print("No table detected on the page.")

Table detected on the page.


In [2]:
# Load the language model
nlp = spacy.load("en_core_web_sm")

In [4]:
# Create a function to clean the data
def clean_data(df):
    # Create a dictionary to store the values
    new_df = {"label": [], "text": []}

    # Iterate over all rows in the dataset
    for row in tqdm(range(len(df.loc[:, "text"].to_list()))):
        # Initialize temporary array to store tokens
        tmp_tokens = []

        try:
            # # Check whether the review is written in English or not
            # if langdetect.detect(df.loc[row, "text"]) == "en" or True:
                for token in nlp(df.loc[row, "text"]):
                    # Set conditions to retain valuable information
                    if (
                        not token.is_stop  # remove stop-words
                        and not token.is_punct  # remove punctuation
                        and not token.like_num  # remove numbers
                        and token.is_oov  # remove words that don't have a word vector
                        and not token.is_space  # remove whitespaces
                        and len(token) > 1  # remove single-letter words
                        # Remove tokens that looks weird & not useful
                        and not str(token).endswith("-")
                        and not str(token).endswith(".")
                        and not any(
                            substr in str(token)
                            for substr in [
                                "---",
                                "--",
                                "/2",
                                "/1",
                                "20feb",
                                "c17",
                                "\x92",
                                "&",
                                "%",
                                "i.e.",
                                "b+",
                                "w/",
                                "02:33:05",
                            ]
                        )
                        and not str(token).startswith("-")
                    ):
                        # Get the lemma & lowercase the token
                        token = token.lemma_.lower()
                        if "(" in token:
                            token = token.split("(")
                            tmp_tokens.append(token[0])
                            tmp_tokens.append(token[1])
                        elif token == "orangy/":
                            token = "orangy"
                        elif token == ".fruity":
                            token = "fruity"

                        tmp_tokens.append(token)

                # Append the corresponding label to the review
                new_df["label"].append(df.loc[row, "label"])

                # Add all tokens from the review to the text
                new_df["text"].append(tmp_tokens)
                # Reset the token array
                tmp_tokens = []
        except:
            continue  # proceed to next row if an exception is raised
        
    # Return the new dataframe
    return pd.DataFrame(new_df)