In [1]:
# Import libraries
import spacy
import langdetect
from tqdm.notebook import tqdm
import pdfplumber
import fitz
import os
import re
import sys
import traceback
import pickle

In [2]:
# Define headings in the document using word size function (PDFPlumber)
def word_ratio_func(word):
    try:
        # calculate word size parameters
        word_length = len(word[4])
        word_bottom = float(word[1])
        word_top = float(word[3])
        return abs(word_bottom - word_top), word_length, word[4]
        
    except:
        # in case of error, return zeros
        return 0, 0, 0

In [3]:
# Define headings in the document using word size function (Fitz)
def word_ratio_func(word):
    try:
        # calculate word size parameters
        word_length = len(word["text"])
        word_bottom = float(word['bottom'])
        word_top = float(word['top'])
        return (word_bottom - word_top), word_length, word["text"]
        
    except:
        # in case of error, return zeros
        return 0, 0, 0

In [4]:
# Preprocess the text
def preprocess_text(texts):

    # join the text and perform cleansing operations
    text = "".join(texts.values()).strip("●").strip("*")
    text = text.split("\n")
    text = [x for x in text if x != '' and x.startswith("Source") == False]
    text = [x[0].replace("●", "") + x[1:] if x[0] == "●" else x for x in text]
    text = [x[0].replace("1", "") + x[1:] if x[1:3] in ["Q1", "Q2", "Q3", "Q4"] else x for x in text]
    text = text[2:]

    return text

In [5]:
# Define if table is correctly identified by calculating digit to character ratio
def digit_character_ratio(s):
    digit_count = 0
    char_count = 0

    for char in s:
        if char.isdigit():
            digit_count += 1
        if char.isalpha() or char.isdigit():
            char_count += 1

    # Avoid division by zero
    if char_count == 0:
        return 0

    return digit_count / char_count

In [6]:
from difflib import SequenceMatcher
# Define similarity function
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

### PDF Into Texts

In [146]:
# Transform pdf into the text
def process_pdf(pdf_paths):
    
    # store the pdf text and headings in dictionaries
    pdf_texts = {}
    pdf_headings = {}
    pdf_headings_context = {}

    # iterate over the pdf files
    for file_path in tqdm(pdf_paths):

        # file_path = "ShareholderLetters/Investor-Letter-Q1-2012.pdf"
        # file_path = "ShareholderLetters/Investor-Letter-Q3-2011.pdf" # check later
        # file_path = "ShareholderLetters/July-Investor-Letter-1130am.pdf"
        # file_path = "ShareholderLetters/July2014EarningsLetter_7.21.14_final.pdf"
        file_path = "ShareholderLetters/Q4_14_Letter_to_shareholders.pdf"

        ### Check later headings and preceding words for these reports
        # file_path = "ShareholderLetters/FINAL-Q1-23-Shareholder-Letter.pdf"
        file_path = "ShareholderLetters/FINAL-Q2-23-Shareholder-Letter.pdf" 

        # file_path = "ShareholderLetters/Q416ShareholderLetter.pdf"

        try:

            # store the text and headings within one pdf file
            texts = {}
            headings = []
            headings_count = 0
            fitz_flag = False

            # open the pdf file using pdfplumber library
            plumber_reader = pdfplumber.open(file_path)

            # for page_number in range(len(fitz_reader)):
            # iterate over pages in the pdf document
            for page_number in range(0, len(plumber_reader.pages)):

                # get the specific page from the pdf plumber
                plumber_page = plumber_reader.pages[page_number]
                # get text from page using pdf plumber
                text = plumber_page.extract_text()

                # if text is not correctly extracted and spaced using pdf plumber
                if len(text.split(" ")) / len(text) < 0.15 or fitz_flag:
                    # set fitz library flag as true for further pages
                    fitz_flag = True
                    # open the pdf file using fitz library
                    fitz_reader = fitz.open(file_path)
                    # get the specific page from the fitz library
                    fitz_page = fitz_reader.load_page(page_number)
                    # get text from page using fitz library
                    text = fitz_page.get_text()

                # find table from the page
                table = plumber_page.extract_tables()
                # if table exists on the page
                if len(table):
                    # define the beginning of table
                    start_table = table[0][0][0].split("\n")[0]
                    # define the end of table
                    end_table = table[-1][-1]
                    end_table = [x for x in end_table if x is not None][-1]
                    # if table has non-empty start and ending
                    if start_table != '' and end_table != '':
                        # flatten the table list of words and digits
                        table_list = [item for sublist in table for subsublist in sublist for item in subsublist]
                        # remove None values from the list
                        table_list = [x for x in table_list if x is not None]
                        # join the list as single string
                        table_list = " ".join(table_list)
                        # calculate the digit to character ratio
                        ratio = digit_character_ratio(table_list)
                        # remove table if digit to character ratio is over 0.2, meaning that table contains numeric data
                        if ratio > 0.2:
                            # update the page text by removing table
                            text = text.split(start_table)[0] + text.split(end_table)[-1]

                # add text to dictionary of texts
                texts[page_number] = text

                # words = page.get_text("words")
                ### get headings from page
                # extract words
                words = plumber_page.extract_words()
                word_count = 0
                # iterate over words
                while word_count < len(words):
                    # find if the words are large enough to be headings by calculating their size
                    word_size, word_length, word_text = word_ratio_func(words[word_count])
                    heading = []
                    context_flag = True

                    # if word size is over 13.5, this means that the word is heading
                    if word_size > 13.5 and word_length > 1:
                        
                        if word_count > 3:
                            heading_context = [words[word_count - 3]["text"], words[word_count - 2]["text"], words[word_count - 1]["text"]]
                            pdf_headings_context[headings_count] = heading_context
                            context_flag = False
                        
                        # append the following words if they satisfy this heading size condition
                        while True:
                            heading.append(word_text)
                            word_count += 1
                            if word_count >= len(words):
                                break
                            word_size, word_length, word_text = word_ratio_func(words[word_count])
                            # if word is small again, break the loop and finish the heading
                            if not word_size > 13.5 and word_length > 1:
                                
                                headings.append(" ".join(heading))
                                if context_flag:
                                    heading_context = [words[word_count]["text"], words[word_count + 1]["text"], words[word_count + 2]["text"]]
                                    pdf_headings_context[headings_count] = heading_context
                                    
                                # add the indent of 10 words to avoid issues 
                                word_count += 10
                                break
                    headings_count += 1
                    word_count += 1
        
                # break if the page covers the reference section
                if "Reference" in heading or page_number == 10:
                    break
            
            # preprocess the text
            text = preprocess_text(texts)
            final_text = " ".join(text).strip()

            

            # optionally, export the text to a txt file
            # with open("Txt/" + file_path.split("/")[-1].split(".")[0] + ".txt", "w", encoding='utf-8') as f:
            #     f.write(final_text)

            # add the text to the dictionary
            pdf_texts[file_path.split("/")[-1].split(".")[0]] = final_text
            
            # clean the headings
            # headings = [x.replace("\u200b", "remove") for x in headings]
            # headings = [x for x in headings if not re.search("remove", x)]    
            
            # add the headings to the dictionary
            pdf_headings[file_path.split("/")[-1].split(".")[0]] = headings

            break

        except Exception as e:

            # in case of error, print the specifics of issue
            exc_type, exc_value, exc_traceback = sys.exc_info()
            traceback_details = traceback.extract_tb(exc_traceback)
            filename = traceback_details[-1][0]
            line_no = traceback_details[-1][1]
            func = traceback_details[-1][2]
            print(f"Exception occurred in file {filename} at line {line_no} in function {func}")
            print(f"Exception type: {exc_type.__name__}, Exception message: {str(e)}")

            break

            # continue
    
    return pdf_texts, pdf_headings, pdf_headings_context

In [147]:
# Get file paths for the pdf files
folder_path = "ShareholderLetters/" # put '/' sign at the end of the folder
file_paths = []
for root, directories, files in os.walk(folder_path):
    for filename in files:
        filepath = os.path.join(root, filename)
        file_paths.append(filepath)

# Transform pdf files into texts and headings and store them as dictionaries
pdf_texts, pdf_headings, pdf_headings_context = process_pdf(file_paths) # total run time: 2 min 20 s 20 files

  0%|          | 0/50 [00:00<?, ?it/s]

In [148]:
pdf_texts

{'FINAL-Q2-23-Shareholder-Letter': "In May, we successfully launched paid sharing in 100+ countries, representing more than 80% of our revenue base. ○ Revenue in each region is now higher than pre-launch, with sign-ups already exceeding cancellations. ○ Paid net additions were 5.9M in Q2, and today we’re rolling out paid sharing to almost all of the remaining countries.  Q2‘23 revenue of $8.2B and operating profit of $1.8B were generally in-line with our forecast—and we expect revenue growth to accelerate in the second half of ‘23 as we start to see the full benefits of paid sharing plus continued steady growth in our ad-supported plan. We’re still targeting a full year 2023 operating margin of 18% to 20%.  We’re a leader in terms of streaming engagement and, per Nielsen, we had the top original streaming series in the US for 24 of the first 25 weeks of 2023, and the top movie for 21 weeks. While we’ve made steady progress this year, we have more work to do to reaccelerate our growth. 

In [149]:
pdf_headings

{'FINAL-Q2-23-Shareholder-Letter': ['Q2 Results',
  'Forecast',
  'Engagement',
  'Monetization and Revenue',
  'Competition',
  'Cash Flow and Capital Structure',
  'Environmental, Social, and Governance (ESG)',
  'Reference']}

In [150]:
pdf_headings_context

{109: ['RevenueinQ2grew3%yearoveryear(+6%onaforeignexchange(F/X)neutralbasis1).Revenuegrowth',
  'wasdrivenbya6%increaseinaveragepaidmembership,whileARM2declined3%yearoveryear(-1%',
  'F/Xneutral).Theyear-over-yearARMdeclinewasdrivenbyacombinationoflimitedpriceincreasesover'],
 113: ['contentspend.EPSinQ2amountedto$3.29vs.$3.20inQ2’22andincludeda$29millionnon-cash',
  'unrealizedlossfromF/XremeasurementonourEurodenominateddebt,whichisrecognizedbelow',
  'operatingincomein“interestandotherincome/expense.”'],
 131: ['WeexpectQ3operatingincomeof$1.9Bvs.$1.5BinQ3’22andoperatingmarginof22%comparedwith',
  '19%intheyearagoperiod.We’restilltargetingafullyear2023operatingmarginof18%-20%,basedon',
  'F/XratesasofJanuary1,2023,upfrom17.8%inFY22.'],
 185: ['Inadditiontodeliveringanever-improvingslateandproductexperience,we’vebeenworkingtoimprove',
  'ourmonetizationthroughinitiativeslikepaidsharingandadvertising.Thiswillallowustogeneratemore',
  'revenueoffabiggerbase,whichwecanreinvesttomakeNetf

In [151]:
text = list(pdf_texts.items())[0][1]

In [152]:
def remove_extra_spaces(text):
    text = text.replace("\u200b", "")
    return ' '.join(text.split())

In [153]:
text = remove_extra_spaces(text)

In [159]:
import pickle

pdf_texts = pickle.load(open("pdf_texts_test.pkl", "rb"))
pdf_headings = pickle.load(open("pdf_headings_test.pkl", "rb"))
pdf_headings_context = pickle.load(open("pdf_headings_context_test.pkl", "rb"))

In [180]:
def preprocess_text(text):
    text = text.replace("\u200b", "")
    text = ' '.join(text.split())
    return text

In [190]:
# Split the text into blocks based on the headings
def split_text_into_blocks(text, headings, pdf_headings_context):

    text = preprocess_text(text)

    # define dictionary for storing text blocks
    text_blocks = {}

    # Iterate over the headings
    for heading in range(len(headings)):

        heading_context = list(headings_context.items())[heading][1]
        heading_context = " ".join(heading_context)

        # in the beginning of document, there is no heading, so let's define it as document intro
        if heading == 0:

            if re.search(headings[heading] + " " + heading_context, text):
                document_intro = text.split(headings[heading] + " " + heading_context)[0]
            elif re.search(heading_context + " " + headings[heading], text):
                document_intro = text.split(heading_context + " " + headings[heading])[0]
                document_intro = heading_context + " " + document_intro
            else:
                document_intro = text.split(headings[heading])[0]
            text_blocks['Document_intro'] = document_intro

        # prevent the error of index out of range
        if len(text.split(headings[heading])) > 1:
            
            if re.search(headings[heading] + " " + heading_context, text):
                text_after_heading = text.split(headings[heading] + " " + heading_context)[1]
                text_after_heading = heading_context + text_after_heading
            elif re.search(heading_context + " " + headings[heading], text):
                text_after_heading = text.split(heading_context + " " + headings[heading])[1]
            else:
                text_after_heading = text.split(headings[heading])[1]
        else:
            text_after_heading = ""

        # identify the last heading
        if heading == len(headings) - 1:
            text_blocks[headings[heading]] = text_after_heading
            break
        else:
            # identify middle headings with surrounding text
            next_heading_context = list(headings_context.items())[heading+1][1]
            next_heading_context = " ".join(next_heading_context)

            if re.search(headings[heading+1] + " " + next_heading_context, text_after_heading):
                text_of_heading = text_after_heading.split(headings[heading+1] + " " + next_heading_context)[0]
            elif re.search(next_heading_context + " " + headings[heading+1], text_after_heading):
                text_of_heading = text_after_heading.split(next_heading_context + " " + headings[heading+1])[0]
                text_of_heading = text_of_heading + next_heading_context
            else:
                text_of_heading = text_after_heading.split(headings[heading+1])[0]
            text_blocks[headings[heading]] = text_of_heading
        
    return text_blocks

In [191]:
for pdf_name in tqdm(pdf_texts):

    text = pdf_texts[pdf_name]
    headings = pdf_headings[pdf_name]
    headings_context = pdf_headings_context[pdf_name]

    # Split the text into blocks based on the headings
    text_blocks = split_text_into_blocks(text, headings, headings_context)

  0%|          | 0/1 [00:00<?, ?it/s]

In [192]:
text_blocks

{'Document_intro': 'Fellow shareholders, We had a beautiful Q4, completing a great year as internet TV expands globally. In 2017, we grew streaming revenue 36% to over $11 billion, added 24 million new memberships (compared to 19 million in 2016), achieved for the first time a full-year positive international contribution profit, and more than doubled global operating income. . 1 ',
 'Q4 Results': 'Average paid streaming memberships rose 25% year over year in Q4. Combined with a 9% increase in ASP, global streaming revenue growth amounted to 35%. Operating income of $245 million (7.5% margin) vs. $154 million prior year (6.2% margin) was slightly above our $238 million forecast. Operating margin for FY17 was 7.2%, on target with our goal at the beginning of this year. EPS was $0.41 vs. $0.15 last year and met our forecast of $0.41. There were several below the line items that affected net income, including a pre-tax $26 million non-cash unrealized loss from F/X remeasurement on our Eur

In [157]:
# define dictionary for storing text blocks
text_blocks = {}

headings = list(pdf_headings.items())[0][1]

# Iterate over the headings
for heading in range(len(headings)):

    heading_context = list(pdf_headings_preceding_words.items())[heading][1]
    heading_context = " ".join(heading_context)

    # in the beginning of document, there is no heading, so let's define it as document intro
    if heading == 0:

        if re.search(headings[heading] + " " + heading_context, text):
            document_intro = text.split(headings[heading] + " " + heading_context)[0]
        elif re.search(heading_context + " " + headings[heading], text):
            document_intro = text.split(heading_context + " " + headings[heading])[0]
            document_intro = heading_context + " " + document_intro
        else:
            document_intro = text.split(headings[heading])[0]
        text_blocks['Document_intro'] = document_intro

    # prevent the error of index out of range
    if len(text.split(headings[heading])) > 1:
        
        if re.search(headings[heading] + " " + heading_context, text):
            text_after_heading = text.split(headings[heading] + " " + heading_context)[1]
            text_after_heading = heading_context + text_after_heading
        elif re.search(heading_context + " " + headings[heading], text):
            text_after_heading = text.split(heading_context + " " + headings[heading])[1]
        else:
            text_after_heading = text.split(headings[heading])[1]
    else:
        text_after_heading = ""

    # identify the last heading
    if heading == len(headings) - 1:
        text_blocks[headings[heading]] = text_after_heading
        break
    else:
        # identify middle headings with surrounding text
        next_heading_context = list(pdf_headings_preceding_words.items())[heading+1][1]
        next_heading_context = " ".join(next_heading_context)

        if re.search(headings[heading+1] + " " + next_heading_context, text_after_heading):
            text_of_heading = text_after_heading.split(headings[heading+1] + " " + next_heading_context)[0]
        elif re.search(next_heading_context + " " + headings[heading+1], text_after_heading):
            text_of_heading = text_after_heading.split(next_heading_context + " " + headings[heading+1])[0]
            text_of_heading = text_of_heading + next_heading_context
        else:
            text_of_heading = text_after_heading.split(headings[heading+1])[0]
        text_blocks[headings[heading]] = text_of_heading

In [156]:
text_blocks

{'Document_intro': "In May, we successfully launched paid sharing in 100+ countries, representing more than 80% of our revenue base. ○ Revenue in each region is now higher than pre-launch, with sign-ups already exceeding cancellations. ○ Paid net additions were 5.9M in Q2, and today we’re rolling out paid sharing to almost all of the remaining countries. Q2‘23 revenue of $8.2B and operating profit of $1.8B were generally in-line with our forecast—and we expect revenue growth to accelerate in the second half of ‘23 as we start to see the full benefits of paid sharing plus continued steady growth in our ad-supported plan. We’re still targeting a full year 2023 operating margin of 18% to 20%. We’re a leader in terms of streaming engagement and, per Nielsen, we had the top original streaming series in the US for 24 of the first 25 weeks of 2023, and the top movie for 21 weeks. While we’ve made steady progress this year, we have more work to do to reaccelerate our growth. We remain focused 

In [79]:
# Save pdf texts and headings to pickle files
# with open("pdf_texts2.pkl", "wb") as f:
#     pickle.dump(pdf_texts, f)
# with open("pdf_headings2.pkl", "wb") as f:
#     pickle.dump(pdf_headings, f)

In [27]:
# file_path = "ShareholderLetters/Investor_Letter_Q12013.pdf"
# file_path = "ShareholderLetters/FINAL-Q3-22-Shareholder-Letter.pdf"
# file_path = "ShareholderLetters/Investor-Letter-Q3-2011.pdf"
# file_path = "ShareholderLetters/July-Investor-Letter-1130am.pdf"
# file_path = "ShareholderLetters/July2014EarningsLetter_7.21.14_final.pdf"
# file_path = "ShareholderLetters/Q3_14_Letter_to_shareholders.pdf"
# file_path = "ShareholderLetters/Q4_14_Letter_to_shareholders.pdf"
# file_path = "ShareholderLetters/FINAL-Q2-23-Shareholder-Letter.pdf"
# file_path = "ShareholderLetters/FINAL-Q1-23-Shareholder-Letter.pdf"
file_path = "ShareholderLetters/Investor-Letter-Q3-2011.pdf"
# file_path = "ShareholderLetters/Investor-Letter-Q42012-01.pdf"

page_number = 0

pdf_plumber = pdfplumber.open(file_path)
plumber_page = pdf_plumber.pages[page_number]

fitz_reader = fitz.open(file_path)
fitz_page = fitz_reader.load_page(page_number)
text = fitz_page.get_text()

plumber_text = plumber_page.extract_text()
table = plumber_page.extract_tables()

if len(plumber_page.extract_tables()):
    print("Table found")
    start_table = plumber_page.extract_tables()[0][0][0].split("\n")[0]
    if not re.search(start_table, text):
        start_table = start_table.replace(" ", "")
    end_table = plumber_page.extract_tables()[-1][-1]
    end_table = [x for x in end_table if x is not None]
    end_table = end_table[-1]
    if start_table != '' and end_table != '':
        table_list = plumber_page.extract_tables()
        table_list = [item for sublist in table_list for subsublist in sublist for item in subsublist]
        table_list = [x for x in table_list if x is not None]
        table_list = " ".join(table_list)
        ratio = digit_character_ratio(table_list)
        print(ratio)
        if ratio > 0.2:
            text_2 = text.split(start_table)[0] + text.split(end_table)[-1]

Table found
0.5042918454935622


### Pdfreader is not good

### Extracting Tables

In [31]:
pdf_document = "ShareholderLetters/Investor-Letter-Q1-2012.pdf"
pdf_document = "ShareholderLetters/FINAL-Q3-22-Shareholder-Letter.pdf"

pdf_plumber = pdfplumber.open(pdf_document)
page = pdf_plumber.pages[11]
table = page.extract_tables()
if len(table):
    print("Table found")
    start_table = table[0][0][0]
    end_table = table[0][-1][-1]

Table found


In [32]:
table

[[['Revenues $ 7,925,589 $ 7,970,141 $ 7,483,467 $ 23,763,497 $ 21,988,526',
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None],
  ['Cost of revenues 4,788,665 4,690,755 4,206,589 13,764,125 12,093,108',
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None],
  ['Marketing 567,954 574,960 635,948 1,698,892 1,752,433',
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None],
  ['Technology and development 662,739 716,846 563,887 2,037,115 1,626,415',
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None],
  ['General and administrative 373,213 409,297 321,790 1,180,438 953,831',
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None],
  ['Operating income 1,533,018 1,578,283 1,755,253 5,082,927 5,562,739',
   None,
   None,
   None,
   None,
   None,
   None,
   None,
   None],
  ['Other income (expense):', None, None, None, None, None, None, None, None],
  ['Interest expense (172,575) (175,455) (190,

In [33]:
import pandas as pd

# Convert the list into a DataFrame
df = pd.DataFrame(table[1:], columns=table[0])

# Display the DataFrame
df.head(100)

Unnamed: 0_level_0,"Revenues $ 7,925,589 $ 7,970,141 $ 7,483,467 $ 23,763,497 $ 21,988,526",NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN
Unnamed: 0_level_1,"Cost of revenues 4,788,665 4,690,755 4,206,589 13,764,125 12,093,108",NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN
Unnamed: 0_level_2,"Marketing 567,954 574,960 635,948 1,698,892 1,752,433",NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN
Unnamed: 0_level_3,"Technology and development 662,739 716,846 563,887 2,037,115 1,626,415",NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN
Unnamed: 0_level_4,"General and administrative 373,213 409,297 321,790 1,180,438 953,831",NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN
Unnamed: 0_level_5,"Operating income 1,533,018 1,578,283 1,755,253 5,082,927 5,562,739",NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN
Unnamed: 0_level_6,Other income (expense):,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN
Unnamed: 0_level_7,"Interest expense (172,575) (175,455) (190,429) (535,609) (576,191)",NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN
Unnamed: 0_level_8,"Interest and other income 261,404 220,226 96,135 677,275 302,702",NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN
Unnamed: 0_level_9,"Income before income taxes 1,621,847 1,623,054 1,660,959 5,224,593 5,289,250",NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN
Unnamed: 0_level_10,"Provision for income taxes (223,605) (182,103) (211,888) (787,953) (780,451)",NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN
Unnamed: 0_level_11,Net income,"$ 1,398,242",Unnamed: 3_level_11,"$ 1,440,951",Unnamed: 5_level_11,"$ 1,449,071",Unnamed: 7_level_11,"$ 4,436,640",Unnamed: 9_level_11
Unnamed: 0_level_12,Unnamed: 1_level_12,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN
Unnamed: 0_level_13,Earnings per share:,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN
Unnamed: 0_level_14,Basic $ 3.14 $ 3.24 $ 3.27 $ 9.98 $ 10.18,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN
Unnamed: 0_level_15,Diluted $ 3.10 $ 3.20 $ 3.19 $ 9.83 $ 9.90,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN
Unnamed: 0_level_16,Weighted-average shares of common stock outstanding:,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN
Unnamed: 0_level_17,"Basic 444,878 444,557 442,778 444,529 443,052",NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN
Unnamed: 0_level_18,"Diluted 450,344 450,169 454,925 451,168 455,230",NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN


In [56]:
import pdfplumber

pdf_document = "ShareholderLetters/Investor-Letter-Q1-2012.pdf"
page_number = 0  # Replace with the page number you're interested in

with pdfplumber.open(pdf_document) as pdf:
    page = pdf.pages[page_number]
    
    # Extract tables as DataFrame objects
    tables = page.extract_tables()
    
    # Process the tables
    for table in tables:
        for row in table:
            print(row)
        print("-" * 20)

['(in millions except per share data)', "Q2 '10", "Q3 '10", "Q4 '10", "Q1 '11", "Q2 '11", "Q3 '11", "Q4 '11", "Q1 '12"]
['Domestic Streaming:', '', '', '', '', '', '', '', 'G id']
['Net Subscription Additions', '-', '-', '-', '-', '-', '-', '0.22', '1.74']
['Total Subscriptions', '-', '-', '-', '-', '-', '21.45', '21.67', '23.41']
['Paid Subscriptions', '-', '-', '-', '-', '-', '20.51', '20.15', '22.02']
['Revenue', '-', '-', '-', '-', '-', '-', '$ 476', '$ 507']
['Contribution Profit', '-', '-', '-', '-', '-', '-', '$ 52', '$ 67']
['Contribution Margin', '-', '-', '-', '-', '-', '-', '10.9%', '13.2%']
['', '', '', '', '', '', '', '', '']
['International Streaming:', '', '', '', '', '', '', '', '']
['Net Subscription Additions', '-', '0.13', '0.38', '0.29', '0.16', '0.51', '0.38', '1.21']
['Total Subscriptions', '-', '0.13', '0.51', '0.80', '0.97', '1.48', '1.86', '3.07']
['Paid Subscriptions', '-', '-', '0.33', '0.67', '0.86', '0.99', '1.45', '2.41']
['Revenue', '-', '$ -', '$ 4', '$ 

In [36]:
text

" \n \n1 \nApril 23rd, 2012    \nDear Fellow Shareholders, \nNetflix added nearly 3 million streaming members in Q1, bringing our total to over 26 million global \nstreaming members, and strengthening our position as the world’s leading Internet TV network.   We \nanticipate returning to global profitability in Q2, and plan to launch our next international market in Q4.   \nWe are constantly improving our service with better personalization, better user-interfaces, better \nstreaming, and more content.  As a result, per-member viewing hours set new records in Q1 and are on \ntrack to do so again in Q2, on a year-over-year basis.  We launched our service in the UK and Ireland in \nJanuary and are very pleased that, after the first 90 days, we had substantially more members than we \nhad after the first 90 days of Canada or Latin America.       \n \n (in millions except per share data)\nQ2 '10\nQ3 '10\nQ4 '10\nQ1 '11\nQ2 '11\nQ3 '11\nQ4 '11\nQ1 '12 \nG id\n \nDomestic Streaming:\nNet Sub

In [34]:
import re
import fitz

pdf_document = "ShareholderLetters/Investor-Letter-Q1-2012.pdf"
page_number = 0  # Replace with the page number you're interested in

doc = fitz.open(pdf_document)
page = doc[page_number]

text = page.get_text()

# Define regular expressions for tabular data
row_pattern = re.compile(r'\n.*\n')  # Match rows based on newline
column_pattern = re.compile(r'\s+')  # Match columns based on whitespace

rows = row_pattern.findall(text)
table_detected = False

# Check if tabular data is detected
if len(rows) > 1:
    columns = column_pattern.split(rows[0])
    if len(columns) > 1:
        table_detected = True

if table_detected:
    print("Table detected on the page.")
else:
    print("No table detected on the page.")

Table detected on the page.


### Cleaning the data

In [2]:
# Load the language model
nlp = spacy.load("en_core_web_sm")

In [4]:
# Create a function to clean the data
def clean_data(df):
    # Create a dictionary to store the values
    new_df = {"label": [], "text": []}

    # Iterate over all rows in the dataset
    for row in tqdm(range(len(df.loc[:, "text"].to_list()))):
        # Initialize temporary array to store tokens
        tmp_tokens = []

        try:
            # # Check whether the review is written in English or not
            # if langdetect.detect(df.loc[row, "text"]) == "en" or True:
                for token in nlp(df.loc[row, "text"]):
                    # Set conditions to retain valuable information
                    if (
                        not token.is_stop  # remove stop-words
                        and not token.is_punct  # remove punctuation
                        and not token.like_num  # remove numbers
                        and token.is_oov  # remove words that don't have a word vector
                        and not token.is_space  # remove whitespaces
                        and len(token) > 1  # remove single-letter words
                        # Remove tokens that looks weird & not useful
                        and not str(token).endswith("-")
                        and not str(token).endswith(".")
                        and not any(
                            substr in str(token)
                            for substr in [
                                "---",
                                "--",
                                "/2",
                                "/1",
                                "20feb",
                                "c17",
                                "\x92",
                                "&",
                                "%",
                                "i.e.",
                                "b+",
                                "w/",
                                "02:33:05",
                            ]
                        )
                        and not str(token).startswith("-")
                    ):
                        # Get the lemma & lowercase the token
                        token = token.lemma_.lower()
                        if "(" in token:
                            token = token.split("(")
                            tmp_tokens.append(token[0])
                            tmp_tokens.append(token[1])
                        elif token == "orangy/":
                            token = "orangy"
                        elif token == ".fruity":
                            token = "fruity"

                        tmp_tokens.append(token)

                # Append the corresponding label to the review
                new_df["label"].append(df.loc[row, "label"])

                # Add all tokens from the review to the text
                new_df["text"].append(tmp_tokens)
                # Reset the token array
                tmp_tokens = []
        except:
            continue  # proceed to next row if an exception is raised
        
    # Return the new dataframe
    return pd.DataFrame(new_df)