The goal here is to write a function snapshot() that will obtain a company's quarterly earnings report from its investor relations website and output the relative metrics with which we are concerned (a "snapshot" of the report). In many cases, these metrics will consist of a company's current quarter earnings per share (EPS), current quarter revenue, and estimates for what these metrics will be in the next quarter, known as the "guidance". For many companies, however, there are various other metrics that concern us in addition to these, or in some cases instead of. Furthermore, with all the companies that report their quarterly earnings on their IR websites, there exists very little uniformity in the way in which their reports are structured. Thus, we have our work cut out for us.

To start, we will try to parse the release of Netflix (NFLX). We are primarily concerned with identifying GAAP EPS and revenue along with guidance for these metrics for next quarter. For NFLX, we are also concerned with identifying net streaming adds.

First task is to obtain the reports from the websites. In practice, we will want to have to program running maybe one minute before the expected earnings report time so that it is refreshing the page every tenth of a second or so and can have the report text the second it is released by the website. Reports are usually released as PDFs, although for NVDA they report in a press release in HTML format so we may have to account for this possibility.

Most companies structure their reports such that there it consists of dialogue talking about the metrics followed by a table of comprehensive metrics and numbers. Will probably want to pull separate the two so they are individually parsable


TODO:
    
    -Write the get pdf functions that will refresh on the quarterly results IR page and download the file
        - get_nflx
        - get_amzn
        - get_twtr
        - get_tsla
        - get_aapl
    -Write the table parsers for each company that will get the information we want for each company from the table and get paragraphs containing keywords
        - nflx_parser
        - amzn_parser
        - twtr_parser
        - tsla_parser
        - aapl_parser
        
    - Wrap up the notebook so that it is usable from command line

In [8]:
import itertools
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

import math
import requests
import time
import json
from bs4 import BeautifulSoup
import sys

import pdfminer
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO

from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.converter import PDFPageAggregator


#Get the PDFs

In [103]:
def get_nflx(link_dict):
    while True:
        page = requests.get(link_dict['NFLX'])
        soup = BeautifulSoup(page.text, 'html.parser')
        q3_html = soup.find_all('div', {'class': 'accBody'})[0]
        docs = q3_html.find_all('a')
        dwnload = []
        found = False
        for doc in docs:
            if doc.text == 'Q316 Letter to shareholders':
                link = doc['href']
                found = True
                break
        if found:
            break
        time.sleep(1)
    link = 'https://ir.netflix.com/' + link
    pdfile = requests.get(link)
    with open('nflx.pdf', 'wb') as f:
        f.write(pdfile.content)

In [None]:
def get_amzn(link_dict):
    while True:
        page = requests.get(link_dict['AMZN'])
        soup = BeautifulSoup(page.text, 'html.parser')
        q3_html = soup.find_all('div', {'class': 'a-section article-copy'})[0]
        docs = q3_html.find_all('a')
        dwnload = []
        found = False
        for doc in docs:
            if doc.text == 'Q3 2016 Financial Results':
                link = doc['href']
                found = True
                break
        if found:
            break
        time.sleep(1)
    pdfile = requests.get(link)
    with open('amzn.pdf', 'wb') as f:
        f.write(pdfile.content)


#Code to parse the PDFs, extract tables

In [43]:
def extract_layout_by_page(pdf_path):
    """
    Extracts LTPage objects from a pdf file.
    
    slightly modified from
    https://euske.github.io/pdfminer/programming.html
    """
    laparams = LAParams()

    fp = open(pdf_path, 'rb')
    parser = PDFParser(fp)
    document = PDFDocument(parser)

    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed

    rsrcmgr = PDFResourceManager()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    layouts = []
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
        layouts.append(device.get_result())

    return layouts

TEXT_ELEMENTS = [
    pdfminer.layout.LTTextBox,
    pdfminer.layout.LTTextBoxHorizontal,
    pdfminer.layout.LTTextLine,
    pdfminer.layout.LTTextLineHorizontal
]

def flatten(lst):
    """Flattens a list of lists"""
    return [subelem for elem in lst for subelem in elem]


def extract_characters(element):
    """
    Recursively extracts individual characters from 
    text elements. 
    """
    if isinstance(element, pdfminer.layout.LTChar):
        return [element]

    if any(isinstance(element, i) for i in TEXT_ELEMENTS):
        return flatten([extract_characters(e) for e in element])

    if isinstance(element, list):
        return flatten([extract_characters(l) for l in element])

    return []

def does_it_intersect(x, (xmin, xmax)):
    return (x <= xmax and x >= xmin)

def convert_to_rows(characters):
    x_limit = 10
    y_limit = 5
    paragraph_limit = 20

    rows = []
    row = []
    cell = ""
    prior_x = None
    prior_y = None

    y_s = [];
    x_s = [];
    for c in characters:
        c_x, c_y = math.floor((c.bbox[0] + c.bbox[2]) / 2), math.floor((c.bbox[1] + c.bbox[3]) / 2)
        if prior_x is not None and not (c_x - prior_x <= x_limit and abs(c_y - prior_y) <= y_limit):
            if abs(c_y - prior_y) > y_limit:
                row.append(cell)

                # find the right row
                for i in xrange(len(rows)):
                    if abs(y_s[i] - prior_y) <= y_limit:
                        for j in xrange(len(x_s[i])):
                            if prior_x < x_s[i][j]:
                                rows[i] = rows[i][:j] + row + rows[i][j:]
                                x_s[i] = x_s[i][:j] + [prior_x] + x_s[i][j:]
                                break
                        else:
                            rows[i] += row
                            x_s[i].append(prior_x)
                            break
                        break
                else:
                    rows.append(row)
                    y_s.append(prior_y)
                    x_s.append([prior_x])

                cell = ""
                row = []
            elif c_x - prior_x > x_limit:
                row.append(cell)
                cell = ""

        cell += c.get_text()
        prior_x = c_x
        prior_y = c_y

    # handle the last row
    row.append(cell)
    for i in xrange(len(rows)):
        if abs(y_s[i] - prior_y) <= y_limit:
            for j in xrange(len(x_s[i])):
                if prior_x < x_s[i][j]:
                    rows[i] = rows[i][:j] + row + rows[i][j:]
                    x_s[i] = x_s[i][:j] + [prior_x] + x_s[i][j:]
                    break
            else:
                rows[i] += row
                x_s[i].append(prior_x)
                break
            break
    else:
        rows.append(row)
        y_s.append(prior_y)
        x_s.append([prior_x])
        
    # insert blank rows between particularly separated lines
    for i in xrange(len(y_s) - 2, -1, -1):
        if abs(y_s[i] - y_s[i+1]) > paragraph_limit:
            rows = rows[:i+1] + [[]] + rows[i+1:]
    
    return rows

#Page Parsers 

In [100]:
def parse_pages(url, parser):
    
    page_layouts = extract_layout_by_page(url)
    #objects_on_page = set(type(o) for o in page_layouts[3])

    pages = []
    for i in xrange(len(page_layouts)):
        current_page = page_layouts[i]

        texts = []

        # seperate text and rectangle elements
        for e in current_page:
            if isinstance(e, pdfminer.layout.LTTextBoxHorizontal):
                texts.append(e)

        # sort them into 
        characters = extract_characters(texts)
        pages.append(convert_to_rows(characters))
    parser(pages)
    
def nflx_parser(pages):
    for page in pages:
        if len(page) > 1 and len(page[1]) > 0 and "Consolidated Statements of Operations " == page[1][0]:
            for row in page:
                if len(row) > 0 and row[0] == "Revenues":
                    print("Revenue: " + row[2])
                # we want the first Basic in the table
                elif len(row) > 0 and row[0] == "Basic":
                    print("Basic EPS: " + row[2])
                    break
            break


In [None]:
def amzn_parser(pages):
    page = pages[0]
    tail = pages[1:]
    flattened = list(itertools.chain.from_iterable(page))
    if any("Consolidated Statements of Operations" in s for s in flattened):
        index1 = flattened.index('Total net sales ') + 1
        print("Revenue: " + flattened[index1]) 
        index2 = flattened.index('Basic earnings per share ') + 2
        print("Basic EPS: " + flattened[index2])
    else:
        if tail != []:
            amzn_parser(tail)
        else:
            print "No Data"

In [102]:
tix = ['TWTR', 'TSLA', 'NFLX', 'AMZN']
links = ['https://investor.twitterinc.com/index.cfm', 'http://ir.tesla.com/', 'https://ir.netflix.com/results.cfm' ,'http://phx.corporate-ir.net/phoenix.zhtml?c=97664&p=irol-reportsOther']

ir_dict = dict(zip(tix, links))
        
get_nflx(ir_dict)
parse_pages('nflx.pdf', nflx_parser)

Revenue: 2,290,188
Basic EPS: 0.12
