In [1]:
import pandas as pd
pd.options.display.float_format = '{:,.4f}'.format
import numpy as np
import datetime as dt
from bs4 import BeautifulSoup as bs
import requests
import time
from sqlalchemy import *
from sqlalchemy import create_engine
import sqlite3 as sql
import operator
import re
import os
os.chdir('C:\\Users\\Fang\\Desktop\\Python Trading\\Trading\\Data\\DBs')

In [2]:
def latest_sec_db_links(formtype, year, qtr, engine):
    
    if formtype == 'F4':
        table_name = 'F4Links'
    elif formtype == '10Q':
        table_name = 'Links10Q'
    elif formtype == '10K':
        table_name = 'Links10K'
    else:
        print('formtypes are F4 10Q and 10K')
        return
    
    query = 'SELECT * FROM {0} WHERE Year = {1} AND Quarter = {2}'.format(table_name, year, qtr)
    
    db_table = pd.read_sql_query(query, con=engine, index_col = 'idx')
    
    db_table['DateFiled'] = pd.to_datetime(db_table['DateFiled'])
    
    return db_table.reset_index(drop = True)


def fin_statement_search(text_blob):
    if 'statements of operations' in text_blob.lower():
        return True
    elif 'balance sheets' in text_blob.lower():
        return True
    elif 'statements of cash flows' in text_blob.lower():
        return True
    elif 'statements of stockholders' in text_blob.lower():
        return True
    else:
        return False


In [3]:
def retrieve_statements(sec_url):
    
    start_time = time.time()
    
    def check_statement(b):
        # Checking for Balance Sheet
        if 'balancesheets' in b or 'balance sheets' in b or 'statements of financial conditions' in b:
            return 'balancesheet'
        elif 'statements of operations' in b or 'statements ofoperations' in b or 'statements of income' in b or 'statements ofincome' in b:
            return 'incomestatement'
        elif 'statements of stockholders' in b or 'statements ofstockholders' in b:
            return 'stockholdersequity'
        elif 'statements of cash flows' in b or 'statements of cashflows' in b or 'statements ofcash flows' in b or 'statements ofcashflows' in b:
            return 'cashflowstatement'
        else:
            return ''
        
    sec_req = requests.get(sec_url).text
    all_hrs = [m.start() for m in re.finditer('<hr', sec_req)]
    
    sec_pages = [bs(sec_req[all_hrs[i - 1]:all_hrs[i]], 'lxml') for i in range(1,len(all_hrs))]
    #sec_pages = list(filter(lambda x: x.find('table') != None, sec_pages))
    
    i = 0

    for page in sec_pages:
        titles = [x.text.lower().strip().replace('\n','') for x in page.find_all('b')]
        for b in titles:
            if check_statement(b) == 'balancesheet':
                balance_sheet = page
                break
            elif check_statement(b) == 'incomestatement':
                income_statement = page
                break
            elif check_statement(b) == 'stockholdersequity':
                stockholders_equity = page
                break
            elif check_statement(b) == 'cashflowstatement':
                cashflow_statement = page
                break
    print('Completed in {} seconds'.format(time.time() - start_time))
    try:
        return balance_sheet, income_statement, cashflow_statement, stockholders_equity
    except:
        return balance_sheet, income_statement, cashflow_statement

def parse_statement_table(curr_statement, cik, filedate):
    start_time = time.time()
    for b in [x.text.strip().lower() for x in curr_statement.find_all('b')]:
        if 'hundreds' in b:
            multiplier = 10**2
            break
        elif 'thousands' in b:
            multiplier = 10**3
            break
        elif 'millions' in b:
            multiplier = 10**6
            break
        else:
            multiplier = 1

    table = curr_statement.find('table')

    rows = []

    multiple_periods_exist = False

    for row in table.find_all('tr'):
        columns = [re.sub(' +',' ',x.text.strip().lower().replace('$', '').replace('\n',' ')).replace('\x97','-').replace('\x92','') for x in row.find_all('td')]
        columns = list(filter(lambda x: x != '' and x != ')' and x != '(', columns))
        if len(columns) > 1:

            if list(filter(lambda x: 'three month' in x, columns)) != []:
                check_col = columns
                multiple_periods_exist = True
            elif multiple_periods_exist == False:
                check_col = []

            try:
                columns = [dt.datetime.strptime(x, '%B %d, %Y').date() for x in columns]
                rows.append(columns)
                dates = columns
            except:
                rows.append(columns)

    frame_width = max([len(x) for x in rows])
    curr_statement_df = pd.DataFrame(columns = range(frame_width-1))
    for i, row in enumerate(rows):

        if len(row) == frame_width:
            curr_statement_df.loc[row[0]] = [float(x.replace('-','0').replace('(','-').replace(',',''))*multiplier for x in row[1:]]

    if check_col != []:
        dates = dates[2*check_col.index("three months ended"):check_col.index("three months ended") + 2]

    df_column, value = max(enumerate(dates), key=operator.itemgetter(1))
    curr_statement_df = curr_statement_df[[df_column]]
    curr_statement_df.columns = ['values']
    curr_statement_df['endingQuarter'] = value
    curr_statement_df['CIK'] = cik
    curr_statement_df['DateFiled'] = filedate
    
    print('Completed in {} seconds'.format(time.time() - start_time))
    return curr_statement_df

In [4]:
sec_engine = create_engine('sqlite:///SEC_htm.db', echo=False)
table_name = 'Links10Qs'
query = 'SELECT * FROM {0}'.format(table_name)
latest_db_10q = pd.read_sql_query(query, con=sec_engine, index_col = 'idx').reset_index(drop = True)
latest_db_10q['DateFiled'] = pd.to_datetime(latest_db_10q['DateFiled'])

In [8]:
idx = 3873
curr_filing = latest_db_10q.loc[idx,:]#'https://www.sec.gov/Archives/edgar/data/1001171/000110465918068475/a18-19054_110q.htm'#
sec_url = curr_filing['Filename']
curr_cik = curr_filing['CIK']
file_date = curr_filing['DateFiled'].date()

try:
    bs, ins, cf, se = retrieve_statements(sec_url)
except:
    bs, ins, cf = retrieve_statements(sec_url)

Completed in 0.9449155330657959 seconds


In [10]:
try:
    curr_balance_sheet = parse_statement_table(bs,curr_cik,file_date)
except:
    None
try:
    curr_income_statement = parse_statement_table(ins,curr_cik,file_date)
except:
    None
try:
    curr_cashflows = parse_statement_table(cf,curr_cik,file_date)
except:
    None
try:
    curr_stockequity = parse_statement_table(se,curr_cik,file_date)
except:
    None

Completed in 0.04188823699951172 seconds
Completed in 0.04434037208557129 seconds
Completed in 0.04716038703918457 seconds
Completed in 0.02859973907470703 seconds


In [14]:
curr_stockequity

Unnamed: 0,2003-06-30
"balance, beginning of period",-4285000.0
common stock issued,105034000.0
"balance, end of period",4599000.0
compensation expense on option grants,891000.0
tax benefit on stock options,30292000.0
other,0.0
net income (loss),50828000.0
net unrealized gains (losses) on securities,2593000.0
foreign currency translation adjustments,6291000.0
total stockholders equity,2556833000.0
