In [1]:
# Importing necessary models
import warnings
warnings.filterwarnings('ignore')

import smtplib
import pandas as pd
import numpy as np
import datetime as dt
import pandas.stats.moments as st
import time
%matplotlib inline
from bs4 import BeautifulSoup as bs
import requests
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from sqlalchemy import *
from sqlalchemy import create_engine
import calendar
import sqlite3 as sql
pd.options.display.float_format = '{:,.4f}'.format

init_notebook_mode(connected=True)

import os
main_dir = os.getcwd()

dbs_dir = 'C:\\Users\\Fang\\Desktop\\Python Trading\\Trading\\Data\\DBs'

os.chdir('C:\\Users\\Fang\\Desktop\\Python Trading\\Trading\\Trading\\Modules\\DataCollection')

from alphavantage import *
from yahoo_query import *
from option_slam_earnings import *
from reuters_query import reuters_query, reuters_insiders
from alphaquery import alphaquery

os.chdir('C:\\Users\\Fang\\Desktop\\Python Trading\\Trading\\Trading\\Modules\\Options')
from optionsFunctions import *

os.chdir('C:\\Users\\Fang\\Desktop\\Python Trading\\Trading\\Trading\\Modules\\Earnings')
from yahoo_earnings import *
from lookup_earnings import lookup_earnings

os.chdir('C:\\Users\\Fang\\Desktop\\Python Trading\\Trading\\Trading\\Modules\\Processing')
from hv_calc import *

os.chdir(main_dir)

prices_dir = 'D:\\Price Data'
os.chdir(prices_dir)
prices_engine = create_engine('sqlite:///histprices.db', echo=False)


dbs_dir = 'C:\\Users\\Fang\\Desktop\\Python Trading\\Trading\\Data\\DBs'
spx_const_dir = dbs_dir + '\\spx_constituents'
quanto_dir = dbs_dir + '\\quantopian'
os.chdir(spx_const_dir)
spx_const = pd.read_csv('spx_constituents-2019-01-15.csv', index_col = 0)

os.chdir(dbs_dir)

earn_engine = create_engine('sqlite:///earningsHistory.db', echo=False)
yahoo_engine = create_engine('sqlite:///yahoo.db', echo = False)
reuters_engine = create_engine('sqlite:///reuters.db', echo=False)
# sec_engine = create_engine('sqlite:///SEC_txt.db', echo=False)



## Initial Tables

In [2]:
for yahoo_table in yahoo_engine.table_names():
    if 'profiles' in yahoo_table:
        query = 'SELECT * FROM {0} WHERE Underlying IN {1}'
        curr_table = pd.read_sql_query(query.format(yahoo_table, str(tuple(spx_const.index.tolist()))),
                                       con = yahoo_engine,
                                       index_col = 'Underlying')[['industry', 'sector']]
        curr_table = curr_table[~curr_table.index.duplicated(keep='last')]
        profiles = curr_table
    else:
        continue        

## Financial Ratios
The current ratio measures a company’s ability to pay off short-term liabilities with current assets:
    - Current ratio = 'totalCurrentAssets'/'totalCurrentLiabilities'
The acid-test ratio measures a company’s ability to pay off short-term liabilities with quick assets:
    - Acid-test ratio = ('totalCurrentAssets' – 'inventory')/'totalCurrentLiabilities'
The cash ratio measures a company’s ability to pay off short-term liabilities with cash and cash equivalents:
    - Cash ratio = 'cash'/'totalCurrentLiabilities'
The operating cash flow ratio is a measure of the number of times a company can pay off current liabilities with the cash generated in a given period:
    - Operating cash flow ratio = 'totalCashFromOperatingActivities'/'totalCurrentLiabilities'
The debt ratio measures the amount of a company’s assets that are provided from debt:
    - Debt ratio = 'totalLiab'/'totalAssets'
The debt to equity ratio calculates the weight of total debt and financial liabilities against shareholders equity:
    - Debt to equity ratio = 'totalLiab'/'totalStockholderEquity'
The interest coverage ratio determines how easily a company can pay its interest expenses:
    - Interest coverage ratio = 'operatingIncome'/'interestExpense'
The asset turnover ratio measures a company’s ability to generate sales from assets:
    - Asset turnover ratio = 'totalRevenue'/'totalAssets'
The gross margin ratio compares the gross profit of a company to its net sales to show how much profit a company makes after paying off its cost of goods sold:
    - Gross margin ratio = 'grossProfit'/'totalRevenue'
The operating margin ratio compares the operating income of a company to its net sales to determine operating efficiency:
    - Operating margin ratio = 'operatingIncome'/'totalRevenue'
    - Profit Margin ratio = 'netIncome'/'totalRevenue'
The return on assets ratio measures how efficiently a company is using its assets to generate profit:
    - Return on assets ratio = 'operatingIncome'/'totalAssets'
The return on equity ratio measures how efficiently a company is using its equity to generate profit:
    - Return on equity ratio = 'operatingIncome'/'totalStockholderEquity'
The price-earnings ratio compares a company’s share price to the earnings per share:
    - Free Cash Flow = 'totalCashFromOperatingActivities' - 'capitalExpenditures'
    - Net Issuance/Repurchase of Stock = 'issuanceOfStock' - 'repurchaseOfStock'
    - Purchases to Revenue = 'netStockSales'/'totalRevenue'

In [3]:
raw_statement_fields = ['totalCurrentAssets', 'totalCurrentLiabilities','inventory','cash',
                        'totalCashFromOperatingActivities','totalLiab','totalAssets','totalStockholderEquity',
                        'operatingIncome','interestExpense','totalRevenue','grossProfit',
                        'totalCashFromOperatingActivities','capitalExpenditures','netIncome']

balanceSheet_fields = ['cash','inventory', 'totalAssets', 'totalCurrentAssets',
                       'totalCurrentLiabilities', 'totalLiab', 'totalStockholderEquity']

incomeStatement_fields = ['grossProfit','interestExpense','netIncome','operatingIncome','totalRevenue']

cashFlow_fields = ['capitalExpenditures', 'totalCashFromOperatingActivities', 'repurchaseOfStock',
                   'issuanceOfStock', 'netBorrowings', 'dividendsPaid']


In [4]:
def pull_statement(ticker, statement_name, fields, period):
    
    query = str([period,'Underlying'] + fields).replace('[','').replace(']','').replace("'",'')
    query = 'SELECT {0} FROM {2} WHERE Underlying = "{1}"'.format(query,ticker,statement_name)

    statement = pd.read_sql_query(query, con = yahoo_engine,
                                  index_col = 'Underlying').drop_duplicates()

    statement[period] = pd.to_datetime(statement[period])

    for col in statement.columns:
        if isinstance(statement.reset_index().loc[0,col], dt.datetime) == False and col != 'maxAge':
            statement[col] = pd.to_numeric(statement[col])
    
    return statement.sort_values(period).tail(4)

def fin_factors(ticker, period, statement_period):
    
    balance = pull_statement(ticker, '{}BalanceSheet'.format(statement_period), balanceSheet_fields, period)
    incomes = pull_statement(ticker, '{}IncomeStatement'.format(statement_period), incomeStatement_fields, period)
    cflows = pull_statement(ticker, '{}CashFlow'.format(statement_period), cashFlow_fields, period)
    cflows['issuanceOfStock'] = cflows['issuanceOfStock'].fillna(0)
    cflows['repurchaseOfStock'] = cflows['repurchaseOfStock'].fillna(0)

    fins = balance.merge(incomes, on = period).merge(cflows, on = period).set_index(period)
    fins['freeCashFlow'] = fins['totalCashFromOperatingActivities'] + fins['capitalExpenditures']
    fins['netStockSales'] = (fins['issuanceOfStock'] + fins['repurchaseOfStock'])*-1

    fins['current_ratio'] = fins['totalCurrentAssets']/fins['totalCurrentLiabilities']
    fins['acidTest_ratio'] = (fins['totalCurrentAssets'] - fins['inventory'])/fins['totalCurrentLiabilities']
    fins['cash_ratio'] = fins['cash']/fins['totalCurrentLiabilities']

    fins['operatingCashFlow_ratio'] = fins['totalCashFromOperatingActivities']/fins['totalCurrentLiabilities']
    fins['debt_ratio'] = fins['totalLiab']/fins['totalAssets']
    fins['debtToEquity_ratio'] = fins['totalLiab']/fins['totalStockholderEquity']
    fins['interestCoverage_ratio'] = fins['operatingIncome']/fins['interestExpense']
    fins['assetTurnover_ratio'] = fins['totalRevenue']/fins['totalAssets']
    fins['grossMargin_ratio'] = fins['grossProfit']/fins['totalRevenue']
    fins['operatingMargin_ratio'] = fins['operatingIncome']/fins['totalRevenue']
    fins['profitMargin_ratio'] = fins['netIncome']/fins['totalRevenue']
    fins['returnOnAssets_ratio'] = fins['operatingIncome']/fins['totalAssets']
    fins['returnOnEquity_ratio'] = fins['operatingIncome']/fins['totalStockholderEquity']
    fins['repurchasesToRevenue_ratio'] = fins['netStockSales']/fins['totalRevenue']

    return fins

def annual_factors(ticker):
    
    fins = fin_factors(ticker, 'year', 'annual')
    one_year_changes = fins.pct_change()
    cumulative_changes = (fins.pct_change() + 1).cumprod() - 1

    annual_factors = fins[list(filter(lambda x: 'ratio' in x, 
                                      fins.columns.tolist()))].join(one_year_changes, 
                                                                    rsuffix='change1yr').join(cumulative_changes, 
                                                                                              rsuffix='changeallyrs')
    return annual_factors


def quarter_factors(ticker):
    period = 'quarter'

    #start_time = time.time()

    fins = fin_factors(ticker, period, 'quarterly').sort_index()

    earnings_hist = pd.read_sql_query('SELECT * FROM postEarningsReturns WHERE Underlying = "{}"'.format(ticker), 
                                      con = earn_engine, index_col = 'Underlying').drop_duplicates().fillna(np.nan)
    earnings_hist.earningsDate = pd.to_datetime(earnings_hist.earningsDate)
    
    earnings_hist = earnings_hist.sort_values('earningsDate')

    # curr_prices = pd.read_sql_query('SELECT * FROM historicalPrices WHERE Underlying = "{}"'.format(ticker),
    #                                 con = prices_engine, index_col = 'Underlying').drop_duplicates()
    # curr_prices.Date = pd.to_datetime(curr_prices.Date)

    quarterly_earnings = pd.read_sql_query('SELECT * FROM quarterlyEarnings WHERE Underlying = "{}"'.format(ticker),
                                           con = yahoo_engine, index_col = 'Underlying').drop_duplicates()
    quarterly_earnings.quarter = pd.to_datetime(quarterly_earnings.quarter)
    
    quarterly_earnings = quarterly_earnings.sort_values('quarter')

    tol = pd.Timedelta('90 day')

    earnings_info = pd.merge_asof(left=quarterly_earnings,right=earnings_hist,
                                  left_on = ['quarter'], right_on = ['earningsDate'],
                                  direction='nearest',tolerance=tol).set_index('quarter')

    fins = pd.merge_asof(left=fins.reset_index(),right=earnings_info[['earningsDate','epsActual','surprisePercent']],
                         left_on = ['quarter'], right_on = ['earningsDate'],
                         direction='nearest',tolerance=tol).set_index('quarter')

    del fins['earningsDate']
    #print("--- %s seconds ---" % (time.time() - start_time))

    one_quarter_changes = fins.pct_change()
    cumulative_changes = (fins[list(filter(lambda x: x != 'surprisePercent',
                                           fins.columns.tolist()))].pct_change() + 1).cumprod() - 1

    quarter_factors = fins[list(filter(lambda x: 'ratio' in x, 
                                       fins.columns.tolist()))].join(one_quarter_changes, 
                                                                     rsuffix='change1qtr').join(cumulative_changes, 
                                                                                                   rsuffix='changeallqtrs')
    
    return quarter_factors, earnings_info[['earningsDate','closeToOpenReturn','industryBeta','marketBeta','stock52WeekReturn','market52WeekReturn','industry52WeekReturn']]

def create_data_row(ticker):
    annualFactors = annual_factors(ticker)

    quarterFactors, earnings_info = quarter_factors(ticker)

    earnings_info['year'] = earnings_info.index

    for idx, row in earnings_info.iterrows():

        yearDate = list(filter(lambda x: x < idx - dt.timedelta(days = 90),annualFactors.index.tolist()))
        earnings_info.loc[idx,'year'] = max(yearDate)

    factor_df = earnings_info.reset_index().merge(annualFactors.reset_index(), on = 'year').set_index('quarter')

    factor_df = factor_df.join(quarterFactors.shift(1),lsuffix = '_y', rsuffix = '_q')

    del factor_df['year']

    test_row = factor_df.tail(1)
    test_row.index = [ticker]
    test_row = test_row.join(profiles)
    
    return test_row


## Yahoo Database Pull

In [5]:
start_time = time.time()

df = []

for ticker in profiles.index:
    try:
        curr_point = create_data_row(ticker)
        df.append(curr_point)
    except:
        continue

print("--- %s seconds ---" % (time.time() - start_time))

--- 0.33911943435668945 seconds ---


In [6]:
# df = pd.concat(df, axis = 0)
# df.to_csv('earnings_oos.csv')
# df

In [7]:
latest_spx_annuals = []
latest_spx_quarters = []
failed_cons = []
for constituent in spx_const.index:
    
    try:
        curr_annual = fin_factors(constituent, 'year', 'annual')
        curr_annual['Ticker'] = constituent
        curr_annual = curr_annual.reset_index().set_index('Ticker').join(profiles)
        #curr_annual = curr_annual.sort_values('year').tail(1)
        latest_spx_annuals.append(curr_annual)
        
        curr_quarterly = fin_factors(constituent, 'quarter', 'quarterly')
        curr_quarterly = curr_quarterly.sort_index()
        curr_quarterly['Ticker'] = constituent
        curr_quarterly = curr_quarterly.reset_index().set_index('Ticker').join(profiles)
        latest_spx_quarters.append(curr_quarterly)
    except:
        failed_cons.append(constituent)
    

In [167]:
pd.concat(latest_spx_annuals, 
          axis = 0).sort_values('year').reset_index().drop_duplicates('index', 
                                                                      keep = 'last').set_index('index').to_csv('annual_spx_yahoo.csv')
pd.concat(latest_spx_quarters, axis = 0).sort_values('quarter').to_csv('quarterly_spx_yahoo.csv')

In [252]:
quanto_af = pd.read_csv('quanto_af_2005-2017.csv', index_col = 0)
quanto_af.index = pd.to_datetime(quanto_af.index)
quanto_af.eps_rpt_date_af = pd.to_datetime(quanto_af.eps_rpt_date_af)

In [253]:
starting_year = 2005

annual_dict = {}
annual_dup_dict = {}

for year in range(starting_year, 2019):
    curr_annuals = quanto_af[(quanto_af.index.year >= year) &
                             (quanto_af.index.year < year + 1)]
    annual_dup_dict[year] = curr_annuals[curr_annuals.duplicated('Equity', keep = 'first')]
    annual_dict[year] = curr_annuals[curr_annuals.duplicated('Equity', keep = 'first') == False]


In [265]:
annuals = []

for year in range(2006,2019):
    curr_df = pd.concat([annual_dict[year],annual_dup_dict[year - 1]],axis = 0).drop_duplicates('Equity', keep = 'last')
    curr_df['FY'] = year
    annuals.append(curr_df)

curr_df = annual_dict[2005]
curr_df['FY'] = 2005
annuals.append(curr_df)

In [267]:
pd.concat(annuals,axis = 0).to_csv('cleaned_af.csv')