In [3]:
from datetime import datetime as dt
from datetime import timedelta
from edgar import Company
import lxml.html as lh
import pandas as pd
import pandas_datareader as pdr
import time

ModuleNotFoundError: No module named 'pandas_datareader'

In [None]:
def get_ticker_to_cik():
    # local copy: data/ticker_to_cik.txt
    ticker_to_cik = pd.read_csv('https://www.sec.gov/include/ticker.txt',
                                sep='\t', header=None, names=['ticker','cik'])
    ticker_to_cik['ticker'] = ticker_to_cik.ticker.str.upper()
    ticker_to_cik['cik'] = ticker_to_cik.cik.astype(str)
    return ticker_to_cik

def get_cik_to_name():
    # local copy: data/cik_to_name.json
    cik_to_name = pd.read_json('https://www.sec.gov/files/company_tickers.json').transpose()
    cik_to_name['ticker'] = cik_to_name.ticker.str.upper()
    cik_to_name['cik'] = cik_to_name.cik_str.astype(str)
    return cik_to_name

def process_spac_lists(file_path_current, file_path_past, write=False):
    # current spac list
    spac_list_current = pd.read_csv(file_path_current)
    spac_list_current = spac_list_current.Ticker.unique()
    spac_list_current = pd.DataFrame(spac_list_current, columns=['Ticker'])
    
    # past spac list (completed business combination)
    spac_list_past = pd.read_csv(file_path_past)
    spac_list_past.fillna('missing', inplace=True)
    spac_list_past['dupe_filter'] = spac_list_past['Old Ticker'] + spac_list_past['New Ticker']
    spac_list_past = spac_list_past[spac_list_past.dupe_filter.isin(spac_list_past.dupe_filter.unique())]
    spac_list_past.drop(columns=['dupe_filter'], inplace=True)
    
    # write to file
    if write==True:
        spac_list_current.to_csv('spac_list_current.csv', index=False)
        spac_list_past.to_csv('spac_list_past.csv', index=False)
    
    # get ticker to cik and cik to company name file, then merge
    ticker_to_cik = get_ticker_to_cik()
    cik_to_name = get_cik_to_name()
    spac_list_past = spac_list_past.merge(ticker_to_cik, how='left', left_on='New Ticker', right_on='ticker')
    spac_list_past = spac_list_past.merge(cik_to_name[['cik','ticker','title']], how='left', on=['cik','ticker'])
    spac_list_current = spac_list_current.merge(ticker_to_cik, how='left', left_on='Ticker', right_on='ticker')
    spac_list_current = spac_list_current.merge(cik_to_name[['cik','ticker','title']], how='left', on=['cik','ticker'])
    
    return spac_list_current, spac_list_past

def form_html_to_text(forms_html):
    forms_text = []
    for form_html in forms_html:
        form_text = form_html.text_content().replace('\n',' ').replace('\xa0',' ').lower()
        forms_text.append(form_text)
    return forms_text

def create_date_text_df(forms_text, form_type):
    df = pd.DataFrame()
    for form_text in forms_text:
        try:
            split_text = form_text.split('date of report (date of earliest event reported): ')[1].split(', ')
            date_string = split_text[0].replace(' ','') + ', ' + split_text[1].replace(' ','')[0:4]
            date_dt = dt.strptime(date_string, '%B%d, %Y')
            date_dt = date_dt.strftime('%Y-%m-%d')
            df = df.append(pd.Series([date_dt, form_type, form_text]), ignore_index=True)
#             print(date_dt, 'form added')
        except:
            print('Logic to find date broke. See text:\n', form_text)
            raise Exception('Could not find date')
    df.columns = ['date','form','text']
    return df

def get_forms_text(company_name, cik_id, form_type):
    print(company_name)
    company = Company(company_name, cik_id)
    print('url to forms:', company.get_filings_url(filing_type=form_type, ownership='include', no_of_entries=100))
    forms_site_html = company.get_all_filings(filing_type=form_type, ownership='include', no_of_entries=100)
    forms_html = company.get_documents(forms_site_html, no_of_documents=100, debug=False)
    forms_text = form_html_to_text(forms_html)
    if len(forms_text)==0:
        return
    df = create_date_text_df(forms_text, form_type)
    return df

def simple_text_match(df_form, substring):
    df_form[substring.replace(' ','_')+'_found'] = df_form.text.apply(lambda x: 1 if substring in x else 0)
    return df_form

def bulk_save_alphavantage_data(symbols, start_date='2018-01-01', end_date='2020-07-10'):
    for symbol in symbols:
        print(symbol)
        df_prices = get_historical_prices(symbol=symbol,
                                          start_date=start_date,
                                          end_date=end_date,
                                          source='alphavantage')
        df_prices = process_historical_prices(df_prices)
        df_prices.to_csv('data/prices/'+symbol+'_prices.csv', index=False)
        time.sleep(12)
        
def load_saved_prices_data(symbol):
    df_prices = pd.read_csv('data/prices/'+symbol+'_prices.csv')
    print('price data min date:', df_prices.date.min())
    print('price data max date:', df_prices.date.max())
    return df_prices

def get_historical_prices(symbol, start_date, end_date, source):
    print('input start_date:', start_date)
    print('input end_date:', end_date)
    start_split = start_date.split('-')
    end_split = end_date.split('-')
    start = dt(int(start_split[0]), int(start_split[1]), int(start_split[2]))
    end = dt(int(end_split[0]), int(end_split[1]), int(end_split[2]))
    # be careful with missing/limited data in yahoo data
    if source=='yahoo':
        df_prices = pdr.data.DataReader(name=symbol, data_source='yahoo', start=start, end=end)
    # alphavantage seems to be most reliable. 5 calls per minute and 500 calls per day
    if source=='alphavantage':
        df_prices = pdr.data.DataReader(name=symbol, data_source='av-daily', start=start, end=end,
                                        api_key='BDB9WJQRCZKINCLD')
    # iex has extremely limited api calls
    if source=='iex':
        df_prices = pdr.data.DataReader(name=symbol, data_source='iex', start=start, end=end,
                                        api_key='pk_970dfff359894b15a056cf677c02e11f')
    return df_prices

def process_historical_prices(df_prices):
    df_prices.reset_index(inplace=True)
    df_prices.rename(columns={'index':'date'}, inplace=True)
    df_prices.columns = df_prices.columns.str.lower()
    df_prices['date'] = df_prices.date.astype(str)
    df_prices['close_t+1'] = df_prices.close.shift(-1)
    df_prices['close_t+3'] = df_prices.close.shift(-3)
    df_prices['close_t+5'] = df_prices.close.shift(-5)
    df_prices['close_t+7'] = df_prices.close.shift(-7)
    df_prices['open_close_t+1_%chg'] = (df_prices['close_t+1'] - df_prices['open']) / df_prices['open']
    df_prices['open_close_t+3_%chg'] = (df_prices['close_t+3'] - df_prices['open']) / df_prices['open']
    df_prices['open_close_t+5_%chg'] = (df_prices['close_t+5'] - df_prices['open']) / df_prices['open']
    df_prices['open_close_t+7_%chg'] = (df_prices['close_t+7'] - df_prices['open']) / df_prices['open']
    df_prices = df_prices.round(2)
    print('output min date:', df_prices.date.min())
    print('output max date:', df_prices.date.max())
    return df_prices

In [4]:
# load current and past spac lists
spac_list_current, spac_list_past = process_spac_lists(file_path_current='data/spac_list_current.csv',
                                                       file_path_past='data/spac_list_past.csv',
                                                       write=False)

# spacs missing price data
missing_past_spacs = ['missing', 'LCAH', 'FMCI1', 'CFCO']
missing_current_spacs = ['ACNDU', 'ARYB', 'BRLI', 'DFHT', 'DMYD', 'FUSE', 'GOAC', 'IWAC', 'LCAH', 'LGVW',
                         'MCAC', 'MLAC', 'PANA', 'PSAC', 'PSTH', 'SSMC', 'TREB']

# bulk save price data for symbols in spac lists (due to API limits)
# symbols_past_new_ticker = [x for x in spac_list_past.ticker.unique().tolist() if str(x)!='nan']
# symbols_past_old_ticker = spac_list_past['Old Ticker'].unique().tolist()
# symbols_past_old_ticker = [x for x in symbols_past_old_ticker if x not in missing_past_spacs]
# symbols_current = [x for x in spac_list_current['Ticker'] if x not in missing_current_spacs]
# bulk_save_alphavantage_data(symbols=symbols_past_new_ticker, start_date='2018-01-01', end_date='2020-07-10')
# bulk_save_alphavantage_data(symbols=symbols_past_old_ticker, start_date='2018-01-01', end_date='2020-07-10')
# bulk_save_alphavantage_data(symbols=symbols_current, start_date='2018-01-01', end_date='2020-07-10')

# bad price data: ACEL

In [302]:
for ind in range(22, len(spac_list_past)):
#     broken_inds_current = [2,5,10,16,22,33,38,39]
#     if ind in broken_inds_current:
#         continue
    broken_inds_past = [10,14,16,17,22] # 22 is SPCE
    if ind in broken_inds_past:
        continue
    
    row = spac_list_past.iloc[ind]
    print('index:', ind)
    print(row.ticker)
    
    # get form 8Ks
    df_form_8K = get_forms_text(company_name=row.title, cik_id=row.cik, form_type='8-K')
    if df_form_8K is None:
        print('no 8Ks found\n')
        continue
    df_form_8K = simple_text_match(df_form_8K, 'letter of intent')
    df_form_8K = simple_text_match(df_form_8K, 'business combination agreement')

    # get stock prices
    try:
        df_prices = load_saved_prices_data(row.ticker)
    except:
        df_prices = get_historical_prices(symbol=row.ticker,
                                          start_date=df_form_8K.date.min(),
                                          end_date=(dt.strptime(df_form_8K.date.max(),'%Y-%m-%d') + 
                                                    timedelta(days=5)).strftime('%Y-%m-%d'),
                                          source='alphavantage')
        df_price = process_historical_prices(df_prices)

    # output returns per form
    df_returns = df_form_8K.merge(df_prices[['date','open_close_t+1_%chg','open_close_t+3_%chg']],
                                  how='left', on='date')
    display(df_returns)

In [26]:
ticker_to_cik = pd.read_csv('https://www.sec.gov/include/ticker.txt',
                                sep='\t', header=None, names=['ticker','cik'])
ticker_to_cik['ticker'] = ticker_to_cik.ticker.str.upper()
ticker_to_cik.shape

(8426, 2)

In [37]:
ticker_to_cik[ticker_to_cik.ticker == 'A'].cik.to_list()[0]

1090872

In [18]:
cik_to_name = pd.read_json('https://www.sec.gov/files/company_tickers.json').transpose()
cik_to_name['ticker'] = cik_to_name.ticker.str.upper()
cik_to_name['cik'] = cik_to_name.cik_str.astype(str)

In [42]:
cik_to_name[cik_to_name.ticker == 'A'].cik.to_list()[0]


'1090872'

In [44]:
'1090872' in cik_to_name.cik.to_list()

True

In [45]:
cik_to_name = pd.read_json('https://www.sec.gov/files/company_tickers.json').transpose()
cik_to_name

Unnamed: 0,cik_str,ticker,title
0,1750,AIR,AAR CORP
1,1800,ABT,ABBOTT LABORATORIES
2,1961,WDDD,WORLDS INC
3,2034,ACET,ACETO CORP
4,2098,ACU,ACME UNITED CORP
5,2178,AE,"ADAMS RESOURCES & ENERGY, INC."
6,2186,BKTI,BK Technologies Corp
7,2488,AMD,ADVANCED MICRO DEVICES INC
8,2809,AEM,AGNICO EAGLE MINES LTD
9,2969,APD,AIR PRODUCTS & CHEMICALS INC /DE/


In [46]:
def get_forms_text(company_name, cik_id, form_type):
    print(company_name)
    company = Company(company_name, cik_id)
    print('url to forms:', company.get_filings_url(filing_type=form_type, ownership='include', no_of_entries=100))
    forms_site_html = company.get_all_filings(filing_type=form_type, ownership='include', no_of_entries=100)
    forms_html = company.get_documents(forms_site_html, no_of_documents=100, debug=False)
    forms_text = form_html_to_text(forms_html)
    if len(forms_text)==0:
        return
    df = create_date_text_df(forms_text, form_type)
    return df

get_forms_text

<function __main__.get_forms_text(company_name, cik_id, form_type)>

In [47]:
import edgar


company = Company('Forum Merger II Corp', '1741231')
edgar.get_all_filings("FMCI", )