In [28]:
from datetime import datetime as dt
from datetime import timedelta
from edgar import Company
import lxml.html as lh
import pandas as pd
import pandas_datareader as pdr

In [53]:
def get_ticker_to_cik():
    # local copy: data/ticker_to_cik.txt
    ticker_to_cik = pd.read_csv('https://www.sec.gov/include/ticker.txt',
                                sep='\t', header=None, names=['ticker','cik'])
    ticker_to_cik['ticker'] = ticker_to_cik.ticker.str.upper()
    ticker_to_cik['cik'] = ticker_to_cik.cik.astype(str)
    return ticker_to_cik

def get_cik_to_name():
    # local copy: data/cik_to_name.json
    cik_to_name = pd.read_json('https://www.sec.gov/files/company_tickers.json').transpose()
    cik_to_name['ticker'] = cik_to_name.ticker.str.upper()
    cik_to_name['cik'] = cik_to_name.cik_str.astype(str)
    return cik_to_name

def process_spac_lists(file_path_current, file_path_past, write=False):
    # current spac list
    spac_list_current = pd.read_csv(file_path_current)
    spac_list_current = spac_list_current.Ticker.unique()
    spac_list_current = pd.DataFrame(spac_list_current, columns=['Ticker'])
    
    # past spac list (completed business combination)
    spac_list_past = pd.read_csv(file_path_past)
    spac_list_past.fillna('missing', inplace=True)
    spac_list_past['dupe_filter'] = spac_list_past['Old Ticker'] + spac_list_past['New Ticker']
    spac_list_past = spac_list_past[spac_list_past.dupe_filter.isin(spac_list_past.dupe_filter.unique())]
    spac_list_past.drop(columns=['dupe_filter'], inplace=True)
    
    # write to file
    if write==True:
        spac_list_current.to_csv('spac_list_current.csv', index=False)
        spac_list_past.to_csv('spac_list_past.csv', index=False)
    
    # get ticker to cik and cik to company name file, then merge
    ticker_to_cik = get_ticker_to_cik()
    cik_to_name = get_cik_to_name()
    spac_list_past = spac_list_past.merge(ticker_to_cik, how='left', left_on='New Ticker', right_on='ticker')
    spac_list_past = spac_list_past.merge(cik_to_name[['cik','ticker','title']], how='left', on=['cik','ticker'])
    spac_list_current = spac_list_current.merge(ticker_to_cik, how='left', left_on='Ticker', right_on='ticker')
    spac_list_current = spac_list_current.merge(cik_to_name[['cik','ticker','title']], how='left', on=['cik','ticker'])
    
    return spac_list_current, spac_list_past

def form_html_to_text(forms_html):
    forms_text = []
    for form_html in forms_html:
        form_text = form_html.text_content().replace('\n',' ').replace('\xa0',' ').lower()
        forms_text.append(form_text)
    return forms_text

def create_date_text_df(forms_text, form_type):
    df = pd.DataFrame()
    for form_text in forms_text:
        try:
            split_text = form_text.split('date of report (date of earliest event reported): ')[1].split(', ')
            date_string = split_text[0].replace(' ','') + ', ' + split_text[1].replace(' ','')[0:4]
            date_dt = dt.strptime(date_string, '%B%d, %Y')
            date_dt = date_dt.strftime('%Y-%m-%d')
            df = df.append(pd.Series([date_dt, form_type, form_text]), ignore_index=True)
            print(date_dt, 'form added')
        except:
            print('Logic to find date broke. See text:\n', form_text)
    df.columns = ['date','form','text']
    return df

def get_forms_text(company_name, cik_id, form_type):
    print(company_name)
    company = Company(company_name, cik_id)
    print('url to forms:', company.get_filings_url(filing_type=form_type, ownership='include', no_of_entries=100))
    forms_site_html = company.get_all_filings(filing_type=form_type, ownership='include', no_of_entries=100)
    forms_html = company.get_documents(forms_site_html, no_of_documents=100, debug=False)
    forms_text = form_html_to_text(forms_html)
    df = create_date_text_df(forms_text, form_type)
    return df

def simple_text_match(df_form, substring):
    df_form[substring.replace(' ','_')+'_found'] = df_form.text.apply(lambda x: 1 if substring in x else 0)
    return df_form

def get_historical_prices(symbol, start_date, end_date, source):
    print('start_date:', start_date)
    print('end_date:', end_date)
    start_split = start_date.split('-')
    end_split = end_date.split('-')
    start = dt(int(start_split[0]), int(start_split[1]), int(start_split[2]))
    end = dt(int(end_split[0]), int(end_split[1]), int(end_split[2]))
    # be careful with missing/limited data
    if source=='yahoo':
        df_prices = pdr.data.DataReader(name=symbol, data_source='yahoo', start=start, end=end)
    # seems to be most reliable
    if source=='alphavantage':
        df_prices = pdr.data.DataReader(name=symbol, data_source='av-daily', start=start, end=end,
                                        api_key='BDB9WJQRCZKINCLD')
    # very limited api calls
    if source=='iex':
        df_prices = pdr.data.DataReader(name=symbol, data_source='iex', start=start, end=end,
                                        api_key='pk_970dfff359894b15a056cf677c02e11f')
    return df_prices

In [56]:
spac_list_current, spac_list_past = process_spac_lists(file_path_current='data/spac_list_current.csv',
                                                       file_path_past='data/spac_list_past.csv',
                                                       write=False)

In [57]:
ind = 27
row = spac_list_current.loc[ind]
# row = spac_list_past.loc[ind]

df_form_8K = get_forms_text(company_name=row.title, cik_id=row.cik, form_type='8-K')
df_form_8K = simple_text_match(df_form_8K, 'letter of intent')
df_form_8K = simple_text_match(df_form_8K, 'business combination agreement')
df_form_8K

Forum Merger II Corp
url to forms: https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=1741231&type=8-K&dateb=&owner=include&count=100
2020-06-22 form added
2020-06-11 form added
2020-06-12 form added
2020-06-08 form added
2020-06-03 form added
2020-05-13 form added
2020-01-07 form added
2020-01-02 form added
2018-09-11 form added
2018-08-13 form added
2018-08-08 form added


Unnamed: 0,date,form,text,letter_of_intent_found,business_combination_agreement_found
0,2020-06-22,8-K,8-k 1 ea123288-8k_forummerger2.htm current re...,0,0
1,2020-06-11,8-K,8-k 1 ea122974-8k_forummerger2.htm current re...,0,0
2,2020-06-12,8-K,8-k 1 ea122985-8k_forummerger2.htm current re...,0,0
3,2020-06-08,8-K,8-k 1 ea122807-8k_forummerger2.htm current re...,0,0
4,2020-06-03,8-K,8-k 1 ea122609-8k_forummerger2.htm current re...,1,0
5,2020-05-13,8-K,8-k 1 ea121761-8k_forummergii.htm current rep...,1,0
6,2020-01-07,8-K,8-k 1 f8k010720_forummerger2.htm current repo...,0,0
7,2020-01-02,8-K,8-k 1 f8k010220_forummerger2.htm current repo...,0,0
8,2018-09-11,8-K,8-k 1 f8k091118_forummerger2.htm current repo...,0,0
9,2018-08-13,8-K,8-k 1 f8k080718_forummerger2.htm current repo...,0,0


In [58]:
df_prices = get_historical_prices(symbol=row.ticker,
                                  start_date=df_form_8K.date.min(),
                                  end_date=(dt.strptime(df_form_8K.date.max(),'%Y-%m-%d') + 
                                            timedelta(days=5)).strftime('%Y-%m-%d'),
                                  source='alphavantage')
df_prices.reset_index(inplace=True)
df_prices.rename(columns={'index':'date'}, inplace=True)
df_prices.columns = df_prices.columns.str.lower()
df_prices['close_t+1'] = df_prices.close.shift(-1)
df_prices['close_t+3'] = df_prices.close.shift(-3)
df_prices['open_close_t+1_%chg'] = (df_prices['close_t+1'] - df_prices['open']) / df_prices['open']
df_prices['open_close_t+3_%chg'] = (df_prices['close_t+3'] - df_prices['open']) / df_prices['open']
df_prices = df_prices.round(2)
df_prices

start_date: 2018-08-08
end_date: 2020-06-27


Unnamed: 0,date,open,high,low,close,volume,close_t+1,close_t+3,open_close_t+1_%chg,open_close_t+3_%chg
0,2018-09-12,9.58,9.62,9.55,9.57,2597025,9.58,9.57,0.00,-0.00
1,2018-09-13,9.58,9.58,9.57,9.58,166800,9.58,9.57,0.00,-0.00
2,2018-09-14,9.58,9.58,9.58,9.58,0,9.57,9.57,-0.00,-0.00
3,2018-09-17,9.57,9.57,9.52,9.57,1500,9.57,9.57,0.00,0.00
4,2018-09-18,9.57,9.57,9.55,9.57,6002,9.57,9.57,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...
437,2020-06-22,17.49,18.14,16.20,17.81,2952689,16.48,17.95,-0.06,0.03
438,2020-06-23,17.56,18.10,16.10,16.48,3150018,15.20,17.12,-0.13,-0.03
439,2020-06-24,16.50,16.61,14.60,15.20,3677590,17.95,,0.09,
440,2020-06-25,14.82,17.95,14.70,17.95,6106318,17.12,,0.16,


In [59]:
df_form_8K.merge(df_prices[['date','open_close_t+1_%chg','open_close_t+3_%chg']], how='left', on='date')

Unnamed: 0,date,form,text,letter_of_intent_found,business_combination_agreement_found,open_close_t+1_%chg,open_close_t+3_%chg
0,2020-06-22,8-K,8-k 1 ea123288-8k_forummerger2.htm current re...,0,0,-0.06,0.03
1,2020-06-11,8-K,8-k 1 ea122974-8k_forummerger2.htm current re...,0,0,-0.06,0.23
2,2020-06-12,8-K,8-k 1 ea122985-8k_forummerger2.htm current re...,0,0,0.05,0.31
3,2020-06-08,8-K,8-k 1 ea122807-8k_forummerger2.htm current re...,0,0,0.15,0.06
4,2020-06-03,8-K,8-k 1 ea122609-8k_forummerger2.htm current re...,1,0,0.06,0.22
5,2020-05-13,8-K,8-k 1 ea121761-8k_forummergii.htm current rep...,1,0,0.04,0.03
6,2020-01-07,8-K,8-k 1 f8k010720_forummerger2.htm current repo...,0,0,0.0,0.0
7,2020-01-02,8-K,8-k 1 f8k010220_forummerger2.htm current repo...,0,0,0.0,0.0
8,2018-09-11,8-K,8-k 1 f8k091118_forummerger2.htm current repo...,0,0,,
9,2018-08-13,8-K,8-k 1 f8k080718_forummerger2.htm current repo...,0,0,,
