In [73]:
from datetime import datetime as dt
from edgar import Company
import lxml.html as lh
import pandas as pd
import pandas_datareader.data as pdr

In [111]:
def form_html_to_text(forms_html):
    forms_text = []
    for form_html in forms_html:
        form_text = form_html.text_content().replace('\n',' ').replace('\xa0',' ').lower()
        forms_text.append(form_text)
    return forms_text

def create_date_text_df(forms_text, form_type):
    df = pd.DataFrame()
    for form_text in forms_text:
        try:
            split_text = form_text.split('date of report (date of earliest event reported): ')[1].split(', ')
            date_string = split_text[0].replace(' ','') + ', ' + split_text[1].replace(' ','')[0:4]
            date_dt = dt.strptime(date_string, '%B%d, %Y')
            df = df.append(pd.Series([date_dt, form_type, form_text]), ignore_index=True)
            print(date_dt, 'form added')
        except:
            print('Logic to find date broke. See text:\n', form_text)
    df.columns = ['date','form','text']
    return df

def get_forms_text(company_name, cik_id, form_type):
    print(company_name)
    company = Company(company_name, cik_id)
    print('url to forms:', company.get_filings_url(filing_type=form_type, ownership='include', no_of_entries=100))
    forms_site_html = company.get_all_filings(filing_type=form_type, ownership='include', no_of_entries=100)
    forms_html = company.get_documents(forms_site_html, no_of_documents=100, debug=False)
    forms_text = form_html_to_text(forms_html)
    df = create_date_text_df(forms_text, form_type)
    return df

def simple_text_match(df_form, substring):
    df_form[substring.replace(' ','_')+'_found'] = df_form.text.apply(lambda x: 1 if substring in x else 0)
    return df_form

def process_current_spacs(file_path, write=False):
    spac_list_current = pd.read_csv('spac_list_current.csv')
    spac_list_current = spac_list_current.Ticker.unique()
    spac_list_current = pd.DataFrame(spac_list_current, columns=['Ticker'])
    if write==True:
        spac_list_current.to_csv('spac_list_current.csv', index=False)
    return spac_list_current

def process_past_spacs(file_path, write=False):
    spac_list_past = pd.read_csv('spac_list_past.csv')
    spac_list_past.fillna('missing', inplace=True)
    spac_list_past['dupe_filter'] = spac_list_past['Old Ticker'] + spac_list_past['New Ticker']
    spac_list_past = spac_list_past[spac_list_past.dupe_filter.isin(spac_list_past.dupe_filter.unique())]
    spac_list_past.drop(columns=['dupe_filter'], inplace=True)
    if write==True:
        spac_list_past.to_csv('spac_list_current.csv', index=False)
    return spac_list_past

def get_historical_prices(symbol, start_date, end_date):
    start_split = start_date.split('-')
    end_split = end_date.split('-')
    start = dt(int(start_split[0]), int(start_split[1]), int(start_split[2]))
    end = dt(int(end_split[0]), int(end_split[1]), int(end_split[2]))
    df_prices = web.DataReader(name=symbol, data_source='yahoo', start=start, end=end)
    return df_prices

# TODO:
# - get company name and cik automatically from ticker

In [109]:
df_form_8K = get_forms_text(company_name='Forum Merger II Corp', cik_id='0001741231', form_type='8-K')
# df_form_8K = get_forms_text(company_name='Nikola Corp', cik_id='0001731289', form_type='8-K')
df_form_8K = simple_text_match(df_form_8K, 'letter of intent')
df_form_8K = simple_text_match(df_form_8K, 'business combination agreement')
df_form_8K

Forum Merger II Corp
url to forms: https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=0001741231&type=8-K&dateb=&owner=include&count=100
2020-06-22 00:00:00 form added
2020-06-11 00:00:00 form added
2020-06-12 00:00:00 form added
2020-06-08 00:00:00 form added
2020-06-03 00:00:00 form added
2020-05-13 00:00:00 form added
2020-01-07 00:00:00 form added
2020-01-02 00:00:00 form added
2018-09-11 00:00:00 form added
2018-08-13 00:00:00 form added
2018-08-08 00:00:00 form added


Unnamed: 0,date,form,text,letter_of_intent_found,business_combination_agreement_found
0,2020-06-22,8-K,8-k 1 ea123288-8k_forummerger2.htm current re...,0,0
1,2020-06-11,8-K,8-k 1 ea122974-8k_forummerger2.htm current re...,0,0
2,2020-06-12,8-K,8-k 1 ea122985-8k_forummerger2.htm current re...,0,0
3,2020-06-08,8-K,8-k 1 ea122807-8k_forummerger2.htm current re...,0,0
4,2020-06-03,8-K,8-k 1 ea122609-8k_forummerger2.htm current re...,1,0
5,2020-05-13,8-K,8-k 1 ea121761-8k_forummergii.htm current rep...,1,0
6,2020-01-07,8-K,8-k 1 f8k010720_forummerger2.htm current repo...,0,0
7,2020-01-02,8-K,8-k 1 f8k010220_forummerger2.htm current repo...,0,0
8,2018-09-11,8-K,8-k 1 f8k091118_forummerger2.htm current repo...,0,0
9,2018-08-13,8-K,8-k 1 f8k080718_forummerger2.htm current repo...,0,0


In [106]:
spac_list_current = process_current_spacs('spac_list_current.csv', write=False)
spac_list_past = process_past_spacs('spac_list_past.csv', write=False)

In [110]:
df_prices = get_historical_prices(symbol='FMCI', start_date='2020-01-01', end_date='2020-06-10')
df_prices.head()

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-02,10.21,10.21,10.21,10.21,0,10.21
2020-01-03,10.21,10.21,10.21,10.21,0,10.21
2020-01-06,10.25,10.24,10.24,10.25,127000,10.25
2020-01-07,10.24,10.24,10.24,10.24,100000,10.24
2020-01-08,10.24,10.24,10.24,10.24,0,10.24
