In [1]:
from datetime import datetime as dt
from edgar import Company
import lxml.html as lh
import pandas as pd

In [44]:
def form_html_to_text(forms_html):
    forms_text = []
    for form_html in forms_html:
        form_text = form_html.text_content().replace('\n',' ').replace('\xa0',' ').lower()
        forms_text.append(form_text)
    return forms_text

def create_date_text_df(forms_text, form_type):
    df = pd.DataFrame()
    for form_text in forms_text:
        try:
            split_text = form_text.split('date of report (date of earliest event reported): ')[1].split(', ')
            date_string = split_text[0].replace(' ','') + ', ' + split_text[1].replace(' ','')[0:4]
            date_dt = dt.strptime(date_string, '%B%d, %Y')
            df = df.append(pd.Series([date_dt, form_type, form_text]), ignore_index=True)
            print(date_dt, 'form added')
        except:
            print('Logic to find date broke. See text:\n', form_text)
    df.columns = ['date','form','text']
    return df

def get_forms_text(company_name, cik_id, form_type):
    print(company_name)
    company = Company(company_name, cik_id)
    print('url to forms:', company.get_filings_url(filing_type=form_type, ownership="include", no_of_entries=100))
    forms_site_html = company.get_all_filings(filing_type=form_type, ownership='include', no_of_entries=100)
    forms_html = company.get_documents(forms_site_html, no_of_documents=100, debug=False)
    forms_text = form_html_to_text(forms_html)
    df = create_date_text_df(forms_text, form_type)
    return df

In [45]:
df_form_8K = get_forms_text(company_name='Forum Merger II Corp', cik_id='0001741231', form_type='8-K')
df_form_8K

Forum Merger II Corp
url to forms: https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=0001741231&type=8-K&dateb=&owner=include&count=100
2020-06-22 00:00:00 form added
2020-06-11 00:00:00 form added
2020-06-12 00:00:00 form added
2020-06-08 00:00:00 form added
2020-06-03 00:00:00 form added
2020-05-13 00:00:00 form added
2020-01-07 00:00:00 form added
2020-01-02 00:00:00 form added
2018-09-11 00:00:00 form added
2018-08-13 00:00:00 form added
2018-08-08 00:00:00 form added


Unnamed: 0,date,form,text
0,2020-06-22,8-K,8-k 1 ea123288-8k_forummerger2.htm current re...
1,2020-06-11,8-K,8-k 1 ea122974-8k_forummerger2.htm current re...
2,2020-06-12,8-K,8-k 1 ea122985-8k_forummerger2.htm current re...
3,2020-06-08,8-K,8-k 1 ea122807-8k_forummerger2.htm current re...
4,2020-06-03,8-K,8-k 1 ea122609-8k_forummerger2.htm current re...
5,2020-05-13,8-K,8-k 1 ea121761-8k_forummergii.htm current rep...
6,2020-01-07,8-K,8-k 1 f8k010720_forummerger2.htm current repo...
7,2020-01-02,8-K,8-k 1 f8k010220_forummerger2.htm current repo...
8,2018-09-11,8-K,8-k 1 f8k091118_forummerger2.htm current repo...
9,2018-08-13,8-K,8-k 1 f8k080718_forummerger2.htm current repo...


In [46]:
df_form_8K = get_forms_text(company_name='Nikola Corp', cik_id='0001731289', form_type='8-K')
df_form_8K

Nikola Corp
url to forms: https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=0001731289&type=8-K&dateb=&owner=include&count=100
2020-06-03 00:00:00 form added
2020-06-03 00:00:00 form added
2020-06-03 00:00:00 form added
2020-06-02 00:00:00 form added
2020-05-26 00:00:00 form added
2020-05-12 00:00:00 form added
2020-04-03 00:00:00 form added
2020-03-13 00:00:00 form added
2020-03-02 00:00:00 form added
2020-01-08 00:00:00 form added
2018-06-11 00:00:00 form added
2018-05-30 00:00:00 form added
2018-05-24 00:00:00 form added
2018-05-21 00:00:00 form added


Unnamed: 0,date,form,text
0,2020-06-03,8-K,8-k/a 1 tm2021982-3_8ka.htm form 8-k/a ...
1,2020-06-03,8-K,8-k 1 tm2021982d1_8k.htm form 8-k ...
2,2020-06-03,8-K,8-k 1 tm2020911-2_8k.htm form 8-k ...
3,2020-06-02,8-K,8-k 1 tm2020911-3_8k.htm form 8-k ...
4,2020-05-26,8-K,8-k 1 tm2020911d1_8k.htm form 8-k ...
5,2020-05-12,8-K,8-k 1 tm2019424-1_8k.htm form 8-k ...
6,2020-04-03,8-K,8-k 1 tm2014949-1_8k.htm form 8-k ...
7,2020-03-13,8-K,8-k 1 tm2012695-1_8k.htm form 8-k ...
8,2020-03-02,8-K,8-k 1 tm2011332d1_8k.htm form 8-k ...
9,2020-01-08,8-K,8-k 1 tm201589d1_8k.htm form 8-k ...
