In [5]:
import os
import pandas as pd
import numpy as np
from edgar import Company
import re
from matplotlib import pyplot as pp

import pickle

# Load data and get unique ciks
These come from https://www.kaggle.com/finnhub/sec-filings?select=2021.QTR1.csv which has data back to '94!

In [6]:
os.chdir('/home/lawrence/Personal/Masters/COMP0087_ Natural_Language_Processing/Project/Data/AllEdgarFilings')

In [7]:
read_files = [
    '2020.QTR1.csv',
    '2020.QTR2.csv',
    '2020.QTR3.csv',
    '2020.QTR4.csv',
]

In [8]:
# pull data from finhub sec filings
ciks = []
for fn in read_files:
    frame = pd.read_csv('2020.QTR1.csv')
    ciks.append(frame[frame.form=='10-K'].loc[:, ['symbol', 'cik']])
ciks = pd.concat(ciks)

In [9]:
# remove null data
ciks = ciks[~pd.isnull(ciks['symbol'])]
ciks = ciks[~pd.isnull(ciks['cik'])]

# multiple dupicate filings so drop these
ciks = ciks.drop_duplicates()

We now have ciks for all unique 10-K filings in 2020! There are about 4000

In [10]:
ciks.sort_values('symbol')

Unnamed: 0,symbol,cik
173370,AA,1675149
185417,AAAU,1708646
252146,AAL,6201
131120,AAMC,1555074
275582,AAME,8177
...,...,...
69221,ZVO,1305323
105268,ZYJT,1442101
94581,ZYME,1403752
152428,ZYNE,1621443


In [11]:
len(np.unique(ciks['cik'].values))

4065

In [12]:
# clearly some tickers have multiple ciks
len(np.unique(ciks['symbol'].values))

3942

# Get business description data for ciks
We download data from EDGAR via package edgar using the ciks determined above. Then we look through the raw text to find the business description and pull it out.

In [13]:
def find_sequence(seq, all_tokens):
    check_num = 0
    seq_loc = 0
    for ll, ss in enumerate(all_tokens):
        if ss.lower() in seq:
            check_num += 1
        else:
            check_num = 0
        # highly hacky as first location is in table of contents; this return second
        if check_num == len(seq):
            seq_loc = ll + 1
    return seq_loc

In [14]:
i = 0
bds = {}
try_starts = [
    {'item', '1.', 'business'},
    {'item', '1.', 'our', 'business'},
    {'item', '1:', 'business'},
    {'item', '1.', 'business.'},
    {'item', '1', 'business'},
    {'item', '1.business'},
    
        ]
try_ends = [
    {'item', '1a.', 'risk', 'factors'},
    {'item', '1a:', 'risk', 'factors'},
    {'item', '1a.', 'risk', 'factors.'},
    {'item', '1a', 'risk', 'factors'},
        ]
max_tries = 5

In [None]:
loop_over = [(x, y) for x, y in zip(ciks['symbol'].values, ciks['cik'].values)]
for ticker, cik in loop_over:
    print(f'Extracting {ticker}:{cik}')
    tt = 0
    while tt < max_tries:
        try:
            company = Company(ticker, str(cik))
            tree = company.get_all_filings(filing_type = "10-K")
            docs = Company.get_documents(tree, no_of_documents=5)
            break
        except:
            tt+=1
    if tt == max_tries:
        print(f'Edgar timeout for {ticker}')
        continue
    for doc in docs:
        fs_text = doc.text_content()
        # could tokenise better here
        # re.split(r'[ \t\n]+', fs_text) would split on any number of tabs, whitespaces, newlines
        fs_text = fs_text.replace('\t', ' ')
        fs_text = fs_text.replace('\n', ' ')
        tokenised = fs_text.split()
        
        for ts in try_starts:
            start_seq = find_sequence(ts, tokenised)
            if start_seq != 0:
                break
        for te in try_ends:
            end_seq = find_sequence(te, tokenised)
            if end_seq != 0:
                break
        checks = (start_seq != 0) & (end_seq != 0) & ((end_seq - start_seq) > 100)
        if checks:
            print(f'from {start_seq} to {end_seq} out of {len(tokenised)}')
            break
    if checks:
        bds[f'{ticker}:{cik}'] = tokenised[start_seq:end_seq]
    else:
        bds[f'{ticker}:{cik}'] = []


In [None]:
pickle.dump(bds, open("bds_1.p", "wb"))

In [19]:
os.chdir('/home/lawrence/Personal/Masters/COMP0087_ Natural_Language_Processing/Project/Data/')
file1 = open("bds_1.txt","a")
for kk, vv in bds.items():
    file1.write(kk + '\n')
    file1.write(' '.join(vv)+ '\n')

file1.close()

In [21]:
i = 0
for vv in bds.values():
    if len(vv) > 0:
        i += 1
i

2034

In [None]:
def extract_text(cik, doc_num):
    company = Company(str(cik), str(cik))
    tree = company.get_all_filings(filing_type = "10-K")
    docs = Company.get_documents(tree, no_of_documents=doc_num+1)
    fs_text = doc.text_content()
    fs_text = fs_text.replace('\t', ' ')
    fs_text = fs_text.replace('\n', ' ')
    tokenised = fs_text.split()
    print(' '.join(tokenised)[:100000])

In [None]:
extract_text(1000753, 0)

Clearly some extractions have failed, about 40% currently which is too high. These will require a case by case handling. Below is a problem for novavax. Also sometimes edgar failes as per error message abover (we could just try catch this case).

# Finding start sequence

In [None]:
def get_tokens(cik, doc_num):
    company = Company(str(cik), str(cik))
    tree = company.get_all_filings(filing_type = "10-K")
    docs = Company.get_documents(tree, no_of_documents=doc_num+1)
    fs_text = doc.text_content()
    fs_text = fs_text.replace('\t', ' ')
    fs_text = fs_text.replace('\n', ' ')
    return fs_text.split()

In [None]:
all_tokens = get_tokens(1000697, 0)

In [None]:
def find_smoothed(tokens, string, smoothing):
    contains = np.array([x.lower().find(string) != -1 for x in tokens])
    return np.minimum(np.convolve(contains, np.ones(smoothing)), 1)

In [None]:
win = 3
loc_business = find_smoothed(all_tokens, 'business', win)
loc_item = find_smoothed(all_tokens, 'item', win)
loc_1 = find_smoothed(all_tokens, '1', win)

In [None]:
pp.plot(loc_business+loc_item+loc_1)

In [None]:
win=3
loc_item = find_smoothed(all_tokens, 'item', win)
loc_1a = find_smoothed(all_tokens, '1a', win)
loc_risk = find_smoothed(all_tokens, 'risk', win)
loc_factors = find_smoothed(all_tokens, 'factors', win)

In [None]:
pp.plot(loc_item+ loc_1a+loc_risk+loc_factors)