In [1]:
# mode = ['full','cpu','gpu','wv'][0]

# import libraries
import os
from os.path import isfile, isdir, join
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
from datetime import datetime, date
from dateutil.relativedelta import relativedelta
import time
from bs4 import BeautifulSoup
import re
from IPython.display import display
from zipfile import ZipFile
import pickle
import unicodedata
import pytz
from joblib import Parallel, delayed
import shutil
import random
import requests
import gc
import math

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_score

!pip install lxml
import lxml

!pip install python-edgar
import edgar

import nltk
from nltk import tokenize
nltk.download('punkt')

Collecting python-edgar
  Downloading python_edgar-3.1.3-py3-none-any.whl (8.6 kB)
Installing collected packages: python-edgar
Successfully installed python-edgar-3.1.3
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# log
def log(msg):
    now = datetime.strftime(datetime.now(tz=pytz.timezone('Hongkong')), '%Y-%m-%d %H:%M:%S')
    print(f'[{now}] {msg}')
    
# pickle
def save_pkl(obj, filename):
    pickle.dump(obj, open(filename, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
    return
def load_pkl(filename):
    return pickle.load(open(filename, 'rb'))

def get_size(path='.'):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            # skip if it is symbolic link
            if not os.path.islink(fp):
                total_size += os.path.getsize(fp)
    return total_size

def dl_txt(txt):
    with open(f'text.txt', 'w') as f:
        f.write(txt)
        f.close()
    return

import signal
class TimeoutException(Exception):   # Custom exception class
    pass
def timeout_handler(signum, frame):   # Custom signal handler
    raise TimeoutException
signal.signal(signal.SIGALRM, timeout_handler)

<Handlers.SIG_DFL: 0>

In [3]:
# params
params = dict()
params['filing_start_date'] = '2008-01-01'
params['filing_end_date'] = '2018-03-31'

# Create Master Index Table

In [4]:
'''
Get CIK mapping from 1) Wikipedia and 2) Edgar official mapping
'''

# current S&P500 CIK mapping based on wikipedia
wiki_tbl_list = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
curr_cons = wiki_tbl_list[0] \
            .assign(stock = lambda x: x.Symbol,
                    cik = lambda x: x.CIK.astype(str).str.zfill(10)) \
            .loc[:,['stock','cik']]

# Official Edgar Symbol-to-CIK mapping
cik_map = pd.read_csv('https://www.sec.gov/include/ticker.txt', sep='\t', names=['stock','cik']) \
                .assign(stock = lambda x: x.stock.str.upper(),
                        cik = lambda x: x.cik.astype(str).str.zfill(10))

# combine the two sources
cik_map = pd.concat([cik_map, curr_cons.loc[lambda x: ~x.stock.isin(cik_map.stock)]], axis=0).drop_duplicates()
display(cik_map.groupby('stock').count().loc[lambda x: x.cik>1])
display(cik_map.head())

Unnamed: 0_level_0,cik
stock,Unnamed: 1_level_1


Unnamed: 0,stock,cik
0,AAPL,320193
1,MSFT,789019
2,GOOGL,1652044
3,AMZN,1018724
4,TSLA,1318605


In [5]:
'''
Perform CIK mapping on returns table
'''

# load full stock list based on returns table
ret = pd.read_csv('../input/hkml-download-returns/ret.csv')
ret = ret.set_index('date')
ret.index = pd.to_datetime(ret.index)

# derive EDGAR filing start (ret start date - 400 days) and end date
df = []
for stock in ret:
    s = ret[stock].loc[lambda x: x.notnull()].index
    df.append((stock, s.min(), s.max()))
df = pd.DataFrame(df, columns=['stock','start_date','end_date'])
df['start_date'] = df['start_date'] + np.timedelta64(-365*2,'D')

# map to CIK
stock_map = df.merge(cik_map, how='left', on='stock')
assert stock_map.stock.nunique()==stock_map.shape[0]

In [6]:
'''
Populate missing CIK
'''

# output missing CIK list
stock_map[stock_map.cik.isnull()].to_csv('missing_stock_map.csv', index=False)

# import manual mapping
manual_cik_map = pd.read_csv('../input/nlp10k-manual-stock-cik-mapping/missing_stock_map.csv') \
    .assign(cik = lambda x: x.cik.astype(str).str.zfill(10)) \
    .rename(columns={'cik':'missing_cik'}) \
    .loc[:,['stock','missing_cik']]

# fill in missing CIK
stock_map = stock_map.merge(manual_cik_map, how='left', on='stock') \
    .assign(cik = lambda x: np.select([(x.cik.isnull()) & (x.missing_cik.notnull()), True],[x.missing_cik, x.cik])) \
    .drop('missing_cik', axis=1)

# check if still missing any CIK
display(stock_map.loc[lambda x: x.cik.isnull()])
display(stock_map.head())

Unnamed: 0,stock,start_date,end_date,cik
14,ADS,2011-12-25,2018-03-26,
452,PBCT,2007-07-07,2018-03-26,


Unnamed: 0,stock,start_date,end_date,cik
0,A,2007-07-07,2018-03-26,1090872
1,AA,2007-07-07,2016-10-31,1675149
2,AAL,2013-03-24,2018-03-26,6201
3,AAP,2013-07-09,2018-03-26,1158449
4,AAPL,2007-07-07,2018-03-26,320193


In [7]:
%%time
'''
Download master index from EDGAR
'''
# download all index
edgar.download_index(dest='./', since_year=2006, user_agent='phyung0107@gmail.com', skip_all_present_except_last=False)

# combin index
master_idx = []
for f in os.listdir('./'):
    if '.tsv' in f:
        df = pd.read_csv(f'./{f}', sep='|', names=['cik','entity','filing_type','filing_date','full_submission_filename','index_url'])
        master_idx.append(df)
        os.remove(f'./{f}')
master_idx = pd.concat(master_idx)

# cleaning and filter with only filings required
master_idx = master_idx \
    .assign(cik = lambda x: x.cik.astype(str).str.zfill(10),
            filing_date = lambda x: pd.to_datetime(x.filing_date)) \
    .merge(stock_map, how='inner', on='cik') \
    .loc[lambda x: (x.filing_date >= x.start_date) & (x.filing_date <= x.end_date)] \
    .reset_index(drop=True)

# if duplicate, take last entry
master_idx = master_idx \
    .sort_values(['filing_type','cik','filing_date','full_submission_filename']) \
    .groupby(['filing_type','cik','filing_date']) \
    .last() \
    .reset_index()

# remove stocks with only 1 10-k or 10-Q filing
ciks_10k = master_idx.loc[lambda x: x.filing_type=='10-K'].groupby('cik')['full_submission_filename'].nunique().loc[lambda x: x==1].index
ciks_10q = master_idx.loc[lambda x: x.filing_type=='10-Q'].groupby('cik')['full_submission_filename'].nunique().loc[lambda x: x==1].index
master_idx = master_idx.loc[lambda x: (~x.cik.isin(ciks_10k)) & (~x.cik.isin(ciks_10q))]

# save the CIK-stock mapping
cik_map = master_idx[['cik','stock']].drop_duplicates()

# final clean
master_idx = master_idx \
    .drop(['stock','start_date','end_date'], axis=1) \
    .drop_duplicates() \
    .sort_values(['filing_type','cik','filing_date']) \
    .reset_index(drop=True)

# separate 10-K, 10-Q, 8-K
master_idx_10q = master_idx.loc[lambda x: x.filing_type=='10-Q'].reset_index(drop=True)
master_idx_8k = master_idx.loc[lambda x: x.filing_type=='8-K'].reset_index(drop=True)
master_idx = master_idx.loc[lambda x: x.filing_type=='10-K'].reset_index(drop=True)

log(f'Shape of 10-K master_idx: {master_idx.shape}')
log(f'Shape of 10-Q master_idx: {master_idx_10q.shape}')
log(f'Shape of 8-K master_idx: {master_idx_8k.shape}')
log(f'Avg number of 10-K filing per stock: {master_idx.shape[0] / master_idx.cik.nunique()}')
log(f'Avg number of 10-Q filing per stock: {master_idx_10q.shape[0] / master_idx_10q.cik.nunique()}')
log(f'Avg number of 8-K filing per stock: {master_idx_8k.shape[0] / master_idx_8k.cik.nunique()}')
display(master_idx.sample(5))
display(master_idx.groupby('cik')['full_submission_filename'].nunique().value_counts())

[2022-05-13 19:46:13] Shape of 10-K master_idx: (5272, 6)
[2022-05-13 19:46:13] Shape of 10-Q master_idx: (15470, 6)
[2022-05-13 19:46:13] Shape of 8-K master_idx: (74272, 6)
[2022-05-13 19:46:13] Avg number of 10-K filing per stock: 8.742951907131012
[2022-05-13 19:46:13] Avg number of 10-Q filing per stock: 25.612582781456954
[2022-05-13 19:46:13] Avg number of 8-K filing per stock: 122.56105610561056


Unnamed: 0,filing_type,cik,filing_date,entity,full_submission_filename,index_url
1436,10-K,91440,2008-02-19,SNAP ON INC,edgar/data/91440/0001104659-08-011401.txt,edgar/data/91440/0001104659-08-011401-index.html
862,10-K,50863,2008-02-20,INTEL CORP,edgar/data/50863/0000891618-08-000106.txt,edgar/data/50863/0000891618-08-000106-index.html
3346,10-K,916076,2016-02-23,MARTIN MARIETTA MATERIALS INC,edgar/data/916076/0001193125-16-473754.txt,edgar/data/916076/0001193125-16-473754-index.html
169,10-K,8818,2010-03-01,AVERY DENNISON CORPORATION,edgar/data/8818/0000950123-10-018494.txt,edgar/data/8818/0000950123-10-018494-index.html
473,10-K,29534,2017-03-24,DOLLAR GENERAL CORP,edgar/data/29534/0001558370-17-002116.txt,edgar/data/29534/0001558370-17-002116-index.html


11    310
10     48
3      43
4      38
5      36
6      32
9      30
7      29
8      27
2       8
12      2
Name: full_submission_filename, dtype: int64

CPU times: user 2min 7s, sys: 19.7 s, total: 2min 26s
Wall time: 3min 28s


In [8]:
# # for testing
# master_idx = master_idx.loc[lambda x: x.cik.isin(master_idx.cik.sample(3).tolist())].reset_index(drop=True)
# master_idx = master_idx.loc[lambda x: x.cik=='0000020286'].reset_index(drop=True)

In [9]:
%%time
'''
contruct full 10-K HTML URLs
'''
def get_html_link(i, full_submission_filename, index_url):
    time.sleep(0.1)
    try: 
        # get 10-K document name
        url = f'https://www.sec.gov/Archives/{index_url}'
        html = requests.get(url, headers={"user-agent": f"chan_tai_man_{int(float(np.random.rand(1)) * 1e7)}@gmail.com"}).content
        doc_name = pd.read_html(html)[0] \
            .loc[lambda x: x.Type=='10-K'] \
            .sort_values('Size', ascending=False) \
            .Document \
            .tolist()[0]

        # construct full URL
        filing_id = full_submission_filename.replace('.txt','').replace('-','')
        full_url = f'https://www.sec.gov/Archives/{filing_id}/{doc_name}'
    except:
        full_url = None
    
    log(f'[{i}] {full_url}') if i%200==0 else None
    return i, full_url

results = Parallel(n_jobs=-1)(delayed(get_html_link)(i, master_idx.iloc[i]['full_submission_filename'], master_idx.iloc[i]['index_url']) for i in range(len(master_idx)))
results = pd.DataFrame(results, columns=['i','url_10k']).set_index('i')
master_idx = master_idx.merge(results, how='left', left_index=True, right_index=True)

# remove nulls and pdf
log(f'Percentage of null: {master_idx["url_10k"].isnull().sum() / master_idx.shape[0]}')
log(f'Percentage of PDF: {(master_idx["url_10k"].fillna("").str.lower().str[-3:]=="pdf").sum() / master_idx.shape[0]}')
master_idx = master_idx.loc[lambda x: (x.url_10k.fillna('').str.lower().str[-3:].isin(['htm','tml']))].reset_index(drop=True)

# check again CIK with single doc
ciks = master_idx.groupby('cik')['filing_date'].count().loc[lambda x: x<2].index.tolist()
log(f'Number of CIK with single doc: {len(ciks)}')
master_idx = master_idx.loc[lambda x: ~x.cik.isin(ciks)].reset_index(drop=True)

# assign doc_id
master_idx = master_idx \
    .assign(doc_id = lambda x: x.cik + '_' + x.filing_date.apply(lambda y: str(y)[:10].replace('-',''))) \
    .sort_values('doc_id') \
    .reset_index(drop=True)

assert master_idx.doc_id.nunique()==master_idx.shape[0]
log(f'Shape of master_idx: {master_idx.shape}')
display(master_idx.sample(5))

[2022-05-13 19:55:51] Percentage of null: 0.0
[2022-05-13 19:55:51] Percentage of PDF: 0.011760242792109257
[2022-05-13 19:55:51] Number of CIK with single doc: 1
[2022-05-13 19:55:51] Shape of master_idx: (5186, 8)


Unnamed: 0,filing_type,cik,filing_date,entity,full_submission_filename,index_url,url_10k,doc_id
3193,10-K,909954,2012-11-28,GREEN MOUNTAIN COFFEE ROASTERS INC,edgar/data/909954/0001104659-12-080228.txt,edgar/data/909954/0001104659-12-080228-index.html,https://www.sec.gov/Archives/edgar/data/909954...,0000909954_20121128
188,10-K,9389,2011-02-28,BALL CORP,edgar/data/9389/0000009389-11-000014.txt,edgar/data/9389/0000009389-11-000014-index.html,https://www.sec.gov/Archives/edgar/data/9389/0...,0000009389_20110228
1509,10-K,96021,2016-08-30,SYSCO CORP,edgar/data/96021/0000096021-16-000275.txt,edgar/data/96021/0000096021-16-000275-index.html,https://www.sec.gov/Archives/edgar/data/96021/...,0000096021_20160830
339,10-K,20520,2016-02-25,FRONTIER COMMUNICATIONS CORP,edgar/data/20520/0000020520-16-000076.txt,edgar/data/20520/0000020520-16-000076-index.html,https://www.sec.gov/Archives/edgar/data/20520/...,0000020520_20160225
3148,10-K,899689,2013-02-26,VORNADO REALTY TRUST,edgar/data/899689/0000899689-13-000004.txt,edgar/data/899689/0000899689-13-000004-index.html,https://www.sec.gov/Archives/edgar/data/899689...,0000899689_20130226


CPU times: user 39.3 s, sys: 1.52 s, total: 40.9 s
Wall time: 9min 36s
Parser   : 818 ms


In [10]:
%%time
'''
contruct full 10-Q HTML URLs
'''
def get_html_link(i, full_submission_filename, index_url):
    time.sleep(0.1)
    try: 
        # get 10-K document name
        url = f'https://www.sec.gov/Archives/{index_url}'
        html = requests.get(url, headers={"user-agent": f"chan_tai_man_{int(float(np.random.rand(1)) * 1e7)}@gmail.com"}).content
        doc_name = pd.read_html(html)[0] \
            .loc[lambda x: x.Type=='10-Q'] \
            .sort_values('Size', ascending=False) \
            .Document \
            .tolist()[0]

        # construct full URL
        filing_id = full_submission_filename.replace('.txt','').replace('-','')
        full_url = f'https://www.sec.gov/Archives/{filing_id}/{doc_name}'
    except:
        full_url = None
    
    log(f'[{i}] {full_url}') if i%200==0 else None
    return i, full_url

results = Parallel(n_jobs=-1)(delayed(get_html_link)(i, master_idx_10q.iloc[i]['full_submission_filename'], master_idx_10q.iloc[i]['index_url']) for i in range(len(master_idx_10q)))
results = pd.DataFrame(results, columns=['i','url_10q']).set_index('i')
master_idx_10q = master_idx_10q.merge(results, how='left', left_index=True, right_index=True)

# remove nulls and pdf
log(f'Percentage of null: {master_idx_10q["url_10q"].isnull().sum() / master_idx_10q.shape[0]}')
log(f'Percentage of PDF: {(master_idx_10q["url_10q"].fillna("").str.lower().str[-3:]=="pdf").sum() / master_idx_10q.shape[0]}')
master_idx_10q = master_idx_10q.loc[lambda x: (x.url_10q.fillna('').str.lower().str[-3:].isin(['htm','tml']))].reset_index(drop=True)

# check again CIK with single doc
ciks = master_idx_10q.groupby('cik')['filing_date'].count().loc[lambda x: x<2].index.tolist()
log(f'Number of CIK with single doc: {len(ciks)}')
master_idx_10q = master_idx_10q.loc[lambda x: ~x.cik.isin(ciks)].reset_index(drop=True)

# assign doc_id
master_idx_10q = master_idx_10q \
    .assign(doc_id = lambda x: x.cik + '_' + x.filing_date.apply(lambda y: str(y)[:10].replace('-',''))) \
    .sort_values('doc_id') \
    .reset_index(drop=True)

assert master_idx_10q.doc_id.nunique()==master_idx_10q.shape[0]
log(f'Shape of master_idx_10q: {master_idx_10q.shape}')
display(master_idx_10q.sample(5))

[2022-05-13 20:23:54] Percentage of null: 0.0
[2022-05-13 20:23:54] Percentage of PDF: 0.009372979961215255
[2022-05-13 20:23:54] Number of CIK with single doc: 0
[2022-05-13 20:23:55] Shape of master_idx_10q: (15238, 8)


Unnamed: 0,filing_type,cik,filing_date,entity,full_submission_filename,index_url,url_10q,doc_id
9685,10-Q,916365,2014-08-04,TRACTOR SUPPLY CO /DE/,edgar/data/916365/0000916365-14-000121.txt,edgar/data/916365/0000916365-14-000121-index.html,https://www.sec.gov/Archives/edgar/data/916365...,0000916365_20140804
10104,10-Q,936468,2017-07-20,LOCKHEED MARTIN CORP,edgar/data/936468/0001193125-17-232077.txt,edgar/data/936468/0001193125-17-232077-index.html,https://www.sec.gov/Archives/edgar/data/936468...,0000936468_20170720
10763,10-Q,1018963,2012-05-04,ALLEGHENY TECHNOLOGIES INC,edgar/data/1018963/0001193125-12-212043.txt,edgar/data/1018963/0001193125-12-212043-index....,https://www.sec.gov/Archives/edgar/data/101896...,0001018963_20120504
14705,10-Q,1466258,2014-07-22,Ingersoll-Rand plc,edgar/data/1466258/0001466258-14-000044.txt,edgar/data/1466258/0001466258-14-000044-index....,https://www.sec.gov/Archives/edgar/data/146625...,0001466258_20140722
11220,10-Q,1039684,2009-11-05,ONEOK INC /NEW/,edgar/data/1039684/0001039684-09-000083.txt,edgar/data/1039684/0001039684-09-000083-index....,https://www.sec.gov/Archives/edgar/data/103968...,0001039684_20091105


CPU times: user 1min 55s, sys: 3.58 s, total: 1min 58s
Wall time: 28min 3s


# Document cleaning and Item extraction

In [11]:
def remove_unicode1(txt):
    chars = {
        r'[\xc2\x82]' : ',',        # High code comma
         r'[\xc2\x84]' : ',,',       # High code double comma
         r'[\xc2\x85]' : '...',      # Tripple dot
         r'[\xc2\x88]' : '^',        # High carat
         r'[\xc2\x91]' : "'",     # Forward single quote
         r'[\xc2\x92]' : "'",     # Reverse single quote
         r'[\xc2\x93]' : '"',     # Forward double quote
         r'[\xc2\x94]' : '"',     # Reverse double quote
         r'[\xc2\x95]' : ' ',
         r'[\xc2\x96]' : '-',        # High hyphen
         r'[\xc2\x97]' : '--',       # Double hyphen
         r'[\xc2\x99]' : ' ',
         r'[\xc2\xa0]' : ' ',
         r'[\xc2\xa6]' : '|',        # Split vertical bar
         r'[\xc2\xab]' : '<<',       # Double less than
         r'[\xc2\xbb]' : '>>',       # Double greater than
         r'[\xc2\xbc]' : '1/4',      # one quarter
         r'[\xc2\xbd]' : '1/2',      # one half
         r'[\xc2\xbe]' : '3/4',      # three quarters
         r'[\xca\xbf]' : "'",     # c-single quote
         r'[\xcc\xa8]' : '',         # modifier - under curve
         r'[\xcc\xb1]' : '',          # modifier - under line
         r"[\']" : "'"
    }
    for ptrn in chars:
        txt = re.sub(ptrn, chars[ptrn], txt)
    return txt

def remove_unicode2(txt):
    txt = txt. \
        replace('\\xe2\\x80\\x99', "'"). \
        replace('\\xc3\\xa9', 'e'). \
        replace('\\xe2\\x80\\x90', '-'). \
        replace('\\xe2\\x80\\x91', '-'). \
        replace('\\xe2\\x80\\x92', '-'). \
        replace('\\xe2\\x80\\x93', '-'). \
        replace('\\xe2\\x80\\x94', '-'). \
        replace('\\xe2\\x80\\x94', '-'). \
        replace('\\xe2\\x80\\x98', "'"). \
        replace('\\xe2\\x80\\x9b', "'"). \
        replace('\\xe2\\x80\\x9c', '"'). \
        replace('\\xe2\\x80\\x9c', '"'). \
        replace('\\xe2\\x80\\x9d', '"'). \
        replace('\\xe2\\x80\\x9e', '"'). \
        replace('\\xe2\\x80\\x9f', '"'). \
        replace('\\xe2\\x80\\xa6', '...'). \
        replace('\\xe2\\x80\\xb2', "'"). \
        replace('\\xe2\\x80\\xb3', "'"). \
        replace('\\xe2\\x80\\xb4', "'"). \
        replace('\\xe2\\x80\\xb5', "'"). \
        replace('\\xe2\\x80\\xb6', "'"). \
        replace('\\xe2\\x80\\xb7', "'"). \
        replace('\\xe2\\x81\\xba', "+"). \
        replace('\\xe2\\x81\\xbb', "-"). \
        replace('\\xe2\\x81\\xbc', "="). \
        replace('\\xe2\\x81\\xbd', "("). \
        replace('\\xe2\\x81\\xbe', ")")
    return txt

In [12]:
def clean_doc1(txt):

    # remove all special fields e.g. us-gaap:AccumulatedOtherComprehensiveIncomeMember
    txt = re.sub(r'\b' + re.escape('us-gaap:') + r'\w+\b', '', txt)
    txt = re.sub(r'\b\w+[:]\w+\b', '', txt)

    # remove unicode characters
    txt = unicodedata.normalize("NFKD", txt)
    txt = remove_unicode1(txt)
    txt = remove_unicode2(txt)

    # standardize spaces
    txt = txt.replace('\\n',' ').replace('\n',' ').replace('\\t','|').replace('\t','|')
    txt = re.sub(r'\| +', '|', txt)
    txt = re.sub(r' +\|', '|', txt)
    txt = re.sub(r'\|+', '|', txt)
    txt = re.sub(r' +', ' ', txt)
    return txt

In [13]:
'''
Function to clean txt; applied only after Item extraction
'''
def clean_doc2(txt):
    # lowercase all strings
    txt = txt.lower()
    # replace sep with space
    txt = txt.replace('|',' ')
    # remove tags
    txt = re.sub('<.+>', '', txt)
    # remove unwanted characters, numbers, dots
    txt = re.sub(r'([a-z]+\d+)+([a-z]+)?(\.+)?', '', txt) # aa12bb33. y3y
    txt = re.sub(r'(\d+[a-z]+)+(\d+)?(\.+)?', '', txt) # 1a2b. 1a1a1
    txt = re.sub(r'\b\$?\d+\.(\d+)?', '', txt) # $2.14 999.8 123.
    txt = re.sub(r'\$\d+', '', txt) # $88
    txt = re.sub(r'(\w+\.){2,}(\w+)?', '', txt) # W.C. ASD.ASD.c
    txt = re.sub(r"\bmr\.|\bjr\.|\bms\.|\bdr\.|\besq\.|\bhon\.|\bmrs\.|\bprof\.|\brev\.|\bsr\.|\bst\.|\bno\.", '', txt) # titles and common abbreviations
    txt = re.sub(r'\b[a-z]\.', '', txt) #  L.
    txt = re.sub(r'(\w+)?\.\w+', '', txt) # .net .123 www.123
    txt = re.sub(r'[\$\%\d]+', '', txt) # remove all $/%/numbers
    # final clean format
    txt = re.sub(r'[\.\:\;]', '.', txt) # standardize all sentence separators
    txt = re.sub(r'( ?\. ?)+', '. ', txt) # replace consecutive sentence separators
    txt = re.sub(r' +', ' ', txt) # replace consecutive spaces
    txt = re.sub(r'( ?, ?)+', ', ', txt) # replace consecutive ","
    return txt

In [14]:
'''
Item extraction Regex patterns
'''

# function to convert txt to re pattern allowing any | between characters
def w(txt):
    txt = r''.join([x + r'\|?' for x in list(txt)])
    return txt

def wu(txt):
    txt = r''.join([x + r'\|?' for x in list(txt)])
    return r'(?:' + txt + r'|' + txt.upper() + r')'

def s(x='.'):
    return x + r'{0,5}'

# defining search patterns
item_ptrn1 = dict()
item_ptrn1['item_1'] = rf"\|(?:{wu('Item')}{s()}1{s()}){w('Business')}{s('[^a-z]')}\|"
item_ptrn1['item_1a'] = rf"\|(?:{wu('Item')}{s()}{wu('1a')}{s()}){w('Risk')}{s()}{w('Factors')}{s()}\|"
item_ptrn1['item_1b'] = rf"\|(?:{wu('Item')}{s()}{wu('1b')}{s()}){w('Unresolved')}{s()}(?:{w('Staff')}|{w('SEC')}){s()}{w('Comment')}{s()}\|"
item_ptrn1['item_2'] = rf"\|(?:{wu('Item')}{s()}2{s()}){w('Properties')}{s()}\|"
item_ptrn1['item_3'] = rf"\|(?:{wu('Item')}{s()}3{s()}){w('Legal')}{s()}{w('Proceeding')}{s()}\|"
item_ptrn1['item_4'] = r'|'.join([rf"(?:\|(?:{wu('Item')}{s()}4{s()}){w('Mine')}{s()}{w('Safety')}{s()}{w('Disclosure')}{s()}\|)", 
                                 rf"(?:\|(?:{wu('Item')}{s()}4{s()}){w('Submission')}{s()}{w('f')}{s()}{w('Matter')}{s()}{w('o')}{s()}{wu('a')}{s()}{w('Vote')}{s()}{w('f')}{s()}{w('Security')}{s()}{w('Holder')}{s()}\|)",
                                 rf"(?:\|(?:{wu('Item')}{s()}4{s()})(?:{w('Removed')}{s()}{w('nd')}{s()})?{w('Reserved')}{s()}\|)"])
item_ptrn1['item_5'] = rf"\|(?:{wu('Item')}{s()}5{s()}){w('Market')}{s()}{w('or')}{s()}{w('Registrant')}{s()}{w('Common')}{s()}{w('Equit')}(?:{w('y')}|{w('ies')}){s()}{w('Related')}{s()}{w('Stockholder')}{s()}{w('Matter')}{s()}{w('nd')}{s()}{w('Issuer')}{s()}{w('Purchase')}{s()}{w('f')}{s()}{w('Equit')}(?:{w('y')}|{w('ies')}){s()}{w('Securities')}{s()}\|"
item_ptrn1['item_6'] = rf"\|(?:{wu('Item')}{s()}6{s()}){w('Selected')}{s()}(?:{w('Consolidated')}{s()})?{w('Financial')}{s()}{w('Data')}{s()}\|"
item_ptrn1['item_7'] = r'|'.join([rf"\|(?:{wu('Item')}{s()}7{s()}){w('Management')}{s()}{w('Discussion')}{s()}{w('nd')}{s()}{w('Analy')}(?:{w('sis')}|{w('ses')}){s()}{w('f')}{s()}{w('Financial')}{s()}{w('Condition')}{s()}{w('nd')}{s()}{w('Result')}{s()}{w('f')}{s()}{w('Operation')}{s()}\|",
                                 rf"\|(?:{wu('Item')}{s()}7{s()}){w('Management')}{s()}{w('Discussion')}{s()}{w('nd')}{s()}{w('Analy')}(?:{w('sis')}|{w('ses')}){s()}{w('f')}{s()}{w('Result')}{s()}{w('f')}{s()}{w('Operation')}{s()}{w('nd')}{s()}{w('Financial')}{s()}{w('Condition')}{s()}\|"])
item_ptrn1['item_7a'] = r'|'.join([rf"\|(?:{wu('Item')}{s()}{wu('7a')}{s()}){w('Quantitative')}{s()}{w('nd')}{s()}{w('Qualitative')}{s()}{w('Disclosure')}{s()}{w('bout')}{s()}{w('Market')}{s()}{w('Risk')}{s()}\|",
                                  rf"\|(?:{wu('Item')}{s()}{wu('7a')}{s()}){w('Qualitative')}{s()}{w('nd')}{s()}{w('Quantitative')}{s()}{w('Disclosure')}{s()}{w('bout')}{s()}{w('Market')}{s()}{w('Risk')}{s()}\|"])
item_ptrn1['item_8'] = rf"\|(?:{wu('Item')}{s()}8{s()}){w('Financial')}{s()}{w('Statement')}{s()}{w('nd')}{s()}{w('Supplementary')}{s()}{w('Data')}{s()}\|"
item_ptrn1['item_9'] = rf"\|(?:{wu('Item')}{s()}9{s()}){w('Change')}{s()}{w('n')}{s()}{w('nd')}{s()}{w('Disagreement')}{s()}{w('ith')}{s()}{w('Accountant')}{s()}{w('n')}{s()}{w('Accounting')}{s()}{w('nd')}{s()}{w('Financial')}{s()}{w('Disclosure')}{s()}\|"
item_ptrn1['item_9a'] = rf"\|(?:{wu('Item')}{s()}{wu('9a')}{s()}){w('Control')}{s()}{w('nd')}{s()}{w('Procedure')}{s()}\|"
item_ptrn1['item_9b'] = rf"\|(?:{wu('Item')}{s()}{wu('9b')}{s()}){w('Other')}{s()}{w('Information')}{s()}\|"
item_ptrn1['item_10'] = rf"\|(?:{wu('Item')}{s()}10{s()}){w('Director')}{s()}{w('Executive')}{s()}{w('Officer')}{s()}{w('nd')}{s()}{w('Corporate')}{s()}{w('Governance')}{s()}\|"
item_ptrn1['item_11'] = rf"\|(?:{wu('Item')}{s()}11{s()}){w('Executive')}{s()}{w('Compensation')}{s()}\|"
item_ptrn1['item_12'] = rf"\|(?:{wu('Item')}{s()}12{s()}){w('Security')}{s()}{w('Ownership')}{s()}{w('f')}{s()}{w('Certain')}{s()}{w('Beneficial')}{s()}{w('Owner')}{s()}{w('nd')}{s()}{w('Management')}{s()}{w('nd')}{s()}{w('Related')}{s()}{w('Stockholder')}{s()}{w('Matter')}s?{s()}\|"
item_ptrn1['item_13'] = rf"\|(?:{wu('Item')}{s()}13{s()}){w('Certain')}{s()}{w('Relationship')}{s()}{w('nd')}{s()}{w('Related')}{s()}{w('Transaction')}{s()}{w('nd')}{s()}{w('Director')}{s()}{w('Independence')}{s()}\|"
item_ptrn1['item_14'] = rf"\|(?:{wu('Item')}{s()}14{s()}){w('Principal')}{s()}{w('Account')}(?:{w('ant')}|{w('ing')}){s()}{w('Fee')}{s()}{w('nd')}{s()}{w('Service')}{s()}\|"
item_ptrn1['item_15'] = rf"\|(?:{wu('Item')}{s()}15{s()}){w('Exhibits')}{s()}{w('Financial')}{s()}{w('Statement')}{s()}{w('Schedule')}{s()}\|"


item_ptrn2 = dict()
item_ptrn2['item_1'] = rf"\|(?:{wu('Item')}{s()}1{s()})?{w('Business')}{s('[^a-z]')}\|"
item_ptrn2['item_1a'] = rf"\|(?:{wu('Item')}{s()}{wu('1a')}{s()})?{w('Risk')}{s()}{w('Factors')}{s()}\|"
item_ptrn2['item_1b'] = rf"\|(?:{wu('Item')}{s()}{wu('1b')}{s()})?{w('Unresolved')}{s()}(?:{w('Staff')}|{w('SEC')}){s()}{w('Comment')}{s()}\|"
item_ptrn2['item_2'] = rf"\|(?:{wu('Item')}{s()}2{s()})?{w('Properties')}{s()}\|"
item_ptrn2['item_3'] = rf"\|(?:{wu('Item')}{s()}3{s()})?{w('Legal')}{s()}{w('Proceeding')}{s()}\|"
item_ptrn2['item_4'] = r'|'.join([rf"(?:\|(?:{wu('Item')}{s()}4{s()})?{w('Mine')}{s()}{w('Safety')}{s()}{w('Disclosure')}{s()}\|)", 
                                 rf"(?:\|(?:{wu('Item')}{s()}4{s()})?{w('Submission')}{s()}{w('f')}{s()}{w('Matter')}{s()}{w('o')}{s()}{wu('a')}{s()}{w('Vote')}{s()}{w('f')}{s()}{w('Security')}{s()}{w('Holder')}{s()}\|)",
                                 rf"(?:\|(?:{wu('Item')}{s()}4{s()})(?:{w('Removed')}{s()}{w('nd')}{s()})?{w('Reserved')}{s()}\|)"])
item_ptrn2['item_5'] = rf"\|(?:{wu('Item')}{s()}5{s()})?{w('Market')}{s()}{w('or')}{s()}{w('Registrant')}{s()}{w('Common')}{s()}{w('Equit')}(?:{w('y')}|{w('ies')}){s()}{w('Related')}{s()}{w('Stockholder')}{s()}{w('Matter')}{s()}{w('nd')}{s()}{w('Issuer')}{s()}{w('Purchase')}{s()}{w('f')}{s()}{w('Equit')}(?:{w('y')}|{w('ies')}){s()}{w('Securities')}{s()}\|"
item_ptrn2['item_6'] = rf"\|(?:{wu('Item')}{s()}6{s()})?{w('Selected')}{s()}(?:{w('Consolidated')}{s()})?{w('Financial')}{s()}{w('Data')}{s()}\|"
item_ptrn2['item_7'] = r'|'.join([rf"\|(?:{wu('Item')}{s()}7{s()})?{w('Management')}{s()}{w('Discussion')}{s()}{w('nd')}{s()}{w('Analy')}(?:{w('sis')}|{w('ses')}){s()}{w('f')}{s()}{w('Financial')}{s()}{w('Condition')}{s()}{w('nd')}{s()}{w('Result')}{s()}{w('f')}{s()}{w('Operation')}{s()}\|",
                                 rf"\|(?:{wu('Item')}{s()}7{s()})?{w('Management')}{s()}{w('Discussion')}{s()}{w('nd')}{s()}{w('Analy')}(?:{w('sis')}|{w('ses')}){s()}{w('f')}{s()}{w('Result')}{s()}{w('f')}{s()}{w('Operation')}{s()}{w('nd')}{s()}{w('Financial')}{s()}{w('Condition')}{s()}\|"])
item_ptrn2['item_7a'] = r'|'.join([rf"\|(?:{wu('Item')}{s()}{wu('7a')}{s()})?{w('Quantitative')}{s()}{w('nd')}{s()}{w('Qualitative')}{s()}{w('Disclosure')}{s()}{w('bout')}{s()}{w('Market')}{s()}{w('Risk')}{s()}\|",
                                  rf"\|(?:{wu('Item')}{s()}{wu('7a')}{s()})?{w('Qualitative')}{s()}{w('nd')}{s()}{w('Quantitative')}{s()}{w('Disclosure')}{s()}{w('bout')}{s()}{w('Market')}{s()}{w('Risk')}{s()}\|"])
item_ptrn2['item_8'] = rf"\|(?:{wu('Item')}{s()}8{s()})?{w('Financial')}{s()}{w('Statement')}{s()}{w('nd')}{s()}{w('Supplementary')}{s()}{w('Data')}{s()}\|"
item_ptrn2['item_9'] = rf"\|(?:{wu('Item')}{s()}9{s()})?{w('Change')}{s()}{w('in')}{s()}{w('nd')}{s()}{w('Disagreement')}{s()}{w('ith')}{s()}{w('Accountant')}{s()}{w('n')}{s()}{w('Accounting')}{s()}{w('nd')}{s()}{w('Financial')}{s()}{w('Disclosure')}{s()}\|"
item_ptrn2['item_9a'] = rf"\|(?:{wu('Item')}{s()}{wu('9a')}{s()})?{w('Control')}{s()}{w('nd')}{s()}{w('Procedure')}{s()}\|"
item_ptrn2['item_9b'] = rf"\|(?:{wu('Item')}{s()}{wu('9b')}{s()})?{w('Other')}{s()}{w('Information')}{s()}\|"
item_ptrn2['item_10'] = rf"\|(?:{wu('Item')}{s()}10{s()})?{w('Director')}{s()}{w('Executive')}{s()}{w('Officer')}{s()}{w('nd')}{s()}{w('Corporate')}{s()}{w('Governance')}{s()}\|"
item_ptrn2['item_11'] = rf"\|(?:{wu('Item')}{s()}11{s()})?{w('Executive')}{s()}{w('Compensation')}{s()}\|"
item_ptrn2['item_12'] = rf"\|(?:{wu('Item')}{s()}12{s()})?{w('Security')}{s()}{w('Ownership')}{s()}{w('f')}{s()}{w('Certain')}{s()}{w('Beneficial')}{s()}{w('Owner')}{s()}{w('nd')}{s()}{w('Management')}{s()}{w('nd')}{s()}{w('Related')}{s()}{w('Stockholder')}{s()}{w('Matter')}s?{s()}\|"
item_ptrn2['item_13'] = rf"\|(?:{wu('Item')}{s()}13{s()})?{w('Certain')}{s()}{w('Relationship')}{s()}{w('nd')}{s()}{w('Related')}{s()}{w('Transaction')}{s()}{w('nd')}{s()}{w('Director')}{s()}{w('Independence')}{s()}\|"
item_ptrn2['item_14'] = rf"\|(?:{wu('Item')}{s()}14{s()})?{w('Principal')}{s()}{w('Account')}(?:{w('ant')}|{w('ing')}){s()}{w('Fee')}{s()}{w('nd')}{s()}{w('Service')}{s()}\|"
item_ptrn2['item_15'] = rf"\|(?:{wu('Item')}{s()}15{s()})?{w('Exhibits')}{s()}{w('Financial')}{s()}{w('Statement')}{s()}{w('Schedule')}{s()}\|"



item_ptrn3 = dict()
item_ptrn3['item_1'] = r'|'.join([rf"\W{w('Business')}\W", 
                                  rf"\W{w('BUSINESS')}\W"])
item_ptrn3['item_1a'] = r'|'.join([rf"\W{w('Risk')}{s()}{w('Factors')}\W", 
                                   rf"\W{w('RISK')}{s()}{w('FACTORS')}\W"])
item_ptrn3['item_1b'] = r'|'.join([rf"\W{w('Unresolved')}{s()}(?:{w('Staff')}|{w('SEC')}|{w('Sec')}){s()}{w('Comment')}s?\W", 
                                   rf"\W{w('UNRESOLVED')}{s()}(?:{w('STAFF')}|{w('SEC')}){s()}{w('COMMENT')}S?\W"])
item_ptrn3['item_2'] = r'|'.join([rf"\W{w('Properties')}\W", 
                                  rf"\W{w('PROPERTIES')}\W"])
item_ptrn3['item_3'] = r'|'.join([rf"\W{w('Legal')}{s()}{w('Proceeding')}s?", 
                                  rf"\W{w('LEGAL')}{s()}{w('PROCEEDING')}S?"])
item_ptrn3['item_4'] = r'|'.join([rf"\W{w('Mine')}{s()}{w('Safety')}{s()}{w('Disclosure')}s?\W",
                                  rf"\W{w('MINE')}{s()}{w('SAFETY')}{s()}{w('DISCLOSURE')}S?\W",
                                  rf"\W(?:{w('Removed')}{s()}(?:{w('A')}|{w('a')}){w('nd')}{s()})?{w('Reserved')}\W",
                                  rf"\W(?:{w('REMOVED')}{s()}{w('AND')}{s()})?{w('RESERVED')}\W",
                                  rf"\W{w('Submission')}{s()}(?:{w('O')}|{w('o')}){w('f')}{s()}{w('Matter')}{s()}(?:{w('T')}|{w('t')}){w('o')}{s()}(?:{w('A')}|{w('a')}){s()}{w('Vote')}{s()}(?:{w('O')}|{w('o')}){w('f')}{s()}{w('Security')}{s()}{w('Holder')}s?\W",
                                  rf"\W{w('SUBMISSION')}{s()}{w('OF')}{s()}{w('MATTER')}{s()}{w('TO')}{s()}{w('A')}{s()}{w('VOTE')}{s()}{w('OF')}{s()}{w('SECURITY')}{s()}{w('HOLDER')}S?\W"])
item_ptrn3['item_5'] = r'|'.join([rf"\W{w('Market')}{s()}(?:{w('F')}|{w('f')}){w('or')}{s()}{w('Registrant')}{s()}{w('Common')}{s()}{w('Equit')}(?:{w('y')}|{w('ies')}){s()}{w('Related')}{s()}{w('Stockholder')}{s()}{w('Matter')}{s()}(?:{w('A')}|{w('a')}){w('nd')}{s()}{w('Issuer')}{s()}{w('Purchase')}{s()}(?:{w('O')}|{w('o')}){w('f')}{s()}{w('Equit')}(?:{w('y')}|{w('ies')}){s()}{w('Securities')}\W", 
                                  rf"\W{w('MARKET')}{s()}{w('FOR')}{s()}{w('REGISTRANT')}{s()}{w('COMMON')}{s()}{w('EQUIT')}(?:{w('Y')}|{w('IES')}){s()}{w('RELATED')}{s()}{w('STOCKHOLDER')}{s()}{w('MATTER')}{s()}{w('AND')}{s()}{w('ISSUER')}{s()}{w('PURCHASE')}{s()}{w('OF')}{s()}{w('EQUIT')}(?:{w('Y')}|{w('IES')}){s()}{w('SECURITIES')}\W"])
item_ptrn3['item_6'] = r'|'.join([rf"\W{w('Selected')}{s()}(?:{w('Consolidated')}{s()})?{w('Financial')}{s()}{w('Data')}\W", 
                                  rf"\W{w('SELECTED')}{s()}(?:{w('CONSOLIDATED')}{s()})?{w('FINANCIAL')}{s()}{w('DATA')}\W"])
item_ptrn3['item_7'] = r'|'.join([rf"\W{w('Management')}{s()}{w('Discussion')}{s()}(?:{w('A')}|{w('a')}){w('nd')}{s()}{w('Analy')}(?:{w('sis')}|{w('ses')}){s()}(?:{w('O')}|{w('o')}){w('f')}{s()}{w('Financial')}{s()}{w('Condition')}{s()}(?:{w('A')}|{w('a')}){w('nd')}{s()}{w('Result')}{s()}(?:{w('O')}|{w('o')}){w('f')}{s()}{w('Operation')}s?\W", 
                                  rf"\W{w('MANAGEMENT')}{s()}{w('DISCUSSION')}{s()}{w('AND')}{s()}{w('ANALY')}(?:{w('SIS')}|{w('SES')}){s()}{w('OF')}{s()}{w('FINANCIAL')}{s()}{w('CONDITION')}{s()}{w('AND')}{s()}{w('RESULT')}{s()}{w('OF')}{s()}{w('OPERATION')}S?\W",
                                  rf"\W{w('Management')}{s()}{w('Discussion')}{s()}(?:{w('A')}|{w('a')}){w('nd')}{s()}{w('Analy')}(?:{w('sis')}|{w('ses')}){s()}(?:{w('O')}|{w('o')}){w('f')}{s()}{w('Result')}{s()}(?:{w('O')}|{w('o')}){w('f')}{s()}{w('Operation')}{s()}(?:{w('A')}|{w('a')}){w('nd')}{s()}{w('Financial')}{s()}{w('Condition')}s?\W", 
                                  rf"\W{w('MANAGEMENT')}{s()}{w('DISCUSSION')}{s()}{w('AND')}{s()}{w('ANALY')}(?:{w('SIS')}|{w('SES')}){s()}{w('OF')}{s()}{w('RESULT')}{s()}{w('OF')}{s()}{w('OPERATION')}{s()}{w('AND')}{s()}{w('FINANCIAL')}{s()}{w('CONDITION')}S?\W"])
item_ptrn3['item_7a'] = '|'.join([rf"\W{w('Quantitative')}{s()}(?:{w('A')}|{w('a')}){w('nd')}{s()}{w('Qualitative')}{s()}{w('Disclosure')}{s()}(?:{w('A')}|{w('a')}){w('bout')}{s()}{w('Market')}{s()}{w('Risk')}s?\W",
                                  rf"\W{w('QUANTITATIVE')}{s()}{w('AND')}{s()}{w('QUALITATIVE')}{s()}{w('DISCLOSURE')}{s()}{w('ABOUT')}{s()}{w('MARKET')}{s()}{w('RISK')}S?\W",
                                  rf"\W{w('Qualitative')}{s()}(?:{w('A')}|{w('a')}){w('nd')}{s()}{w('Quantitative')}{s()}{w('Disclosure')}{s()}(?:{w('A')}|{w('a')}){w('bout')}{s()}{w('Market')}{s()}{w('Risk')}s?\W",
                                  rf"\W{w('QUALITATIVE')}{s()}{w('AND')}{s()}{w('QUANTITATIVE')}{s()}{w('DISCLOSURE')}{s()}{w('ABOUT')}{s()}{w('MARKET')}{s()}{w('RISK')}S?\W"])
item_ptrn3['item_8'] = r'|'.join([rf"\W{w('Financial')}{s()}{w('Statement')}s?{s()}(?:{w('A')}|{w('a')}){w('nd')}{s()}{w('Supplementary')}{s()}{w('Data')}\W",
                                  rf"\W{w('FINANCIAL')}{s()}{w('STATEMENT')}S?{s()}{w('AND')}{s()}{w('SUPPLEMENTARY')}{s()}{w('DATA')}\W"])
item_ptrn3['item_9'] = r'|'.join([rf"\W{w('Change')}{s()}(?:{w('I')}|{w('i')}){w('n')}{s()}(?:{w('A')}|{w('a')}){w('nd')}{s()}{w('Disagreement')}{s()}(?:{w('W')}|{w('w')}){w('ith')}{s()}{w('Accountant')}{s()}(?:{w('O')}|{w('o')}){w('n')}{w('Accounting')}{s()}(?:{w('A')}|{w('a')}){w('nd')}{s()}{w('Financial')}{s()}{w('Disclosure')}s?\W",
                                  rf"\W{w('CHANGE')}{s()}{w('IN')}{s()}{w('AND')}{s()}{w('DISAGREEMENT')}{s()}{w('WITH')}{s()}{w('ACCOUNTANT')}{s()}{w('ON')}{w('ACCOUNTING')}{s()}{w('AND')}{s()}{w('FINANCIAL')}{s()}{w('DISCLOSURE')}S?\W"])
item_ptrn3['item_9a'] = r'|'.join([rf"\W{w('Control')}s?{s()}(?:{w('A')}|{w('a')}){w('nd')}{s()}{w('Procedure')}s?\W",
                                   rf"\W{w('CONTROL')}S?{s()}{w('AND')}{s()}{w('PROCEDURE')}S?\W"])
item_ptrn3['item_9b'] = r'|'.join([rf"\W{w('Other')}{s()}{w('Information')}\W",
                                   rf"\W{w('OTHER')}{s()}{w('INFORMATION')}\W"])
item_ptrn3['item_10'] = r'|'.join([rf"\W{w('Director')}{s()}{w('Executive')}{s()}{w('Officer')}{s()}(?:{w('A')}|{w('a')}){w('nd')}{s()}{w('Corporate')}{s()}{w('Governance')}s?\W",
                                   rf"\W{w('DIRECTOR')}{s()}{w('EXECUTIVE')}{s()}{w('OFFICER')}{s()}{w('AND')}{s()}{w('CORPORATE')}{s()}{w('GOVERNANCE')}S?\W"])
item_ptrn3['item_11'] = r'|'.join([rf"\W{w('Executive')}{s()}{w('Compensation')}s?\W",
                                   rf"\W{w('EXECUTIVE')}{s()}{w('COMPENSATION')}S?\W"])
item_ptrn3['item_12'] = r'|'.join([rf"\W{w('Security')}{s()}{w('Ownership')}{s()}(?:{w('O')}|{w('o')}){w('f')}{s()}{w('Certain')}{s()}{w('Beneficial')}{s()}{w('Owner')}{s()}(?:{w('A')}|{w('a')}){w('nd')}{s()}{w('Management')}{s()}(?:{w('A')}|{w('a')}){w('nd')}{s()}{w('Related')}{s()}{w('Stockholder')}{s()}{w('Matter')}s?\W",
                                   rf"\W{w('SECURITY')}{s()}{w('OWNERSHIP')}{s()}{w('OF')}{s()}{w('CERTAIN')}{s()}{w('BENEFICIAL')}{s()}{w('OWNER')}{s()}{w('AND')}{s()}{w('MANAGEMENT')}{s()}{w('AND')}{s()}{w('RELATED')}{s()}{w('STOCKHOLDER')}{s()}{w('MATTER')}S?\W"])
item_ptrn3['item_13'] = r'|'.join([rf"\W{w('Certain')}{s()}{w('Relationship')}{s()}(?:{w('A')}|{w('a')}){w('nd')}{s()}{w('Related')}{s()}{w('Transaction')}{s()}(?:{w('A')}|{w('a')}){w('nd')}{s()}{w('Director')}{s()}{w('Independence')}\W",
                                   rf"\W{w('CERTAIN')}{s()}{w('RELATIONSHIP')}{s()}{w('AND')}{s()}{w('RELATED')}{s()}{w('TRANSACTION')}{s()}{w('AND')}{s()}{w('DIRECTOR')}{s()}{w('INDEPENDENCE')}\W"])
item_ptrn3['item_14'] = r'|'.join([rf"\W{w('Principal')}{s()}{w('Account')}(?:{w('ant')}|{w('ing')}){s()}{w('Fee')}{s()}(?:{w('A')}|{w('a')}){w('nd')}{s()}{w('Service')}s?\W",
                                   rf"\W{w('PRINCIPAL')}{s()}{w('ACCOUNT')}(?:{w('ANT')}|{w('IND')}){s()}{w('FEE')}{s()}{w('AND')}{s()}{w('SERVICE')}S?\W"])
item_ptrn3['item_15'] = r'|'.join([rf"\W{w('Exhibits')}{s()}{w('Financial')}{s()}{w('Statement')}{s()}{w('Schedule')}s?\W",
                                   rf"\W{w('EXHIBITS')}{s()}{w('FINANCIAL')}{s()}{w('STATEMENT')}{s()}{w('SCHEDULE')}S?\W"])

In [15]:
%%time
"""
Given a document, extract start and end position of each Item
"""

def dedup_pos(pos):
    return list(pd.DataFrame({0:[x[0] for x in pos], 1:[x[1] for x in pos]}).drop_duplicates(subset=[0]).to_records(index=False))

def find_item_pos(doc, log_mode=False):
    item_pos = []
    
    # loop througn all items
    for item in item_ptrn1:
        
        # pattern 1 (normal + upper)
        pos = [(m.start(), m.end()) for m in re.finditer(item_ptrn1[item], doc)] + [(m.start(), m.end()) for m in re.finditer(item_ptrn1[item].upper(), doc)]
        pos = dedup_pos(pos)
        log(f'[{item}] After attempt 1 yielded {len(pos)} matches') if log_mode==True else None

        # pattern 2 ("Item" as optional, normal + upper)
        if len(pos) == 0 or (len(pos) == 1 and len(re.findall(rf"{w('table')}{s()}{w('of')}{s()}{w('content')}", doc.lower())) > 0 and pos[0][0] < 7000):
            pos = pos + [(m.start(), m.end()) for m in re.finditer(item_ptrn2[item], doc)] + [(m.start(), m.end()) for m in re.finditer(item_ptrn2[item].upper(), doc)]
            pos = dedup_pos(pos)
            log(f'[{item}] After attempt 2 yielded {len(pos)} matches') if log_mode==True else None

        # pattern 3
        if len(pos) == 0 or (len(pos) == 1 and len(re.findall(rf"{w('table')}{s()}{w('of')}{s()}{w('content')}", doc.lower())) > 0 and pos[0][0] < 7000):
            pos = pos + [(m.start(), m.end()) for m in re.finditer(item_ptrn3[item], doc)]
            pos = dedup_pos(pos)
            log(f'[{item}] After attempt 3 yielded {len(pos)} matches') if log_mode==True else None


        # remove first entry due to table of contents
        if len(pos) >= 2  \
        and len(re.findall(rf"{w('table')}{s()}{w('of')}{s()}{w('content')}", doc.lower())) > 0 \
        and pos[0][0] < 6000 \
        and item != 'item_1':
            pos = pos[1:]
            log(f'[{item}] Removed first result due to Table of Contents') if log_mode==True else None

        # remove occurrance due to references
        pos_filtered = []
        for p in pos:
            match = doc[p[0]:p[1]]
            pre = doc[p[0]-20:p[0]].lower()
            suf = doc[p[1]:p[1]+20].lower()
            log(f'[{item}] pos {p} : <<{pre}....{match}....{suf}>>') if log_mode==True else None
            pre_ptrn = r"""(\W"$|\W“$|('s\W)$|\Wsee\W$|\Win\W$|\Wthe\W$|\Wour\W$|\Wthis\W$|\Wwithin\W$|\Wherein\W$|\Wrefer to\W$|\Wreferring\W$)"""
            suf_ptrn = r"""(^\Wshould\W|^\Wshall\W|^\Wmust\W|^\Wwas\W|^\Wwere\W|^\Whas\W|^\Whad\W|^\Wis\W|^\Ware\W)"""
            if re.search(pre_ptrn, pre) or re.search(suf_ptrn, suf):
                log(f'[{item}] removed the above match') if log_mode==True else None
            else:
                pos_filtered.append(p)
        pos = pos_filtered.copy()

        # save position as dataframe
        pos = pd.DataFrame({'item':[item]*len(pos), 'pos_start':[x[0] for x in pos]})
        item_pos.append(pos)

    # combine positions for all items
    item_pos = pd.concat(item_pos).sort_values('pos_start').reset_index(drop=True)
    # define ending position
    item_pos['pos_end'] = item_pos.pos_start.shift(-1).fillna(len(doc))
    # define length
    item_pos['len'] = item_pos.pos_end - item_pos.pos_start
    # for each item, select the match with longest length
    item_pos = item_pos.sort_values(['item','len','pos_start'], ascending=[1,0,0]).drop_duplicates(subset=['item']).sort_values('pos_start')
    item_pos = pd.concat([item_pos[item_pos.item==item][['pos_start','pos_end']].reset_index(drop=True).rename(columns={'pos_start':f'{item}_pos_start','pos_end':f'{item}_pos_end'}) for item in item_ptrn1], axis=1)
    # fillna with zero
    item_pos = item_pos.fillna(0).astype(int)
    # if item_pos is empty due to no item found, put all zeros as a row
    if item_pos.shape[0] == 0:
        item_pos.loc[0,:] = [0] * 2 * len(item_ptrn1)
    # record the full document length
    item_pos['full_doc_len'] = len(doc)
    # check if non empty df is returned
    assert item_pos.shape[0]==1
    return item_pos


# function to sample check item extraction quality
def show_item(doc_dict):
    n = 100
    for item in item_ptrn1:
        print(f'{item}: {doc_dict[item][:n]}........{doc_dict[item][-n:]}')
    return

CPU times: user 7 µs, sys: 1e+03 ns, total: 8 µs
Wall time: 11.9 µs


In [16]:
# urls = ['https://www.sec.gov/Archives/edgar/data/1166691/000119312508034239/d10k.htm',
#        'https://www.sec.gov/Archives/edgar/data/922224/000092222411000029/form10k.htm',
#        'https://www.sec.gov/Archives/edgar/data/1283699/000119312511051403/d10k.htm']
# docs = {}
# for i in range(len(urls)):

#     url = urls[i]
#     doc_id = i
#     txt = requests.get(url, headers={"user-agent": f"chan_tai_man_{int(float(np.random.rand(1)) * 1e7)}@gmail.com"}).text

#     # clean doc, extract items
#     txt = soup = BeautifulSoup(txt, 'lxml').get_text('|', strip=True)
#     txt = clean_doc1(txt)
#     item_pos = find_item_pos(txt)
#     doc_dict = {}
#     doc_dict['full'] = txt[item_pos.iloc[0]['item_1_pos_start'] :]
#     for item in item_ptrn1:
#         doc_dict[item] = txt[item_pos.iloc[0][f'{item}_pos_start'] : item_pos.iloc[0][f'{item}_pos_end']]
#     for x in doc_dict:
#         doc_dict[x] = clean_doc2(doc_dict[x])
#     docs[doc_id] = doc_dict

# Signal Extraction

In [17]:
'''
Download NLP pretrained models
'''

# if mode in ['full','gpu']:
#     !pip install sentence-transformers
#     from sentence_transformers import SentenceTransformer
#     st_model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')

#     from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
#     import torch
#     fb_tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
#     fb_model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
#     fb_model = fb_model.to("cuda:0")

# if mode in ['full','wv']:
# import gensim.downloader as api
# for i in range(20):
#     try:
#         wv = api.load('word2vec-google-news-300')
#         break
#     except:
#         continue

'\nDownload NLP pretrained models\n'

In [18]:
# '''
# Loughran and McDonald’s Master Dictionary
# '''
# # load Loughran and McDonald’s Master Dictionary (2020)
# master_dict = pd.read_csv('../input/loughranmcdonald-masterdictionary-2020/LoughranMcDonald_MasterDictionary_2020.csv')
# master_dict.columns = ['_'.join([y.lower() for y in x.split()]) for x in master_dict.columns]
# master_dict.word = master_dict.word.str.lower()

# # extract specific word lists
# negative_word_list = master_dict.loc[lambda x: x.negative!=0].word.tolist()
# positive_word_list = master_dict.loc[lambda x: x.positive!=0].word.tolist()
# uncertainty_word_list = master_dict.loc[lambda x: x.uncertainty!=0].word.tolist()
# litigious_word_list = master_dict.loc[lambda x: x.litigious!=0].word.tolist()
# strong_modal_word_list = master_dict.loc[lambda x: x.strong_modal!=0].word.tolist()
# weak_modal_word_list = master_dict.loc[lambda x: x.weak_modal!=0].word.tolist()
# constraining_word_list = master_dict.loc[lambda x: x.constraining!=0].word.tolist()
# complexity_word_list = master_dict.loc[lambda x: x.complexity!=0].word.tolist()

In [19]:
'''
Change in length
'''
# full doc
def gen_feat_ch_full_len(docs):
    feat = pd.Series([len(doc_dict['full']) for doc_dict in docs.values()])
    feat = np.log(feat).diff()
    feat = feat * -1
    return feat.rename('feat_ch_full_len')

# Item 1A - Risk Factors
def gen_feat_ch_item_1a_len(docs):
    feat = pd.Series([len(doc_dict['item_1a']) for doc_dict in docs.values()])
    feat = np.log(feat).diff()
    feat = feat * -1
    return feat.rename('feat_ch_item_1a_len')

# Item 1B - Unresolved Staff Comments
def gen_feat_ch_item_1b_len(docs):
    feat = pd.Series([len(doc_dict['item_1b']) for doc_dict in docs.values()])
    feat = np.log(feat).diff()
    feat = feat * -1
    return feat.rename('feat_ch_item_1b_len')

# Item 3 - Legal Proceedings
def gen_feat_ch_item_3_len(docs):
    feat = pd.Series([len(doc_dict['item_3']) for doc_dict in docs.values()])
    feat = np.log(feat).diff()
    feat = feat * -1
    return feat.rename('feat_ch_item_3_len')

In [20]:
'''
Document Similarity
'''
# full doc, cosine similarity, 1 gram
def gen_feat_full_cos_1gram(docs):
    doc_list = [doc_dict['full'] for doc_dict in docs.values()]
    tf_vectors = global_tfidf_1g.transform(doc_list)
    feat = pd.Series([cosine_similarity(tf_vectors[i-1:i+1,:])[0][1] if i > 0 else np.NaN for i in range(len(doc_list))])
    return feat.rename('feat_full_cos_1gram')

# full doc, cosine similarity, 2 gram
def gen_feat_full_cos_2gram(docs):
    doc_list = [doc_dict['full'] for doc_dict in docs.values()]
    tf_vectors = global_tfidf_2g.transform(doc_list)
    feat = pd.Series([cosine_similarity(tf_vectors[i-1:i+1,:])[0][1] if i > 0 else np.NaN for i in range(len(doc_list))])
    return feat.rename('feat_full_cos_2gram')

# full doc, jaccard similarity, 1 gram
def gen_feat_full_jac_1gram(docs):
    doc_list = [doc_dict['full'] for doc_dict in docs.values()]
    vectorizer = CountVectorizer(ngram_range=(1,1), binary=True, token_pattern=r"(?u)\b[a-z]{3,}\b")
    tf_vectors = vectorizer.fit_transform(doc_list)
    feat = pd.Series([jaccard_score(tf_vectors[i-1,:].toarray().flatten(), tf_vectors[i,:].toarray().flatten()) if i > 0 else np.NaN for i in range(len(doc_list))])
    return feat.rename('feat_full_jac_1gram')

# full doc, jaccard similarity, 2 gram
def gen_feat_full_jac_2gram(docs):
    doc_list = [doc_dict['full'] for doc_dict in docs.values()]
    vectorizer = CountVectorizer(ngram_range=(1,2), binary=True, token_pattern=r"(?u)\b[a-z]{3,}\b")
    tf_vectors = vectorizer.fit_transform(doc_list)
    feat = pd.Series([jaccard_score(tf_vectors[i-1,:].toarray().flatten(), tf_vectors[i,:].toarray().flatten()) if i > 0 else np.NaN for i in range(len(doc_list))])
    return feat.rename('feat_full_jac_2gram')

In [21]:
'''
Dictionary based sentiment (Loughran and McDonald)
'''
# net postive words change in proportion
def gen_feat_lm_postive(docs):
    doc_list = [doc_dict['full'] for doc_dict in docs.values()]
    doc_len = pd.Series([len(x) for x in doc_list])
    vectorizer = CountVectorizer(ngram_range=(1,1), binary=False, token_pattern=r"(?u)\b[a-z]{3,}\b").fit(doc_list)
    tf_vectors = vectorizer.transform(doc_list)
    pos_target_cols = [vectorizer.vocabulary_[x] for x in positive_word_list if x in list(vectorizer.vocabulary_.keys())]
    neg_target_cols = [vectorizer.vocabulary_[x] for x in negative_word_list if x in list(vectorizer.vocabulary_.keys())]
    feat = pd.Series([(tf_vectors[i,pos_target_cols].sum() - tf_vectors[i,neg_target_cols].sum()) / doc_len[i] for i in range(len(doc_list))]).diff()
    return feat.rename('feat_lm_postive')

# uncertainty words change in proportion
def gen_feat_lm_uncertainty(docs):
    doc_list = [doc_dict['full'] for doc_dict in docs.values()]
    doc_len = pd.Series([len(x) for x in doc_list])
    vectorizer = CountVectorizer(ngram_range=(1,1), binary=False, token_pattern=r"(?u)\b[a-z]{3,}\b").fit(doc_list)
    tf_vectors = vectorizer.transform(doc_list)
    target_cols = [vectorizer.vocabulary_[x] for x in uncertainty_word_list if x in list(vectorizer.vocabulary_.keys())]
    feat = pd.Series([tf_vectors[i,target_cols].sum() / doc_len[i] for i in range(len(doc_list))]).diff()
    feat = feat * -1
    return feat.rename('feat_lm_uncertainty')

# uncertainty words change in proportion
def gen_feat_lm_litigious(docs):
    doc_list = [doc_dict['full'] for doc_dict in docs.values()]
    doc_len = pd.Series([len(x) for x in doc_list])
    vectorizer = CountVectorizer(ngram_range=(1,1), binary=False, token_pattern=r"(?u)\b[a-z]{3,}\b").fit(doc_list)
    tf_vectors = vectorizer.transform(doc_list)
    target_cols = [vectorizer.vocabulary_[x] for x in litigious_word_list if x in list(vectorizer.vocabulary_.keys())]
    feat = pd.Series([tf_vectors[i,target_cols].sum() / doc_len[i] for i in range(len(doc_list))]).diff()
    feat = feat * -1
    return feat.rename('feat_lm_litigious')

In [22]:
'''
Sentence encoding
'''
def gen_feat_sen_enc(docs):
    doc_list = [doc_dict['full'] for doc_dict in docs.values()]
    vecs = []
    for doc in doc_list:
        sen_list = [x for x in tokenize.sent_tokenize(doc) if len(x)>=30 and len(x)<=1000 and re.match(r'[a-z]', x)]
        vecs.append(st_model.encode(sentences=sen_list, show_progress_bar=False).mean(axis=0).flatten())
    vecs = np.concatenate(vecs).reshape(-1, vecs[0].shape[0])
    feat = pd.Series([cosine_similarity(vecs[i-1:i+1,:])[0][1] if i > 0 else np.NaN for i in range(len(doc_list))])
    return feat.rename('feat_sen_enc')


'''
Finbert Sentiment on Item 1A & 7
'''
def gen_feat_item_sentiment(docs):
    doc_list = [doc_dict['item_1a'] + '.' + doc_dict['item_7'] for doc_dict in docs.values()]
    sentiment = []
    for doc in doc_list:
        sen_list = [x for x in tokenize.sent_tokenize(doc) if len(x)>=30 and len(x)<=1000 and re.match(r'[a-z]', x)]
        if len(sen_list)==0:
            sentiment.append(np.NaN)
            continue
        batch_size = 8
        n_batch = math.ceil(len(sen_list)/batch_size)
        sentiment_sum = 0
        for i in range(n_batch):
            inputs = fb_tokenizer(sen_list[batch_size*i:batch_size*(i+1)], padding=True, truncation=True, return_tensors='pt').to("cuda:0")
            with torch.no_grad():
                outputs = fb_model(**inputs)
            sentiment_sum += float(torch.nn.functional.softmax(outputs.logits, dim=-1)[:,0].sum())
            torch.cuda.empty_cache()
        sentiment.append(sentiment_sum / len(sen_list))
    feat = pd.Series(sentiment).ffill().diff()
    return feat.rename('feat_item_sentiment')


'''
Finbert Sentiment on Forward-Looking Statements
'''
def gen_feat_fls_sentiment(docs):
    doc_list = [doc_dict['item_1a'] + '.' + doc_dict['item_7'] for doc_dict in docs.values()]
    fls_ptrn = r"(\baim\b|\banticipate\b|\bbelieve\b|\bcould\b|\bcontinue\b|\bestimate\b|\bexpansion\b|\bexpect\b|\bexpectation\b|\bexpected to be\b|\bfocus\b|\bforecast\b|\bgoal\b|\bgrow\b|\bguidance\b|\bintend\b|\binvest\b|\bis expected\b|\bmay\b|\bobjective\b|\bplan\b|\bpriority\b|\bproject\b|\bstrategy\b|\bto be\b|\bwe'll\b|\bwill\b|\bwould\b)"
    sentiment = []
    for doc in doc_list:
        sen_list = [x for x in tokenize.sent_tokenize(doc) if len(x)>=30 and len(x)<=1000 and re.match(r'[a-z]', x) and re.search(fls_ptrn, x)]
        if len(sen_list)==0:
            sentiment.append(np.NaN)
            continue
        batch_size = 8
        n_batch = math.ceil(len(sen_list)/batch_size)
        sentiment_sum = 0
        for i in range(n_batch):
            inputs = fb_tokenizer(sen_list[batch_size*i:batch_size*(i+1)], padding=True, truncation=True, return_tensors='pt').to("cuda:0")
            with torch.no_grad():
                outputs = fb_model(**inputs)
            sentiment_sum += float(torch.nn.functional.softmax(outputs.logits, dim=-1)[:,0].sum())
            torch.cuda.empty_cache()
        sentiment.append(sentiment_sum / len(sen_list))
    feat = pd.Series(sentiment).ffill().diff()
    return feat.rename('feat_fls_sentiment')


'''
Word2Vec
'''
def gen_feat_word2vec(docs):
    doc_list = [doc_dict['full'] for doc_dict in docs.values()]
    tf_vectors = global_tfidf_1g.transform(doc_list)[:,tfidf_1g_wv_idx]
    tf_vectors = tf_vectors / tf_vectors.sum(axis=1)
    avg_vecs = []
    for i in range(len(doc_list)):
        vec_sum = np.zeros(300)
        for j in range(len(tfidf_1g_wv_idx)):
            vec_sum += tf_vectors[i,j] * wv[tfidf_1g_wv_word[j]]
        avg_vecs.append(vec_sum)
    avg_vecs = np.concatenate(avg_vecs).reshape(len(doc_list), 300)
    feat = pd.Series([cosine_similarity(tf_vectors[i-1:i+1,:])[0][1] if i > 0 else np.NaN for i in range(len(doc_list))])
    return feat.rename('feat_word2vec')

# Run All per CIK

In [23]:
def gen_signal(cik):
    log(f'Started signal generation for CIK {cik}')
    df = master_idx.loc[lambda x: x.cik==cik].sort_values('filing_date').reset_index(drop=True)
    docs = {}
    for i in range(len(df)):
        
        # load 10-K text from EDGAR html url
        url = df.iloc[i]['url_10k']
        doc_id = df.iloc[i]['doc_id']
        txt = requests.get(url, headers={"user-agent": f"chan_tai_man_{int(float(np.random.rand(1)) * 1e7)}@gmail.com"}).text

        # clean doc, extract items
        txt = soup = BeautifulSoup(txt, 'lxml').get_text('|', strip=True)
        txt = clean_doc1(txt)
        item_pos = find_item_pos(txt)
        doc_dict = {}
        doc_dict['full'] = txt[item_pos.iloc[0]['item_1_pos_start'] :]
        for item in item_ptrn1:
            doc_dict[item] = txt[item_pos.iloc[0][f'{item}_pos_start'] : item_pos.iloc[0][f'{item}_pos_end']]
        for x in doc_dict:
            doc_dict[x] = clean_doc2(doc_dict[x])
        docs[doc_id] = doc_dict
        
    # generate signal
    feat_vecs = [pd.Series(list(docs.keys())).rename('doc_id')]
    if mode in ['full','cpu']:
        feat_vecs += [gen_feat_ch_full_len(docs),
                        gen_feat_ch_item_1a_len(docs),
                        gen_feat_ch_item_1b_len(docs),
                        gen_feat_ch_item_3_len(docs),
                        gen_feat_full_cos_1gram(docs),
                        gen_feat_full_cos_2gram(docs),
                        gen_feat_full_jac_1gram(docs),
                        gen_feat_full_jac_2gram(docs),
                        gen_feat_lm_postive(docs),
                        gen_feat_lm_uncertainty(docs),
                        gen_feat_lm_litigious(docs),
                        gen_feat_word2vec(docs)]
    if mode in ['full','gpu']:
        feat_vecs += [gen_feat_sen_enc(docs),
                        gen_feat_item_sentiment(docs),
                        gen_feat_fls_sentiment(docs)]
    feats = pd.concat(feat_vecs, axis=1)
    log(f'Completed signal generation for CIK {cik}')
    return feats

In [24]:
# %%time

# # generate signal per CIK
# feats = Parallel(n_jobs=-1)(delayed(gen_signal)(cik) for cik in master_idx.cik.unique())
# feats = pd.concat(feats).sort_values('doc_id').reset_index(drop=True)

# # map back to stock
# df = master_idx[['doc_id','cik','entity','filing_date']].drop_duplicates()
# feats = feats.merge(df, how='inner', on='doc_id')
# feats = feats.merge(cik_map, how='inner', on='cik')
# cols = [c for c in feats if c[:5]=='feat_']
# feats = feats[[c for c in feats if c not in cols] + cols]
# display(feats.loc[lambda x: x.isnull().sum(axis=1) > 0].groupby('cik')['doc_id'].count().loc[lambda x: x>1])
# display(feats.head())

# # export
# feats.to_csv('feats.csv', index=False)

In [25]:
# # show sample item extraction
# df = master_idx.sample(10).sort_values('filing_date').reset_index(drop=True)
# # df = master_idx.sort_values('filing_date').reset_index(drop=True)

# for i in range(len(df)):

#     print(df.iloc[i]['cik'])
#     print(df.iloc[i]['doc_id'])
#     print(df.iloc[i]['url_10k'])
    
#     # load 10-K text from EDGAR html url
#     url = df.iloc[i]['url_10k']
#     doc_id = df.iloc[i]['doc_id']
#     txt = requests.get(url, headers={"user-agent": f"chan_tai_man_{int(float(np.random.rand(1)) * 1e7)}@gmail.com"}).text

#     # clean doc, extract items
#     txt = soup = BeautifulSoup(txt, 'lxml').get_text('|', strip=True)
#     txt = clean_doc1(txt)
#     item_pos = find_item_pos(txt, log_mode=False)
#     doc_dict = {}
#     doc_dict['full'] = txt[item_pos.iloc[0]['item_1_pos_start'] :]
#     for item in item_ptrn1:
#         doc_dict[item] = txt[item_pos.iloc[0][f'{item}_pos_start'] : item_pos.iloc[0][f'{item}_pos_end']]
#     for x in doc_dict:
#         doc_dict[x] = clean_doc2(doc_dict[x])
#     show_item(doc_dict)

# Preprocessing Tasks

In [26]:
'''
Generate a global TFIDF model
'''
# sample and clean doc
master_idx_sampled = master_idx.groupby('cik').last().sort_values('filing_date').reset_index(drop=True)
master_idx_sampled = master_idx \
    .sort_values(['cik','filing_date']).reset_index(drop=True) \
    .groupby('cik').last().reset_index() \

def download_doc(i):
    url = master_idx_sampled.iloc[i]['url_10k']
    txt = requests.get(url, headers={"user-agent": f"chan_tai_man_{int(float(np.random.rand(1)) * 1e7)}@gmail.com"}).text
    txt = soup = BeautifulSoup(txt, 'lxml').get_text('|', strip=True)
    txt = clean_doc1(txt)
    item_pos = find_item_pos(txt)
    item_1 = clean_doc2(txt[item_pos.iloc[0]['item_1_pos_start'] : item_pos.iloc[0]['item_1_pos_end']])
    full = clean_doc2(txt[item_pos.iloc[0]['item_1_pos_start'] :])
    log(f'Completed downloading doc {i}')
    return {'full':full, 'item_1':item_1}
docs = {}
for i in range(len(master_idx_sampled)):
    cik = master_idx_sampled.iloc[i]['cik']
    docs[cik] = download_doc(i)
doc_list = [docs[cik]['full'] for cik in docs]

# build tfidf for 1 and 2 gram
global_tfidf_1g = TfidfVectorizer(ngram_range=(1,1), norm='l2', min_df=0.0, max_df=0.7, use_idf=True, binary=False, token_pattern=r"(?u)\b[a-z]{3,}\b").fit(doc_list)
global_tfidf_2g = TfidfVectorizer(ngram_range=(1,2), norm='l2', min_df=0.0, max_df=0.7, use_idf=True, binary=False, token_pattern=r"(?u)\b[a-z]{3,}\b").fit(doc_list)
log(f'Vocab size of TFIDF (1-gram): {len(global_tfidf_1g.vocabulary_)}')
log(f'Vocab size of TFIDF (2-gram): {len(global_tfidf_2g.vocabulary_)}')

# release memory
save_pkl(docs, 'docs')
del doc_list, docs
gc.collect()

# download word2vec
import gensim.downloader as api
wv = load_pkl('../input/word2vecgooglenews300/wv')
# for i in range(20):
#     try:
#         wv = api.load('word2vec-google-news-300')
#         break
#     except:
#         continue
        
# get the column index for vocab overlapping with Word2Vec
wv_vocab_list = list(wv.key_to_index)
tfidf_vocab = global_tfidf_1g.vocabulary_
tfidf_vocab_swap = {v: k for k, v in tfidf_vocab.items()}
tfidf_1g_wv_idx = sorted([global_tfidf_1g.vocabulary_[x] for x in list(global_tfidf_1g.vocabulary_) if x in wv_vocab_list])
tfidf_1g_wv_word = [tfidf_vocab_swap[x] for x in tfidf_1g_wv_idx]
log(f'Vocab size of TFIDF overlapped with Word2Vec: {len(tfidf_1g_wv_idx)}')

# extract smaller word2vec dict
wv_subset = {w : wv[w] for w in tfidf_1g_wv_word}
del wv
gc.collect()

[2022-05-13 20:24:00] Completed downloading doc 0
[2022-05-13 20:24:04] Completed downloading doc 1
[2022-05-13 20:24:08] Completed downloading doc 2
[2022-05-13 20:24:14] Completed downloading doc 3
[2022-05-13 20:24:16] Completed downloading doc 4
[2022-05-13 20:24:21] Completed downloading doc 5
[2022-05-13 20:24:24] Completed downloading doc 6
[2022-05-13 20:24:31] Completed downloading doc 7
[2022-05-13 20:24:37] Completed downloading doc 8
[2022-05-13 20:24:52] Completed downloading doc 9
[2022-05-13 20:24:58] Completed downloading doc 10
[2022-05-13 20:25:03] Completed downloading doc 11
[2022-05-13 20:25:07] Completed downloading doc 12
[2022-05-13 20:25:10] Completed downloading doc 13
[2022-05-13 20:25:14] Completed downloading doc 14
[2022-05-13 20:25:19] Completed downloading doc 15
[2022-05-13 20:25:22] Completed downloading doc 16
[2022-05-13 20:25:23] Completed downloading doc 17
[2022-05-13 20:25:27] Completed downloading doc 18
[2022-05-13 20:25:33] Completed downloadi

0

In [27]:
'''
Count of 8-K filings
'''
# get a vector of all calendar dates
dates = pd.to_datetime(pd.Series(['2008-01-01'] * 365*11)) \
    .to_frame() \
    .rename(columns={0:'filing_date'})
dates['filing_date'] = dates['filing_date'] + pd.Series([np.timedelta64(i, 'D') for i in range(len(dates))])

# count the rolling 1-year number of 8-K filings
feats_8k = []
for cik in master_idx_8k.cik.unique():
    df = pd.merge(dates, master_idx_8k.loc[lambda x: x.cik==cik], how='left', on='filing_date') \
        .assign(filed=lambda x: x.cik.notnull().astype(int)) \
        .assign(cik=cik) \
        .loc[:,['cik','filing_date','filed']] \
        .assign(feat_cnt_8k = lambda x: x.rolling(365).filed.sum()) \
        .dropna()
    feats_8k.append(df)
feats_8k = pd.concat(feats_8k).reset_index(drop=True)

# calculate Year-on-Year change
feats_8k = feats_8k \
    .merge(cik_map, how='inner', on='cik') \
    .rename(columns={'filing_date':'date'}) \
    .loc[:,['stock','date','feat_cnt_8k']] \
    .sort_values(['stock','date']) \
    .assign(feat_cnt_8k_prev = lambda x: x.groupby('stock').feat_cnt_8k.shift(365)) \
    .assign(feat_cnt_8k_diff = lambda x: x.feat_cnt_8k - x.feat_cnt_8k_prev) \
    .assign(feat_cnt_8k = lambda x: x.feat_cnt_8k * -1,
            feat_cnt_8k_diff = lambda x: x.feat_cnt_8k_diff * -1) \
    .loc[lambda x: x.date.isin(ret.index), ['stock','date','feat_cnt_8k','feat_cnt_8k_diff']] \
    .dropna()
log(f'Shape of 8-K feats: {feats_8k.shape}')

[2022-05-13 21:50:19] Shape of 8-K feats: (1261848, 4)


In [28]:
# export prepared files
save_pkl(cik_map, 'cik_map')
save_pkl(master_idx, 'master_idx')
save_pkl(master_idx_10q, 'master_idx_10q')
save_pkl(master_idx_8k, 'master_idx_8k')
save_pkl(feats_8k, 'feats_8k')
save_pkl(master_idx_sampled, 'master_idx_sampled')
save_pkl(global_tfidf_1g, 'global_tfidf_1g')
save_pkl(global_tfidf_2g, 'global_tfidf_2g')
save_pkl(tfidf_1g_wv_idx, 'tfidf_1g_wv_idx')
save_pkl(tfidf_1g_wv_word, 'tfidf_1g_wv_word')
save_pkl(wv_subset, 'wv_subset')