In [1]:
import pandas as pd
import numpy as np
from gensim.parsing.preprocessing import preprocess_string, remove_stopwords, strip_numeric, strip_punctuation, strip_short, stem_text

# Preprocess 10-K filings

In [2]:
data_by_year = {}
# Filing_Date is included because its (allegedly) useful for matching with the predictions. Also useful for 
# deciding what 1a/7 text is correct (by newness)
relevant_cols = ["cik", "ticker", "filing_date", "item1a_risk", "item7_mda"]
path = "../Files/"

# Only load ^ columns. Drop if N/A in cik, item1a, or item7 or if duplicate. Append year
for year in range(2009,2021):
    data_by_year[year] = pd.read_csv(path + str(year) + ".csv", usecols=relevant_cols)
    data_by_year[year] = data_by_year[year].dropna(subset=['cik', 'item1a_risk', 'item7_mda']).drop_duplicates()
    data_by_year[year]["filing_date"] = pd.to_datetime(data_by_year[year]["filing_date"])
    data_by_year[year]["year"] = year
    data_by_year[year]["filing_year"] =  pd.DatetimeIndex(data_by_year[year]["filing_date"]).year

In [3]:
# Dirty logic for collapsing groups. Reformat as needed - currently pretty dumb
def collapse_cik_groups(grp):
    if len(grp) > 1:
        """ If the 1a and 7 text is the same, take the most recent (regardless of ticker)"""
        if (grp.iloc[0,3] == grp["item1a_risk"]).all() and (grp.iloc[0,3] == grp["item7_mda"]).all():
            # Seems like its sorted by filing_date originally - just take the last
            return grp.iloc[-1,:]
        else:
            """For now, just return the most recent"""
            return grp.iloc[-1,:]
    else:
        return grp.squeeze()

pre_grouping_sizes = []
post_grouping_sizes = []

for year in range(2009,2021):
    pre_grouping_sizes.append(len(data_by_year[year]))
    data_by_year[year] = data_by_year[year].groupby("cik").apply(lambda grp: collapse_cik_groups(grp)).reset_index(drop=True)
    post_grouping_sizes.append(len(data_by_year[year])) 

In [4]:
for pre_size, post_size, year in zip(pre_grouping_sizes, post_grouping_sizes, list(range(2009,2021))):
    num_lost = pre_size - post_size
    percent_lost = float(num_lost) / float(pre_size)
    display("{:n} lost {:.0%}, {} rows total".format(year, percent_lost, num_lost))

num_lost = sum(pre_grouping_sizes) - sum(post_grouping_sizes)
percent_lost = float(num_lost) / float(sum(pre_grouping_sizes))
display("In total lost {:.0%}, {} rows total".format(percent_lost, num_lost))

'2009 lost 2%, 14 rows total'

'2010 lost 2%, 51 rows total'

'2011 lost 2%, 58 rows total'

'2012 lost 2%, 56 rows total'

'2013 lost 2%, 60 rows total'

'2014 lost 2%, 68 rows total'

'2015 lost 2%, 76 rows total'

'2016 lost 2%, 73 rows total'

'2017 lost 2%, 83 rows total'

'2018 lost 2%, 86 rows total'

'2019 lost 3%, 87 rows total'

'2020 lost 3%, 62 rows total'

'In total lost 2%, 774 rows total'

In [5]:
# Concat all dataframes into a single one
text_dfs = pd.concat(data_by_year.values(), ignore_index=True)
text_dfs

Unnamed: 0,cik,ticker,filing_date,item1a_risk,item7_mda,year,filing_year
0,1750,AIR,2009-07-16,> ITEM 1A. RISK FACTORS The following is a des...,> ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS...,2009,2009
1,2034,ACET,2009-09-11,>Item 1A. Risk factors You should carefully co...,>Item 7. Management s Discussion and Analysis ...,2009,2009
2,2491,BYI,2009-08-20,> ITEM 1A. RISK FACTORS Our business and inves...,> ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS...,2009,2009
3,2969,APD,2009-11-25,>ITEM 1A. RISK FACTORS The Company operates in...,>ITEM 7. MANAGEMENT S DISCUSSION AND ANALYSIS ...,2009,2009
4,3545,ALCO,2009-12-14,>Item 1A. Risk Factors. Alico s operations inv...,>Item 7. Management s Discussion and Analysis ...,2009,2009
...,...,...,...,...,...,...,...
36346,1796129,LSACU,2020-09-23,>ITEM 1A. RISK FACTORS 13,>ITEM 7. MANAGEMENT S DISCUSSION AND ANALYSIS ...,2020,2020
36347,1796587,NO_TICKER,2020-05-21,>Item 1A.Risk Factors.Item 2.Properties.,>Item 7.Management's Discussion and Analysis o...,2020,2020
36348,1797956,CPOW,2020-08-11,>Item 1A. Risk Factors. We discuss our expecta...,>Item 7. Management s Discussion and Analysis ...,2020,2020
36349,1799186,NONE,2020-06-25,">Item 1A, Risk Factors (c) Item 2, Properties ...",">Item 7, Management s Discussion and Analysis ...",2020,2020


# Load target data

In [6]:
# Load predictions

relevant_cols = ["PERMID", "CIK", "Ticker", "year", "FilingDate", "company_name", "Dividend Payer", "DPS growth", "DPS cut", "zEnvironmental", "dEnvironmental", "sector"]
predictions = pd.read_excel(path + "predictions.xlsx", sheet_name="data", skiprows=32, usecols=relevant_cols)
predictions.columns = ["perm_id", "cik", "ticker", "year", "filing_date", "company_name", "is_dividend_payer", "dps_change", "is_dps_cut", "z_environmental", "d_environmental", "sector"]
predictions['perm_id'] = predictions['perm_id'].str.replace(r"[^0-9]",'')
predictions["filing_date"] = pd.to_datetime(predictions["filing_date"])
predictions["filing_year"] = pd.DatetimeIndex(predictions["filing_date"]).year

""" Difference in filing_date and the year (ticker AA  has 2016 w/ 2017 filing)"""
predictions

Unnamed: 0,perm_id,cik,ticker,year,filing_date,company_name,is_dividend_payer,dps_change,is_dps_cut,z_environmental,d_environmental,sector,filing_year
0,229,1090872,A,2012,2012-12-20,AGILENT TECHNOLOGIES INC,1.0,0.421053,0.0,2.449925,-1.408006,Health Care,2012
1,229,1090872,A,2013,2013-12-19,AGILENT TECHNOLOGIES INC,1.0,0.137652,0.0,1.666993,1.117957,Health Care,2013
2,229,1090872,A,2014,2014-12-22,AGILENT TECHNOLOGIES INC,1.0,-0.275862,1.0,2.154238,-0.193752,Health Care,2014
3,229,1090872,A,2015,2015-12-21,AGILENT TECHNOLOGIES INC,1.0,0.139535,0.0,2.069083,0.039619,Health Care,2015
4,229,1090872,A,2016,2016-12-20,AGILENT TECHNOLOGIES INC,1.0,0.137652,0.0,2.087030,-0.112717,Health Care,2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...
27030,3286,1011006,YHOO,2015,2016-02-29,ALTABA INC,0.0,,,-0.180481,-1.653875,Consumer,2016
27031,3286,1011006,YHOO,2016,2017-03-01,ALTABA INC,0.0,,,-0.836055,,Consumer,2017
27032,2000,887568,ZOLL,2010,2010-12-17,ZOLL MEDICAL CORP,0.0,,,,,Health Care,2010
27033,2000,887568,ZOLL,2011,2011-11-23,ZOLL MEDICAL CORP,0.0,,,,,Health Care,2011


# Merge Data, Write to disk

In [None]:
num_pred = float(len(predictions))
num_text = float(len(text_dfs))

In [9]:
result = pd.merge(text_dfs, predictions, on=["cik", "filing_date"])

num_result = float(len(result))
num_text_lost = num_text - num_result
num_pred_lost = num_pred - num_result

ticker_mismatch = result["ticker_x"] != result["ticker_y"]
display("# and % of 10-K filings lost: ({:n},{:.0%})".format(num_text_lost, num_text_lost/ num_text))
display("# and % of stock events lost: ({:n},{:.0%})".format(num_pred_lost, num_pred_lost / num_pred))
display("# and % of ticker mismatches: ({:n},{:.0%})".format(ticker_mismatch.sum(), float(ticker_mismatch.sum()) / num_result))

'# and % of 10-K filings lost: (11316,31%)'

'# and % of stock events lost: (2000,7%)'

'# and % of ticker mismatches: (598,2%)'

In [10]:
result = pd.merge(text_dfs, predictions, on=["cik", "filing_year"])

num_result = float(len(result))
num_text_lost = num_text - num_result
num_pred_lost = num_pred - num_result

ticker_mismatch = result["ticker_x"] != result["ticker_y"]
filing_date_mismatch = result["filing_date_x"] != result["filing_date_y"]
ticker_and_filing_mismatch = ticker_mismatch & filing_date_mismatch
ticker_or_filing_mismatch = ticker_mismatch | filing_date_mismatch

display("# and % of 10-K filings lost: ({:n},{:.0%})".format(num_text_lost, num_text_lost/ num_text))
display("# and % of stock events lost: ({:n},{:.0%})".format(num_pred_lost, num_pred_lost / num_pred))
display("# and % of ticker mismatches: ({:n},{:.0%})".format(ticker_mismatch.sum(), float(ticker_mismatch.sum()) / num_result))
display("# and % of filing date mismatches: ({:n},{:.0%})".format(filing_date_mismatch.sum(), float(filing_date_mismatch.sum()) / num_result))
display("# and % of ticker and filing date mismatches: ({:n},{:.0%})".format(ticker_and_filing_mismatch.sum(), float(ticker_and_filing_mismatch.sum()) / num_result))
display("# and % of ticker or filing date mismatches: ({:n},{:.0%})".format(ticker_or_filing_mismatch.sum(), float(ticker_or_filing_mismatch.sum()) / num_result))

'# and % of 10-K filings lost: (11289,31%)'

'# and % of stock events lost: (1973,7%)'

'# and % of ticker mismatches: (599,2%)'

'# and % of filing date mismatches: (27,0%)'

'# and % of ticker and filing date mismatches: (1,0%)'

'# and % of ticker or filing date mismatches: (625,2%)'

In [11]:
result = pd.merge(text_dfs, predictions, left_on=["cik", "year"], right_on=["cik", "filing_year"])

num_result = float(len(result))
num_text_lost = num_text - num_result
num_pred_lost = num_pred - num_result

ticker_mismatch = result["ticker_x"] != result["ticker_y"]
filing_date_mismatch = result["filing_date_x"] != result["filing_date_y"]
ticker_and_filing_mismatch = ticker_mismatch & filing_date_mismatch
ticker_or_filing_mismatch = ticker_mismatch | filing_date_mismatch

display("# and % of 10-K filings lost: ({:n},{:.0%})".format(num_text_lost, num_text_lost/ num_text))
display("# and % of stock events lost: ({:n},{:.0%})".format(num_pred_lost, num_pred_lost / num_pred))
display("# and % of ticker mismatches: ({:n},{:.0%})".format(ticker_mismatch.sum(), float(ticker_mismatch.sum()) / num_result))
display("# and % of filing date mismatches: ({:n},{:.0%})".format(filing_date_mismatch.sum(), float(filing_date_mismatch.sum()) / num_result))
display("# and % of ticker and filing date mismatches: ({:n},{:.0%})".format(ticker_and_filing_mismatch.sum(), float(ticker_and_filing_mismatch.sum()) / num_result))
display("# and % of ticker or filing date mismatches: ({:n},{:.0%})".format(ticker_or_filing_mismatch.sum(), float(ticker_or_filing_mismatch.sum()) / num_result))

'# and % of 10-K filings lost: (11289,31%)'

'# and % of stock events lost: (1973,7%)'

'# and % of ticker mismatches: (599,2%)'

'# and % of filing date mismatches: (27,0%)'

'# and % of ticker and filing date mismatches: (1,0%)'

'# and % of ticker or filing date mismatches: (625,2%)'

In [12]:
result = pd.merge(text_dfs, predictions, left_on=["cik", "filing_year"], right_on=["cik", "year"])

num_result = float(len(result))
num_text_lost = num_text - num_result
num_pred_lost = num_pred - num_result

ticker_mismatch = result["ticker_x"] != result["ticker_y"]
filing_date_mismatch = result["filing_date_x"] != result["filing_date_y"]
ticker_and_filing_mismatch = ticker_mismatch & filing_date_mismatch
ticker_or_filing_mismatch = ticker_mismatch | filing_date_mismatch

display("# and % of 10-K filings lost: ({:n},{:.0%})".format(num_text_lost, num_text_lost/ num_text))
display("# and % of stock events lost: ({:n},{:.0%})".format(num_pred_lost, num_pred_lost / num_pred))
display("# and % of ticker mismatches: ({:n},{:.0%})".format(ticker_mismatch.sum(), float(ticker_mismatch.sum()) / num_result))
display("# and % of filing date mismatches: ({:n},{:.0%})".format(filing_date_mismatch.sum(), float(filing_date_mismatch.sum()) / num_result))
display("# and % of ticker and filing date mismatches: ({:n},{:.0%})".format(ticker_and_filing_mismatch.sum(), float(ticker_and_filing_mismatch.sum()) / num_result))
display("# and % of ticker or filing date mismatches: ({:n},{:.0%})".format(ticker_or_filing_mismatch.sum(), float(ticker_or_filing_mismatch.sum()) / num_result))

'# and % of 10-K filings lost: (10847,30%)'

'# and % of stock events lost: (1531,6%)'

'# and % of ticker mismatches: (550,2%)'

'# and % of filing date mismatches: (19766,78%)'

'# and % of ticker and filing date mismatches: (412,2%)'

'# and % of ticker or filing date mismatches: (19904,78%)'

In [13]:
""" Relevant statistics post merge """
result = pd.merge(text_dfs, predictions, on=["cik", "year"])
num_result = float(len(result))
num_text_lost = num_text - num_result
num_pred_lost = num_pred - num_result

ticker_mismatch = result["ticker_x"] != result["ticker_y"]
filing_date_mismatch = result["filing_date_x"] != result["filing_date_y"]
ticker_and_filing_mismatch = ticker_mismatch & filing_date_mismatch
ticker_or_filing_mismatch = ticker_mismatch | filing_date_mismatch

display("# and % of 10-K filings lost: ({:n},{:.0%})".format(num_text_lost, num_text_lost/ num_text))
display("# and % of stock events lost: ({:n},{:.0%})".format(num_pred_lost, num_pred_lost / num_pred))
display("# and % of ticker mismatches: ({:n},{:.0%})".format(ticker_mismatch.sum(), float(ticker_mismatch.sum()) / num_result))
display("# and % of filing date mismatches: ({:n},{:.0%})".format(filing_date_mismatch.sum(), float(filing_date_mismatch.sum()) / num_result))
display("# and % of ticker and filing date mismatches: ({:n},{:.0%})".format(ticker_and_filing_mismatch.sum(), float(ticker_and_filing_mismatch.sum()) / num_result))
display("# and % of ticker or filing date mismatches: ({:n},{:.0%})".format(ticker_or_filing_mismatch.sum(), float(ticker_or_filing_mismatch.sum()) / num_result))

'# and % of 10-K filings lost: (10847,30%)'

'# and % of stock events lost: (1531,6%)'

'# and % of ticker mismatches: (550,2%)'

'# and % of filing date mismatches: (19766,78%)'

'# and % of ticker and filing date mismatches: (412,2%)'

'# and % of ticker or filing date mismatches: (19904,78%)'

In [None]:
# We can conduct analysis by filtering by either cik (by company), or by year
result.drop(columns=["ticker_x", "filing_date_x", "ticker_y", "filing_date_y", "cik"], inplace=True)
result.head(5)

In [None]:
result.write_csv("processed_data.csv")

In [None]:
Add 2009-2012 10-