In [1]:
import re
import pandas as pd 
from bs4 import BeautifulSoup
import requests
#import io
from near_regex import NEAR_regex
import math
#from zipfile import ZipFile
import os 
import random

In [2]:
colnames = ['tic', 'url']
#create a dataframe from the csv, and drop the header row
form10K_df = pd.read_csv('inputs/sp500_10K_with_url.csv', names= colnames).drop([0])
form10K_df

Unnamed: 0,tic,url
1,MMM,https://www.sec.gov/Archives/edgar/data/66740/...
2,ABT,https://www.sec.gov/Archives/edgar/data/1800/0...
3,ABBV,https://www.sec.gov/Archives/edgar/data/155115...
4,ABMD,https://www.sec.gov/Archives/edgar/data/815094...
5,ACN,https://www.sec.gov/Archives/edgar/data/146737...
...,...,...
495,YUM,https://www.sec.gov/Archives/edgar/data/104106...
496,ZBRA,https://www.sec.gov/Archives/edgar/data/877212...
497,ZBH,https://www.sec.gov/Archives/edgar/data/113686...
498,ZION,https://www.sec.gov/Archives/edgar/data/109380...


In [3]:
#list of keywords often found in the first paragraph of the risk section, usually urges the reader to read the section fully 
risk_ids = ['below', 'important', 'material', 'immaterial', 'consider', 'considered', 'consideration',
            'affect', 'effect', 'carefully', 'following', 'uncertainties', 'you', 'forth', 'adverse', 'adversely']
#helper method to find all instances of the phrase 'item 1a' in 10Ks, the standarized section where risk is discussed 
def findRiskEvents(cleaned_str):
    key = 'item 1a'
    indicies = []
    current_index = 1
    while current_index != 0:
        #get the occurrence of 'item 1a', start at the first character
        current_index = cleaned_str.find(key, current_index)
        #add the index to the list 
        indicies.append(current_index)
        #go to the next character and search again
        current_index += 1
    #if not found, method returns -1. So if current_index is 0 (-1 + 1) then we have found each occurrence 
    #return the list without the last member (since that's 0)
    return indicies[:-1]

#using the array generated above, find the occurrence of item 1a which heads the risk section 
def findRiskSection(cleaned_str, risk_events):
    risk_begin = 0
    for i in risk_events:
        risk_str = cleaned_str[i:i+1500]
        if any(risk_id in risk_str for risk_id in risk_ids):
            if ('item 1 business' or 'incorporated') not in risk_str:
                #assume that this is the start of the risk section if keyword found, and no disqualifier words and foun
                risk_begin = i
                break
    #if the risk section is never found
    if risk_begin == 0:
        return risk_begin
    risk_end = cleaned_str.find('item 1b', risk_begin)
    return risk_begin, risk_end

In [4]:
#define a general list of words related to risk
risk_words = '(liability|liabilities|risk|risks|headwind|headwinds|concerning)'
#define 6 different risks to find occurrences of in document
supply_chain = '(supply|supply chain|logistical|supplier|suppliers)'
regulatory = '(regulation|regulatory|regulate)'
inflation = '(inflation|inflationary)'
macro = '(macroeconomic|macro economic|trade|trading|import|imports|importing|exchange)'
debt = '(debt|debts|interest payments|principal payments|repay|repay lenders|default|solvent|solvency)'
liquidity = '(liquid|liquidity)'

risk_names = ['supply_chain_risk', 'regulatory_risk', 'inflationary_risk', 'macro_risk', 'debt_risk', 'liquidity_risk', 'doc_type']
risk_names_factors_only = risk_names[:6]

In [5]:
#this cell takes a few minutes to execute, lots of text to parse 

c = 1
last_print = 0
for index, row in form10K_df.iterrows():
    path = 'text_files/' + row['tic'] + '_10K.html'
    with open(path, 'r', encoding = 'utf-8') as f:
        text = f.read() 
    
    lower = BeautifulSoup(text).get_text().lower()
    no_punc = re.sub(r'\W', ' ', lower)
    cleaned = re.sub(r'\s+',' ', no_punc).strip()
  
    risk_inds = findRiskSection(cleaned, findRiskEvents(cleaned))

    #if the risk section was found to not exist, continue to use risk section as the entire document
    if risk_inds == 0:
        risk_section = cleaned
        doctype = '10K'
        max_words = 20
    #define the risk section from the beginning and ending indicies found above if it exists
    else:
        risk_section = cleaned[risk_inds[0]:risk_inds[1]]
        doctype = 'Risk Section'
        max_words = 100
        #print an occasional risk section at random, this should theoretically print a few each time
        r = random.randint(1, 499)
        if r < 7:
            print('\n\n')
            print(row['tic'])
            print(risk_section)
            print('\n\n') 

    #determine the number of occurrences of each risk, and add each to the dataframe
    rgx_list = []
    rgx_list.append(NEAR_regex([risk_words, supply_chain],max_words_between=max_words))
    rgx_list.append(NEAR_regex([risk_words, regulatory],max_words_between=max_words))
    rgx_list.append(NEAR_regex([risk_words, inflation],max_words_between=max_words))
    rgx_list.append(NEAR_regex([risk_words, macro],max_words_between=max_words))
    rgx_list.append(NEAR_regex([risk_words, debt],max_words_between=max_words))
    rgx_list.append(NEAR_regex([risk_words, liquidity],max_words_between=max_words))

    for name, rgx in zip(risk_names_factors_only, rgx_list):
        #int may crash code, untested
        form10K_df.loc[index, name] = (int)(len(re.findall(rgx, risk_section)))

    form10K_df.loc[index, 'doctype'] = doctype

    prog = math.trunc(c / 499 * 100)
    if prog in range(10, 100, 10) and c > last_print + 10:
        print('%d percent of risk factors found.' % prog)
        last_print = c
    c += 1
print('Done!')

10 percent of risk factors found.
20 percent of risk factors found.
30 percent of risk factors found.
40 percent of risk factors found.



IFF
item 1a risk factors we routinely encounter and address risks in conducting our business some of these risks may cause our future results to be different sometimes materially different than we presently anticipate below are material risks we have identified that could adversely affect our business how we react to material future developments as well as how our competitors and customers react to those developments could also affect our future resultswe may not realize the benefits anticipated from the frutarom acquisition which could adversely affect our business part of our growth strategy has included growth through acquisitions within the fragrance flavors natural ingredient industries and adjacencies the frutarom acquisition was our most significant acquisition and brings a significantly different customer base and new adjacent product catego

In [6]:
#lets see how it looks:
form10K_df

Unnamed: 0,tic,url,supply_chain_risk,regulatory_risk,inflationary_risk,macro_risk,debt_risk,liquidity_risk,doctype
1,MMM,https://www.sec.gov/Archives/edgar/data/66740/...,3.0,6.0,1.0,4.0,1.0,0.0,Risk Section
2,ABT,https://www.sec.gov/Archives/edgar/data/1800/0...,4.0,7.0,2.0,5.0,3.0,1.0,Risk Section
3,ABBV,https://www.sec.gov/Archives/edgar/data/155115...,2.0,7.0,2.0,3.0,4.0,0.0,Risk Section
4,ABMD,https://www.sec.gov/Archives/edgar/data/815094...,3.0,11.0,0.0,7.0,1.0,2.0,Risk Section
5,ACN,https://www.sec.gov/Archives/edgar/data/146737...,4.0,10.0,1.0,6.0,2.0,0.0,Risk Section
...,...,...,...,...,...,...,...,...,...
495,YUM,https://www.sec.gov/Archives/edgar/data/104106...,4.0,6.0,2.0,6.0,3.0,2.0,Risk Section
496,ZBRA,https://www.sec.gov/Archives/edgar/data/877212...,8.0,3.0,0.0,7.0,3.0,0.0,Risk Section
497,ZBH,https://www.sec.gov/Archives/edgar/data/113686...,3.0,4.0,0.0,2.0,1.0,0.0,Risk Section
498,ZION,https://www.sec.gov/Archives/edgar/data/109380...,0.0,0.0,0.0,0.0,0.0,0.0,Risk Section


In [8]:
#load this into a csv for further use
os.makedirs('outputs', exist_ok=True)
form10K_df.to_csv('outputs/form10K_risks.csv')

In [9]:
#get the accounting data
url = 'https://github.com/LeDataSciFi/ledatascifi-2021/blob/main/data/2019%20ccm_cleaned.dta?raw=true'
data = pd.read_stata(url)
data.to_csv('outputs/acct_file.csv')
acct_df = pd.read_csv('outputs/acct_file.csv')
#since I have an index column in the csv that's unnamed, it throws it into the dataframe. delete it
if 'Unnamed: 0' in acct_df.columns:
    del acct_df['Unnamed: 0']

In [10]:
#now to merge in the data, use a left merge to ensure all the form10K firms remain in the set 
sp500_accting_plus_textrisks = pd.merge(left=form10K_df, right=acct_df, how='left', on='tic', validate='1:1', indicator=True)
sp500_accting_plus_textrisks

Unnamed: 0,tic,url,supply_chain_risk,regulatory_risk,inflationary_risk,macro_risk,debt_risk,liquidity_risk,doctype,gvkey,...,prof_a,ppe_a,cash_a,xrd_a,dltt_a,invopps_FG09,sales_g,dv_a,short_debt,_merge
0,MMM,https://www.sec.gov/Archives/edgar/data/66740/...,3.0,6.0,1.0,4.0,1.0,0.0,Risk Section,7435.0,...,0.193936,0.228196,0.065407,0.042791,0.408339,2.749554,,0.074252,0.143810,both
1,ABT,https://www.sec.gov/Archives/edgar/data/1800/0...,4.0,7.0,2.0,5.0,3.0,1.0,Risk Section,1078.0,...,0.118653,0.132161,0.060984,0.035942,0.256544,2.520681,,0.033438,0.088120,both
2,ABBV,https://www.sec.gov/Archives/edgar/data/155115...,2.0,7.0,2.0,3.0,4.0,0.0,Risk Section,16101.0,...,0.178107,0.037098,0.448005,0.076216,0.709488,2.211589,,0.071436,0.057566,both
3,ABMD,https://www.sec.gov/Archives/edgar/data/815094...,3.0,11.0,0.0,7.0,1.0,2.0,Risk Section,13619.0,...,0.225749,0.137531,0.466354,0.088683,0.000000,12.164233,,0.000000,,both
4,ACN,https://www.sec.gov/Archives/edgar/data/146737...,4.0,10.0,1.0,6.0,2.0,0.0,Risk Section,143357.0,...,0.232395,0.046699,0.205780,0.026846,0.000545,4.241083,,0.062583,0.282946,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494,YUM,https://www.sec.gov/Archives/edgar/data/104106...,4.0,6.0,2.0,6.0,3.0,2.0,Risk Section,65417.0,...,0.341853,0.346396,0.142038,0.000000,1.071959,8.046718,,0.097687,0.044192,both
495,ZBRA,https://www.sec.gov/Archives/edgar/data/877212...,8.0,3.0,0.0,7.0,3.0,0.0,Risk Section,24405.0,...,0.192104,0.077691,0.006368,0.094884,0.250478,3.225952,,0.000000,0.160740,both
496,ZBH,https://www.sec.gov/Archives/edgar/data/113686...,3.0,4.0,0.0,2.0,1.0,0.0,Risk Section,144559.0,...,0.102051,0.095139,0.025078,0.021081,0.281545,1.556915,,0.007983,0.184000,both
497,ZION,https://www.sec.gov/Archives/edgar/data/109380...,0.0,0.0,0.0,0.0,0.0,0.0,Risk Section,,...,,,,,,,,,,left_only


In [11]:
#looks good. save to csv for further use 
sp500_accting_plus_textrisks.to_csv('outputs/sp500_accting_plus_textrisks.csv')