Imports

In [2]:
import pandas as pd
import re
import requests
import time
import pickle

Helper Functions

In [3]:
def get_request(CIK):
    header = {"User-agent":"nchasse@umich.edu"}
    url = 'https://data.sec.gov/api/xbrl/companyfacts/CIK' + CIK +'.json'
    r = requests.get(url,headers=header)
    data_sec = r.json()
    return data_sec

In [4]:
def get_measures(sec):
    return sec['facts']['us-gaap'].keys()

In [5]:
def get_measure_keys(measures,request):
    measure_keys = {}
    for measure in measures:
        measure_keys[measure] = list(request['facts']['us-gaap'][measure]['units'])[0]
    return measure_keys

In [63]:
def get_data(measure,measure_units,request,symbol):
    quarters = []
    for dic in request['facts']['us-gaap'][measure]['units'][measure_units[measure]]:
        if ('frame' in dic.keys()):
            new_dic = {}
            new_dic['sym'] = symbol
            new_dic['end'] = dic['end']
            new_dic['frame'] = dic['frame']
            new_dic['value'] = dic['val']
            new_dic['measure'] = measure
            if re.match(r"^CY\d{4}$",dic['frame']):
                new_dic['type'] = 'Year'
            else:
                new_dic['type'] = 'Quarter'
            quarters.append(new_dic)
    df = pd.DataFrame.from_dict(quarters)
    return df

In [65]:
def make_dataframe(measures,measure_units,request,symbol):
    df_final = pd.DataFrame()
    for measure in measures:
        df = get_data(measure,measure_units,request,symbol)
        if len(df_final) == 0:
            df_final = df
        else:
            df_final = pd.concat([df_final, df],join='outer')
    df_final = df_final.sort_values(by='end')
    return df_final

Apple Example

In [66]:
apple_cik = '0000320193'
request = get_request(apple_cik)
apple_measures = get_measures(request)
apple_measure_keys = get_measure_keys(apple_measures,request)
apple_dataframe = make_dataframe(apple_measures,apple_measure_keys,request,'AAPL')

In [67]:
apple_dataframe.to_pickle('apple_dataframe.pkl')

Getting Data from All Stocks Using CIK Identifier

In [87]:
#get all CIKS
tickers = pd.read_pickle(r'C:\Users\nchas\Downloads\top500_data_nona.pkl')
data = pd.read_csv("SP500 - Sheet1.csv")
cols = list(tickers.columns)
ticks = set()
for col in cols:
    ticks.add(col[1])
CIKs = []
for sym in data['Symbol']:
    if sym in ticks:
        CIKs.append((int(data['CIK'][data['Symbol'] == sym]),sym))

  CIKs.append((int(data['CIK'][data['Symbol'] == sym]),sym))


In [101]:
smaller_file = round(len(CIKs)/3)
range_1 = smaller_file
range_2 = smaller_file*2
range_3 = smaller_file*3

In [102]:
#get data from EDGAR and make one DataFrame part 1/3
df_merged = pd.DataFrame()
for cik in CIKs[:range_1]:
    final_cik = str(cik[0]).zfill(10)
    request = get_request(final_cik)
    measures = get_measures(request)
    measure_keys = get_measure_keys(measures,request)
    df = make_dataframe(measures,measure_keys,request,cik[1])
    if len(df_merged) == 0:
        df_merged = df
    else:
        df_merged = pd.concat([df,df_merged],join='outer')
    #ensure compliance with SEC limit of 10 requests per second
    time.sleep(0.1)
df_merged.dropna(inplace=True)
df_merged.to_feather('stock_edgar_1_of_3.feather')

In [103]:
#get data from EDGAR and make one DataFrame part 2/3
df_merged = pd.DataFrame()
for cik in CIKs[range_1:range_2]:
    final_cik = str(cik[0]).zfill(10)
    request = get_request(final_cik)
    measures = get_measures(request)
    measure_keys = get_measure_keys(measures,request)
    df = make_dataframe(measures,measure_keys,request,cik[1])
    if len(df_merged) == 0:
        df_merged = df
    else:
        df_merged = pd.concat([df,df_merged],join='outer')
    #ensure compliance with SEC limit of 10 requests per second
    time.sleep(0.1)
df_merged.dropna(inplace=True)
df_merged.to_feather('stock_edgar_2_of_3.feather')

In [104]:
#get data from EDGAR and make one DataFrame part 3/3
df_merged = pd.DataFrame()
for cik in CIKs[range_2:range_3+1]:
    final_cik = str(cik[0]).zfill(10)
    request = get_request(final_cik)
    measures = get_measures(request)
    measure_keys = get_measure_keys(measures,request)
    df = make_dataframe(measures,measure_keys,request,cik[1])
    if len(df_merged) == 0:
        df_merged = df
    else:
        df_merged = pd.concat([df,df_merged],join='outer')
    #ensure compliance with SEC limit of 10 requests per second
    time.sleep(0.1)
df_merged.dropna(inplace=True)
df_merged.to_feather('stock_edgar_3_of_3.feather')