## Get Items from SEC Data

In [1]:
import pandas as pd
import requests, zipfile, io
import os
from pathlib import Path

Get latest file from SEC (https://www.sec.gov/dera/data/financial-statement-and-notes-data-set.html):

In [2]:
def download_file(period):
    url = 'https://www.sec.gov/files/dera/data/financial-statement-and-notes-data-sets/'+period+'_notes.zip'
    
    unzip_folder_name = 'data/sec/downloads/' + period                           # Where to put contents of unzipped file  
    
    r = requests.get(url)
    if r.ok:                                                                     # If download worked
        print('Downloaded:', url, 'to:', unzip_folder_name)
        Path(unzip_folder_name).mkdir(parents=True, exist_ok=True)            
        z = zipfile.ZipFile(io.BytesIO(r.content))
        z.extractall(members=['sub.tsv','num.tsv'], path=unzip_folder_name)
    else:
        print('File not found')
        
        
def merge_sec_files(folder):

    keep_these_columns = ['cik','sic','countryinc','tag','filed','ddate','qtrs','value']

    filings = pd.read_table('data/sec/downloads/'+folder+'/sub.tsv')
    numbers = pd.read_table('data/sec/downloads/'+folder+'/num.tsv', encoding='ISO-8859-1', error_bad_lines=False) 

    filings = filings[filings.form.isin(['10-Q','10-K']) & filings.cik.notnull()]
    numbers = numbers[(numbers.dimh=='0x00000000')]                                     # keep only non-segment data

    merged = numbers.merge(filings, on='adsh', how='inner')[keep_these_columns]

    merged['filed'] = pd.to_datetime(merged.filed, format='%Y%m%d', errors='coerce')    #  ‘coerce’: invalid parsing set as NaT.
    merged['ddate'] = pd.to_datetime(merged.ddate, format='%Y%m%d', errors='coerce')    

    merged = merged[merged.filed.notnull() & merged.ddate.notnull()].drop_duplicates()

    merged.to_csv('data/sec/merged/'+folder+'.csv', index=False)
    
    return merged        

In [3]:
download_file('2021_01')

Downloaded: https://www.sec.gov/files/dera/data/financial-statement-and-notes-data-sets/2021_01_notes.zip to: data/sec/downloads/2021_01


In [4]:
merge_sec_files('2021_01')

Unnamed: 0,cik,sic,countryinc,tag,filed,ddate,qtrs,value
0,1517389,7371.0,US,AccountsPayableAndAccruedLiabilitiesCurrent,2021-01-06,2020-11-30,0,7094.0
1,1517389,7371.0,US,AccountsPayableAndAccruedLiabilitiesCurrent,2021-01-06,2020-02-29,0,8698.0
2,1517389,7371.0,US,AccountsReceivableNetCurrent,2021-01-06,2020-11-30,0,9384.0
3,1517389,7371.0,US,AccountsReceivableNetCurrent,2021-01-06,2020-02-29,0,9402.0
4,1517389,7371.0,US,AdditionalPaidInCapital,2021-01-06,2020-11-30,0,2449733.0
...,...,...,...,...,...,...,...,...
118256,1593812,6221.0,US,RedemptionsCostBasis,2021-01-29,2019-10-31,4,-6713.0
118257,1593812,6221.0,US,RedemptionsCostBasis,2021-01-29,2020-10-31,4,-203243901.0
118258,1593812,6221.0,US,WeightedAverageNumberOfGoldReceipts,2021-01-29,2018-10-31,4,194219.0
118259,1593812,6221.0,US,WeightedAverageNumberOfGoldReceipts,2021-01-29,2019-10-31,4,153340.0


Read example file:

In [5]:
directory = 'data/sec/merged/'
filename  = '2010q2.csv'
data      = pd.read_csv(directory+filename, parse_dates=['filed','ddate'])
data

Unnamed: 0,cik,sic,countryinc,tag,filed,ddate,qtrs,value
0,1063761,6798,US,AccountsAndNotesReceivableNet,2010-05-10,2009-12-31,0,4.027290e+08
1,1063761,6798,US,AccountsAndNotesReceivableNet,2010-05-10,2010-03-31,0,3.554690e+08
2,1063761,6798,US,AccumulatedOtherComprehensiveIncomeLossNetOfTax,2010-05-10,2009-12-31,0,-3.088000e+06
3,1063761,6798,US,AccumulatedOtherComprehensiveIncomeLossNetOfTax,2010-05-10,2010-03-31,0,-2.751700e+07
4,1063761,6798,US,AdditionalPaidInCapital,2010-05-10,2009-12-31,0,7.547959e+09
...,...,...,...,...,...,...,...,...
96107,1032208,4932,US,EntityCommonStockSharesOutstanding,2010-05-04,2010-03-31,0,2.475390e+08
96108,1135971,4931,US,EntityCommonStockSharesOutstanding,2010-05-07,2010-03-31,0,2.232103e+08
96109,934612,4011,,EntityCommonStockSharesOutstanding,2010-05-07,2010-03-31,0,0.000000e+00
96110,934612,4011,,EntityPublicFloat,2010-05-07,2009-06-30,0,2.479400e+10


Get all earnings:

In [6]:
tag  = 'NetIncomeLoss'
item = data[data.tag==tag]
item

Unnamed: 0,cik,sic,countryinc,tag,filed,ddate,qtrs,value
302,899881,6798,US,NetIncomeLoss,2010-05-05,2010-03-31,1,-8.476000e+07
303,899881,6798,US,NetIncomeLoss,2010-05-05,2009-03-31,1,1.851010e+08
543,895648,6798,US,NetIncomeLoss,2010-05-12,2010-03-31,1,5.165600e+07
544,895648,6798,US,NetIncomeLoss,2010-05-12,2009-03-31,1,-3.960820e+08
824,315189,3523,US,NetIncomeLoss,2010-05-28,2009-04-30,2,6.762000e+08
...,...,...,...,...,...,...,...,...
95426,310522,6111,,NetIncomeLoss,2010-05-10,2010-03-31,1,-1.153000e+10
95771,1032033,6141,US,NetIncomeLoss,2010-05-06,2009-03-31,1,-2.138600e+07
95772,1032033,6141,US,NetIncomeLoss,2010-05-06,2010-03-31,1,2.401400e+08
96014,1022646,1311,,NetIncomeLoss,2010-05-05,2010-03-31,1,2.023760e+08


Get SEC file with ticker symbols:

In [7]:
symbols = pd.read_json('https://www.sec.gov/files/company_tickers.json').transpose().set_index('cik_str')
symbols

Unnamed: 0_level_0,ticker,title
cik_str,Unnamed: 1_level_1,Unnamed: 2_level_1
320193,AAPL,Apple Inc.
789019,MSFT,MICROSOFT CORP
1018724,AMZN,AMAZON COM INC
1652044,GOOG,Alphabet Inc.
1293451,TCEHY,Tencent Holdings Ltd
...,...,...
1819516,ASPL-WT,Aspirational Consumer Lifestyle Corp.
1819574,STIC-UN,Northern Star Acquisition Corp.
1819574,STIC-WT,Northern Star Acquisition Corp.
1819584,SNPR-UN,Tortoise Acquisition Corp. II


Find CIKs for Apple and Amazon:

In [8]:
apple = symbols[symbols.ticker=='AAPL'].index[0]  
apple

320193

In [9]:
amazon = symbols[symbols.ticker=='AMZN'].index[0] 
amazon

1018724

Get all repoted earnings for these two firms:

In [11]:
t = item[item.cik.isin([320193, 1018724])] # could also search for 'apple' and 'amazon', the variables we assigned the cik above
t

Unnamed: 0,cik,sic,countryinc,tag,filed,ddate,qtrs,value
36575,1018724,5961,US,NetIncomeLoss,2010-04-23,2009-03-31,4,679000000.0
36576,1018724,5961,US,NetIncomeLoss,2010-04-23,2010-03-31,4,1024000000.0
36577,1018724,5961,US,NetIncomeLoss,2010-04-23,2009-03-31,1,177000000.0
36578,1018724,5961,US,NetIncomeLoss,2010-04-23,2010-03-31,1,299000000.0
57500,320193,3571,US,NetIncomeLoss,2010-04-21,2010-03-31,2,6452000000.0
57501,320193,3571,US,NetIncomeLoss,2010-04-21,2009-03-31,2,3875000000.0
57502,320193,3571,US,NetIncomeLoss,2010-04-21,2009-03-31,1,1620000000.0
57503,320193,3571,US,NetIncomeLoss,2010-04-21,2010-03-31,1,3074000000.0


We want: for each company and each filing: most recent period and shortest quarters.     
Step 1: sort:

In [12]:
shortest = t.sort_values(['cik','filed','ddate','qtrs'], ascending=[True,True,True,False])
shortest

Unnamed: 0,cik,sic,countryinc,tag,filed,ddate,qtrs,value
57501,320193,3571,US,NetIncomeLoss,2010-04-21,2009-03-31,2,3875000000.0
57502,320193,3571,US,NetIncomeLoss,2010-04-21,2009-03-31,1,1620000000.0
57500,320193,3571,US,NetIncomeLoss,2010-04-21,2010-03-31,2,6452000000.0
57503,320193,3571,US,NetIncomeLoss,2010-04-21,2010-03-31,1,3074000000.0
36575,1018724,5961,US,NetIncomeLoss,2010-04-23,2009-03-31,4,679000000.0
36577,1018724,5961,US,NetIncomeLoss,2010-04-23,2009-03-31,1,177000000.0
36576,1018724,5961,US,NetIncomeLoss,2010-04-23,2010-03-31,4,1024000000.0
36578,1018724,5961,US,NetIncomeLoss,2010-04-23,2010-03-31,1,299000000.0


Step 2: group (we want 1 observation per filing):

In [13]:
shortest.groupby(['cik','filed']).last()

Unnamed: 0_level_0,Unnamed: 1_level_0,sic,countryinc,tag,ddate,qtrs,value
cik,filed,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
320193,2010-04-21,3571,US,NetIncomeLoss,2010-03-31,1,3074000000.0
1018724,2010-04-23,5961,US,NetIncomeLoss,2010-03-31,1,299000000.0


Same for longest quarters:

In [14]:
longest = t.sort_values(['cik','filed','ddate','qtrs'], ascending=[True,True,True,True])

longest.groupby(['cik','filed']).last()

Unnamed: 0_level_0,Unnamed: 1_level_0,sic,countryinc,tag,ddate,qtrs,value
cik,filed,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
320193,2010-04-21,3571,US,NetIncomeLoss,2010-03-31,2,6452000000.0
1018724,2010-04-23,5961,US,NetIncomeLoss,2010-03-31,4,1024000000.0


Now do this for all firms:

In [17]:
shortest = item.sort_values(['cik','filed','ddate','qtrs'], ascending=[True,True,True,False])
longest  = item.sort_values(['cik','filed','ddate','qtrs'], ascending=[True,True,True,True])

shortest = shortest.groupby(['cik','filed']).last()     # Most recent, shortest for each firm and filing.
longest  = longest .groupby(['cik','filed']).last()     # Most recent, longest for each firm and filing.  

shortest

Unnamed: 0_level_0,Unnamed: 1_level_0,sic,countryinc,tag,ddate,qtrs,value
cik,filed,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2969,2010-04-26,2810,US,NetIncomeLoss,2010-03-31,1,252000000.0
3673,2010-05-07,4911,US,NetIncomeLoss,2010-03-31,1,88200000.0
4127,2010-05-11,3674,US,NetIncomeLoss,2010-03-31,1,27744000.0
4281,2010-04-22,3350,US,NetIncomeLoss,2010-03-31,1,-201000000.0
4447,2010-05-07,2911,US,NetIncomeLoss,2010-03-31,1,538000000.0
...,...,...,...,...,...,...,...
1451505,2010-05-05,1381,CH,NetIncomeLoss,2010-03-31,1,677000000.0
1453090,2010-05-03,1381,CH,NetIncomeLoss,2010-03-31,1,-40009000.0
1465112,2010-05-07,4899,US,NetIncomeLoss,2010-03-31,1,558000000.0
1466258,2010-05-07,3822,IE,NetIncomeLoss,2010-03-31,1,1400000.0


Repeat this for multiple quarters:

In [27]:
tag = 'EarningsPerShareBasic'

directory = 'data/sec/merged/'                           
filenames = ['2020q1.csv','2020q2.csv','2020q3.csv', '2020_10.csv', '2020_11.csv','2020_12.csv','2021_01.csv']

values_short   = pd.DataFrame()        # Values measured over shortest duration (smallest qtrs).
values_long    = pd.DataFrame()        # Values measured over longest  duration (largest qtrs).

for filename in filenames:                                # Loop over all files.
    print(filename)
    data = pd.read_csv(directory+filename, parse_dates=['filed','ddate'])  # Read the file.

    item  = data[data.tag==tag]                         # Select all data for this tag.

    short = item.sort_values(['cik','filed','ddate','qtrs'], ascending=[True,True,True,False])
    long  = item.sort_values(['cik','filed','ddate','qtrs'], ascending=[True,True,True,True])
    short = short.groupby(['cik','filed']).last()     # Most recent, shortests for each firm and filing.
    long  = long .groupby(['cik','filed']).last()     # Most recent, longest   for each firm and filing.            
    
    values_short   = values_short  .append( short[['value','qtrs']])  
    values_long    = values_long   .append( long [['value','qtrs']]) 

2020q1.csv
2020q2.csv
2020q3.csv
2020_10.csv
2020_11.csv
2020_12.csv
2021_01.csv


Check results, get all values for Apple:

In [34]:
cik = symbols[symbols.ticker=='GOOG'].index[0]  

values_short.value.unstack(level = 0)[cik].dropna()

filed
2020-02-04    49.59
2020-04-29     9.96
2020-07-31    10.21
2020-10-30    16.55
Name: 1652044, dtype: float64

Corresponding quarters:

In [29]:
values_short.qtrs.unstack(level = 0)[cik].dropna()

filed
2020-01-29    1.0
2020-05-01    1.0
2020-07-31    1.0
2020-10-30    1.0
2021-01-28    1.0
Name: 320193, dtype: float64

Long values:

In [37]:
values_long.value.unstack(level = 0)[cik].dropna()

filed
2020-02-04    49.59
2020-04-29     9.96
2020-07-31    20.16
2020-10-30    36.69
Name: 1652044, dtype: float64

Quarters:

In [36]:
values_long.qtrs.unstack(level = 0)[cik].dropna()

filed
2020-02-04    4.0
2020-04-29    1.0
2020-07-31    2.0
2020-10-30    3.0
Name: 1652044, dtype: float64

We can also unstack these tables:

Get multiple tags simultaneously:

In [80]:
def get_items(tags, filename=None):                     # Function input: list of tags, optional filename.

    directory = 'data/sec/merged/'                            # Read data from here.
    filenames = [filename] if filename else os.listdir(directory) # Supplied filename or all files in "merged" directory.

    values_short   = {t:pd.DataFrame() for t in tags}         # Values measured over shortest duration (smallest qtrs).
    values_long    = {t:pd.DataFrame() for t in tags}         # Values measured over longest  duration (largest qtrs).

    for filename in filenames:                                # Loop over all files.
        print(filename)
        data = pd.read_csv(directory+filename, parse_dates=['filed','ddate'])  # Read the file.
        
        for t in tags:                                        # Loop over all tags.
            item  = data[data.tag==t]                         # Select all data for this tag.
            short = item.sort_values(['cik','filed','ddate','qtrs'], ascending=[True,True,True,False])
            long  = item.sort_values(['cik','filed','ddate','qtrs'], ascending=[True,True,True,True])
            short = short.groupby(['cik','filed']).last()     # Most recent, shortests for each firm and filing.
            long  = long .groupby(['cik','filed']).last()     # Most recent, longest   for each firm and filing.            
            values_short  [t] = values_short  [t].append( short[['value','qtrs']] )  
            values_long   [t] = values_long   [t].append( long [['value','qtrs']] )         
                        
    for t in tags:                                            # Now sort all tables by filing date.
        if not values_short  [t].empty:   values_short  [t] = values_short  [t].unstack(level=0).sort_values('filed')
        if not values_long   [t].empty:   values_long   [t] = values_long   [t].unstack(level=0).sort_values('filed')

    return values_short, values_long

Run this function like this:

In [None]:
vs, vl = get_items(['ResearchAndDevelopmentExpense','NetIncomeLoss'])

Plot R&D of Apple: