# U.S. Stock Market I: Matching SEC Data to Trading Days and Ticker Symbols

In [None]:
from tiingo import TiingoClient                       
tiingo = TiingoClient({'api_key':'XXXX'})

In [None]:
import pandas as pd
import numpy as np
import requests, zipfile, io
import os
from pathlib import Path

In [None]:
def get_items_from_SEC_files(tags, filename=None):                # Function input: list of tags, optional filename.

    directory = 'data/sec/merged/'                                # Read data from here.
    filenames = [filename] if filename else os.listdir(directory) # Supplied filename or all files in "merged" directory.
    filenames = [f for f in filenames if not f.startswith(".")]   # Exclude hidden files from file list.

    results   = {t:pd.DataFrame() for t in tags}                  # Dictionary of tables (1 table for each tag)

    for filename in filenames:                                    # Loop over all files.
        print(filename)
        data = pd.read_csv(directory+filename, parse_dates=['filed','ddate'])  # Read the file.
        
        for t in tags:                                            # Loop over all tags.
            item  = data[data.tag==t]                             # Select all data for this tag.
            short = item.sort_values(['cik','filed','ddate','qtrs'], ascending=[True,True,True,False]) # Samllest qrts.
            long  = item.sort_values(['cik','filed','ddate','qtrs'], ascending=[True,True,True,True])  # Largest  qtrs.
            short = short.groupby(['cik','filed']).last()[['value','qtrs']]     # One value for each firm and filing.
            long  = long .groupby(['cik','filed']).last()[['value','qtrs']]     
            short_long = short.join(long, lsuffix='_shortest', rsuffix='_longest') # Put shortest and longest next to each other.
            results[t] = results[t].append( short_long )  
                        
    for t in tags:                                                # Now sort all tables by filing date.
        if not results[t].empty: results[t] = results[t].sort_index(level='filed')            

    return results



def calculate_quarterly_annual_values(item):                        # item: table with shortest and longest values and quarters.
    result           = pd.DataFrame()                               # Results go here.
    all_firms        = item.index.get_level_values('cik').unique()  # All CIKs.
    all_filing_dates = pd.read_csv('data/sec/dates/filing_dates.csv', index_col='cik', parse_dates=['filed'])
    
    for cik in all_firms:                                           # Loop over all firms.  
        filing_dates = pd.Series(all_filing_dates.filed[cik])       # All filing dates for this firm.

        # Quarterly values:
        valuesQ = item.loc[cik].value_shortest.reindex(filing_dates) # Values with shortest reported quarters.
        qtrsQ   = item.loc[cik].qtrs_shortest.astype(int)           # Number of quarters for each value.
        for date,q in qtrsQ[qtrsQ>1].iteritems():                   # Loop over all dates with > 1 quarters. 
            previous_values = valuesQ[:date][-q:-1]                 # Example: for q=3 we need to subtract 2 previous quarters.            
            if len(previous_values) == q-1:                         # If all previous values available.
                valuesQ[date] -= previous_values.sum(skipna=False)  # Subtract previous values to get quarterly value.
            else:
                valuesQ[date]  = np.nan                  

        # Annual values:
        valuesA = item.loc[cik].value_longest.reindex(filing_dates) # Values with longest reported quarters.
        qtrsA   = item.loc[cik].qtrs_longest.astype(int)            # Number of quarters for each value.
        for date,q in qtrsA[qtrsA<4].iteritems():                   # Loop over all dates with < 4 quarters. 
            previous_values = valuesQ[:date][-4:-q]                 # Example: for q=2 we need to add quarters -3 and -4.
            if len(previous_values) == 4-q:                         # If all previous data available.     
                valuesA[date] += previous_values.sum(skipna=False)  # Add previous values to get annual values.
            else:
                valuesA[date]  = np.nan        
        
        result = result.append( pd.DataFrame({'cik':cik, 'filed':filing_dates, 'valueQ':valuesQ.values, 'valueA':valuesA.values}) )

    return result.set_index(['cik','filed'])                        # Return a table with columns 'valueQ' and 'valueA'.

Get R&D values:

In [None]:
tags  = ['ResearchAndDevelopmentExpense'] 
items = get_items_from_SEC_files(tags)
item  = items[tags[0]]

rnd = calculate_quarterly_annual_values(item)
rnd

Save this table:

In [None]:
rnd.to_csv('data/sec/items/RnD.csv')

And now we can read the file like this:

In [None]:
rnd = pd.read_csv('data/sec/items/RnD.csv',parse_dates=['filed'], index_col=['cik','filed'])
rnd

Unstack the quarterly table (put cik as column):

In [None]:
rndQ = 
rndQ

Get data for SPY:

In [None]:
tiingo.get_dataframe('', '2009-04-1')

Use these dates as "trading days":

In [None]:
trading_days = 
trading_days

Combine the trading days and the dates from the R&D table:

In [None]:
trading_days.union(rndQ.index)

Add these dates to the R&D table:

In [None]:
rndQ = rndQ.reindex( trading_days.union(rndQ.index) ).sort_index()
rndQ

Get all filing dates (we saved this file previously):

In [None]:
filing_dates = pd.read_csv('data/sec/dates/filing_dates.csv', index_col='cik', parse_dates=['filed']).filed
filing_dates

Last filing date for our data:

In [None]:
last_filing_date_all_firms = 
last_filing_date_all_firms

When was the last filing date for Red Hat?

In [None]:
symbols = pd.read_json('https://www.sec.gov/files/company_tickers.json').transpose().set_index('cik_str')

In [None]:
cik = symbols[symbols.ticker==''].index[0]    # Red Hat

last_filing_date =  
last_filing_date

How many days since Red Hat last filed?

In [None]:
days_since_last_filed = 
days_since_last_filed

Assumption: if firm filed within last 120 days, then firm still active:

In [None]:
last_date_this_firm = trading_days[-1] if days_since_last_filed < 120 else last_filing_date
last_date_this_firm

Forward fill R&D values until last date:

In [None]:
rndQ.loc[:last_date_this_firm, cik]

Last filing date for Microsoft:

In [None]:
cik = symbols[symbols.ticker=='MSFT'].index[0]

last_filing_date = filing_dates[cik].iloc[-1] 
last_filing_date

How many days sincce Microsoft last filed?

In [None]:
days_since_last_filed = (last_filing_date_all_firms - last_filing_date).days
days_since_last_filed

Last date for Microsoft:

In [None]:
last_date_this_firm = trading_days[-1] if days_since_last_filed < 120 else last_filing_date
last_date_this_firm

Forward fill R&D values:

In [None]:
rndQ.loc[:last_date_this_firm, cik].ffill()#.plot()

Put this into a function:

In [None]:
def ffill_values(item, dates):                                          
    data = item.unstack('cik')
    data = data.reindex(dates.union(data.index)).sort_index()           # Add specified dates to index.
    filing_dates = pd.read_csv('data/sec/dates/filing_dates.csv', index_col='cik', parse_dates=['filed']).filed
    last_filing_date_all_firms = filing_dates.max()                     # Most recent date where at least 1 firm filed.
     
    for cik in data.columns:                                            # Loop over all firms.
        last_filing_date      = pd.Series(filing_dates[cik]).iloc[-1]   # Last date where this firm filed
        days_since_last_filed = (last_filing_date_all_firms - last_filing_date).days
        last_date_this_firm   = dates[-1] if days_since_last_filed < 120 else last_filing_date
        data.loc[:last_date_this_firm, cik].ffill(inplace=True)         # Forward fill all the values.

    return data.loc[dates]                                              # Return only specified dates.  

Use function like this:

In [None]:
rndQ = ffill_values(  )
rndQ

Total R&D for U.S. stock market:

#### Match CIKs to ticker symbols

Ticker symbol file from SEC:

In [None]:
sec = pd.read_json('https://www.sec.gov/files/company_tickers.json').transpose()
sec = sec.rename(columns={'cik_str':'cik'})
sec

Get Google CIK:

In [None]:
sec[sec.ticker=='GOOG']

Get all rows for this CIK:

Get ticker symbol file from tiingo:

In [None]:
r = requests.get('https://apimedia.tiingo.com/docs/tiingo/daily/supported_tickers.zip')

z = zipfile.ZipFile(io.BytesIO(r.content))

z.namelist() 

Open this file:

In [None]:
tngo = pd.read_csv(z.open('supported_tickers.csv'))
tngo

Merge the SEC and the tiingo table:

In [None]:
all_shares = sec.merge(tngo, on='ticker', how='outer')
all_shares

Check SPY:

In [None]:
all_shares[all_shares.ticker=='']

Exclude ETFs:

In [None]:
all_shares = 

Check Alphabet again:

In [None]:
all_shares[all_shares.cik==1652044]  # CIK Alphabet

Lets select the first share for each firm:

In [None]:
symbols = all_shares.sort_values(['cik','startDate']).groupby('cik',as_index=False).first().set_index('cik')
symbols

Check Alphabet:

In [None]:
symbols.loc[[1652044]]

Save the symbols table:

In [1]:
Path('data/ticker_symbols/').mkdir(parents=True, exist_ok=True)  # Generate the folder:

symbols.to_csv('data/ticker_symbols/symbols.csv')

What are the top 10 firms with higest most recent R&D?

Put this into a dataFrame:

In [None]:
top_10 = 
top_10

Which firms are these?