# U.S. Stock Market III: Industries

In [None]:
import pandas as pd
import numpy as np
import requests, zipfile, io
import os
from pathlib import Path

from tiingo import TiingoClient                       
tiingo = TiingoClient({'api_key':'XXXX'})

import matplotlib.pyplot as plt                        # Basic plot library.
plt.style.use('ggplot')                                # Make plots look nice.

In [None]:
def ffill_values(item, dates):                                          
    data = item.unstack('cik')
    data = data.reindex(dates.union(data.index)).sort_index()           # Add specified dates to index.
    filing_dates = pd.read_csv('data/sec/dates/filing_dates.csv', index_col='cik', parse_dates=['filed']).filed
    last_filing_date_all_firms = filing_dates.max()                     # Most recent date where at least 1 firm filed.
     
    for cik in data.columns:                                            # Loop over all firms.
        last_filing_date      = pd.Series(filing_dates[cik]).iloc[-1]   # Last date where this firm filed
        days_since_last_filed = (last_filing_date_all_firms - last_filing_date).days
        last_date_this_firm   = dates[-1] if days_since_last_filed < 120 else last_filing_date
        data.loc[:last_date_this_firm, cik].ffill(inplace=True)         # Forward fill all the values.

    return data.loc[dates]                                              # Return only specified dates.   

In [None]:
# Read the files
sales           = pd.read_csv('data/sec/items/Sales.csv',           parse_dates=['filed'], index_col=['cik','filed'])
earnings        = pd.read_csv('data/sec/items/Earnings.csv',        parse_dates=['filed'], index_col=['cik','filed'])
operatingIncome = pd.read_csv('data/sec/items/OperatingIncome.csv', parse_dates=['filed'], index_col=['cik','filed'])

earnings[:5]

Forward fill table for annual earnings:

In [None]:
trading_days = tiingo.get_dataframe('SPY','2009-04-15').index.tz_convert(None)

earningsA = ffill_values( earnings.valueA, trading_days ) / 10**9    # In USD billion

We want to get th industry codes from the merged files.     
Example:

In [None]:
directory = 'data/sec/merged/'
filename  = '2021_01.csv'
data = pd.read_csv(directory+filename, parse_dates=['filed','ddate'])
data[:3]

In [None]:
def get_attributes_from_SEC_files(attributes, filename=None):         # Function input: optional filename.
    
    directory = 'data/sec/merged/'                                    # Read data from here.
    filenames = [filename] if filename else os.listdir(directory)     # Supplied filename or all files in "merged" directory.
    filenames = [f for f in filenames if not f.startswith(".")]       # Exclude hidden files from file list. 
    
    results = {a:pd.DataFrame() for a in attributes}                  # Dictionary of dataFrames (one table for each attribute).

    for filename in filenames:                                        # Loop over all files.
        print(filename)
        data = pd.read_csv(directory+filename, parse_dates=['filed','ddate'])
                
        for a in attributes:                                          # Loop over all attributes.
            item =  data.groupby(['cik','filed'])[[a]].first()        # Get attributes value for each firm, filing.
            results[a] = results[a].append( item )
            
    for a in attributes: 
        results[a] = results[a].sort_index(level='filed')             # Sort each attribute table by filing date.

    return results

In [None]:
attributes = get_attributes_from_SEC_files(['countryinc','sic'])

In [None]:
# Save data
Path('data/sec/attributes/').mkdir(parents=True, exist_ok=True)  # Generate the folder:

attributes['sic']       .to_csv('data/sec/attributes/sic.csv')
attributes['countryinc'].to_csv('data/sec/attributes/countryinc.csv')

In [None]:
# Read data
sic = pd.read_csv('data/sec/attributes/sic.csv', parse_dates=['filed'], xindex_col=['filed','cik'])
sic[:2]

In [None]:
# Forward fill the table:
sic = ffill_values(attributes.sic, trading_days)

In [None]:
sic_current = sic.iloc[-1].to_frame('sic')
sic_current

In [None]:
symbols = pd.read_csv('data/ticker_symbols/symbols.csv',index_col=0)
symbols[:3]

Top 10 Earnings with title and SIC:

In [None]:
earningsA.iloc[-1].nlargest(10).to_frame('Earnings')

https://www.osha.gov/data/sic-manual     

How to select a specific SIC:

In [None]:
t = pd.DataFrame({'A':[7372, 6000, 7385],'B':[8000,2200,7372]})
t

Get 7372:

In [None]:
# Get specific sic
codes    = sic
industry = codes[codes==7372].notnull()

earningsA[industry]

Top 10 earnings in this industry:

Get all 7300 sic:

In [None]:
t

In [None]:
codes = t.div(100).apply(np.floor)
codes

In [None]:
codes[codes==73]

In [None]:
codes[codes==73].notnull()

In [None]:
# get all 5800 sic:
codes    = sic.div(100).apply(np.floor)
industry = codes[codes==58].notnull()