# Cleaning File
Get the basic company information here.

In [1]:
import pickle
import pandas as pd
import matplotlib
import os
import re

import collections
import datetime
import time

import geopandas as gpd

import numpy as np

from difflib import get_close_matches

from fuzzywuzzy import process
from fuzzywuzzy import fuzz

In [2]:
defl            = pd.read_csv("../GDPDEF.csv")
defl['DATE']    = pd.to_datetime(defl['DATE'])
defl['year']    = defl['DATE'].dt.year

# for some reason the years are messed up but they take anything after 1971 to be in the 2000s

defl.loc[(defl.year > 2020), 'year'] = defl.year - 100
defl['qtr'] = defl['DATE'].dt.quarter.astype('float64')
defl.drop(columns = {'DATE','GDPDEF'},inplace = True)
defl.tail()

Unnamed: 0,inflator,year,qtr
291,1.012932,2019,4.0
292,1.009643,2020,1.0
293,1.014,2020,2.0
294,1.004838,2020,3.0
295,1.0,2020,4.0


In [13]:
file = "../../data/companyData/compustat2000s.csv"
compustat = pd.read_csv(file, encoding = 'unicode_escape').drop(columns = {'curncdq'})

# rename to clean this up a little bit
compustat.rename(columns = {'fyearq':'year',
                            'fqtr':  'qtr',
                            'chq':   'cash', 
                            'conm':  'companyName',
                            'atq':   'assets',
                            'niq':   'netIncome',
                            'revtq': 'totalRevenue',
                            'cogsq': 'costGoodsSold',
                            'invtq': 'totalInv',
                            'oibdpq': 'opInc_befDep',
                            'oiadpq': 'opInc_afDep',
                            'prccq':  'priceClose'
                           },
                 inplace = True)


# and merge in the deflators
compustat = compustat.merge(defl)
compustat.head()

Unnamed: 0,gvkey,datadate,year,qtr,indfmt,consol,popsrc,datafmt,tic,companyName,...,opInc_afDep,opInc_befDep,totalRevenue,costat,priceClose,add1,addzip,city,state,inflator
0,1004,19990228,1998,3.0,INDL,C,D,STD,AIR,AAR CORP,...,18.938,23.2,250.984,A,15.125,"One AAR Place, 1100 North Wood Dale Road",60191,Wood Dale,IL,1.51804
1,1082,19990131,1998,3.0,INDL,C,D,STD,SERV.1,SERVIDYNE INC,...,0.795,1.562,27.942,I,6.875,"1945 The Exchange, Suite 325",30339-2029,Atlanta,GA,1.51804
2,1244,19990228,1998,3.0,INDL,C,D,STD,ALCD.,ALCIDE CORP,...,-0.09,-0.074,2.28,I,15.5,8561 154th Avenue North East,98052,Redmond,WA,1.51804
3,1258,19990228,1998,3.0,INDL,C,D,STD,CGYNQ,CAPCO ENERGY INC,...,0.802,0.803,0.809,I,,"1800 West Loop South, Suite 1950",77027,Houston,TX,1.51804
4,1331,19990131,1998,3.0,INDL,C,D,STD,APNI,ALPINE GROUP INC,...,,,326.714,I,13.9375,One Meadowlands Plaza,07073,East Rutherford,NJ,1.51804


In [14]:
compustat.columns

Index(['gvkey', 'datadate', 'year', 'qtr', 'indfmt', 'consol', 'popsrc',
       'datafmt', 'tic', 'companyName', 'curcdq', 'datacqtr', 'datafqtr',
       'assets', 'cash', 'costGoodsSold', 'totalInv', 'netIncome',
       'opInc_afDep', 'opInc_befDep', 'totalRevenue', 'costat', 'priceClose',
       'add1', 'addzip', 'city', 'state', 'inflator'],
      dtype='object')

In [15]:
compustat.columns

sum(compustat.datacqtr != compustat.datafqtr)
compustat.shape

(1005795, 28)

In [16]:
toDeflate = ['assets','costGoodsSold', 'totalInv', 'netIncome', 'totalRevenue',
            'opInc_afDep', 'opInc_befDep','priceClose','cash']


for col in toDeflate:
    compustat[col] = compustat[col]*compustat.inflator

    
    
compustat.columns

Index(['gvkey', 'datadate', 'year', 'qtr', 'indfmt', 'consol', 'popsrc',
       'datafmt', 'tic', 'companyName', 'curcdq', 'datacqtr', 'datafqtr',
       'assets', 'cash', 'costGoodsSold', 'totalInv', 'netIncome',
       'opInc_afDep', 'opInc_befDep', 'totalRevenue', 'costat', 'priceClose',
       'add1', 'addzip', 'city', 'state', 'inflator'],
      dtype='object')

In [17]:
compustat.drop(columns=['indfmt','consol','popsrc','datafmt','tic','datacqtr','datafqtr','inflator'], inplace = True)


compustat.columns

Index(['gvkey', 'datadate', 'year', 'qtr', 'companyName', 'curcdq', 'assets',
       'cash', 'costGoodsSold', 'totalInv', 'netIncome', 'opInc_afDep',
       'opInc_befDep', 'totalRevenue', 'costat', 'priceClose', 'add1',
       'addzip', 'city', 'state'],
      dtype='object')

In [18]:
compustatAddresses = compustat[['year','gvkey','companyName','add1','city','state','addzip']]
compustatAddresses.to_csv("../../data/companyData/compustatAddresses.csv")

In [19]:
compustatLast = compustat.copy()

compustatLast['year'] += 1
compustatLast.rename(columns = {'assets': 'assetsLast',
                                'netIncome':'netIncomeLast',
                                'totalRevenue':'totalRevenueLast',
                                'costGoodsSold':'costGoodsSoldLast',
                                'totalInv':'totalInvLast',
                                'opInc_afDep': 'opInc_afDepLast',
                                'opInc_befDep': 'opInc_befDepLast',
                                'priceClose': 'priceCloseLast',
                                'cash': 'cashLast'
                           }, inplace = True)

# 'assets','costGoodsSold', 'totalInv', 'netIncome', 'totalRevenue',
# 'opInc_afDep', 'opInc_befDep','priceClose','cash'

print(compustatLast.columns)
compustatLast = compustatLast[['gvkey','year','qtr','assetsLast','netIncomeLast',
                               'totalRevenueLast','costGoodsSoldLast','totalInvLast',
                              'opInc_afDepLast','opInc_befDepLast','priceCloseLast','cashLast']]

Index(['gvkey', 'datadate', 'year', 'qtr', 'companyName', 'curcdq',
       'assetsLast', 'cashLast', 'costGoodsSoldLast', 'totalInvLast',
       'netIncomeLast', 'opInc_afDepLast', 'opInc_befDepLast',
       'totalRevenueLast', 'costat', 'priceCloseLast', 'add1', 'addzip',
       'city', 'state'],
      dtype='object')


In [21]:
compustatLast.head()

Unnamed: 0,gvkey,year,qtr,assetsLast,netIncomeLast,totalRevenueLast,costGoodsSoldLast,totalInvLast,opInc_afDepLast,opInc_befDepLast,priceCloseLast,cashLast
0,1004,1999,3.0,1075.103596,15.60242,381.003873,309.704548,419.419406,28.748651,35.218539,22.960362,
1,1082,1999,3.0,168.611793,-1.294889,42.417087,36.31001,17.313252,1.206842,2.371179,10.436528,
2,1244,1999,3.0,27.689058,-2.32412,3.461132,1.621267,4.513134,-0.136624,-0.112335,23.529628,
3,1258,1999,3.0,1.316141,1.217468,1.228095,0.0,0.0,1.217468,1.218987,,
4,1331,1999,3.0,3716.041664,0.593554,495.965079,,677.551564,,,21.157689,


In [22]:
compustatChanges = compustat.merge(compustatLast)
print(compustatChanges.shape)

(891345, 29)


In [23]:
compustatChanges['incomeChange']       = (compustatChanges.netIncome - compustatChanges.netIncomeLast)/compustatChanges.netIncomeLast

compustatChanges['revenueChange']      = (compustatChanges.totalRevenue - compustatChanges.totalRevenueLast)/compustatChanges.totalRevenueLast

compustatChanges['costChange']         = (compustatChanges.costGoodsSold - compustatChanges.costGoodsSoldLast)/compustatChanges.costGoodsSoldLast

compustatChanges['inventoryChange']    = (compustatChanges.totalInv - compustatChanges.totalInvLast)/compustatChanges.totalInvLast

compustatChanges['opInc_afDepChange']  = (compustatChanges.opInc_afDep - compustatChanges.opInc_afDepLast)/compustatChanges.opInc_afDepLast

compustatChanges['opInc_befDepChange'] = (compustatChanges.opInc_befDep - compustatChanges.opInc_befDepLast)/compustatChanges.opInc_befDepLast

compustatChanges['priceCloseChange']   = (compustatChanges.priceClose - compustatChanges.priceCloseLast)/compustatChanges.priceCloseLast

compustatChanges['assetsPrev']         = compustatChanges.assetsLast



print(compustatChanges.head())


'''compustatChanges = compustatChanges[['year', 'qtr', 'gvkey', 'companyName', 
                                     'tic', 'curcdq','salesChange','incomeChange','revenueChange',
                                     'costChange','inventoryChange']]
'''

   gvkey  datadate  year  qtr       companyName curcdq       assets  cash  \
0   1004  20000229  1999  3.0          AAR CORP    USD  1129.147545   NaN   
1   1082  20000131  1999  3.0     SERVIDYNE INC    USD   165.794462   NaN   
2   1244  20000229  1999  3.0       ALCIDE CORP    USD    21.983589   NaN   
3   1258  20000229  1999  3.0  CAPCO ENERGY INC    USD    41.520895   NaN   
4   1331  20000131  1999  3.0  ALPINE GROUP INC    USD          NaN   NaN   

   costGoodsSold    totalInv  ...  priceCloseLast  cashLast  incomeChange  \
0     333.569319  477.771178  ...       22.960362       NaN      0.051819   
1      39.497057   17.258803  ...       10.436528       NaN      2.519227   
2       2.067281    2.405836  ...       23.529628       NaN     -0.962616   
3       3.947309    1.427623  ...             NaN       NaN     -1.141501   
4            NaN         NaN  ...       21.157689       NaN           NaN   

   revenueChange costChange  inventoryChange opInc_afDepChange  \
0       

"compustatChanges = compustatChanges[['year', 'qtr', 'gvkey', 'companyName', \n                                     'tic', 'curcdq','salesChange','incomeChange','revenueChange',\n                                     'costChange','inventoryChange']]\n"

In [24]:
compustatChanges.to_csv("../../data/companyData/compustatChanges_all.csv")

In [None]:
compustatChanges.head()

### Company-level controls

In [None]:
file = "../../data/companyData/compustatControls.csv"
compustatControls = pd.read_csv(file, encoding = 'unicode_escape')
compustatControls.head()

In [None]:
earliestYear = compustatControls.groupby('gvkey')['fyearq'].min().reset_index().\
    rename(columns = {'fyearq': 'earliestYear'})
earliestYear.head()

---------------------------------------

In [None]:
otherControls = compustatControls[['gvkey','fyearq','fqtr','sic','naics','atq','niq']]

In [None]:
defl.rename(columns = {'year': 'fyearq', 'qtr': 'fqtr'}, inplace = True)
defl.head()


otherControls = otherControls.merge(defl)


toDeflate = ['atq','niq']


for col in toDeflate:
    otherControls[col] = otherControls[col]*otherControls.inflator

otherControls.head()

In [None]:
otherControls['roa']  = otherControls['niq']/otherControls['atq']

otherControls.rename(columns = {'niq': 'netIncome',
                               'atq':  'assets'},inplace = True)

Try some industry classifications. First SIC2 and the corresponding groups.

In [None]:
otherControls = otherControls[~(otherControls['sic'] == 'na') & ~(otherControls['sic'].isna())]

otherControls['sic'] = otherControls['sic'].astype('int').astype('str').str.rjust(4,'0')

print(otherControls.sic.max(),otherControls.sic.min())

In [None]:
otherControls['sic2'] = otherControls.sic.str.slice(0,2).astype('int')

otherControls['sic']  = otherControls.sic.astype('int')

In [None]:
otherControls.loc[(otherControls.sic2  < 10 ), 'indGroup']                              = 'agForFish'
otherControls.loc[(otherControls.sic2 >= 10 ) & (otherControls.sic2 < 15 ) , 'indGroup'] = 'mining'
otherControls.loc[(otherControls.sic2 >= 15 ) & (otherControls.sic2 < 18 ) , 'indGroup'] = 'construction'
otherControls.loc[(otherControls.sic2 >= 20 ) & (otherControls.sic2 < 40 ) , 'indGroup'] = 'manu'
otherControls.loc[(otherControls.sic2 >= 40 ) & (otherControls.sic2 < 50 ) , 'indGroup'] = 'transportUtilities'
otherControls.loc[(otherControls.sic2 >= 50 ) & (otherControls.sic2 < 52 ) , 'indGroup'] = 'wholesale'
otherControls.loc[(otherControls.sic2 >= 52 ) & (otherControls.sic2 < 60 ) , 'indGroup'] = 'retail'
otherControls.loc[(otherControls.sic2 >= 60 ) & (otherControls.sic2 < 68 ) , 'indGroup'] = 'finance'
otherControls.loc[(otherControls.sic2 >= 70 ) & (otherControls.sic2 < 90 ) , 'indGroup'] = 'services'
otherControls.loc[(otherControls.sic2 >= 91 ) & (otherControls.sic2 < 98 ) , 'indGroup'] = 'publicAdmin'

In [None]:
otherControls.indGroup.value_counts()

Now try the fama french codes.

In [None]:
otherControls.loc[(otherControls.sic >=100) & (otherControls.sic <= 799), 'famafrench'] = 1
otherControls.loc[(otherControls.sic >=910) & (otherControls.sic <= 919), 'famafrench'] = 1
otherControls.loc[(otherControls.sic == 2048), 'famafrench'] = 1

otherControls.loc[(otherControls.sic >=2000) & (otherControls.sic <= 2046), 'famafrench'] = 2
otherControls.loc[(otherControls.sic >=2050) & (otherControls.sic <= 2063), 'famafrench'] = 2
otherControls.loc[(otherControls.sic >=2070) & (otherControls.sic <= 2079), 'famafrench'] = 2
otherControls.loc[(otherControls.sic >=2090) & (otherControls.sic <= 2092), 'famafrench'] = 2
otherControls.loc[(otherControls.sic == 2095), 'famafrench'] = 2
otherControls.loc[(otherControls.sic >=2098) & (otherControls.sic <= 2099), 'famafrench'] = 2

otherControls.loc[(otherControls.sic >=2064) & (otherControls.sic <= 2068), 'famafrench'] = 3
otherControls.loc[(otherControls.sic >=2086) & (otherControls.sic <= 2087), 'famafrench'] = 3
otherControls.loc[(otherControls.sic >=2096) & (otherControls.sic <= 2097), 'famafrench'] = 3

otherControls.loc[(otherControls.sic ==2080), 'famafrench'] = 4
otherControls.loc[(otherControls.sic >=2082) & (otherControls.sic <= 2085), 'famafrench'] = 4

otherControls.loc[(otherControls.sic >=2100) & (otherControls.sic <= 2199), 'famafrench'] = 5

otherControls.loc[(otherControls.sic >=920)  & (otherControls.sic <= 999), 'famafrench']  = 6
otherControls.loc[(otherControls.sic >=3650) & (otherControls.sic<= 3652), 'famafrench']  = 6
otherControls.loc[(otherControls.sic ==3732), 'famafrench']  = 6
otherControls.loc[(otherControls.sic >=3930)  & (otherControls.sic<= 3931), 'famafrench']  = 6
otherControls.loc[(otherControls.sic >=3940)  & (otherControls.sic<= 3949), 'famafrench']  = 6

otherControls.loc[(otherControls.sic >=7800)  & (otherControls.sic<= 7829), 'famafrench']  = 7
otherControls.loc[(otherControls.sic >=7830)  & (otherControls.sic<= 7833), 'famafrench']  = 7
otherControls.loc[(otherControls.sic >=7840)  & (otherControls.sic<= 7841), 'famafrench']  = 7
otherControls.loc[(otherControls.sic ==7900), 'famafrench']  = 7
otherControls.loc[(otherControls.sic >=7910)  & (otherControls.sic<= 7911), 'famafrench']  = 7
otherControls.loc[(otherControls.sic >=7920)  & (otherControls.sic<= 7929), 'famafrench']  = 7
otherControls.loc[(otherControls.sic >=7930)  & (otherControls.sic<= 7933), 'famafrench']  = 7
otherControls.loc[(otherControls.sic >=7940)  & (otherControls.sic<= 7949), 'famafrench']  = 7
otherControls.loc[(otherControls.sic ==7980), 'famafrench']  = 7
otherControls.loc[(otherControls.sic >=7990)  & (otherControls.sic<= 7999), 'famafrench']  = 7

otherControls.loc[(otherControls.sic >=2700)  & (otherControls.sic<= 2749), 'famafrench']  = 8
otherControls.loc[(otherControls.sic >=2770)  & (otherControls.sic<= 2771), 'famafrench']  = 8
otherControls.loc[(otherControls.sic >=2780)  & (otherControls.sic<= 2799), 'famafrench']  = 8

otherControls.loc[(otherControls.sic ==2047), 'famafrench']  = 9
otherControls.loc[(otherControls.sic >=2391)  & (otherControls.sic<= 2392), 'famafrench']  = 9
otherControls.loc[(otherControls.sic >=2510)  & (otherControls.sic<= 2519), 'famafrench']  = 9
otherControls.loc[(otherControls.sic >=2590)  & (otherControls.sic<= 2599), 'famafrench']  = 9
otherControls.loc[(otherControls.sic >=2840)  & (otherControls.sic<= 2844), 'famafrench']  = 9
otherControls.loc[(otherControls.sic >=3160)  & (otherControls.sic<= 3161), 'famafrench']  = 9
otherControls.loc[(otherControls.sic >=3170)  & (otherControls.sic<= 3172), 'famafrench']  = 9
otherControls.loc[(otherControls.sic >=3190)  & (otherControls.sic<= 3199), 'famafrench']  = 9
otherControls.loc[(otherControls.sic >=3229)  & (otherControls.sic<= 3229), 'famafrench']  = 9
otherControls.loc[(otherControls.sic ==3260), 'famafrench']  = 9
otherControls.loc[(otherControls.sic >=3262)  & (otherControls.sic<= 3263), 'famafrench']  = 9
otherControls.loc[(otherControls.sic ==3269), 'famafrench']  = 9
otherControls.loc[(otherControls.sic >=3630)  & (otherControls.sic<= 3639), 'famafrench']  = 9
otherControls.loc[(otherControls.sic >=3750)  & (otherControls.sic<= 3751), 'famafrench']  = 9
otherControls.loc[(otherControls.sic ==3800), 'famafrench']  = 9
otherControls.loc[(otherControls.sic>=3860) & (otherControls.sic<=3861), 'famafrench']  = 9
otherControls.loc[(otherControls.sic>=3870) & (otherControls.sic<=3873), 'famafrench']  = 9
otherControls.loc[(otherControls.sic>=3910) & (otherControls.sic<=3911), 'famafrench']  = 9
otherControls.loc[(otherControls.sic==3914) & (otherControls.sic<=3915), 'famafrench']  = 9
otherControls.loc[(otherControls.sic>=3960) & (otherControls.sic<=3962), 'famafrench']  = 9
otherControls.loc[(otherControls.sic==3991), 'famafrench']  = 9
otherControls.loc[(otherControls.sic==3995), 'famafrench']  = 9

otherControls.loc[(otherControls.sic>=2300) & (otherControls.sic<=2390), 'famafrench']  = 10
otherControls.loc[(otherControls.sic>=3020) & (otherControls.sic<=3021), 'famafrench']  = 10
otherControls.loc[(otherControls.sic>=3100) & (otherControls.sic<=3111), 'famafrench']  = 10
otherControls.loc[(otherControls.sic>=3130) & (otherControls.sic<=3131), 'famafrench']  = 10
otherControls.loc[(otherControls.sic>=3140) & (otherControls.sic<=3149), 'famafrench']  = 10
otherControls.loc[(otherControls.sic>=3150) & (otherControls.sic<=3151), 'famafrench']  = 10
otherControls.loc[(otherControls.sic>=3963) & (otherControls.sic<=3965), 'famafrench']  = 10

otherControls.loc[(otherControls.sic>=8000) & (otherControls.sic<=8099), 'famafrench']  = 11

otherControls.loc[(otherControls.sic==3693), 'famafrench']  = 12
otherControls.loc[(otherControls.sic>=3840) & (otherControls.sic<=3851), 'famafrench']  = 12

otherControls.loc[(otherControls.sic>=2830) & (otherControls.sic<=2831), 'famafrench']  = 13
otherControls.loc[(otherControls.sic>=2833) & (otherControls.sic<=2836), 'famafrench']  = 13

otherControls.loc[(otherControls.sic>=2800) & (otherControls.sic<=2829), 'famafrench']  = 14
otherControls.loc[(otherControls.sic>=2850) & (otherControls.sic<=2879), 'famafrench']  = 14
otherControls.loc[(otherControls.sic>=2890) & (otherControls.sic<=2899), 'famafrench']  = 14

# otherControls.loc[(otherControls.sic==3000), 'famafrench']  = 15
otherControls.loc[(otherControls.sic==3031), 'famafrench']  = 15
otherControls.loc[(otherControls.sic==3041), 'famafrench']  = 15
otherControls.loc[(otherControls.sic>=3050) & (otherControls.sic<=3053), 'famafrench']  = 15
otherControls.loc[(otherControls.sic>=3060) & (otherControls.sic<=3099), 'famafrench']  = 15

otherControls.loc[(otherControls.sic>=2200) & (otherControls.sic<=2284), 'famafrench']  = 16
otherControls.loc[(otherControls.sic>=2290) & (otherControls.sic<=2295), 'famafrench']  = 16
otherControls.loc[(otherControls.sic>=2297) & (otherControls.sic<=2299), 'famafrench']  = 16
otherControls.loc[(otherControls.sic>=2393) & (otherControls.sic<=2395), 'famafrench']  = 16
otherControls.loc[(otherControls.sic>=2397) & (otherControls.sic<=2399), 'famafrench']  = 16

otherControls.loc[(otherControls.sic>=800)  & (otherControls.sic<=899), 'famafrench']  = 17
otherControls.loc[(otherControls.sic>=2400) & (otherControls.sic<=2439), 'famafrench']  = 17
otherControls.loc[(otherControls.sic>=2450) & (otherControls.sic<=2459), 'famafrench']  = 17
otherControls.loc[(otherControls.sic>=2490) & (otherControls.sic<=2499), 'famafrench']  = 17
otherControls.loc[(otherControls.sic>=2660) & (otherControls.sic<=2661), 'famafrench']  = 17
otherControls.loc[(otherControls.sic>=2950) & (otherControls.sic<=2952), 'famafrench']  = 17
otherControls.loc[(otherControls.sic==3200), 'famafrench']  = 17
otherControls.loc[(otherControls.sic>=3210) & (otherControls.sic<=3211), 'famafrench']  = 17
otherControls.loc[(otherControls.sic>=3240) & (otherControls.sic<=3241), 'famafrench']  = 17
otherControls.loc[(otherControls.sic>=3250) & (otherControls.sic<=3259), 'famafrench']  = 17
otherControls.loc[(otherControls.sic==3261), 'famafrench']  = 17
otherControls.loc[(otherControls.sic==3264), 'famafrench']  = 17
otherControls.loc[(otherControls.sic>=3270) & (otherControls.sic<=3275), 'famafrench']  = 17
otherControls.loc[(otherControls.sic>=3280) & (otherControls.sic<=3281), 'famafrench']  = 17
otherControls.loc[(otherControls.sic>=3290) & (otherControls.sic<=3293), 'famafrench']  = 17
otherControls.loc[(otherControls.sic>=3295) & (otherControls.sic<=3299), 'famafrench']  = 17
otherControls.loc[(otherControls.sic>=3420) & (otherControls.sic<=3433), 'famafrench']  = 17
otherControls.loc[(otherControls.sic>=3440) & (otherControls.sic<=3442), 'famafrench']  = 17
otherControls.loc[(otherControls.sic==3446), 'famafrench']  = 17
otherControls.loc[(otherControls.sic>=3448) & (otherControls.sic<=3452), 'famafrench']  = 17
otherControls.loc[(otherControls.sic>=3490) & (otherControls.sic<=3499), 'famafrench']  = 17
otherControls.loc[(otherControls.sic==3996), 'famafrench']  = 17

otherControls.loc[(otherControls.sic>=1500) & (otherControls.sic<=1511), 'famafrench']  = 18
otherControls.loc[(otherControls.sic>=1520) & (otherControls.sic<=1549), 'famafrench']  = 18
otherControls.loc[(otherControls.sic>=1600) & (otherControls.sic<=1699), 'famafrench']  = 18
otherControls.loc[(otherControls.sic>=1700) & (otherControls.sic<=1799), 'famafrench']  = 18

otherControls.loc[(otherControls.sic==3300), 'famafrench']  = 19
otherControls.loc[(otherControls.sic>=3310) & (otherControls.sic<=3317), 'famafrench']  = 19
otherControls.loc[(otherControls.sic>=3320) & (otherControls.sic<=3325), 'famafrench']  = 19
otherControls.loc[(otherControls.sic>=3330) & (otherControls.sic<=3339), 'famafrench']  = 19
otherControls.loc[(otherControls.sic>=3340) & (otherControls.sic<=3341), 'famafrench']  = 19
otherControls.loc[(otherControls.sic>=3350) & (otherControls.sic<=3357), 'famafrench']  = 19
otherControls.loc[(otherControls.sic>=3360) & (otherControls.sic<=3369), 'famafrench']  = 19
otherControls.loc[(otherControls.sic>=3370) & (otherControls.sic<=3379), 'famafrench']  = 19
otherControls.loc[(otherControls.sic>=3390) & (otherControls.sic<=3399), 'famafrench']  = 19

otherControls.loc[(otherControls.sic==3400), 'famafrench']  = 20
otherControls.loc[(otherControls.sic>=3443) & (otherControls.sic<=3444), 'famafrench']  = 20
otherControls.loc[(otherControls.sic>=3460) & (otherControls.sic<=3479), 'famafrench']  = 20

otherControls.loc[(otherControls.sic>=3510) & (otherControls.sic<=3536), 'famafrench']  = 21
otherControls.loc[(otherControls.sic==3538), 'famafrench']  = 21
otherControls.loc[(otherControls.sic>=3540) & (otherControls.sic<=3569), 'famafrench']  = 21
otherControls.loc[(otherControls.sic>=3580) & (otherControls.sic<=3582), 'famafrench']  = 21
otherControls.loc[(otherControls.sic>=3585) & (otherControls.sic<=3586), 'famafrench']  = 21
otherControls.loc[(otherControls.sic>=3589) & (otherControls.sic<=3599), 'famafrench']  = 21

otherControls.loc[(otherControls.sic>=3600), 'famafrench']  = 22
otherControls.loc[(otherControls.sic>=3610) & (otherControls.sic<=3613), 'famafrench']  = 22
otherControls.loc[(otherControls.sic>=3620) & (otherControls.sic<=3621), 'famafrench']  = 22
otherControls.loc[(otherControls.sic>=3623) & (otherControls.sic<=3629), 'famafrench']  = 22
otherControls.loc[(otherControls.sic>=3640) & (otherControls.sic<=3646), 'famafrench']  = 22
otherControls.loc[(otherControls.sic>=3648) & (otherControls.sic<=3649), 'famafrench']  = 22
otherControls.loc[(otherControls.sic==3660), 'famafrench']  = 22
otherControls.loc[(otherControls.sic>=3690) & (otherControls.sic<=3692), 'famafrench']  = 22
otherControls.loc[(otherControls.sic==3699), 'famafrench']  = 22

otherControls.loc[(otherControls.sic==2296), 'famafrench']  = 23
otherControls.loc[(otherControls.sic==2396), 'famafrench']  = 23
otherControls.loc[(otherControls.sic>=3010) & (otherControls.sic<=3011), 'famafrench']  = 23
otherControls.loc[(otherControls.sic==3537), 'famafrench']  = 23
otherControls.loc[(otherControls.sic==3647), 'famafrench']  = 23
otherControls.loc[(otherControls.sic==3694), 'famafrench']  = 23
otherControls.loc[(otherControls.sic==3700), 'famafrench']  = 23
otherControls.loc[(otherControls.sic==3710), 'famafrench']  = 23
otherControls.loc[(otherControls.sic==3711), 'famafrench']  = 23
otherControls.loc[(otherControls.sic>=3713) & (otherControls.sic<=3716), 'famafrench']  = 23
otherControls.loc[(otherControls.sic>=3790) & (otherControls.sic<=3792), 'famafrench']  = 23
otherControls.loc[(otherControls.sic==3799), 'famafrench']  = 23

otherControls.loc[(otherControls.sic>=3720) & (otherControls.sic<=3721), 'famafrench']  = 24
otherControls.loc[(otherControls.sic>=3723) & (otherControls.sic<=3725), 'famafrench']  = 24
otherControls.loc[(otherControls.sic>=3728) & (otherControls.sic<=3729), 'famafrench']  = 24

otherControls.loc[(otherControls.sic>=3730) & (otherControls.sic<=3731), 'famafrench']  = 25
otherControls.loc[(otherControls.sic>=3740) & (otherControls.sic<=3743), 'famafrench']  = 25

otherControls.loc[(otherControls.sic>=3760) & (otherControls.sic<=3769), 'famafrench']  = 26
otherControls.loc[(otherControls.sic==3795), 'famafrench']  = 26
otherControls.loc[(otherControls.sic>=3480) & (otherControls.sic<=3489), 'famafrench']  = 26

otherControls.loc[(otherControls.sic>=1040) & (otherControls.sic<=1049), 'famafrench']  = 27

otherControls.loc[(otherControls.sic>=1000) & (otherControls.sic<=1039), 'famafrench']  = 28
otherControls.loc[(otherControls.sic>=1050) & (otherControls.sic<=1119), 'famafrench']  = 28
otherControls.loc[(otherControls.sic>=1400) & (otherControls.sic<=1499), 'famafrench']  = 28

otherControls.loc[(otherControls.sic>=1200) & (otherControls.sic<=1299), 'famafrench']  = 29

otherControls.loc[(otherControls.sic==1300), 'famafrench']  = 30
otherControls.loc[(otherControls.sic>=1310) & (otherControls.sic<=1339), 'famafrench']  = 30
otherControls.loc[(otherControls.sic>=1370) & (otherControls.sic<=1382), 'famafrench']  = 30
otherControls.loc[(otherControls.sic==1389), 'famafrench']  = 30
otherControls.loc[(otherControls.sic>=2900) & (otherControls.sic<=2912), 'famafrench']  = 30
otherControls.loc[(otherControls.sic>=2990) & (otherControls.sic<=2999), 'famafrench']  = 30

otherControls.loc[(otherControls.sic==4900), 'famafrench']  = 31
otherControls.loc[(otherControls.sic>=4910) & (otherControls.sic<=4911), 'famafrench']  = 31
otherControls.loc[(otherControls.sic>=4920) & (otherControls.sic<=4925), 'famafrench']  = 31
otherControls.loc[(otherControls.sic>=4930) & (otherControls.sic<=4932), 'famafrench']  = 31
otherControls.loc[(otherControls.sic==4939), 'famafrench']  = 31
otherControls.loc[(otherControls.sic>=4940) & (otherControls.sic<=4942), 'famafrench']  = 31

otherControls.loc[(otherControls.sic==4800), 'famafrench']  = 32
otherControls.loc[(otherControls.sic>=4810) & (otherControls.sic<=4813), 'famafrench']  = 32
otherControls.loc[(otherControls.sic>=4820) & (otherControls.sic<=4822), 'famafrench']  = 32
otherControls.loc[(otherControls.sic>=4830) & (otherControls.sic<=4841), 'famafrench']  = 32
otherControls.loc[(otherControls.sic>=4880) & (otherControls.sic<=4892), 'famafrench']  = 32
otherControls.loc[(otherControls.sic==4899), 'famafrench']  = 32

otherControls.loc[(otherControls.sic>=7020) & (otherControls.sic<=7021), 'famafrench']  = 33
otherControls.loc[(otherControls.sic>=7030) & (otherControls.sic<=7033), 'famafrench']  = 33
otherControls.loc[(otherControls.sic==7200), 'famafrench']  = 33
otherControls.loc[(otherControls.sic>=7210) & (otherControls.sic<=7212), 'famafrench']  = 33
otherControls.loc[(otherControls.sic>=7214) & (otherControls.sic<=7217), 'famafrench']  = 33
otherControls.loc[(otherControls.sic>=7219) & (otherControls.sic<=7221), 'famafrench']  = 33
otherControls.loc[(otherControls.sic>=7230) & (otherControls.sic<=7231), 'famafrench']  = 33
otherControls.loc[(otherControls.sic>=7240) & (otherControls.sic<=7241), 'famafrench']  = 33
otherControls.loc[(otherControls.sic>=7250) & (otherControls.sic<=7251), 'famafrench']  = 33
otherControls.loc[(otherControls.sic>=7260) & (otherControls.sic<=7299), 'famafrench']  = 33
otherControls.loc[(otherControls.sic==7395), 'famafrench']  = 33
otherControls.loc[(otherControls.sic==7500), 'famafrench']  = 33
otherControls.loc[(otherControls.sic>=7520) & (otherControls.sic<=7549), 'famafrench']  = 33
otherControls.loc[(otherControls.sic==7600), 'famafrench']  = 33
otherControls.loc[(otherControls.sic==7620), 'famafrench']  = 33
otherControls.loc[(otherControls.sic==7622), 'famafrench']  = 33
otherControls.loc[(otherControls.sic==7623), 'famafrench']  = 33
otherControls.loc[(otherControls.sic>=7629) & (otherControls.sic<=7631), 'famafrench']  = 33
otherControls.loc[(otherControls.sic>=7640) & (otherControls.sic<=7641), 'famafrench']  = 33
otherControls.loc[(otherControls.sic>=7690) & (otherControls.sic<=7699), 'famafrench']  = 33
otherControls.loc[(otherControls.sic>=8100) & (otherControls.sic<=8199), 'famafrench']  = 33
otherControls.loc[(otherControls.sic>=8200) & (otherControls.sic<=8299), 'famafrench']  = 33
otherControls.loc[(otherControls.sic>=8300) & (otherControls.sic<=8399), 'famafrench']  = 33
otherControls.loc[(otherControls.sic>=8400) & (otherControls.sic<=8499), 'famafrench']  = 33
otherControls.loc[(otherControls.sic>=8600) & (otherControls.sic<=8699), 'famafrench']  = 33
otherControls.loc[(otherControls.sic>=8800) & (otherControls.sic<=8899), 'famafrench']  = 33
otherControls.loc[(otherControls.sic>=7510) & (otherControls.sic<=7515), 'famafrench']  = 33

otherControls.loc[(otherControls.sic>=2750) & (otherControls.sic<=2759), 'famafrench']  = 34
otherControls.loc[(otherControls.sic==3993), 'famafrench']  = 34
otherControls.loc[(otherControls.sic==7218), 'famafrench']  = 34
otherControls.loc[(otherControls.sic==7300), 'famafrench']  = 34
otherControls.loc[(otherControls.sic>=7310) & (otherControls.sic<=7342), 'famafrench']  = 34
otherControls.loc[(otherControls.sic>=7349) & (otherControls.sic<=7353), 'famafrench']  = 34
otherControls.loc[(otherControls.sic>=7359) & (otherControls.sic<=7372), 'famafrench']  = 34
otherControls.loc[(otherControls.sic>=7374) & (otherControls.sic<=7385), 'famafrench']  = 34
otherControls.loc[(otherControls.sic>=7389) & (otherControls.sic<=7394), 'famafrench']  = 34
otherControls.loc[(otherControls.sic>=7396) & (otherControls.sic<=7397), 'famafrench']  = 34
otherControls.loc[(otherControls.sic==7399), 'famafrench']  = 34
otherControls.loc[(otherControls.sic==7519), 'famafrench']  = 34
otherControls.loc[(otherControls.sic==8700), 'famafrench']  = 34
otherControls.loc[(otherControls.sic>=8710) & (otherControls.sic<=8713), 'famafrench']  = 34
otherControls.loc[(otherControls.sic>=8720) & (otherControls.sic<=8721), 'famafrench']  = 34
otherControls.loc[(otherControls.sic>=8730) & (otherControls.sic<=8734), 'famafrench']  = 34
otherControls.loc[(otherControls.sic>=8740) & (otherControls.sic<=8748), 'famafrench']  = 34
otherControls.loc[(otherControls.sic>=8900) & (otherControls.sic<=8910), 'famafrench']  = 34
otherControls.loc[(otherControls.sic>=8920) & (otherControls.sic<=8999), 'famafrench']  = 34
otherControls.loc[(otherControls.sic>=4220) & (otherControls.sic<=4229), 'famafrench']  = 34

otherControls.loc[(otherControls.sic>=3570) & (otherControls.sic<=3579), 'famafrench']  = 35
otherControls.loc[(otherControls.sic>=3680) & (otherControls.sic<=3689), 'famafrench']  = 35
otherControls.loc[(otherControls.sic==3695), 'famafrench']  = 35
otherControls.loc[(otherControls.sic==7373), 'famafrench']  = 35

otherControls.loc[(otherControls.sic==3622), 'famafrench']  = 36
otherControls.loc[(otherControls.sic>=3661) & (otherControls.sic<=3666), 'famafrench']  = 36
otherControls.loc[(otherControls.sic>=3669) & (otherControls.sic<=3679), 'famafrench']  = 36
otherControls.loc[(otherControls.sic==3810), 'famafrench']  = 36
otherControls.loc[(otherControls.sic==3812), 'famafrench']  = 36

otherControls.loc[(otherControls.sic==3811), 'famafrench']  = 37
otherControls.loc[(otherControls.sic>=3820) & (otherControls.sic<=3827), 'famafrench']  = 37
otherControls.loc[(otherControls.sic>=3829) & (otherControls.sic<=3839), 'famafrench']  = 37

otherControls.loc[(otherControls.sic>=2520) & (otherControls.sic<=2549), 'famafrench']  = 38
otherControls.loc[(otherControls.sic>=2600) & (otherControls.sic<=2639), 'famafrench']  = 38
otherControls.loc[(otherControls.sic>=2670) & (otherControls.sic<=2699), 'famafrench']  = 38
otherControls.loc[(otherControls.sic>=2760) & (otherControls.sic<=2761), 'famafrench']  = 38
otherControls.loc[(otherControls.sic>=3950) & (otherControls.sic<=3955), 'famafrench']  = 38

otherControls.loc[(otherControls.sic>=2440) & (otherControls.sic<=2449), 'famafrench']  = 39
otherControls.loc[(otherControls.sic>=2640) & (otherControls.sic<=2659), 'famafrench']  = 39
otherControls.loc[(otherControls.sic>=3220) & (otherControls.sic<=3221), 'famafrench']  = 39
otherControls.loc[(otherControls.sic>=3410) & (otherControls.sic<=3412), 'famafrench']  = 39

otherControls.loc[(otherControls.sic>=4000) & (otherControls.sic<=4013), 'famafrench']  = 40
otherControls.loc[(otherControls.sic>=4040) & (otherControls.sic<=4049), 'famafrench']  = 40
otherControls.loc[(otherControls.sic==4100), 'famafrench']  = 40
otherControls.loc[(otherControls.sic>=4110) & (otherControls.sic<=4121), 'famafrench']  = 40
otherControls.loc[(otherControls.sic>=4130) & (otherControls.sic<=4131), 'famafrench']  = 40
otherControls.loc[(otherControls.sic>=4140) & (otherControls.sic<=4142), 'famafrench']  = 40
otherControls.loc[(otherControls.sic>=4150) & (otherControls.sic<=4151), 'famafrench']  = 40
otherControls.loc[(otherControls.sic>=4170) & (otherControls.sic<=4173), 'famafrench']  = 40
otherControls.loc[(otherControls.sic>=4190) & (otherControls.sic<=4200), 'famafrench']  = 40
otherControls.loc[(otherControls.sic>=4210) & (otherControls.sic<=4219), 'famafrench']  = 40
otherControls.loc[(otherControls.sic>=4230) & (otherControls.sic<=4231), 'famafrench']  = 40
otherControls.loc[(otherControls.sic>=4240) & (otherControls.sic<=4249), 'famafrench']  = 40
otherControls.loc[(otherControls.sic>=4400) & (otherControls.sic<=4700), 'famafrench']  = 40
otherControls.loc[(otherControls.sic>=4710) & (otherControls.sic<=4712), 'famafrench']  = 40
otherControls.loc[(otherControls.sic>=4720) & (otherControls.sic<=4749), 'famafrench']  = 40
otherControls.loc[(otherControls.sic==4780), 'famafrench']  = 40
otherControls.loc[(otherControls.sic>=4782) & (otherControls.sic<=4785), 'famafrench']  = 40
otherControls.loc[(otherControls.sic==4789), 'famafrench']  = 40

otherControls.loc[(otherControls.sic==5000), 'famafrench']  = 41
otherControls.loc[(otherControls.sic>=5010) & (otherControls.sic<=5015), 'famafrench']  = 41
otherControls.loc[(otherControls.sic>=5020) & (otherControls.sic<=5023), 'famafrench']  = 41
otherControls.loc[(otherControls.sic>=5030) & (otherControls.sic<=5060), 'famafrench']  = 41
otherControls.loc[(otherControls.sic>=5063) & (otherControls.sic<=5065), 'famafrench']  = 41
otherControls.loc[(otherControls.sic>=5070) & (otherControls.sic<=5078), 'famafrench']  = 41
otherControls.loc[(otherControls.sic>=5080) & (otherControls.sic<=5088), 'famafrench']  = 41
otherControls.loc[(otherControls.sic>=5090) & (otherControls.sic<=5094), 'famafrench']  = 41
otherControls.loc[(otherControls.sic==5099), 'famafrench']  = 41
otherControls.loc[(otherControls.sic==5100), 'famafrench']  = 41
otherControls.loc[(otherControls.sic>=5110) & (otherControls.sic<=5113), 'famafrench']  = 41
otherControls.loc[(otherControls.sic>=5120) & (otherControls.sic<=5122), 'famafrench']  = 41
otherControls.loc[(otherControls.sic>=5130) & (otherControls.sic<=5172), 'famafrench']  = 41
otherControls.loc[(otherControls.sic>=5180) & (otherControls.sic<=5182), 'famafrench']  = 41
otherControls.loc[(otherControls.sic>=5190) & (otherControls.sic<=5199), 'famafrench']  = 41

otherControls.loc[(otherControls.sic==5200), 'famafrench']  = 42
otherControls.loc[(otherControls.sic>=5210) & (otherControls.sic<=5231), 'famafrench']  = 42
otherControls.loc[(otherControls.sic>=5250) & (otherControls.sic<=5251), 'famafrench']  = 42
otherControls.loc[(otherControls.sic>=5260) & (otherControls.sic<=5261), 'famafrench']  = 42
otherControls.loc[(otherControls.sic>=5270) & (otherControls.sic<=5271), 'famafrench']  = 42
otherControls.loc[(otherControls.sic>=5300), 'famafrench']  = 42
otherControls.loc[(otherControls.sic>=5310) & (otherControls.sic<=5311), 'famafrench']  = 42
otherControls.loc[(otherControls.sic==5320), 'famafrench']  = 42
otherControls.loc[(otherControls.sic>=5330) & (otherControls.sic<=5331), 'famafrench']  = 42
otherControls.loc[(otherControls.sic==5334), 'famafrench']  = 42
otherControls.loc[(otherControls.sic>=5340) & (otherControls.sic<=5349), 'famafrench']  = 42
otherControls.loc[(otherControls.sic>=5390) & (otherControls.sic<=5400), 'famafrench']  = 42
otherControls.loc[(otherControls.sic>=5410) & (otherControls.sic<=5412), 'famafrench']  = 42
otherControls.loc[(otherControls.sic>=5420) & (otherControls.sic<=5469), 'famafrench']  = 42
otherControls.loc[(otherControls.sic>=5490) & (otherControls.sic<=5500), 'famafrench']  = 42
otherControls.loc[(otherControls.sic>=5510) & (otherControls.sic<=5579), 'famafrench']  = 42
otherControls.loc[(otherControls.sic>=5590) & (otherControls.sic<=5700), 'famafrench']  = 42
otherControls.loc[(otherControls.sic>=5710) & (otherControls.sic<=5722), 'famafrench']  = 42
otherControls.loc[(otherControls.sic>=5730) & (otherControls.sic<=5736), 'famafrench']  = 42
otherControls.loc[(otherControls.sic>=5750) & (otherControls.sic<=5799), 'famafrench']  = 42
otherControls.loc[(otherControls.sic==5900), 'famafrench']  = 42
otherControls.loc[(otherControls.sic>=5910) & (otherControls.sic<=5912), 'famafrench']  = 42
otherControls.loc[(otherControls.sic>=5920) & (otherControls.sic<=5932), 'famafrench']  = 42
otherControls.loc[(otherControls.sic>=5940) & (otherControls.sic<=5990), 'famafrench']  = 42
otherControls.loc[(otherControls.sic>=5992) & (otherControls.sic<=5995), 'famafrench']  = 42
otherControls.loc[(otherControls.sic==5999), 'famafrench']  = 42


otherControls.loc[(otherControls.sic>=5800) & (otherControls.sic<=5819), 'famafrench']  = 43
otherControls.loc[(otherControls.sic>=5820) & (otherControls.sic<=5829), 'famafrench']  = 43
otherControls.loc[(otherControls.sic>=5890) & (otherControls.sic<=7000), 'famafrench']  = 43
otherControls.loc[(otherControls.sic>=7010) & (otherControls.sic<=7019), 'famafrench']  = 43
otherControls.loc[(otherControls.sic>=7040) & (otherControls.sic<=7049), 'famafrench']  = 43
otherControls.loc[(otherControls.sic==7213), 'famafrench']  = 43

otherControls.loc[(otherControls.sic==6000), 'famafrench']  = 44
otherControls.loc[(otherControls.sic>=6010) & (otherControls.sic<=6036), 'famafrench']  = 44
otherControls.loc[(otherControls.sic>=6040) & (otherControls.sic<=6062), 'famafrench']  = 44
otherControls.loc[(otherControls.sic>=6080) & (otherControls.sic<=6082), 'famafrench']  = 44
otherControls.loc[(otherControls.sic>=6090) & (otherControls.sic<=6100), 'famafrench']  = 44
otherControls.loc[(otherControls.sic>=6110) & (otherControls.sic<=6113), 'famafrench']  = 44
otherControls.loc[(otherControls.sic>=6120) & (otherControls.sic<=6179), 'famafrench']  = 44
otherControls.loc[(otherControls.sic>=6190) & (otherControls.sic<=6199), 'famafrench']  = 44

otherControls.loc[(otherControls.sic==6300), 'famafrench']  = 45
otherControls.loc[(otherControls.sic>=6310) & (otherControls.sic<=6331), 'famafrench']  = 45
otherControls.loc[(otherControls.sic>=6350) & (otherControls.sic<=6351), 'famafrench']  = 45
otherControls.loc[(otherControls.sic>=6360) & (otherControls.sic<=6361), 'famafrench']  = 45
otherControls.loc[(otherControls.sic>=6370) & (otherControls.sic<=6379), 'famafrench']  = 45
otherControls.loc[(otherControls.sic>=6390) & (otherControls.sic<=6399), 'famafrench']  = 45
otherControls.loc[(otherControls.sic>=6400) & (otherControls.sic<=6411), 'famafrench']  = 45

otherControls.loc[(otherControls.sic==6500), 'famafrench']  = 47
otherControls.loc[(otherControls.sic==6510), 'famafrench']  = 47
otherControls.loc[(otherControls.sic>=6512) & (otherControls.sic<=6515), 'famafrench']  = 46
otherControls.loc[(otherControls.sic>=6517) & (otherControls.sic<=6532), 'famafrench']  = 46
otherControls.loc[(otherControls.sic>=6540) & (otherControls.sic<=6541), 'famafrench']  = 46
otherControls.loc[(otherControls.sic>=6550) & (otherControls.sic<=6553), 'famafrench']  = 46
otherControls.loc[(otherControls.sic>=6590) & (otherControls.sic<=6599), 'famafrench']  = 46
otherControls.loc[(otherControls.sic>=6610) & (otherControls.sic<=6611), 'famafrench']  = 46

otherControls.loc[(otherControls.sic>=6200) & (otherControls.sic<=6299), 'famafrench']  = 47
otherControls.loc[(otherControls.sic==6700), 'famafrench']  = 47
otherControls.loc[(otherControls.sic>=6710) & (otherControls.sic<=6726), 'famafrench']  = 47
otherControls.loc[(otherControls.sic>=6730) & (otherControls.sic<=6733), 'famafrench']  = 47
otherControls.loc[(otherControls.sic>=6740) & (otherControls.sic<=6779), 'famafrench']  = 47
otherControls.loc[(otherControls.sic>=6790) & (otherControls.sic<=6795), 'famafrench']  = 47
otherControls.loc[(otherControls.sic>=6798) & (otherControls.sic<=6799), 'famafrench']  = 47

otherControls.loc[(otherControls.sic>=4950) & (otherControls.sic<=4961), 'famafrench']  = 48
otherControls.loc[(otherControls.sic>=4970) & (otherControls.sic<=4971), 'famafrench']  = 48
otherControls.loc[(otherControls.sic>=4990) & (otherControls.sic<=4991), 'famafrench']  = 48

print(otherControls.head())

In [None]:
otherControls = otherControls[['gvkey','fyearq','fqtr','assets','netIncome','roa','famafrench', 'sic2', 'indGroup']]

otherControls.rename(columns = {'assets': 'assetsLagged', 'netIncome': 'netIncomeLagged', 'roa': 'roa_lagged'}, 
                    inplace = True)

otherControls['year_toMatchOn'] = otherControls['fyearq'] + 2 

otherControls = otherControls.merge(earliestYear)

otherControls.head()

In [None]:
otherControls = otherControls[otherControls.year_toMatchOn > 1997]

otherControls['ageTercile']  = pd.qcut(otherControls['earliestYear'], 3, labels=False)
otherControls['sizeTercile'] = pd.qcut(otherControls['assetsLagged'], 3, labels=False)
otherControls['profitTercile'] = pd.qcut(otherControls['roa_lagged'], 3, labels=False)

otherControls.profitTercile.value_counts()

In [None]:
otherControls.earliestYear.max()

In [None]:
otherControls.to_csv('../../data/companyData/otherControls.csv')

# Location Data
This is not our final source of data. We use it as a complementary source to match the companies, as many of the ompanies have similar address names in IG and CStat, especially at the end of the data (the last address in IG should match with the CStat address, as the cstat addresses are not updated).

In [None]:
file = "../../data/cstatLocations.csv"
locations = pd.read_csv(file)[['fyear','gvkey','addzip','state']]


locations = locations[~locations.fyear.isna()]
locations = locations[~locations.addzip.isna()]
print(locations.shape)

locations.rename(columns = {'fyear': 'year'},inplace = True)
locations.reset_index(inplace = True,drop= True)
locations['year'] = locations['year'].astype('int64')
locations.drop_duplicates(inplace = True)

In [None]:
locations.shape

In [None]:
compustatChanges.head()

In [None]:
companyHQ = compustatChanges.merge(locations,how='left')

In [None]:
companyHQ = companyHQ[~companyHQ.addzip.isna()]

In [None]:
sum(companyHQ.addzip.isna())

Merge the zip code information with the change information.

In [None]:
zipFile = "../../data/zipLatLong.csv"
zips = pd.read_csv(zipFile)
'''[['ZIP','LAT','LNG']].rename(columns = {'ZIP': 'addzip'})
zips['addzip'] = zips.addzip.astype('object')'''

zips = zips['ZIP,LAT,LNG'].str.split(',', expand=True)

zips.columns = ['addzip','latitude','longitude']

print(zips.head())

In [None]:
companyHQZips = companyHQ.merge(zips)
print(companyHQZips.head(),companyHQZips.shape)

In [None]:
companyHQZips = companyHQZips.drop_duplicates()

In [None]:
companyHQZips.to_csv("../../data/companyHQZips.csv")

-------------------

# Segments Data

In [None]:
import datetime

In [None]:
file = "../../data/companyData/compustatSegments.csv"
segments = pd.read_csv(file)
segments.head()

In [None]:
segments['srcdate'] = pd.DatetimeIndex(pd.to_datetime(segments['srcdate'], format='%Y%m%d'))
segments['year']    = pd.DatetimeIndex(pd.to_datetime(segments['srcdate'], format='%Y%m%d')).year

In [None]:
segments.head()
relats = segments[segments.ctype == 'COMPANY']
relats['customerCleaned'] = relats.cnms.str.lower()
print(relats.shape)

relats = relats[relats.customerCleaned != 'not reported'] 
print(relats.shape)

relats = relats[~relats['customerCleaned'].str.contains('customers',na = True)] 
print(relats.shape)

relats = relats[['year','gvkey','conm','customerCleaned','salecs']].drop_duplicates()

There is a linked version of Compustat that has company names merged in.

It seems like it has maybe an incomplete representation of the companies in it, but it's not too too bad of the company-company sales here.

In [None]:
relats.head()

In [None]:
file = "../../data/companyData/compustatSCLinked.csv"
linked = pd.read_csv(file)

linked['year'] = pd.DatetimeIndex(pd.to_datetime(linked['srcdate'], format='%Y%m%d')).year

In [None]:
linked.salecs.sum()/relats.salecs.sum()

# [['cnms','srcdate','conm']].drop_duplicates().shape

In [None]:
linked.head()

In [None]:
customers = linked[['year','cgvkey']].drop_duplicatzipes().rename(columns = {'cgvkey': 'gvkey'})
suppliers = linked[['year', 'gvkey']].drop_duplicates()

In [None]:
len(linked[['gvkey']].drop_duplicates())

In [None]:
def gatherData(df): 
    df['yearPlus1']  = df['year'] + 1
    df['yearPlus2']  = df['year'] + 2
    df['yearPlus3']  = df['year'] + 3

    df['yearMinus1'] = df['year'] - 1
    df['yearMinus2'] = df['year'] - 2
    df['yearMinus3'] = df['year'] - 3
    
    dfMelted = pd.melt(df,id_vars=['year','gvkey'], var_name='variation', value_name='vyear').\
        drop(['variation','year'], axis = 1).rename(columns = {'vyear': 'year'}).drop_duplicates()

    return(dfMelted)

In [None]:
suppDF = gatherData(suppliers)

In [None]:
custDF = gatherData(customers)

In [None]:
print(suppDF.head())
print(suppDF.shape)

print(custDF.head())
print(custDF.shape)

In [None]:
allCompanies = suppDF.append(custDF).drop_duplicates()

In [None]:
allCompanies.head()

In [None]:
allCompanies.to_csv('../../data/companyData/allCompanyKeys.csv')

## Merge in some of the data

Now let's try merging in the other dataframes.

In [None]:
def mergeCompChanges(test):
    test = test.merge(compustatChanges, how = 'left')

    test.replace([np.inf, -np.inf], np.nan, inplace=True)

    test = test.dropna(subset = ['salesChange','incomeChange','revenueChange','costChange','inventoryChange'])

    return(test)

In [None]:
suppChanges = mergeCompChanges(suppDF)
custChanges = mergeCompChanges(custDF)

In [None]:
custChanges.salesChange.describe()

We also need to merge in the Infogroup data here.

# Clean the company information up

In [None]:
def cleanText(text):
    text = text.\
    replace(" CORP","").replace(" CO","").replace(" INC","").\
    replace(" LTD","").replace(" -CL A","").\
    replace(" -LP","").replace(" LP","").\
    replace("-OLD","").replace(" LLC","").\
    replace(" -CL B","").replace(" -CL i","").replace(" -CL","").\
    replace("-REDH","").replace(" CP","").\
    replace("-ADR","").replace(" PLC","").lower()
    
    
    return text

### Merge in Infogroup data
Let's use the gvkey as the unit of analysis.

In [None]:
hq = pd.read_csv('../../data/companyData/hqPublicAll.csv').drop(["Unnamed: 0"], axis = 1).\
    rename(columns = {'archive_version_year': 'year',
                     'company': 'companyName',
                     'ticker': 'tic'})

# 'ticker': 'tic',

hq['year'] = hq.year.astype('int64')

hq.head()

In [None]:
hq['companyNameClean'] = hq.companyName.apply(cleanText).apply(cleanText).apply(cleanText)

In [None]:
hq.companyNameClean.head()

In [None]:
ig_unique = hq[['tic','companyNameClean']].drop_duplicates()
ig_unique.shape

In [None]:
compIDs = suppChanges.append(custChanges)[['companyName','tic','gvkey']].drop_duplicates().reset_index().\
    drop(['index'],axis = 1)
compIDs.shape

In [None]:
compIDs['companyNameClean'] = compIDs.companyName.apply(cleanText)

Look for compustat IDs that match infogroup IDs. First off, look to see if company name or tic is in infogroup names or tickers.

In [None]:
compIDs.head()

In [None]:
compIDs['hasMatch'] = (compIDs.companyNameClean.isin(ig_unique.companyNameClean) | compIDs.tic.isin(ig_unique.tic))
compIDsNeedsMatch   = compIDs[~compIDs.hasMatch]
compIDsNeedsMatch.shape

In [None]:
compIDs.companyNameClean

In [None]:
def closestName(text):
    matchOrder = process.extract(text, ig_unique.companyNameClean, 
                               scorer=fuzz.token_sort_ratio)
    return(matchOrder)


In [None]:
start = time.time()
closestNames = compIDsNeedsMatch.companyNameClean[0:50].apply(closestName)

time.time() - start

In [None]:
compIDsNeedsMatch.companyNameClean[0:50]

In [None]:
closestNames

In [None]:
compIDsNeedsMatch.to_csv('../../data/companyData/matchesNeeded.csv')

In [None]:
closestNames.to_csv('../../data/companyData/closestNames.csv')

# Stock price data

In [None]:
file = "../../data/crsp_raw.csv"
raw = pd.read_csv(file, encoding = 'unicode_escape')

In [None]:
raw.head()

Let's follow Evan's crsp cleaning and linking process.

1. Drop firms not traded on NYSE or NASDAQ.

In [None]:
other_exch = raw[ (raw['PRIMEXCH'] != 'Q') & (raw['PRIMEXCH'] != 'N') ].index
raw.drop(other_exch, inplace=True)
raw.shape

2. Drop entries with missing return or volume information.

In [None]:
raw.dropna(subset=['VOL', 'RET','date'],inplace=True)
raw['VOL'] = raw['VOL'].astype('int')
raw.drop(raw[raw['RET']=='C'].index,inplace=True)
raw['RET'] = raw['RET'].astype('float')
raw['date'] = raw['date'].astype('int')

3. Drop tickers with average daily trading volume less than 100K shares.

In [None]:
adv = raw[['VOL','TICKER']].groupby(['TICKER']).mean()
low_vol = adv.drop(adv[adv['VOL']>=100000].index)
low_vol_index = raw[raw['TICKER'].isin(low_vol.index)].index
raw.drop(low_vol_index,inplace=True)
raw.shape

4. Drop tickers that traded for less than 20% of the period

In [None]:
ndays = len(raw.date.unique())
obs = raw['TICKER'].value_counts()
rare_obs = obs[obs<0.2*ndays]
rare_index = raw[raw['TICKER'].isin(rare_obs.index)].index
raw.drop(rare_index,inplace=True)
raw.shape

In [None]:
raw = raw.drop_duplicates()

In [None]:
raw.shape

Merge in GVKEY information for linking with compustat.

gv_cik will come in hand with another merge maybe; but to get to compustat, looks like we just need gvkey - permco.

Some of the keys seem to be used twice. Let's see if they're re-used when a company has gone out of business.

In [None]:
# gv_cik = pd.read_csv("data/cik_gvkey.csv",dtype={'cik':str})[['cik','gvkey']]
gv_permco = pd.read_csv("../../data/cik_permco.csv",dtype={'cik':str})
# [['gvkey','LPERMCO']].rename(columns = {'LPERMCO': 'PERMCO'}).drop_duplicates()

gv_permco.loc[gv_permco.LINKENDDT == 'E', 'LINKENDDT'] = '20210101'
gv_permco['existsCurrently'] = pd.DatetimeIndex(pd.to_datetime(gv_permco.LINKENDDT.astype(str), format='%Y%m%d')).year

gv_permco = gv_permco[gv_permco.existsCurrently > 2010]
print(gv_permco.shape)
print(gv_permco.head())

gv_permco = gv_permco[['gvkey','LPERMCO','conm']].rename(columns = {'LPERMCO': 'PERMCO'}).drop_duplicates()

In [None]:
for company in gv_permco[gv_permco.PERMCO.duplicated()].conm:
    print(company)

Drop the duplicated entries. We may want to revisit this later on.

In [None]:
gv_permco = gv_permco[~gv_permco.PERMCO.isin(list(gv_permco.PERMCO[gv_permco.PERMCO.duplicated()]))]

Now just merge the raw return information with this company information so we can link everything.

In [None]:
rawMergeable = pd.merge(raw,gv_permco,on='PERMCO',how = 'left')
rawMergeable.shape

In [None]:
raw.head()

In [None]:
rawMergeable = rawMergeable.drop(['RETX'],axis = 1)
rawMergeable['date'] = pd.to_datetime(rawMergeable['date'].astype(str), format='%Y%m%d')
rawMergeable['year'] = pd.DatetimeIndex(rawMergeable['date']).year
rawMergeable.head()

In [None]:
outfile =  '../../data/stockReturns.pkl'
with open(outfile, 'wb') as pickle_file:
    pickle.dump(rawMergeable, pickle_file)

----------------------------------------------

In [None]:
rawMergeable.year.value_counts()

In [None]:
crsp2010s = crsp[['date','year','PERMNO','COMNAM','PRC']]

In [None]:
sum(crsp.RET == crsp.RETX)