In [2]:
import pickle as pkl
import pandas as pd
import matplotlib
import os
import re

import collections
import datetime
import time

import geopandas as gpd

import numpy as np

import rasterio

import spacy
  
nlp = spacy.load('en_core_web_lg')
from sklearn.metrics.pairwise import cosine_similarity

import itertools

# Get Data

## Changes from year to year

In [None]:
changes = pd.read_csv("../../data/companyData/compustatChanges_2010s.csv").drop(columns = ['Unnamed: 0'])

changes.head()

In [None]:
otherControls = pd.read_csv('../../data/companyData/otherControls.csv').\
    drop(columns = {'Unnamed: 0', 'fyearq'}).rename(columns = {'year_toMatchOn': 'year',
                                                              'fqtr': 'qtr'})

In [None]:
otherControls.head()

In [None]:
print(changes.shape)
changes = changes.merge(otherControls)
print(changes.shape)

industries = changes[['gvkey','famafrench','naics']].drop_duplicates()
industries['naics'] = industries.naics.astype('str').str.slice(0,2)

In [None]:
changes.to_csv("../../data/companyData/compustatChanges_2010s_withControls.csv")
changes.head()

In [3]:
changes = pd.read_csv("../../data/companyData/compustatChanges_2010s_withControls.csv")

In [4]:
industries = changes[['gvkey','famafrench','naics']].drop_duplicates()
industries['naics'] = industries.naics.astype('str').str.slice(0,2)

## SC Linking Table for 2010s

In [None]:
# this does a little bit of a test on the reporting requirements. 
# number 

'''c_linksTest = pd.read_csv("../../data/companyData/compustatSCLinked.csv")[['srcdate','gvkey','cgvkey']]
c_linksTest['year'] = c_linksTest.srcdate.astype('str').str.slice(0,4).astype('int64')

bs = c_linksTest[c_linksTest.year < 2014]
print("Customers per supplier, 1978-2013 Pd: ", len(bs.cgvkey.unique())/len(bs.gvkey.unique()))

bs2 = c_linksTest[c_linksTest.year > 2010]
print("Customers per supplier, Recent Pd: ", len(bs2.cgvkey.unique())/len(bs2.gvkey.unique()))'''


In [7]:
c_links = pd.read_csv("../../data/companyData/compustatSCLinked.csv")

c_links['year'] = c_links.srcdate.astype('str').str.slice(0,4).astype('int64')

c_links = c_links[c_links.year > 2009][['year','gvkey','cgvkey','salecs']].\
    rename(columns = {'cgvkey': 'customer_gvkey','gvkey': 'supplier_gvkey'})


print(c_links.shape)

c_links.head()

(34473, 4)


Unnamed: 0,year,supplier_gvkey,customer_gvkey,salecs
80,2010,1013,9899,300.0
81,2010,1013,2136,146.0
281,2016,1094,31673,78.193
282,2017,1094,31673,76.598
283,2017,1094,7171,70.215


In [10]:
industries.columns = ['customer_gvkey','customer_famafrench','customer_naics']

c_links = c_links.merge(industries)

industries.columns = ['supplier_gvkey','supplier_famafrench','supplier_naics']

c_links = c_links.merge(industries)
c_links.head()

Unnamed: 0,year,supplier_gvkey,customer_gvkey,salecs,customer_famafrench,customer_naics,supplier_famafrench,supplier_naics
0,2010,2497,9899,461.6,33.0,51,18.0,23
1,2011,2497,9899,692.065,33.0,51,18.0,23
2,2012,2497,9899,670.822,33.0,51,18.0,23
3,2013,2497,9899,778.462,33.0,51,18.0,23
4,2014,2497,9899,968.479,33.0,51,18.0,23


In [11]:
c_links.shape

(30953, 8)

Now see if it's common to have one in and one out of the industries of interest. 

For now, let's keep all the different industry types.

We can always filter later if we need to.

In [12]:
'''ofInterest = ['11','21','22','23','31','32','33','42','44','45','48','49']

c_linksCut = c_links[~(c_links.customer_naics.isin(ofInterest) & c_links.supplier_naics.isin(ofInterest))]

c_linksCut['relat'] = c_linksCut.customer_naics + "_" + c_linksCut.supplier_naics
c_linksCut.relat.value_counts()[0:10]

c_linksCut.supplier_naics.value_counts()[0:10]'''

'ofInterest = [\'11\',\'21\',\'22\',\'23\',\'31\',\'32\',\'33\',\'42\',\'44\',\'45\',\'48\',\'49\']\n\nc_linksCut = c_links[~(c_links.customer_naics.isin(ofInterest) & c_links.supplier_naics.isin(ofInterest))]\n\nc_linksCut[\'relat\'] = c_linksCut.customer_naics + "_" + c_linksCut.supplier_naics\nc_linksCut.relat.value_counts()[0:10]\n\nc_linksCut.supplier_naics.value_counts()[0:10]'

In [13]:
# c_links.to_csv("../../data/companyData/c_links.csv")
c_links = pd.read_csv("../../data/companyData/c_links.csv")


## Compustat and ABI Linking

In [14]:
gvKey_abiLinkingTable = pd.read_csv('../../data/companyData/linkingTable.csv').drop(columns = ['Unnamed: 0'])

base_columns = gvKey_abiLinkingTable.columns 
customer_columns = "customer_" + base_columns
supplier_columns = "supplier_" + base_columns


gvKey_abiLinkingTable.head()

Unnamed: 0,cstatCompanies,igCompanies,delete,gvkey,abi
0,american software,american software,,1562,4378204
1,apco oil and gas,apco oil gas,,1682,544813678
2,constellation energy grp,constellation energy,,1995,506384064
3,central natural res,central natural resources,,2852,312712631
4,cracker barrel old ctry stor,cracker barrel olduntry str,,3570,852053057


In [15]:
hasMatch = gvKey_abiLinkingTable.gvkey.unique()

In [16]:
sum(c_links.supplier_gvkey.isin(hasMatch) | c_links.customer_gvkey.isin(hasMatch))

28655

# Merge

In [None]:

#########################
# merge in customer information
gvKey_abiLinkingTable.columns = customer_columns

print(c_links.shape)
c_linksMerge1 = c_links.merge(gvKey_abiLinkingTable, on ='customer_gvkey')
print(c_links.shape,c_linksMerge1.shape)



#########################
# and merge in supplier 
gvKey_abiLinkingTable.columns = supplier_columns

print(c_links.shape)
c_linksMerge2 = c_linksMerge1.merge(gvKey_abiLinkingTable, on ='supplier_gvkey')
print(c_links.shape,c_linksMerge2.shape)

In [None]:
c_linksMerge2.to_csv("../../data/companyData/clinks_IG_selected.csv")

In [None]:
c_linksMerge2

In [None]:
gvKey_abiLinkingTable

This is probably because: (1) companies are not in North America, or (2) companies are not in the physical goods industries we're interested in. We can verify this though: look at c_links where both the customer and supplier are in the dataset of interest.

In [None]:
chq     = pd.read_csv("../../data/chq.csv",dtype={'cstatZipcode': 'object'}).drop(columns = {'Unnamed: 0'})

c_linkTest = c_links[c_links.customer_gvkey.isin(chq.gvkey.unique()) & \
                     c_links.supplier_gvkey.isin(chq.gvkey.unique())]

print("Percent of firms with a match: ", c_linksMerge2.shape[0]/c_linkTest.shape[0])

It's entirely possible that we have too small of a sample from the 2010s alone. Let's just try it though and see how it goes.

First, make a sample with the companies on three years of either side of when it reports another customer.

In [None]:
scTableCustomers = c_linksMerge2.copy()[['year','customer_gvkey','customer_abi']].drop_duplicates()
scTableSuppliers = c_linksMerge2.copy()[['year','supplier_gvkey','supplier_abi']].drop_duplicates()

In [None]:
scTableCustomers.head()

In [None]:
def makeThreeEitherSide(df): 
    yrPlus1 = df.copy(); yrPlus1['year'] += 1
    yrPlus2 = df.copy(); yrPlus2['year'] += 1
    yrPlus3 = df.copy(); yrPlus3['year'] += 1
    
    yrMinus1 = df.copy(); yrMinus1['year'] -= 1
    yrMinus2 = df.copy(); yrMinus2['year'] -= 1
    yrMinus3 = df.copy(); yrMinus3['year'] -= 1
    
    all = pd.concat([yrPlus1,yrPlus2,yrPlus3,yrMinus1,yrMinus2,yrMinus3])
    
    return(all)

In [None]:
allCustomerData = makeThreeEitherSide(scTableCustomers)
allCustomerData.columns = ['year','gvkey','abi']


allSupplierData = makeThreeEitherSide(scTableSuppliers)
allSupplierData.columns = ['year','gvkey','abi']

In [None]:
allSupplierData.year

In [None]:
allAbi = allCustomerData.abi.append(allSupplierData.abi).drop_duplicates()

In [None]:
hqsOnly = pd.read_csv("../../data/ig2010s_uniqueHQs.csv").drop(columns = {'Unnamed: 0'})

In [None]:
hqsOnly.head()

In [None]:
hq = pd.read_csv("../../data/ig2010s_uniqueHQs_multLocations.csv").\
    drop(columns = {'Unnamed: 0'}).\
    rename(columns = {'archive_version_year': 'year'})

hq['year'] = hq.year.astype('int64')

hqRelevant = hq[hq.abi.isin(allAbi)]

In [None]:
hqRelevant[hqRelevant.abi == 71340]

In [None]:
allSupplierData = allSupplierData.merge(hqRelevant).drop_duplicates()
allCustomerData = allCustomerData.merge(hqRelevant).drop_duplicates()

In [None]:
allSupplierData.head()

In [None]:
allCustomerData.to_csv("../../data/companyData/allCustomerData.csv")
allSupplierData.to_csv("../../data/companyData/allSupplierData.csv")

# Find Customer and Supplier pairings and merge with change data
## Can pick up here

In [17]:
allSupplierData = pd.read_csv("../../data/companyData/allSupplierData.csv").drop(columns = ['Unnamed: 0'])
allCustomerData = pd.read_csv("../../data/companyData/allCustomerData.csv").drop(columns = ['Unnamed: 0'])

In [18]:
changes = pd.read_csv("../../data/companyData/compustatChanges_2010s_withControls.csv").drop(columns = ['Unnamed: 0'])
changes.head()

Unnamed: 0,year,qtr,gvkey,companyName,tic,naics,curcdq,incomeChange,revenueChange,revenueChangeAbsolute,costChange,inventoryChange,netIncome,roa,famafrench,earliestYear
0,2010,1.0,1004,AAR CORP,AIR,423860.0,USD,0.213983,0.200565,81.107,0.178258,0.055631,15.153,0.01407,42.0,1968
1,2010,2.0,1004,AAR CORP,AIR,423860.0,USD,0.045617,0.059577,27.099,0.017636,0.086776,17.855,0.0157,42.0,1968
2,2010,3.0,1004,AAR CORP,AIR,423860.0,USD,0.153198,0.166276,76.16,0.129456,0.16927,20.095,0.01507,42.0,1968
3,2010,4.0,1004,AAR CORP,AIR,423860.0,USD,-0.398739,0.174283,85.02,0.136013,0.182304,22.041,0.016183,42.0,1968
4,2011,1.0,1004,AAR CORP,AIR,423860.0,USD,0.096386,0.133883,65.0,0.101523,0.10415,14.975,0.011086,42.0,1968


In [19]:
changes.year.value_counts()

2015    35383
2016    35257
2017    34868
2018    34802
2014    33699
2013    33325
2012    32570
2011    32498
2010    31083
Name: year, dtype: int64

In [20]:
allSupplierData.head()

Unnamed: 0,year,gvkey,abi,ticker,company,state,city,address_line_1,zipcode,latitude,longitude,parent_employee_size_code,location_employee_size_code,employeesAtLocation
0,2011,2497,482985413,MTZ,MAS TEC INC,FL,CORAL GABLES,800 S DOUGLAS RD # 12,33134,25.7639,-80.25634,250.0,250.0,1.0
1,2012,2497,482985413,MTZ,MAS TEC INC,FL,CORAL GABLES,800 S DOUGLAS RD # 12,33134,25.7639,-80.25634,250.0,250.0,1.0
2,2013,2497,482985413,MTZ,MAS TEC INC,FL,CORAL GABLES,800 S DOUGLAS RD # 1200,33134,25.76375,-80.25635,250.0,250.0,1.0
3,2014,2497,482985413,MTZ,MAS TEC INC,FL,CORAL GABLES,800 S DOUGLAS RD # 1200,33134,25.7639,-80.25634,250.0,250.0,1.0
4,2015,2497,482985413,MTZ,MAS TEC INC,FL,CORAL GABLES,800 S DOUGLAS RD # 1200,33134,25.76375,-80.25635,250.0,250.0,1.0


In [21]:
allSupplierData.zipcode.drop_duplicates().to_csv("../../data/companyData/uniqueZIPS.csv")

  """Entry point for launching an IPython kernel.


In [22]:
suppliers = changes.merge(allSupplierData[['year','gvkey','zipcode','employeesAtLocation']])
suppliers.shape

(29006, 18)

In [23]:
customers = changes.merge(allCustomerData[['year','gvkey','zipcode','employeesAtLocation']])
customers.head()

Unnamed: 0,year,qtr,gvkey,companyName,tic,naics,curcdq,incomeChange,revenueChange,revenueChangeAbsolute,costChange,inventoryChange,netIncome,roa,famafrench,earliestYear,zipcode,employeesAtLocation
0,2010,1.0,1045,AMERICAN AIRLINES GROUP INC,AAL,481111.0,USD,-0.136634,0.091752,465.0,0.091284,0.077899,81.0,0.00271,41.0,1963,76155,1.0
1,2010,2.0,1045,AMERICAN AIRLINES GROUP INC,AAL,481111.0,USD,25.0,0.077547,440.0,0.139454,0.126538,317.0,0.010424,41.0,1963,76155,1.0
2,2010,3.0,1045,AMERICAN AIRLINES GROUP INC,AAL,481111.0,USD,-2.132867,0.091407,534.0,0.165973,0.097391,175.0,0.005912,41.0,1963,76155,1.0
3,2010,4.0,1045,AMERICAN AIRLINES GROUP INC,AAL,481111.0,USD,10.173469,0.073935,413.0,0.105994,0.038721,-69.0,-0.002415,41.0,1963,76155,1.0
4,2011,1.0,1045,AMERICAN AIRLINES GROUP INC,AAL,481111.0,USD,2.807339,0.09109,504.0,0.082808,0.040336,-341.0,-0.011854,41.0,1963,76155,1.0


In [24]:
print(allCustomerData.shape,allSupplierData.shape)

(6159, 14) (9382, 14)


In [None]:
relevantZips = allCustomerData.zipcode.append(allSupplierData.zipcode).unique()
outfile =  '../../data/companyData/relevantZips.pkl'
with open(outfile, 'wb') as pickle_file:
    pkl.dump(relevantZips, pickle_file)

------------------------------------------------

# Weather Data

In [5]:
averages = pd.read_csv("../../data/companyData/quarterlyStatsByZip.csv").\
    drop(columns = {"Unnamed: 0"}).rename(columns = {'ZIP': 'zipcode'})

averages['qtr'] = averages.quarter.str.slice(1,2).astype('float')
averages.drop(columns = {'quarter'}, inplace = True) 

averages.head()

Unnamed: 0,zipcode,quarterly_avg_precip,quarterly_median_precip,quarterly_variance_precip,quarterly_avg_temp,quarterly_median_temp,quarterly_variance_temp,qtr
0,1238.0,2.757483,0.0,36.928115,2.409069,2.219,44.018717,1.0
1,1238.0,3.459268,0.01,55.665913,19.287705,19.91,50.832629,2.0
2,1238.0,3.45253,0.0,66.193167,24.876742,25.417,18.847386,3.0
3,1238.0,3.300801,0.0,70.256476,8.784418,8.775,58.056296,4.0
4,1434.0,3.042919,0.0,52.736437,3.692083,3.428,41.687367,1.0


In [6]:
# allWeather = pd.read_csv("../../../../../../../Volumes/backup2/dissData/prism/allWeatherBins_2010.2019.csv").\
allWeather = pd.read_csv("../../data/companyData/allWeatherBins_2009to2018.csv").\
    drop(columns = {"Unnamed: 0",'zip','yearQuarter'})

In [7]:
changes.head()

Unnamed: 0.1,Unnamed: 0,year,qtr,gvkey,companyName,tic,naics,curcdq,incomeChange,revenueChange,revenueChangeAbsolute,costChange,inventoryChange,netIncome,roa,famafrench,earliestYear
0,0,2010,1.0,1004,AAR CORP,AIR,423860.0,USD,0.213983,0.200565,81.107,0.178258,0.055631,15.153,0.01407,42.0,1968
1,1,2010,2.0,1004,AAR CORP,AIR,423860.0,USD,0.045617,0.059577,27.099,0.017636,0.086776,17.855,0.0157,42.0,1968
2,2,2010,3.0,1004,AAR CORP,AIR,423860.0,USD,0.153198,0.166276,76.16,0.129456,0.16927,20.095,0.01507,42.0,1968
3,3,2010,4.0,1004,AAR CORP,AIR,423860.0,USD,-0.398739,0.174283,85.02,0.136013,0.182304,22.041,0.016183,42.0,1968
4,4,2011,1.0,1004,AAR CORP,AIR,423860.0,USD,0.096386,0.133883,65.0,0.101523,0.10415,14.975,0.011086,42.0,1968


In [31]:
allWeather.head()

Unnamed: 0,temp_annualquant_0.05,temp_annualquant_tossThisOne,temp_annualquant_0.95,temp_annualquant_1.0,precip_annualquant_tossThisOne,precip_annualquant_0.95,precip_annualquant_1.0,temp_monthlyquant_0.05,temp_monthlyquant_tossThisOne,temp_monthlyquant_0.95,...,temp_zipquant_0.05,temp_zipquant_tossThisOne,temp_zipquant_0.95,temp_zipquant_1.0,precip_zipquant_tossThisOne,precip_zipquant_0.95,precip_zipquant_1.0,year,qtr,zipcode
0,39,51,0,0,25,8,2,12,78,0,...,7,83,0,0,25,8,2,2009,1.0,1238
1,0,88,3,0,31,7,6,12,74,2,...,0,49,16,26,30,6,8,2009,2.0,1238
2,0,92,0,0,18,6,13,12,80,0,...,0,15,17,60,18,5,14,2009,3.0,1238
3,13,79,0,0,29,6,6,14,78,0,...,2,88,2,0,27,6,8,2009,4.0,1238
4,27,63,0,0,27,4,8,3,87,0,...,3,85,2,0,26,4,9,2010,1.0,1238


In [None]:
'''allWeather['yearQtr'] = allWeather.year + (allWeather.qtr - 1)/4

lag1 = allWeather.copy()[['zipcode','yearQtr','precip_quant_1.0','tmax_quant_1.0']]
lag1['yearQtr'] += 0.25
lag1.rename(columns = {'precip_quant_1.0':'lag1_precip_quant_1.0','tmax_quant_1.0':'lag1_tmax_quant_1.0'},inplace = True)
# allWeather = allWeather.merge(lag1)


lag2 = allWeather.copy()[['zipcode','yearQtr','precip_quant_1.0','tmax_quant_1.0']]
lag2['yearQtr'] += 0.5
lag2.rename(columns = {'precip_quant_1.0':'lag2_precip_quant_2.0','tmax_quant_1.0':'lag2_tmax_quant_2.0'},inplace = True)
# allWeather = allWeather.merge(lag2)


lag3 = allWeather.copy()[['zipcode','yearQtr','precip_quant_1.0','tmax_quant_1.0']]
lag3['yearQtr'] += 0.75
lag3.rename(columns = {'precip_quant_1.0':'lag3_precip_quant_3.0','tmax_quant_1.0':'lag3_tmax_quant_3.0'},inplace = True)
# allWeather = allWeather.merge(lag3)


lag4 = allWeather.copy()[['zipcode','yearQtr','precip_quant_1.0','tmax_quant_1.0']]
lag4['yearQtr'] += 1
lag4.rename(columns = {'precip_quant_1.0':'lag4_precip_quant_4.0','tmax_quant_1.0':'lag4_tmax_quant_4.0'},inplace = True)
# allWeather = allWeather.merge(lag4)

print(allWeather.shape)'''

In [32]:
allWeather.year.value_counts()

2015    5112
2014    5112
2013    5112
2012    5112
2011    5112
2010    5112
2009    5112
2018    5112
2017    5112
2016    5112
Name: year, dtype: int64

In [None]:
'''allWeather = allWeather.merge(lag1).merge(lag2).merge(lag3).merge(lag4)

print(oneMerge.shape)

allWeather_withLags.year.value_counts()'''

In [None]:
changesWithWeather  = changes.merge(allWeather).merge(averages)
changesWithWeather.shape

In [42]:
customersWithWeather = customers.merge(allWeather).merge(averages)
customersWithWeather.shape

(18821, 45)

In [43]:
suppliersWithWeather = suppliers.merge(allWeather).merge(averages)
suppliersWithWeather.shape

(28580, 45)

In [44]:
suppliersWithWeather.to_csv("../../data/companyData/suppliersWithWeather.csv")
customersWithWeather.to_csv("../../data/companyData/customersWithWeather.csv")

In [None]:
suppliersWithWeather = pd.read_csv("../../data/companyData/suppliersWithWeather.csv").drop(columns = {'Unnamed: 0'})
customersWithWeather = pd.read_csv("../../data/companyData/customersWithWeather.csv").drop(columns = {'Unnamed: 0'})

In [45]:
customersWithWeather.columns

Index(['year', 'qtr', 'gvkey', 'companyName', 'tic', 'naics', 'curcdq',
       'incomeChange', 'revenueChange', 'revenueChangeAbsolute', 'costChange',
       'inventoryChange', 'netIncome', 'roa', 'famafrench', 'earliestYear',
       'zipcode', 'employeesAtLocation', 'temp_annualquant_0.05',
       'temp_annualquant_tossThisOne', 'temp_annualquant_0.95',
       'temp_annualquant_1.0', 'precip_annualquant_tossThisOne',
       'precip_annualquant_0.95', 'precip_annualquant_1.0',
       'temp_monthlyquant_0.05', 'temp_monthlyquant_tossThisOne',
       'temp_monthlyquant_0.95', 'temp_monthlyquant_1.0',
       'precip_monthlyquant_tossThisOne', 'precip_monthlyquant_0.95',
       'precip_monthlyquant_1.0', 'temp_zipquant_0.05',
       'temp_zipquant_tossThisOne', 'temp_zipquant_0.95', 'temp_zipquant_1.0',
       'precip_zipquant_tossThisOne', 'precip_zipquant_0.95',
       'precip_zipquant_1.0', 'quarterly_avg_precip',
       'quarterly_median_precip', 'quarterly_variance_precip',
       'qu

In [46]:
customersWithWeather.head()

Unnamed: 0,year,qtr,gvkey,companyName,tic,naics,curcdq,incomeChange,revenueChange,revenueChangeAbsolute,...,temp_zipquant_1.0,precip_zipquant_tossThisOne,precip_zipquant_0.95,precip_zipquant_1.0,quarterly_avg_precip,quarterly_median_precip,quarterly_variance_precip,quarterly_avg_temp,quarterly_median_temp,quarterly_variance_temp
0,2010,1.0,1045,AMERICAN AIRLINES GROUP INC,AAL,481111.0,USD,-0.136634,0.091752,465.0,...,6,15,7,6,2.376315,0.0,56.895214,16.170472,16.924,51.717128
1,2011,1.0,1045,AMERICAN AIRLINES GROUP INC,AAL,481111.0,USD,2.807339,0.09109,504.0,...,24,17,1,2,2.376315,0.0,56.895214,16.170472,16.924,51.717128
2,2012,1.0,1045,AMERICAN AIRLINES GROUP INC,AAL,481111.0,USD,-0.794578,0.010104,61.0,...,27,15,1,8,2.376315,0.0,56.895214,16.170472,16.924,51.717128
3,2013,1.0,1045,AMERICAN AIRLINES GROUP INC,AAL,481111.0,USD,-2.407625,0.639062,3897.0,...,13,13,1,5,2.376315,0.0,56.895214,16.170472,16.924,51.717128
4,2014,1.0,1045,AMERICAN AIRLINES GROUP INC,AAL,481111.0,USD,0.941667,-0.016808,-168.0,...,14,18,0,1,2.376315,0.0,56.895214,16.170472,16.924,51.717128


In [47]:
frames = [customersWithWeather, suppliersWithWeather]

allCompanies = pd.concat(frames).drop_duplicates()

print(allCompanies.shape)

(42352, 45)


In [48]:
allCompanies.head()

Unnamed: 0,year,qtr,gvkey,companyName,tic,naics,curcdq,incomeChange,revenueChange,revenueChangeAbsolute,...,temp_zipquant_1.0,precip_zipquant_tossThisOne,precip_zipquant_0.95,precip_zipquant_1.0,quarterly_avg_precip,quarterly_median_precip,quarterly_variance_precip,quarterly_avg_temp,quarterly_median_temp,quarterly_variance_temp
0,2010,1.0,1045,AMERICAN AIRLINES GROUP INC,AAL,481111.0,USD,-0.136634,0.091752,465.0,...,6,15,7,6,2.376315,0.0,56.895214,16.170472,16.924,51.717128
1,2011,1.0,1045,AMERICAN AIRLINES GROUP INC,AAL,481111.0,USD,2.807339,0.09109,504.0,...,24,17,1,2,2.376315,0.0,56.895214,16.170472,16.924,51.717128
2,2012,1.0,1045,AMERICAN AIRLINES GROUP INC,AAL,481111.0,USD,-0.794578,0.010104,61.0,...,27,15,1,8,2.376315,0.0,56.895214,16.170472,16.924,51.717128
3,2013,1.0,1045,AMERICAN AIRLINES GROUP INC,AAL,481111.0,USD,-2.407625,0.639062,3897.0,...,13,13,1,5,2.376315,0.0,56.895214,16.170472,16.924,51.717128
4,2014,1.0,1045,AMERICAN AIRLINES GROUP INC,AAL,481111.0,USD,0.941667,-0.016808,-168.0,...,14,18,0,1,2.376315,0.0,56.895214,16.170472,16.924,51.717128


In [49]:
allCompanies.to_csv("../../data/companyData/allCompaniesWithWeather.csv")

In [9]:
import statsmodels.formula.api as smf

  import pandas.util.testing as tm


In [10]:
goodsData = pd.read_csv('extremes/goodsData.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'extremes/goodsData.csv'

In [59]:
allCompanies.shape

(42352, 45)

In [50]:
allCompanies.head()

Unnamed: 0,year,qtr,gvkey,companyName,tic,naics,curcdq,incomeChange,revenueChange,revenueChangeAbsolute,...,temp_zipquant_1.0,precip_zipquant_tossThisOne,precip_zipquant_0.95,precip_zipquant_1.0,quarterly_avg_precip,quarterly_median_precip,quarterly_variance_precip,quarterly_avg_temp,quarterly_median_temp,quarterly_variance_temp
0,2010,1.0,1045,AMERICAN AIRLINES GROUP INC,AAL,481111.0,USD,-0.136634,0.091752,465.0,...,6,15,7,6,2.376315,0.0,56.895214,16.170472,16.924,51.717128
1,2011,1.0,1045,AMERICAN AIRLINES GROUP INC,AAL,481111.0,USD,2.807339,0.09109,504.0,...,24,17,1,2,2.376315,0.0,56.895214,16.170472,16.924,51.717128
2,2012,1.0,1045,AMERICAN AIRLINES GROUP INC,AAL,481111.0,USD,-0.794578,0.010104,61.0,...,27,15,1,8,2.376315,0.0,56.895214,16.170472,16.924,51.717128
3,2013,1.0,1045,AMERICAN AIRLINES GROUP INC,AAL,481111.0,USD,-2.407625,0.639062,3897.0,...,13,13,1,5,2.376315,0.0,56.895214,16.170472,16.924,51.717128
4,2014,1.0,1045,AMERICAN AIRLINES GROUP INC,AAL,481111.0,USD,0.941667,-0.016808,-168.0,...,14,18,0,1,2.376315,0.0,56.895214,16.170472,16.924,51.717128


## Get first-hop SC data

In [51]:
c_links = pd.read_csv("../../data/companyData/clinks_IG_selected.csv").drop(columns = {'Unnamed: 0'})
c_links.head()

Unnamed: 0,year,supplier_gvkey,customer_gvkey,salecs,customer_famafrench,customer_naics,supplier_famafrench,supplier_naics,customer_cstatCompanies,customer_igCompanies,customer_delete,customer_abi,supplier_cstatCompanies,supplier_igCompanies,supplier_delete,supplier_abi
0,2010,2497,9899,461.6,33.0,51,18.0,23,at t,at t,,460637358,mastec,mastec,,482985413
1,2011,2497,9899,692.065,33.0,51,18.0,23,at t,at t,,460637358,mastec,mastec,,482985413
2,2012,2497,9899,670.822,33.0,51,18.0,23,at t,at t,,460637358,mastec,mastec,,482985413
3,2013,2497,9899,778.462,33.0,51,18.0,23,at t,at t,,460637358,mastec,mastec,,482985413
4,2014,2497,9899,968.479,33.0,51,18.0,23,at t,at t,,460637358,mastec,mastec,,482985413


In [52]:
c_links['suppliers'] = 1
custExp = c_links[['year', 'customer_gvkey', 'salecs','suppliers']].groupby(['year','customer_gvkey']).sum().\
    reset_index().rename(columns = {'salecs': 'totalExp'})

custExp.head()



Unnamed: 0,year,customer_gvkey,totalExp,suppliers
0,2010,1045,0.0,1
1,2010,1078,84.192,5
2,2010,1161,0.0,1
3,2010,1177,27.269,1
4,2010,1300,6.027,2


In [53]:
print("Number of firms with no exp information and multiple suppliers: ", \
          sum(custExp[custExp.totalExp == 0].suppliers > 1))
print("Number of firms with no exp information and >5 suppliers: ", \
          sum(custExp[custExp.totalExp == 0].suppliers > 5))


Number of firms with no exp information and multiple suppliers:  176
Number of firms with no exp information and >5 suppliers:  2


Most of these firms have expenditure information. We can look at:
    - Expenditure-weighted (just do equal shares if no exp information)
    - Largest supplier

In [54]:
customerDB = c_links[['year','customer_gvkey','supplier_gvkey','salecs']].\
    merge(custExp).rename(columns = {'customer_gvkey': 'gvkey'}).drop_duplicates()
print(customerDB.shape)

customerDB.head()

(19377, 6)


Unnamed: 0,year,gvkey,supplier_gvkey,salecs,totalExp,suppliers
0,2010,9899,2497,461.6,4509.106,37
1,2010,9899,3275,10.806,4509.106,37
2,2010,9899,4115,201.679,4509.106,37
3,2010,9899,4900,,4509.106,37
4,2010,9899,10420,574.805,4509.106,37


## Biggest Supplier
Focus on weather of biggest supplier.

First find the max by supplier. Add back in any rows with only 1 supplier.

In [55]:
# https://stackoverflow.com/questions/15705630/get-the-rows-which-have-the-max-value-in-groups-using-groupby
idx = customerDB.groupby(['year','gvkey']).salecs.\
    transform(max) == customerDB.salecs
largestSuppliers = customerDB[idx].reset_index(drop = True)
print(largestSuppliers.shape)

# find companies who only have one other supplier
singleSuppliers = customerDB[customerDB.suppliers == 1].reset_index(drop = True)
print(singleSuppliers.shape)

# find largest suppliers of different companies
largestSuppliers = largestSuppliers.append(singleSuppliers).drop_duplicates()
print(largestSuppliers.shape)



(4417, 6)
(2811, 6)
(5329, 6)


In [56]:
largestSuppliers.head()

Unnamed: 0,year,gvkey,supplier_gvkey,salecs,totalExp,suppliers
0,2010,9899,111880,865.425,4509.106,37
1,2011,9899,111880,921.541,4097.448,37
2,2012,9899,111880,844.195,3771.511,36
3,2013,9899,111880,936.839,4290.472,34
4,2014,9899,111880,1176.0,5623.386,35


In [58]:
suppliersWithWeather.columns

Index(['year', 'qtr', 'gvkey', 'companyName', 'tic', 'naics', 'curcdq',
       'incomeChange', 'revenueChange', 'revenueChangeAbsolute', 'costChange',
       'inventoryChange', 'netIncome', 'roa', 'famafrench', 'earliestYear',
       'zipcode', 'employeesAtLocation', 'temp_annualquant_0.05',
       'temp_annualquant_tossThisOne', 'temp_annualquant_0.95',
       'temp_annualquant_1.0', 'precip_annualquant_tossThisOne',
       'precip_annualquant_0.95', 'precip_annualquant_1.0',
       'temp_monthlyquant_0.05', 'temp_monthlyquant_tossThisOne',
       'temp_monthlyquant_0.95', 'temp_monthlyquant_1.0',
       'precip_monthlyquant_tossThisOne', 'precip_monthlyquant_0.95',
       'precip_monthlyquant_1.0', 'temp_zipquant_0.05',
       'temp_zipquant_tossThisOne', 'temp_zipquant_0.95', 'temp_zipquant_1.0',
       'precip_zipquant_tossThisOne', 'precip_zipquant_0.95',
       'precip_zipquant_1.0', 'quarterly_avg_precip',
       'quarterly_median_precip', 'quarterly_variance_precip',
       'qu

In [57]:
suppliers_toMerge = suppliersWithWeather[['year','qtr','gvkey','tmax_quant_1.0','precip_quant_1.0']].\
    rename(columns = {'gvkey': 'supplier_gvkey',
                      'tmax_quant_1.0': 'supplier_tmax_quant_1.0',
                      'precip_quant_1.0': 'supplier_precip_quant_1.0'})

KeyError: "['precip_quant_1.0', 'tmax_quant_1.0'] not in index"

In [None]:
largestSuppliersWithWeather = largestSuppliers.merge(customersWithWeather[['year', 'qtr', 'gvkey', 'companyName', 'naics',
       'revenueChange', 'costChange',
       'inventoryChange', 'zipcode', 'tmax_quant_1.0', 'precip_quant_1.0']]).merge(suppliers_toMerge)

largestSuppliersWithWeather.shape

In [None]:
largestSuppliersWithWeather.head()

In [None]:
largestSuppliersWithWeather.to_csv("../../data/companyData/largestSuppliersWithWeather.csv")

## Sales-Weighted Average
If a company doesn't have sales-specific information, then assume equal shares. This doesn't happen for too many of the companies, thankfully.

In [None]:
customerDB = c_links[['year','customer_gvkey','supplier_gvkey','salecs']].\
    merge(custExp).rename(columns = {'customer_gvkey': 'gvkey'}).drop_duplicates()

customerDB['salesWeight'] = customerDB.salecs/customerDB.totalExp

customerDB.fillna(1, inplace = True)

Now merge this with the supplier weather data, and use the sales weights to find a sales-weighted average of the weather conditions for the suppliers.

In [None]:
suppliers_toMerge = suppliersWithWeather[['year','qtr','gvkey','tmax_quant_1.0','precip_quant_1.0']].\
    rename(columns = {'gvkey': 'supplier_gvkey',
                      'tmax_quant_1.0': 'supplier_tmax_quant_1.0',
                      'precip_quant_1.0': 'supplier_precip_quant_1.0'})

In [None]:
supplierWeather = customerDB.merge(suppliers_toMerge)[['year','qtr','gvkey','salesWeight','supplier_tmax_quant_1.0','supplier_precip_quant_1.0']]

supplierWeather['supplier_tmax_quant_1.0']   = supplierWeather.salesWeight*supplierWeather['supplier_tmax_quant_1.0']
supplierWeather['supplier_precip_quant_1.0'] = supplierWeather.salesWeight*supplierWeather['supplier_precip_quant_1.0']

supplierWeather = supplierWeather[['year','qtr','gvkey','supplier_tmax_quant_1.0','supplier_precip_quant_1.0']]

In [None]:
supplierWtdAvgWeather = supplierWeather.groupby(['year','qtr','gvkey']).sum().reset_index()

In [None]:
supplierWtdAvgWeather.shape

Merge the supplier weighted average weather data with the customer data that has weather as well.

In [None]:
wtdAvgSuppliers = customersWithWeather[['year', 'qtr', 'gvkey', 'companyName', 'naics',
       'revenueChange', 'costChange',
       'inventoryChange', 'zipcode', 'tmax_quant_1.0', 'precip_quant_1.0']].merge(supplierWtdAvgWeather)

wtdAvgSuppliers.head()

In [None]:
wtdAvgSuppliers.columns

In [None]:
wtdAvgSuppliers.to_csv("../../data/companyData/wtdAvgSuppliers.csv")