In [35]:
import pickle as pkl
import pandas as pd
import matplotlib
import os
import re

import collections
import datetime
import time

import geopandas as gpd

import numpy as np

import rasterio

import spacy

import gc

import geopy.distance

nlp = spacy.load('en_core_web_lg')
from sklearn.metrics.pairwise import cosine_similarity

import itertools

import pgeocode

from dateutil.relativedelta import relativedelta

# Indirect
Introduce the SC Data, focusing on transactions between US companies.

In [36]:
c_links = pd.read_csv("../../data/companyData/c_linksUS.csv") # pd.read_csv("../../data/companyData/compustatSCLinked.csv")

c_links['year'] = c_links.srcdate.astype('str').str.slice(0,4).astype('int64')

c_links = c_links[c_links.year > 1999][['year','gvkey','cgvkey','salecs']].\
    rename(columns = {'cgvkey': 'customer_gvkey','gvkey': 'supplier_gvkey'})

c_links['year'] = pd.to_datetime(c_links.year, format = '%Y')


c_links.head()

Unnamed: 0,year,supplier_gvkey,customer_gvkey,salecs
0,2002-01-01,1013,2136,111.056
1,2004-01-01,1013,2136,104.312
2,2005-01-01,1013,2136,146.0
3,2006-01-01,1013,2136,205.0
4,2007-01-01,1013,2136,236.0


In [37]:
sum(c_links.salecs.isna())

11228

In [38]:
supplierCombos = c_links[['supplier_gvkey', 'customer_gvkey']].drop_duplicates().reset_index(drop = True)

print(supplierCombos.shape)

supplierCombos.head()

(11827, 2)


Unnamed: 0,supplier_gvkey,customer_gvkey
0,1013,2136
1,1013,9899
2,1021,61494
3,1021,25880
4,1048,11552


We'll follow Barrot Sauvagnat in assuming that a supplier relationship holds for every year between the first and last year in which a customer is reported. This is going to take a little bit of work. We'll try it like this: 
- subset dataframe to a specific supplier-customer pair
- fill in data for every year that's missing

Then, apply this row-wise to all rows of the unique supplierCombos df above using: https://stackoverflow.com/questions/61942138/apply-function-row-wise-to-pandas-dataframe

In [39]:
def fillYear(supplier, customer, scData = c_links):
    c_linksTemp = scData[(scData.supplier_gvkey == supplier) & \
                      (scData.customer_gvkey == customer)].reset_index(drop = True)
    
    # if there are na values and non-na values for the same supplier-cust combination, then 
    # select for only the non-na values, by (1) replacing na with negative, (2) 1
    c_linksTemp['salecs'] = c_linksTemp['salecs'].fillna(-5)
    c_linksTemp = c_linksTemp.loc[c_linksTemp.reset_index().groupby(['year','supplier_gvkey',
                                                                     'customer_gvkey'])['salecs'].idxmax()]
    
    
    # now: find the start and end of the data series
    first = c_linksTemp.year.min() - relativedelta(years = 3)
    last  = c_linksTemp.year.max() + relativedelta(years = 3)
    
    
    c_linksTemp['firstYear'] = c_linksTemp.year.min()
    c_linksTemp['lastYear']  = c_linksTemp.year.max()
    
    
    c_linksTemp = c_linksTemp.set_index('year') 
    
    
    c_linksTemp = c_linksTemp.reindex(pd.date_range(first, last, freq = 'YS')).\
        reset_index().rename(columns = {'index': 'year'})
    
    
    # and impute all values within the series
    c_linksTemp = c_linksTemp.groupby(c_linksTemp.year.dt.time).ffill().bfill()
    
    
    return(c_linksTemp)

Show that this works for one of the supplier rows.

In [40]:
fillYear(supplierCombos.supplier_gvkey[0], supplierCombos.customer_gvkey[0])

Unnamed: 0,year,supplier_gvkey,customer_gvkey,salecs,firstYear,lastYear
0,1999-01-01,1013.0,2136.0,111.056,2002-01-01,2010-01-01
1,2000-01-01,1013.0,2136.0,111.056,2002-01-01,2010-01-01
2,2001-01-01,1013.0,2136.0,111.056,2002-01-01,2010-01-01
3,2002-01-01,1013.0,2136.0,111.056,2002-01-01,2010-01-01
4,2003-01-01,1013.0,2136.0,111.056,2002-01-01,2010-01-01
5,2004-01-01,1013.0,2136.0,104.312,2002-01-01,2010-01-01
6,2005-01-01,1013.0,2136.0,146.0,2002-01-01,2010-01-01
7,2006-01-01,1013.0,2136.0,205.0,2002-01-01,2010-01-01
8,2007-01-01,1013.0,2136.0,236.0,2002-01-01,2010-01-01
9,2008-01-01,1013.0,2136.0,240.0,2002-01-01,2010-01-01


Now do it for all rows.

In [41]:
start = time.time()
print(c_links.shape)
c_linksImpd_list = supplierCombos.apply(lambda row: fillYear(row['supplier_gvkey'], row['customer_gvkey']), axis = 1)
c_linksImpd_df   = pd.concat(list(c_linksImpd_list))
print(c_linksImpd_df.shape)
print(time.time() - start)

c_linksImpd_df['year'] = c_linksImpd_df.year.dt.year

(47427, 4)
(120797, 6)
64.36585712432861


We had converted some of the na sales values to -5 so that we could deal with duplicated values, by choosing the larger of said values. Switch back to nan so that we are not thrown off when we look for biggest supplier.

In [42]:
c_linksImpd_df.loc[c_linksImpd_df.salecs == -5, 'salecs'] = float('nan')

Partially subset to focus on suppliers in our data.

In [54]:
suppliersOnly = c_linksImpd_df[['year', 'supplier_gvkey']].drop_duplicates()
suppliersOnly['supplierYear'] = suppliersOnly.supplier_gvkey.astype('int64').astype('str') + \
    '_' + suppliersOnly.year.astype('str')


suppliersOnly.to_csv("../../data/companyData/suppliers.csv")


suppliersOnly.head()

Unnamed: 0,year,supplier_gvkey,supplierYear
0,1999,1013.0,1013_1999
1,2000,1013.0,1013_2000
2,2001,1013.0,1013_2001
3,2002,1013.0,1013_2002
4,2003,1013.0,1013_2003


In [44]:
suppliersOnly.head()

Unnamed: 0,year,supplier_gvkey
0,1999,1013.0
1,2000,1013.0
2,2001,1013.0
3,2002,1013.0
4,2003,1013.0


In [20]:
customersOnly = c_linksImpd_df[['year', 'customer_gvkey']].drop_duplicates()
customersOnly.shape

(29148, 2)

In [21]:
industries = pd.read_csv("../../data/companyData/gvkeyIndustries.csv").drop(columns = {'Unnamed: 0'})
print(industries)

       gvkey            indGroup
0       1010                manu
1       1043           wholesale
2       1045  transportUtilities
3       1062             finance
4       1075  transportUtilities
...      ...                 ...
31652  37600             finance
31653  37618             finance
31654  37620             finance
31655  37621             finance
31656  37625             finance

[31657 rows x 2 columns]


In [22]:
c_links = c_linksImpd_df.copy()

print(c_links.shape)

print(c_links.head())

industries.columns = ['customer_gvkey','customer_ind']


c_links = c_links.merge(industries)
industries.columns = ['supplier_gvkey','supplier_ind']

c_links = c_links.merge(industries)
print(c_links.head(), c_links.shape)


c_links.to_csv("../../data/companyData/c_links.csv")


(120797, 6)
   year  supplier_gvkey  customer_gvkey   salecs  firstYear   lastYear
0  1999          1013.0          2136.0  111.056 2002-01-01 2010-01-01
1  2000          1013.0          2136.0  111.056 2002-01-01 2010-01-01
2  2001          1013.0          2136.0  111.056 2002-01-01 2010-01-01
3  2002          1013.0          2136.0  111.056 2002-01-01 2010-01-01
4  2003          1013.0          2136.0  111.056 2002-01-01 2010-01-01
   year  supplier_gvkey  customer_gvkey   salecs  firstYear   lastYear  \
0  1999          1013.0          2136.0  111.056 2002-01-01 2010-01-01   
1  2000          1013.0          2136.0  111.056 2002-01-01 2010-01-01   
2  2001          1013.0          2136.0  111.056 2002-01-01 2010-01-01   
3  2002          1013.0          2136.0  111.056 2002-01-01 2010-01-01   
4  2003          1013.0          2136.0  111.056 2002-01-01 2010-01-01   

         customer_ind supplier_ind  
0  transportUtilities         manu  
1  transportUtilities         manu  
2  tra

Let's see how all this translates into different industries. Check how many times different industries show up.

The full count will be roughly 4x whatever is below, assuming we can get a match for roughly all of them (which we should be able to).

In [23]:
c_links[c_links.year.astype(int) > 1999].supplier_ind.value_counts()

manu                  50104
finance               22950
services              17259
mining                 9457
transportUtilities     6402
wholesale              2884
construction           1097
retail                  628
agForFish               194
Name: supplier_ind, dtype: int64

Now see if it's common to have one in and one out of the industries of interest. 

For now, let's keep all the different industry types.

We can always filter later if we need to.


********this is where it would be helpful to check on the abi bits for a given industry, to see if we can get additional matches as well***********

In [24]:
#########################
# get data and reset columns 
gvKey_abiLinkingTable = pd.read_csv('../../data/companyData/sc_linkingTable.csv').drop(columns = ['Unnamed: 0']).\
    drop_duplicates()



base_columns = gvKey_abiLinkingTable.columns 
customer_columns = "customer_" + base_columns
supplier_columns = "supplier_" + base_columns



# merge in customer information
gvKey_abiLinkingTable.columns = customer_columns



print(c_links.shape)
c_linksMerge1 = c_links.merge(gvKey_abiLinkingTable, on ='customer_gvkey')
print(c_links.shape,c_linksMerge1.shape, c_linksMerge1.head())



#########################
# and merge in supplier 
gvKey_abiLinkingTable.columns = supplier_columns

print(c_links.shape)
c_linksMerge2 = c_linksMerge1.merge(gvKey_abiLinkingTable, on ='supplier_gvkey').drop_duplicates()
print(c_linksMerge2.shape)

c_linksMerge2.to_csv("../../data/companyData/clinks_IG_selected.csv")


c_linksMerge2['year'] = pd.to_datetime(c_linksMerge2.year, format = '%Y')


##########################
# get all the company abi
allAbi = c_linksMerge2.customer_abi.append(c_linksMerge2.supplier_abi).drop_duplicates()
allAbi.shape

(120797, 8)
(120797, 8) (112962, 11)    year  supplier_gvkey  customer_gvkey   salecs  firstYear   lastYear  \
0  1999          1013.0          2136.0  111.056 2002-01-01 2010-01-01   
1  2000          1013.0          2136.0  111.056 2002-01-01 2010-01-01   
2  2001          1013.0          2136.0  111.056 2002-01-01 2010-01-01   
3  2002          1013.0          2136.0  111.056 2002-01-01 2010-01-01   
4  2003          1013.0          2136.0  111.056 2002-01-01 2010-01-01   

         customer_ind supplier_ind customer_cstatCompanies  \
0  transportUtilities         manu     verizonmmunications   
1  transportUtilities         manu     verizonmmunications   
2  transportUtilities         manu     verizonmmunications   
3  transportUtilities         manu     verizonmmunications   
4  transportUtilities         manu     verizonmmunications   

  customer_igCompanies  customer_abi  
0  verizonmmunications       7564776  
1  verizonmmunications       7564776  
2  verizonmmunications      

(4082,)

## Relevant Industries

In [25]:
'''c_linksMerge2 = c_linksMerge2[(c_linksMerge2.customer_ind != 'finance') & (c_linksMerge2.customer_ind != 'services') & \
               (c_linksMerge2.supplier_ind != 'finance') & (c_linksMerge2.supplier_ind != 'services')]

c_linksMerge2.shape'''

"c_linksMerge2 = c_linksMerge2[(c_linksMerge2.customer_ind != 'finance') & (c_linksMerge2.customer_ind != 'services') &                (c_linksMerge2.supplier_ind != 'finance') & (c_linksMerge2.supplier_ind != 'services')]\n\nc_linksMerge2.shape"

First, make a sample with the companies on one year of either side of when it reports another customer.

In [27]:
########
hqsOnly = pd.read_csv("../../data/ig_uniqueHQs.csv").drop(columns = {'Unnamed: 0'})

hq = pd.read_csv("../../data/ig_uniqueHQs_multLocations.csv").\
    drop(columns = {'Unnamed: 0'}).\
    rename(columns = {'archive_version_year': 'year'})

hq['year']    = pd.to_datetime(hq.year, format = '%Y')   # hq.year.astype('int64')
hq['zipcode'] = hq.zipcode.astype('str').str.zfill(5)
hqRelevant    = hq[hq.abi.isin(allAbi)].reset_index()



I think many of these don't have a lat-long but do have a zipcode. We should be able to use pgeocode to get an approximate lat-long from the zipcode.

In [28]:
us = pgeocode.Nominatim('us')

for i in range(0, hq.shape[0]):
    if pd.isna(hq.latitude[i]):
        hq.latitude[i]  = us.query_postal_code(hq.zipcode[i]).latitude
        hq.longitude[i] = us.query_postal_code(hq.zipcode[i]).longitude

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [29]:
hq[hq['longitude'].isna()].abi.unique()

array([  1864149,   5136239,   6484067,   7512304,   7536717, 107286601,
       107358178, 120553524, 306744129, 406330118, 406330126, 433385663,
       441433422, 450510896, 477538250, 902151877, 936186998, 967349333,
         7001068, 510200629, 902180348,   4552857,   6918395, 602875296,
       435965967, 657440202, 670508548, 671873610, 207813429, 252925805])

In [30]:
supplierHQ = hq[['year','abi','latitude','longitude','zipcode']].copy().rename(columns = {'abi':      'supplier_abi',
                                                                        'latitude':  'supplier_latitude',
                                                                        'longitude': 'supplier_longitude',
                                                                        'zipcode': 'supplier_zipcode'})


customerHQ = hq[['year','abi','latitude','longitude','zipcode']].copy().rename(columns = {'abi':      'customer_abi',
                                                                        'latitude':  'customer_latitude',
                                                                        'longitude': 'customer_longitude',
                                                                        'zipcode': 'customer_zipcode'})

In [31]:
print(sum(allAbi.isin(supplierHQ.supplier_abi)) ,
     sum(allAbi.isin(customerHQ.customer_abi)))



4059 4059


Merge to the closest year in either direction

In [32]:
merged_dataframe = pd.merge_asof(c_linksMerge2.sort_values('year'), customerHQ.sort_values('year'), on="year",
                                 by="customer_abi", 
                                direction = "nearest")

print(merged_dataframe.shape)
merged_dataframe.head()

(99854, 17)


Unnamed: 0,year,supplier_gvkey,customer_gvkey,salecs,firstYear,lastYear,customer_ind,supplier_ind,customer_cstatCompanies,customer_igCompanies,customer_abi,supplier_cstatCompanies,supplier_igCompanies,supplier_abi,customer_latitude,customer_longitude,customer_zipcode
0,1997-01-01,27786.0,11056.0,54.9,2000-01-01,2001-01-01,transportUtilities,mining,aquila,aquila,7693658,chesapeake energy,chesapeake energy,463222893,39.1051,-94.5916,64199
1,1997-01-01,138005.0,1164.0,24.271,2000-01-01,2000-01-01,transportUtilities,services,mci,mci,712125533,lexent,lexent,544364045,32.30014,-90.18116,39201
2,1997-01-01,63766.0,1045.0,597.201,2000-01-01,2003-01-01,transportUtilities,services,american airlines,american airlines,7501711,sabre,sabre,416062792,32.8827,-97.03714,75261
3,1997-01-01,138005.0,13440.0,75.182,2000-01-01,2002-01-01,transportUtilities,services,level 3mmunications,level 3mmunications,463236547,lexent,lexent,544364045,39.9206,-105.14427,80021
4,1997-01-01,63766.0,10946.0,311.583,2000-01-01,2001-01-01,transportUtilities,services,us airways,us airways,441439718,sabre,sabre,416062792,38.85292,-77.04895,22202


In [33]:
c_linksMerge3 = pd.merge_asof(merged_dataframe.sort_values('year'), supplierHQ.sort_values('year'), on="year",
                                 by="supplier_abi", 
                                direction = "nearest")

# c_linksMerge2.merge(supplierHQ).merge(customerHQ)


print(c_linksMerge3.shape)


c_linksMerge3 = c_linksMerge3[(~c_linksMerge3.supplier_latitude.isna()) & (~c_linksMerge3.customer_latitude.isna())]

print(c_linksMerge3.shape)

c_linksMerge3 = c_linksMerge3.reset_index(drop = True)

(99854, 20)
(98787, 20)


In [34]:
c_linksMerge3.salecs.sum()/c_linksMerge2.salecs.sum()

0.9889415923171958

Now, find the distances between suppliers.

In [35]:
c_linksMerge3['dist'] = 0

for i in range(0,c_linksMerge3.shape[0]):
    c_linksMerge3.dist[i] = geopy.distance.geodesic((c_linksMerge3.customer_latitude[i],c_linksMerge3.customer_longitude[i]), \
                                                             (c_linksMerge3.supplier_latitude[i],c_linksMerge3.supplier_longitude[i])).km

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [36]:
c_linksMerge3.dist.hist(bins = 100)

<matplotlib.axes._subplots.AxesSubplot at 0x7fbbf5bb2550>

In [37]:
c_linksMerge3.to_csv("../../data/companyData/supplierCustomerHQs.csv")

In [38]:
c_linksMerge3 = pd.read_csv("../../data/companyData/supplierCustomerHQs.csv")


Try to figure out which companies have no sales figures. Are they the same companies through and through, or are some different ones in play?

This means that most of the firms with an na transaction have another transaction. It could be either that year or another year. If a company has one transaction for a year and it's na, then turn it into a small amount so that we keep it. 

In [39]:
c_linksMerge3['suppliers'] = 1


suppliers = c_linksMerge3[['year', 'customer_gvkey', 'suppliers']].groupby(['year','customer_gvkey']).sum().\
    reset_index().rename(columns = {'suppliers': 'totalSuppliers'})

meanExp   = c_linksMerge3[['year', 'customer_gvkey', 'salecs']].groupby(['year','customer_gvkey'],dropna=False).\
    mean().reset_index().rename(columns = {'salecs': 'meanSales'})

c_linksMerge4 = c_linksMerge3.merge(suppliers).drop(columns = {'suppliers'}).merge(meanExp)



print(c_linksMerge3.shape,c_linksMerge4.shape)


c_linksMerge4.head()

(98787, 23) (98787, 24)


Unnamed: 0.1,Unnamed: 0,year,supplier_gvkey,customer_gvkey,salecs,firstYear,lastYear,customer_ind,supplier_ind,customer_cstatCompanies,...,supplier_abi,customer_latitude,customer_longitude,customer_zipcode,supplier_latitude,supplier_longitude,supplier_zipcode,dist,totalSuppliers,meanSales
0,0,1997-01-01,27786.0,11056.0,54.9,2000-01-01,2001-01-01,transportUtilities,mining,aquila,...,463222893,39.1051,-94.5916,64199,35.52366,-97.52563,73154,474,2,30.249
1,394,1997-01-01,2204.0,11056.0,5.598,2000-01-01,2002-01-01,transportUtilities,mining,aquila,...,443390901,39.1051,-94.5916,64199,32.84653,-96.77077,75206,721,2,30.249
2,1,1997-01-01,7116.0,11259.0,,2000-01-01,2019-01-01,retail,manu,walmart,...,441359064,36.36072,-94.22725,72712,33.9203,-118.39185,90245,2212,82,212.806338
3,5,1997-01-01,6431.0,11259.0,32.043,2000-01-01,2003-01-01,retail,manu,walmart,...,5035605,36.36072,-94.22725,72712,42.12987,-71.05769,2322,2092,82,212.806338
4,9,1997-01-01,5518.0,11259.0,530.21,2000-01-01,2019-01-01,retail,manu,walmart,...,435133038,36.36072,-94.22725,72712,41.88076,-71.38554,2862,2060,82,212.806338


In [40]:
withNAs = c_linksMerge3[c_linksMerge3.salecs.isna()].customer_gvkey.drop_duplicates()

print("Find the fraction of firms who have an na transaction value one year, but another non-na", 
      withNAs.isin(c_linksMerge4[~c_linksMerge4.salecs.isna()].customer_gvkey.drop_duplicates()).sum()/withNAs.shape)

Find the fraction of firms who have an na transaction value one year, but another non-na [0.75112613]


For each row: if salecs is na, replace with mean sales. If mean sales also na, replace with 1.

In [41]:
for i in range(0,c_linksMerge4.shape[0]):
    
    # if mean sales is na, replace with 1
    if pd.isna(c_linksMerge4.meanSales[i]):
        c_linksMerge4.salecs[i] = 1
    
    # if salecs is na, replace with mean sales
    elif pd.isna(c_linksMerge4.salecs[i]):
        c_linksMerge4.salecs[i] = c_linksMerge4.meanSales[i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [42]:
print(c_linksMerge3[['customer_gvkey', 'year']].drop_duplicates().shape,
     c_linksMerge4[['customer_gvkey', 'year']].drop_duplicates().shape)

(23250, 2) (23250, 2)


In [43]:
# c_linksMerge4 = c_linksMerge4[(c_linksMerge4.dist >= 500)]
c_linksMerge4.drop(columns = {'Unnamed: 0'}, inplace = True)
c_linksMerge4.shape

(98787, 23)

In [44]:
c_linksMerge4.head()

Unnamed: 0,year,supplier_gvkey,customer_gvkey,salecs,firstYear,lastYear,customer_ind,supplier_ind,customer_cstatCompanies,customer_igCompanies,...,supplier_abi,customer_latitude,customer_longitude,customer_zipcode,supplier_latitude,supplier_longitude,supplier_zipcode,dist,totalSuppliers,meanSales
0,1997-01-01,27786.0,11056.0,54.9,2000-01-01,2001-01-01,transportUtilities,mining,aquila,aquila,...,463222893,39.1051,-94.5916,64199,35.52366,-97.52563,73154,474,2,30.249
1,1997-01-01,2204.0,11056.0,5.598,2000-01-01,2002-01-01,transportUtilities,mining,aquila,aquila,...,443390901,39.1051,-94.5916,64199,32.84653,-96.77077,75206,721,2,30.249
2,1997-01-01,7116.0,11259.0,212.806338,2000-01-01,2019-01-01,retail,manu,walmart,walmart,...,441359064,36.36072,-94.22725,72712,33.9203,-118.39185,90245,2212,82,212.806338
3,1997-01-01,6431.0,11259.0,32.043,2000-01-01,2003-01-01,retail,manu,walmart,walmart,...,5035605,36.36072,-94.22725,72712,42.12987,-71.05769,2322,2092,82,212.806338
4,1997-01-01,5518.0,11259.0,530.21,2000-01-01,2019-01-01,retail,manu,walmart,walmart,...,435133038,36.36072,-94.22725,72712,41.88076,-71.38554,2862,2060,82,212.806338


## Merge in supplier weather
Get the weather data.

In [45]:
g = pd.read_csv("../../data/companyData/weatherByEstablishment.csv").\
    drop(columns = {"Unnamed: 0"})


allWeather_withLags = pd.read_csv("../../data/companyData/allWeather_withLags_allZips.csv").\
    drop(columns = {"Unnamed: 0", 'yearQtr'}).astype({'year':       'category',
                           'qtr':        'category',
                           'zipcode':    'category'})


averages = pd.read_csv("../../data/companyData/quarterlyStatsByZip.csv").\
    drop(columns = {"Unnamed: 0"}).rename(columns = {'ZIP': 'zipcode'})
averages['qtr'] = averages.quarter.str.slice(1,2).astype('float')
averages.drop(columns = {'quarter'}, inplace = True) 
averages = averages.astype({'qtr':        'category',
                           'zipcode':    'category'})


allWeather_withLags2 = pd.read_csv("../../data/companyData/allWeather_withLags_new.csv").\
    drop(columns = {"Unnamed: 0", 'yearQtr'}).astype({'year':       'category',
                           'qtr':        'category',
                           'zipcode':    'category'})


'''thunderstorms_withLags = pd.read_csv("../../data/companyData/thunderstorms_withLags.csv").\
    drop(columns = {"Unnamed: 0", 'yearQtr'}).astype({'year':       'category',
                           'qtr':        'category',
                           'zipcode':    'category'})'''

allWeather = allWeather_withLags.merge(averages).merge(allWeather_withLags2) 
#allWeather['zipcode'] = allWeather.zipcode.astype('str').str.zfill(5).astype({'zipcode': 'category'})

# g will merge in when we have the gvkey

In [46]:
print(g.shape, allWeather_withLags.shape, averages.shape, allWeather_withLags2.shape)

(317420, 223) (2481856, 93) (17824, 8) (2481856, 53)


Make the weather for suppliers.

In [47]:
supplierIdentifiers = pd.read_csv("../../data/companyData/igData.csv").drop(columns = {'Unnamed: 0'})[['gvkey',
                        'zipcode','year','qtr']]

print(supplierIdentifiers.merge(allWeather).merge(g).shape)


supplierIdentifiers.merge(allWeather).merge(g).head()

(204378, 370)


Unnamed: 0,gvkey,zipcode,year,qtr,precip_annual_50,precip_annual_95,precip_annual_99,precip_zip_50,precip_zip_95,precip_zip_99,...,empWt_lag2_temp_zipQuarter95,empWt_lag3_temp_zipQuarter95,empWt_precip_zipQuarter95,empWt_lag1_precip_zipQuarter95,empWt_lag2_precip_zipQuarter95,empWt_lag3_precip_zipQuarter95,empWt_days90Plus,empWt_lag1_days90Plus,empWt_lag2_days90Plus,empWt_lag3_days90Plus
0,8515,70508,2003,1,12.0,4.0,1.0,26.0,5.0,2.0,...,0.0,0.0,0.0,0.27439,0.27439,0.0,0.0,7.981707,73.262195,51.341463
1,25874,70508,2003,1,12.0,4.0,1.0,26.0,5.0,2.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,5.0,62.0,29.0
2,28564,70508,2003,1,12.0,4.0,1.0,26.0,5.0,2.0,...,0.25,0.25,0.25,0.75,0.178571,0.0,0.0,0.714286,51.821429,17.642857
3,8515,70508,2003,2,21.0,5.0,2.0,33.0,7.0,3.0,...,0.0,0.0,0.0,0.0,0.27439,0.27439,40.182927,0.0,7.981707,73.262195
4,25874,70508,2003,2,21.0,5.0,2.0,33.0,7.0,3.0,...,0.0,0.0,0.0,0.0,1.0,1.0,38.0,0.0,5.0,62.0


In [48]:
'''allCustomerWeather = allWeather_withLags.merge(averages).merge(allWeather_withLags2).merge(g)


for colname in allCustomerWeather.columns[3:]:
    allCustomerWeather.rename(columns = {colname: 'customer_' + colname}, inplace = True)

allCustomerWeather.rename(columns = {'zipcode': 'customer_zipcode'}, inplace = True)
allCustomerWeather['customer_zipcode']     = allCustomerWeather.customer_zipcode.astype('str').str.zfill(5)

print(allCustomerWeather.shape)

allCustomerWeather.head()'''

"allCustomerWeather = allWeather_withLags.merge(averages).merge(allWeather_withLags2).merge(g)\n\n\nfor colname in allCustomerWeather.columns[3:]:\n    allCustomerWeather.rename(columns = {colname: 'customer_' + colname}, inplace = True)\n\nallCustomerWeather.rename(columns = {'zipcode': 'customer_zipcode'}, inplace = True)\nallCustomerWeather['customer_zipcode']     = allCustomerWeather.customer_zipcode.astype('str').str.zfill(5)\n\nprint(allCustomerWeather.shape)\n\nallCustomerWeather.head()"

In [49]:
allSupplierWeather = supplierIdentifiers.merge(allWeather).merge(g)

for colname in allSupplierWeather.columns[4:]:
    allSupplierWeather.rename(columns = {colname: 'supplier_' + colname}, inplace = True)

allSupplierWeather.rename(columns = {'zipcode': 'supplier_zipcode',
                                 'gvkey': 'supplier_gvkey'}, inplace = True)
# allSupplierWeather['supplier_zipcode']     = allSupplierWeather.supplier_zipcode.astype('str').str.zfill(5)

allSupplierWeather.head()

Unnamed: 0,supplier_gvkey,supplier_zipcode,year,qtr,supplier_precip_annual_50,supplier_precip_annual_95,supplier_precip_annual_99,supplier_precip_zip_50,supplier_precip_zip_95,supplier_precip_zip_99,...,supplier_empWt_lag2_temp_zipQuarter95,supplier_empWt_lag3_temp_zipQuarter95,supplier_empWt_precip_zipQuarter95,supplier_empWt_lag1_precip_zipQuarter95,supplier_empWt_lag2_precip_zipQuarter95,supplier_empWt_lag3_precip_zipQuarter95,supplier_empWt_days90Plus,supplier_empWt_lag1_days90Plus,supplier_empWt_lag2_days90Plus,supplier_empWt_lag3_days90Plus
0,8515,70508,2003,1,12.0,4.0,1.0,26.0,5.0,2.0,...,0.0,0.0,0.0,0.27439,0.27439,0.0,0.0,7.981707,73.262195,51.341463
1,25874,70508,2003,1,12.0,4.0,1.0,26.0,5.0,2.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,5.0,62.0,29.0
2,28564,70508,2003,1,12.0,4.0,1.0,26.0,5.0,2.0,...,0.25,0.25,0.25,0.75,0.178571,0.0,0.0,0.714286,51.821429,17.642857
3,8515,70508,2003,2,21.0,5.0,2.0,33.0,7.0,3.0,...,0.0,0.0,0.0,0.0,0.27439,0.27439,40.182927,0.0,7.981707,73.262195
4,25874,70508,2003,2,21.0,5.0,2.0,33.0,7.0,3.0,...,0.0,0.0,0.0,0.0,1.0,1.0,38.0,0.0,5.0,62.0


In [50]:
compustat = pd.read_csv("../../data/companyData/igData.csv").\
    drop(columns = ['Unnamed: 0', 'datadate','curcdq','costat','add1','addzip','city','state',
                   'fyearq','assetsLagged','datacqtr','datafqtr','fyr','DATE','address_line_1',
                   'delete','latitude','longitude','zipcode','abi','ticker','company',
                    'cstatCompanies','igCompanies']).\
    rename(columns = {'gvkey': 'customer_gvkey'})


print(compustat.columns,compustat.shape)

Index(['customer_gvkey', 'year', 'qtr', 'companyName', 'assets', 'cash',
       'costGoodsSold', 'totalInv', 'netIncome', 'opInc_afDep', 'opInc_befDep',
       'totalRevenue', 'priceClose', 'assetsLast', 'netIncomeLast',
       'totalRevenueLast', 'costGoodsSoldLast', 'totalInvLast',
       'opInc_afDepLast', 'opInc_befDepLast', 'priceCloseLast', 'cashLast',
       'netIncomeLagged', 'roa_lagged', 'sic2', 'indGroup', 'earliestYear',
       'ageTercile', 'sizeTercile', 'profitTercile'],
      dtype='object') (271281, 30)


In [51]:
c_linksMerge4         = c_linksMerge4[c_linksMerge4.supplier_zipcode.isin(allWeather.zipcode) & 
                             c_linksMerge4.customer_zipcode.isin(allWeather.zipcode)]

c_linksMerge4['year'] = c_linksMerge4.year.astype('str').str.slice(0,4).astype('int64')

print(c_linksMerge4.shape)

(81516, 23)


In [52]:
compustat.year.min()

1999

In [53]:
compustat_withLinks = c_linksMerge4.merge(compustat)
compustat_withLinks.shape

(179768, 51)

In [54]:
compustat_withLinks.columns

Index(['year', 'supplier_gvkey', 'customer_gvkey', 'salecs', 'firstYear',
       'lastYear', 'customer_ind', 'supplier_ind', 'customer_cstatCompanies',
       'customer_igCompanies', 'customer_abi', 'supplier_cstatCompanies',
       'supplier_igCompanies', 'supplier_abi', 'customer_latitude',
       'customer_longitude', 'customer_zipcode', 'supplier_latitude',
       'supplier_longitude', 'supplier_zipcode', 'dist', 'totalSuppliers',
       'meanSales', 'qtr', 'companyName', 'assets', 'cash', 'costGoodsSold',
       'totalInv', 'netIncome', 'opInc_afDep', 'opInc_befDep', 'totalRevenue',
       'priceClose', 'assetsLast', 'netIncomeLast', 'totalRevenueLast',
       'costGoodsSoldLast', 'totalInvLast', 'opInc_afDepLast',
       'opInc_befDepLast', 'priceCloseLast', 'cashLast', 'netIncomeLagged',
       'roa_lagged', 'sic2', 'indGroup', 'earliestYear', 'ageTercile',
       'sizeTercile', 'profitTercile'],
      dtype='object')

In [55]:
compustat_withLinks.to_csv("../../data/companyData/allSupplierCustomerData.csv")

# Aggregating SC Information
How do we go from a bunch of information on a bunch of suppliers to a more general measure. Candidates:
    - max over suppliers
    - average over suppliers
    - sales-weighted average over supplier
    - random choice 
    
Start with the max.    

In [56]:
allSupplierWeather.head()

Unnamed: 0,supplier_gvkey,supplier_zipcode,year,qtr,supplier_precip_annual_50,supplier_precip_annual_95,supplier_precip_annual_99,supplier_precip_zip_50,supplier_precip_zip_95,supplier_precip_zip_99,...,supplier_empWt_lag2_temp_zipQuarter95,supplier_empWt_lag3_temp_zipQuarter95,supplier_empWt_precip_zipQuarter95,supplier_empWt_lag1_precip_zipQuarter95,supplier_empWt_lag2_precip_zipQuarter95,supplier_empWt_lag3_precip_zipQuarter95,supplier_empWt_days90Plus,supplier_empWt_lag1_days90Plus,supplier_empWt_lag2_days90Plus,supplier_empWt_lag3_days90Plus
0,8515,70508,2003,1,12.0,4.0,1.0,26.0,5.0,2.0,...,0.0,0.0,0.0,0.27439,0.27439,0.0,0.0,7.981707,73.262195,51.341463
1,25874,70508,2003,1,12.0,4.0,1.0,26.0,5.0,2.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,5.0,62.0,29.0
2,28564,70508,2003,1,12.0,4.0,1.0,26.0,5.0,2.0,...,0.25,0.25,0.25,0.75,0.178571,0.0,0.0,0.714286,51.821429,17.642857
3,8515,70508,2003,2,21.0,5.0,2.0,33.0,7.0,3.0,...,0.0,0.0,0.0,0.0,0.27439,0.27439,40.182927,0.0,7.981707,73.262195
4,25874,70508,2003,2,21.0,5.0,2.0,33.0,7.0,3.0,...,0.0,0.0,0.0,0.0,1.0,1.0,38.0,0.0,5.0,62.0


In [57]:
customerDB = c_linksMerge4[['year', 'customer_gvkey', 'supplier_gvkey', 'salecs', 'supplier_zipcode']].drop_duplicates()

supplierWeather = customerDB.merge(allSupplierWeather)

print(supplierWeather.shape)

supplierWeather.head()

(144912, 372)


Unnamed: 0,year,customer_gvkey,supplier_gvkey,salecs,supplier_zipcode,qtr,supplier_precip_annual_50,supplier_precip_annual_95,supplier_precip_annual_99,supplier_precip_zip_50,...,supplier_empWt_lag2_temp_zipQuarter95,supplier_empWt_lag3_temp_zipQuarter95,supplier_empWt_precip_zipQuarter95,supplier_empWt_lag1_precip_zipQuarter95,supplier_empWt_lag2_precip_zipQuarter95,supplier_empWt_lag3_precip_zipQuarter95,supplier_empWt_days90Plus,supplier_empWt_lag1_days90Plus,supplier_empWt_lag2_days90Plus,supplier_empWt_lag3_days90Plus
0,2001,2184.0,14150.0,86.840529,91311,1,17.0,7.0,1.0,34.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,52.0,17.0
1,2001,2184.0,14150.0,86.840529,91311,2,3.0,0.0,0.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,20.0,0.0,0.0,52.0
2,2001,2184.0,14150.0,86.840529,91311,3,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,49.0,20.0,0.0,0.0
3,2001,2184.0,14150.0,86.840529,91311,4,11.0,0.0,0.0,19.0,...,0.0,0.0,0.0,0.0,0.0,0.0,7.0,49.0,20.0,0.0
4,2001,2184.0,4128.0,15.566,60707,1,8.0,0.0,0.0,24.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.678571,56.678571,5.357143


### Find the Worst Weather

In [58]:
supplierWorstWeather = supplierWeather.drop(columns = {'salecs', 'supplier_gvkey','supplier_zipcode'})
supplierWorstWeather = supplierWorstWeather.groupby(['year','qtr','customer_gvkey']).max().reset_index().drop_duplicates()

supplierWorstWeather.columns = supplierWorstWeather.columns[0:3].append('worst_' + supplierWorstWeather.columns[3:])


supplierWorstWeather.to_csv("../../data/companyData/supplierWorstWeather.csv")
supplierWorstWeather.shape

(47799, 369)

In [59]:
supplierWorstWeather.head()

Unnamed: 0,year,qtr,customer_gvkey,worst_supplier_precip_annual_50,worst_supplier_precip_annual_95,worst_supplier_precip_annual_99,worst_supplier_precip_zip_50,worst_supplier_precip_zip_95,worst_supplier_precip_zip_99,worst_supplier_precip_zipQuarter_50,...,worst_supplier_empWt_lag2_temp_zipQuarter95,worst_supplier_empWt_lag3_temp_zipQuarter95,worst_supplier_empWt_precip_zipQuarter95,worst_supplier_empWt_lag1_precip_zipQuarter95,worst_supplier_empWt_lag2_precip_zipQuarter95,worst_supplier_empWt_lag3_precip_zipQuarter95,worst_supplier_empWt_days90Plus,worst_supplier_empWt_lag1_days90Plus,worst_supplier_empWt_lag2_days90Plus,worst_supplier_empWt_lag3_days90Plus
0,2001,1,1038.0,10.0,3.0,0.0,25.0,6.0,1.0,25.0,...,0.431818,0.0,0.157343,0.0,0.0,0.0,0.0,1.879371,37.354895,14.536713
1,2001,1,1075.0,15.0,2.0,0.0,32.0,4.0,1.0,32.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,8.0
2,2001,1,1078.0,20.0,4.0,0.0,37.0,9.0,2.0,37.0,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,49.0,16.0
3,2001,1,1164.0,15.0,5.0,3.0,33.0,8.0,5.0,33.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.44,18.88
4,2001,1,1279.0,14.0,0.0,0.0,50.0,1.0,0.0,50.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Now the sales-weighted average.

In [62]:
####
# find total expenditures, using the annual data so we don't have double counting
custExp    = c_linksMerge4[['year','customer_gvkey','salecs']].groupby(['year','customer_gvkey']).\
    sum().reset_index().rename(columns = {'salecs': 'totalExp'})

customerDB = c_linksMerge4[['year','customer_gvkey','supplier_gvkey','supplier_zipcode','salecs']].\
    merge(custExp).drop_duplicates() # .rename(columns = {'customer_gvkey': 'gvkey'})

customerDB['salesWeight'] = customerDB.salecs/customerDB.totalExp

customerDB.fillna(1, inplace = True)

####
supplierWeightedWeather = customerDB[['year','customer_gvkey','supplier_gvkey','supplier_zipcode','salesWeight']].merge(allSupplierWeather)
print(supplierWeightedWeather.shape)
# print(supplierWeightedWeather.head())

for col in supplierWeightedWeather.columns[6:]:
        supplierWeightedWeather[col] = supplierWeightedWeather.salesWeight*supplierWeightedWeather[col]
        
# print(supplierWeightedWeather.head())

supplierWeightedWeather.drop(columns = {'supplier_zipcode','supplier_gvkey', 'salesWeight'}, inplace = True)

# print(supplierWeightedWeather.head())


supplierWeightedWeather.columns = supplierWeightedWeather.columns[0:3].append('wtd_' + supplierWeightedWeather.columns[3:])


supplierWeightedWeather = supplierWeightedWeather.groupby(['year','qtr','customer_gvkey']).sum().reset_index().drop_duplicates()
print(supplierWeightedWeather.shape)


supplierWeightedWeather.to_csv("../../data/companyData/supplierWeightedWeather.csv")

supplierWeightedWeather.head()

(144912, 372)
(47799, 369)


Unnamed: 0,year,qtr,customer_gvkey,wtd_supplier_precip_annual_50,wtd_supplier_precip_annual_95,wtd_supplier_precip_annual_99,wtd_supplier_precip_zip_50,wtd_supplier_precip_zip_95,wtd_supplier_precip_zip_99,wtd_supplier_precip_zipQuarter_50,...,wtd_supplier_empWt_lag2_temp_zipQuarter95,wtd_supplier_empWt_lag3_temp_zipQuarter95,wtd_supplier_empWt_precip_zipQuarter95,wtd_supplier_empWt_lag1_precip_zipQuarter95,wtd_supplier_empWt_lag2_precip_zipQuarter95,wtd_supplier_empWt_lag3_precip_zipQuarter95,wtd_supplier_empWt_days90Plus,wtd_supplier_empWt_lag1_days90Plus,wtd_supplier_empWt_lag2_days90Plus,wtd_supplier_empWt_lag3_days90Plus
0,2001,1,1038.0,6.635116,1.990535,0.0,16.587791,3.98107,0.663512,16.587791,...,0.286516,0.0,0.104399,0.0,0.0,0.0,0.0,1.246984,24.785407,9.645278
1,2001,1,1075.0,15.0,2.0,0.0,32.0,4.0,1.0,32.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,8.0
2,2001,1,1078.0,1.598431,0.40247,0.0,4.035652,0.514762,0.200422,4.035652,...,0.017532,0.017532,0.0,0.0,0.0,0.0,0.004583,0.004583,1.696414,0.425247
3,2001,1,1164.0,1.142112,0.403038,0.224843,2.681102,0.56436,0.246536,2.681102,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.109336,0.106186
4,2001,1,1279.0,4.666667,0.0,0.0,16.666667,0.333333,0.0,16.666667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [63]:
supplierWeightedWeather[['year','customer_gvkey']].drop_duplicates().shape

(12027, 2)

In [64]:
for col in supplierWeightedWeather.columns:
    print(col)

year
qtr
customer_gvkey
wtd_supplier_precip_annual_50
wtd_supplier_precip_annual_95
wtd_supplier_precip_annual_99
wtd_supplier_precip_zip_50
wtd_supplier_precip_zip_95
wtd_supplier_precip_zip_99
wtd_supplier_precip_zipQuarter_50
wtd_supplier_precip_zipQuarter_95
wtd_supplier_precip_zipQuarter_99
wtd_supplier_temp_annual_50
wtd_supplier_temp_annual_95
wtd_supplier_temp_annual_99
wtd_supplier_temp_zip_50
wtd_supplier_temp_zip_95
wtd_supplier_temp_zip_99
wtd_supplier_temp_zipQuarter_50
wtd_supplier_temp_zipQuarter_95
wtd_supplier_temp_zipQuarter_99
wtd_supplier_lag1_precip_annual_50
wtd_supplier_lag1_precip_annual_95
wtd_supplier_lag1_precip_annual_99
wtd_supplier_lag1_precip_zip_50
wtd_supplier_lag1_precip_zip_95
wtd_supplier_lag1_precip_zip_99
wtd_supplier_lag1_precip_zipQuarter_50
wtd_supplier_lag1_precip_zipQuarter_95
wtd_supplier_lag1_precip_zipQuarter_99
wtd_supplier_lag1_temp_annual_50
wtd_supplier_lag1_temp_annual_95
wtd_supplier_lag1_temp_annual_99
wtd_supplier_lag1_temp_zip_50
w

Now find the largest supplier for each customer. Make sure that we will have weather for them.

There seem to be a number of ties here, maybe from the companies that had only na values. To get around this, just sample one of each observation by company-group.

In [65]:
customerDB_withWeather = customerDB.merge(allSupplierWeather)[['year','customer_gvkey','supplier_gvkey','supplier_zipcode','salecs','totalExp','salesWeight']]
customerDB_withWeather.shape

(144912, 7)

In [66]:
idx = customerDB_withWeather.groupby(['year','customer_gvkey']).salecs.\
    transform(max) == customerDB_withWeather.salecs

largestSuppliers = customerDB_withWeather[idx].reset_index(drop = True)

randomSample = largestSuppliers.groupby(['year',
                                         'customer_gvkey']).\
    apply(lambda x: x.sample(1)).reset_index(drop=True)

randomSample['year']       = randomSample.year.astype('str').str.slice(0,4).astype('int64')
randomSample['supplier_gvkey'] = randomSample['supplier_gvkey'].astype('int64')
randomSample.shape

(12027, 7)

In [67]:
'''
idx = c_linksMerge4.groupby(['year','customer_gvkey']).salecs.\
    transform(max) == c_linksMerge4.salecs
largestSuppliers = c_linksMerge4[idx].reset_index(drop = True)
print(c_linksMerge4.shape)


# find companies who only have one other supplier
singleSuppliers = c_linksMerge4[c_linksMerge4.totalSuppliers == 1].reset_index(drop = True)
print(singleSuppliers.shape)


# find largest suppliers of different companies
largestSuppliers = largestSuppliers.append(singleSuppliers).drop_duplicates()[['year', 'customer_gvkey', 'supplier_gvkey', 'supplier_zipcode', 'salecs']]
print(largestSuppliers.shape)

randomSample = largestSuppliers.groupby(['year','customer_gvkey']).\
    apply(lambda x: x.sample(1)).reset_index(drop=True)

randomSample['year']       = randomSample.year.astype('str').str.slice(0,4).astype('int64')
randomSample['supplier_gvkey'] = randomSample['supplier_gvkey'].astype('int64')
randomSample.head()
'''


"\nidx = c_linksMerge4.groupby(['year','customer_gvkey']).salecs.    transform(max) == c_linksMerge4.salecs\nlargestSuppliers = c_linksMerge4[idx].reset_index(drop = True)\nprint(c_linksMerge4.shape)\n\n\n# find companies who only have one other supplier\nsingleSuppliers = c_linksMerge4[c_linksMerge4.totalSuppliers == 1].reset_index(drop = True)\nprint(singleSuppliers.shape)\n\n\n# find largest suppliers of different companies\nlargestSuppliers = largestSuppliers.append(singleSuppliers).drop_duplicates()[['year', 'customer_gvkey', 'supplier_gvkey', 'supplier_zipcode', 'salecs']]\nprint(largestSuppliers.shape)\n\nrandomSample = largestSuppliers.groupby(['year','customer_gvkey']).    apply(lambda x: x.sample(1)).reset_index(drop=True)\n\nrandomSample['year']       = randomSample.year.astype('str').str.slice(0,4).astype('int64')\nrandomSample['supplier_gvkey'] = randomSample['supplier_gvkey'].astype('int64')\nrandomSample.head()\n"

In [68]:
allSupplierWeather['year'] = allSupplierWeather['year'].astype('int64')
allSupplierWeather['qtr']  = allSupplierWeather['qtr'].astype('int64')

In [69]:
largestSupplierWeather = randomSample.merge(allSupplierWeather)

largestSupplierWeather.drop(columns = {'supplier_zipcode', 'supplier_gvkey', 'totalExp', 'salecs', 'salesWeight'}, inplace = True)     

largestSupplierWeather.columns = largestSupplierWeather.columns[0:3].append('largest_' + largestSupplierWeather.columns[3:])

largestSupplierWeather.to_csv("../../data/companyData/largestSupplierWeather.csv")

In [70]:
largestSupplierWeather.columns

Index(['year', 'customer_gvkey', 'qtr', 'largest_supplier_precip_annual_50',
       'largest_supplier_precip_annual_95',
       'largest_supplier_precip_annual_99', 'largest_supplier_precip_zip_50',
       'largest_supplier_precip_zip_95', 'largest_supplier_precip_zip_99',
       'largest_supplier_precip_zipQuarter_50',
       ...
       'largest_supplier_empWt_lag2_temp_zipQuarter95',
       'largest_supplier_empWt_lag3_temp_zipQuarter95',
       'largest_supplier_empWt_precip_zipQuarter95',
       'largest_supplier_empWt_lag1_precip_zipQuarter95',
       'largest_supplier_empWt_lag2_precip_zipQuarter95',
       'largest_supplier_empWt_lag3_precip_zipQuarter95',
       'largest_supplier_empWt_days90Plus',
       'largest_supplier_empWt_lag1_days90Plus',
       'largest_supplier_empWt_lag2_days90Plus',
       'largest_supplier_empWt_lag3_days90Plus'],
      dtype='object', length=369)

In [71]:
compustat.columns

Index(['customer_gvkey', 'year', 'qtr', 'companyName', 'assets', 'cash',
       'costGoodsSold', 'totalInv', 'netIncome', 'opInc_afDep', 'opInc_befDep',
       'totalRevenue', 'priceClose', 'assetsLast', 'netIncomeLast',
       'totalRevenueLast', 'costGoodsSoldLast', 'totalInvLast',
       'opInc_afDepLast', 'opInc_befDepLast', 'priceCloseLast', 'cashLast',
       'netIncomeLagged', 'roa_lagged', 'sic2', 'indGroup', 'earliestYear',
       'ageTercile', 'sizeTercile', 'profitTercile'],
      dtype='object')

In [72]:
largestSupplierWeather.shape

(47748, 369)

# Merge all for regressions

*********this should pick up all observations, not sure why we're dropping some

In [74]:
compustat_withLinks[['year','customer_gvkey']].drop_duplicates().shape

(10349, 2)

In [4]:
supplierWeightedWeather = pd.read_csv("../../data/companyData/supplierWeightedWeather.csv").\
    drop(columns = ['Unnamed: 0'])
largestSupplierWeather  = pd.read_csv("../../data/companyData/largestSupplierWeather.csv").\
    drop(columns = ['Unnamed: 0'])
supplierWorstWeather    =pd.read_csv("../../data/companyData/supplierWorstWeather.csv").\
    drop(columns = ['Unnamed: 0'])

In [75]:
supplierWeightedWeather.head()

Unnamed: 0,year,qtr,customer_gvkey,wtd_supplier_precip_annual_50,wtd_supplier_precip_annual_95,wtd_supplier_precip_annual_99,wtd_supplier_precip_zip_50,wtd_supplier_precip_zip_95,wtd_supplier_precip_zip_99,wtd_supplier_precip_zipQuarter_50,...,wtd_supplier_empWt_lag2_temp_zipQuarter95,wtd_supplier_empWt_lag3_temp_zipQuarter95,wtd_supplier_empWt_precip_zipQuarter95,wtd_supplier_empWt_lag1_precip_zipQuarter95,wtd_supplier_empWt_lag2_precip_zipQuarter95,wtd_supplier_empWt_lag3_precip_zipQuarter95,wtd_supplier_empWt_days90Plus,wtd_supplier_empWt_lag1_days90Plus,wtd_supplier_empWt_lag2_days90Plus,wtd_supplier_empWt_lag3_days90Plus
0,2001,1,1038.0,6.635116,1.990535,0.0,16.587791,3.98107,0.663512,16.587791,...,0.286516,0.0,0.104399,0.0,0.0,0.0,0.0,1.246984,24.785407,9.645278
1,2001,1,1075.0,15.0,2.0,0.0,32.0,4.0,1.0,32.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,8.0
2,2001,1,1078.0,1.598431,0.40247,0.0,4.035652,0.514762,0.200422,4.035652,...,0.017532,0.017532,0.0,0.0,0.0,0.0,0.004583,0.004583,1.696414,0.425247
3,2001,1,1164.0,1.142112,0.403038,0.224843,2.681102,0.56436,0.246536,2.681102,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.109336,0.106186
4,2001,1,1279.0,4.666667,0.0,0.0,16.666667,0.333333,0.0,16.666667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
supplierWeightedWeather.head()

Unnamed: 0,year,qtr,customer_gvkey,wtd_supplier_precip_annual_50,wtd_supplier_precip_annual_95,wtd_supplier_precip_annual_99,wtd_supplier_precip_zip_50,wtd_supplier_precip_zip_95,wtd_supplier_precip_zip_99,wtd_supplier_precip_zipQuarter_50,...,wtd_supplier_empWt_lag2_temp_zipQuarter95,wtd_supplier_empWt_lag3_temp_zipQuarter95,wtd_supplier_empWt_precip_zipQuarter95,wtd_supplier_empWt_lag1_precip_zipQuarter95,wtd_supplier_empWt_lag2_precip_zipQuarter95,wtd_supplier_empWt_lag3_precip_zipQuarter95,wtd_supplier_empWt_days90Plus,wtd_supplier_empWt_lag1_days90Plus,wtd_supplier_empWt_lag2_days90Plus,wtd_supplier_empWt_lag3_days90Plus
0,2001,1,1038.0,6.635116,1.990535,0.0,16.587791,3.98107,0.663512,16.587791,...,0.286516,0.0,0.104399,0.0,0.0,0.0,0.0,1.246984,24.785407,9.645278
1,2001,1,1075.0,15.0,2.0,0.0,32.0,4.0,1.0,32.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,8.0
2,2001,1,1078.0,1.598431,0.40247,0.0,4.035652,0.514762,0.200422,4.035652,...,0.017532,0.017532,0.0,0.0,0.0,0.0,0.004583,0.004583,1.696414,0.425247
3,2001,1,1164.0,1.142112,0.403038,0.224843,2.681102,0.56436,0.246536,2.681102,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.109336,0.106186
4,2001,1,1279.0,4.666667,0.0,0.0,16.666667,0.333333,0.0,16.666667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
largestSupplierWeather.head()

Unnamed: 0,year,customer_gvkey,qtr,largest_supplier_precip_annual_50,largest_supplier_precip_annual_95,largest_supplier_precip_annual_99,largest_supplier_precip_zip_50,largest_supplier_precip_zip_95,largest_supplier_precip_zip_99,largest_supplier_precip_zipQuarter_50,...,largest_supplier_empWt_lag2_temp_zipQuarter95,largest_supplier_empWt_lag3_temp_zipQuarter95,largest_supplier_empWt_precip_zipQuarter95,largest_supplier_empWt_lag1_precip_zipQuarter95,largest_supplier_empWt_lag2_precip_zipQuarter95,largest_supplier_empWt_lag3_precip_zipQuarter95,largest_supplier_empWt_days90Plus,largest_supplier_empWt_lag1_days90Plus,largest_supplier_empWt_lag2_days90Plus,largest_supplier_empWt_lag3_days90Plus
0,2001,1038.0,1,10.0,3.0,0.0,25.0,6.0,1.0,25.0,...,0.431818,0.0,0.157343,0.0,0.0,0.0,0.0,1.879371,37.354895,14.536713
1,2001,1038.0,2,24.0,9.0,2.0,34.0,12.0,4.0,34.0,...,0.0,0.431818,0.0,0.157343,0.0,0.0,11.888112,0.0,1.879371,37.354895
2,2001,1038.0,3,18.0,5.0,1.0,31.0,10.0,2.0,31.0,...,0.0,0.0,0.157343,0.0,0.157343,0.0,35.788462,11.888112,0.0,1.879371
3,2001,1038.0,4,4.0,1.0,0.0,18.0,2.0,1.0,18.0,...,0.0,0.0,0.0,0.157343,0.0,0.157343,0.0,35.788462,11.888112,0.0
4,2001,1075.0,1,15.0,2.0,0.0,32.0,4.0,1.0,32.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,8.0


In [7]:
supplierWorstWeather.head()

Unnamed: 0,year,qtr,customer_gvkey,worst_supplier_precip_annual_50,worst_supplier_precip_annual_95,worst_supplier_precip_annual_99,worst_supplier_precip_zip_50,worst_supplier_precip_zip_95,worst_supplier_precip_zip_99,worst_supplier_precip_zipQuarter_50,...,worst_supplier_empWt_lag2_temp_zipQuarter95,worst_supplier_empWt_lag3_temp_zipQuarter95,worst_supplier_empWt_precip_zipQuarter95,worst_supplier_empWt_lag1_precip_zipQuarter95,worst_supplier_empWt_lag2_precip_zipQuarter95,worst_supplier_empWt_lag3_precip_zipQuarter95,worst_supplier_empWt_days90Plus,worst_supplier_empWt_lag1_days90Plus,worst_supplier_empWt_lag2_days90Plus,worst_supplier_empWt_lag3_days90Plus
0,2001,1,1038.0,10.0,3.0,0.0,25.0,6.0,1.0,25.0,...,0.431818,0.0,0.157343,0.0,0.0,0.0,0.0,1.879371,37.354895,14.536713
1,2001,1,1075.0,15.0,2.0,0.0,32.0,4.0,1.0,32.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,8.0
2,2001,1,1078.0,20.0,4.0,0.0,37.0,9.0,2.0,37.0,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,49.0,16.0
3,2001,1,1164.0,15.0,5.0,3.0,33.0,8.0,5.0,33.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.44,18.88
4,2001,1,1279.0,14.0,0.0,0.0,50.0,1.0,0.0,50.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
allSupplierWeather = supplierWorstWeather.merge(largestSupplierWeather).merge(supplierWeightedWeather)
allSupplierWeather.rename(columns = {'customer_gvkey': 'gvkey'}, inplace = True)

In [29]:
allSupplierWeather.head()

Unnamed: 0,year,qtr,gvkey,worst_supplier_precip_annual_50,worst_supplier_precip_annual_95,worst_supplier_precip_annual_99,worst_supplier_precip_zip_50,worst_supplier_precip_zip_95,worst_supplier_precip_zip_99,worst_supplier_precip_zipQuarter_50,...,wtd_supplier_empWt_lag2_temp_zipQuarter95,wtd_supplier_empWt_lag3_temp_zipQuarter95,wtd_supplier_empWt_precip_zipQuarter95,wtd_supplier_empWt_lag1_precip_zipQuarter95,wtd_supplier_empWt_lag2_precip_zipQuarter95,wtd_supplier_empWt_lag3_precip_zipQuarter95,wtd_supplier_empWt_days90Plus,wtd_supplier_empWt_lag1_days90Plus,wtd_supplier_empWt_lag2_days90Plus,wtd_supplier_empWt_lag3_days90Plus
0,2001,1,1038.0,10.0,3.0,0.0,25.0,6.0,1.0,25.0,...,0.286516,0.0,0.104399,0.0,0.0,0.0,0.0,1.246984,24.785407,9.645278
1,2001,1,1075.0,15.0,2.0,0.0,32.0,4.0,1.0,32.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,8.0
2,2001,1,1078.0,20.0,4.0,0.0,37.0,9.0,2.0,37.0,...,0.017532,0.017532,0.0,0.0,0.0,0.0,0.004583,0.004583,1.696414,0.425247
3,2001,1,1164.0,15.0,5.0,3.0,33.0,8.0,5.0,33.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.109336,0.106186
4,2001,1,1279.0,14.0,0.0,0.0,50.0,1.0,0.0,50.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
allSupplierWeather.to_csv("../../data/companyData/allIndirectWeather.csv")

In [27]:
'''changes = compustat = pd.read_csv("../../data/companyData/igData.csv").\
    drop(columns = ['Unnamed: 0', 'datadate','curcdq','costat','add1','addzip','city','state',
                   'fyearq','assetsLagged','datacqtr','datafqtr','fyr','DATE','address_line_1',
                   'delete','latitude','longitude','zipcode','abi','ticker','company',
                    'cstatCompanies','igCompanies']).\
    rename(columns = {'gvkey': 'customer_gvkey'})

changes.head()


allIndirect = changes.merge(supplierWeightedWeather).merge(largestSupplierWeather).\
    merge(supplierWorstWeather)

allIndirect.to_csv("../../data/companyData/allIndirect.csv")'''


'changes = compustat = pd.read_csv("../../data/companyData/igData.csv").    drop(columns = [\'Unnamed: 0\', \'datadate\',\'curcdq\',\'costat\',\'add1\',\'addzip\',\'city\',\'state\',\n                   \'fyearq\',\'assetsLagged\',\'datacqtr\',\'datafqtr\',\'fyr\',\'DATE\',\'address_line_1\',\n                   \'delete\',\'latitude\',\'longitude\',\'zipcode\',\'abi\',\'ticker\',\'company\',\n                    \'cstatCompanies\',\'igCompanies\']).    rename(columns = {\'gvkey\': \'customer_gvkey\'})\n\nchanges.head()\n\n\nallIndirect = changes.merge(supplierWeightedWeather).merge(largestSupplierWeather).    merge(supplierWorstWeather)\n\nallIndirect.to_csv("../../data/companyData/allIndirect.csv")'

In [18]:
igData = pd.read_csv("../../data/companyData/igWithWeather.csv").drop(columns = {'Unnamed: 0'})
igData.head()

Unnamed: 0,gvkey,datadate,year,qtr,companyName,curcdq,assets,cash,costGoodsSold,totalInv,...,empWt_lag2_temp_zipQuarter95,empWt_lag3_temp_zipQuarter95,empWt_precip_zipQuarter95,empWt_lag1_precip_zipQuarter95,empWt_lag2_precip_zipQuarter95,empWt_lag3_precip_zipQuarter95,empWt_days90Plus,empWt_lag1_days90Plus,empWt_lag2_days90Plus,empWt_lag3_days90Plus
0,8515,20030331,2003,1,PHI INC,USD,524.003551,,67.936692,52.543325,...,0.0,0.0,0.0,0.27439,0.27439,0.0,0.0,7.981707,73.262195,51.341463
1,25874,20030331,2003,1,PETROQUEST ENERGY INC,USD,198.075923,,4.340718,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,5.0,62.0,29.0
2,28564,20030331,2003,1,STONE ENERGY CORP,USD,1840.969152,,29.874514,0.0,...,0.25,0.25,0.25,0.75,0.178571,0.0,0.0,0.714286,51.821429,17.642857
3,8515,20040331,2004,1,PHI INC,USD,520.557973,,68.98805,53.825001,...,0.0,0.0,0.0,0.0,0.251572,0.0,7.484277,17.962264,69.113208,40.415094
4,25874,20040331,2004,1,PETROQUEST ENERGY INC,USD,243.365831,,4.63376,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,45.0,38.0


In [30]:
igData.year.min()

2001

In [31]:
allSupplierWeather.head()

Unnamed: 0,year,qtr,gvkey,worst_supplier_precip_annual_50,worst_supplier_precip_annual_95,worst_supplier_precip_annual_99,worst_supplier_precip_zip_50,worst_supplier_precip_zip_95,worst_supplier_precip_zip_99,worst_supplier_precip_zipQuarter_50,...,wtd_supplier_empWt_lag2_temp_zipQuarter95,wtd_supplier_empWt_lag3_temp_zipQuarter95,wtd_supplier_empWt_precip_zipQuarter95,wtd_supplier_empWt_lag1_precip_zipQuarter95,wtd_supplier_empWt_lag2_precip_zipQuarter95,wtd_supplier_empWt_lag3_precip_zipQuarter95,wtd_supplier_empWt_days90Plus,wtd_supplier_empWt_lag1_days90Plus,wtd_supplier_empWt_lag2_days90Plus,wtd_supplier_empWt_lag3_days90Plus
0,2001,1,1038.0,10.0,3.0,0.0,25.0,6.0,1.0,25.0,...,0.286516,0.0,0.104399,0.0,0.0,0.0,0.0,1.246984,24.785407,9.645278
1,2001,1,1075.0,15.0,2.0,0.0,32.0,4.0,1.0,32.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,8.0
2,2001,1,1078.0,20.0,4.0,0.0,37.0,9.0,2.0,37.0,...,0.017532,0.017532,0.0,0.0,0.0,0.0,0.004583,0.004583,1.696414,0.425247
3,2001,1,1164.0,15.0,5.0,3.0,33.0,8.0,5.0,33.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.109336,0.106186
4,2001,1,1279.0,14.0,0.0,0.0,50.0,1.0,0.0,50.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
allDirIndir = igData.merge(allSupplierWeather, how = 'left')


In [34]:
for col in allDirIndir.columns:
    print(col)

gvkey
datadate
year
qtr
companyName
curcdq
assets
cash
costGoodsSold
totalInv
netIncome
opInc_afDep
opInc_befDep
totalRevenue
costat
priceClose
add1
addzip
assetsLast
netIncomeLast
totalRevenueLast
costGoodsSoldLast
totalInvLast
opInc_afDepLast
opInc_befDepLast
priceCloseLast
cashLast
fyearq
assetsLagged
netIncomeLagged
roa_lagged
sic2
indGroup
earliestYear
ageTercile
sizeTercile
profitTercile
datacqtr
datafqtr
fyr
DATE
cstatCompanies
igCompanies
delete
abi
ticker
company
state
city
address_line_1
zipcode
latitude
longitude
precip_annual_50
precip_annual_95
precip_annual_99
precip_zip_50
precip_zip_95
precip_zip_99
precip_zipQuarter_50
precip_zipQuarter_95
precip_zipQuarter_99
temp_annual_50
temp_annual_95
temp_annual_99
temp_zip_50
temp_zip_95
temp_zip_99
temp_zipQuarter_50
temp_zipQuarter_95
temp_zipQuarter_99
lag1_precip_annual_50
lag1_precip_annual_95
lag1_precip_annual_99
lag1_precip_zip_50
lag1_precip_zip_95
lag1_precip_zip_99
lag1_precip_zipQuarter_50
lag1_precip_zipQuarter_95
l

Add in a marker for the suppliers, to do the analysis on the suppliers only.

In [46]:
suppliersOnly = pd.read_csv("../../data/companyData/suppliers.csv").drop(columns = {'Unnamed: 0'})
suppliersOnly.head()

Unnamed: 0,year,supplier_gvkey
0,1999,1013.0
1,2000,1013.0
2,2001,1013.0
3,2002,1013.0
4,2003,1013.0


In [50]:
allDirIndir['isSupplier'] = allDirIndir.gvkey.isin(suppliersOnly.supplier_gvkey)

sum(allDirIndir['isSupplier'])

allDirIndir.to_csv("../../data/companyData/allDirIndir.csv")

Now put in the customer hq thing, so we can filter out customer-supplier pairs that are within x miles of each other.

In [None]:
largestSuppliersWithWeather.rename(columns = {'customer_gvkey': 'gvkey'}, inplace = True)

In [None]:
largestSuppliersWithWeather.to_csv("../../data/companyData/largestSuppliersWithWeather_more500K.csv")

In [None]:
largestSuppliersWithWeather.shape

In [51]:
allDirIndir.shape

(239126, 1518)