In [2]:
import pickle as pkl
import pandas as pd
import matplotlib
import os
import re

import collections
import datetime
import time

import geopandas as gpd

import numpy as np

import rasterio

import spacy

import gc



nlp = spacy.load('en_core_web_lg')
from sklearn.metrics.pairwise import cosine_similarity

import itertools

# Get Data

## Changes from year to year

In [None]:
changes = pd.read_csv("../../data/compustatChanges_all.csv").drop(columns = ['Unnamed: 0'])

changes.head()

In [None]:
otherControls = pd.read_csv('../../data/companyData/otherControls.csv').\
    drop(columns = {'Unnamed: 0', 'fyearq'}).rename(columns = {'year_toMatchOn': 'year',
                                                              'fqtr': 'qtr'})

otherControls.head()

In [None]:
otherControls.head()

In [None]:
print(changes.shape)
changes = changes.merge(otherControls)
print(changes.shape)


industries = changes[['gvkey','famafrench']].drop_duplicates()

In [None]:
industries.to_csv("../../data/companyData/gvkeyIndustries.csv")

In [None]:
changes.to_csv("../../data/companyData/compustatChanges_withControls.csv")
changes.head()

In [None]:
changes = pd.read_csv("../../data/companyData/compustatChanges_withControls.csv").drop(columns = {'Unnamed: 0',
                                                                                                 'Unnamed: 0.1',
                                                                                                 'Unnamed: 0.1.1'})
changes.head()

Put in the calendar quarters and fiscal quarter data.

In [None]:
quarters = pd.read_csv("../../data/companyData/fiscalYears.csv")
quarters.head()

In [None]:
len(quarters.gvkey.unique())

In [None]:
sum((quarters.fyr == 12) | 
   (quarters.fyr == 3) | 
   (quarters.fyr == 6) | 
   (quarters.fyr == 9))/quarters.shape[0]

In [None]:
quarters = quarters[(quarters.fyr == 12) | 
   (quarters.fyr == 3) | 
   (quarters.fyr == 6) | 
   (quarters.fyr == 9)][['gvkey','datadate','datacqtr','datafqtr','fyr']].reset_index(drop = True)


In [None]:
quarters.head()

Merge the quarter data into the change data, and make sure that the quarters that are used line up with the calendar quarters.

In [None]:
changesCal = changes[changes.gvkey.isin(quarters.gvkey.unique())]

changesCal = changesCal.merge(quarters)

print(changesCal.shape[0]/changes.shape[0])

In [None]:
changesCal.loc[~(changesCal.datacqtr.isna()), 'year'] = changesCal.datacqtr.str.slice(0,4)
changesCal.loc[~(changesCal.datacqtr.isna()), 'qtr']  = changesCal.datacqtr.str.slice(5,6)

changesCal['DATE'] = pd.to_datetime(changesCal['datadate'])

changesCal.loc[(changesCal.datacqtr.isna()), 'year'] = changesCal.DATE.dt.year
changesCal.loc[(changesCal.datacqtr.isna()), 'qtr']  = changesCal.DATE.dt.quarter

changesCal['year'] = changesCal.year.astype('int64')
changesCal['qtr']  = changesCal.qtr.astype('int64')

print(changesCal.shape,changesCal.head())

In [None]:
changesCal.to_csv("../../data/companyData/compustatChanges_withControls.csv")
changesCal.head()

In [None]:
changesCal = pd.read_csv("../../data/companyData/compustatChanges_withControls.csv")
changesCal.head()

# Compustat and ABI Linking

In [None]:
gvKey_abiLinkingTable = pd.read_csv('../../data/companyData/linkingTable.csv').drop(columns = ['Unnamed: 0'])

base_columns = gvKey_abiLinkingTable.columns 
customer_columns = "customer_" + base_columns
supplier_columns = "supplier_" + base_columns



hasMatch = gvKey_abiLinkingTable.gvkey.unique()

gvKey_abiLinkingTable.head()


---------------------------------

# Get all change data together
Get the linking table and merge the abi labels into the change df. 

Then, merge the location data into the change data and get as complete a record of companies as possible given the HQ data.

In [None]:
gvKey_abiLinkingTable = pd.read_csv('../../data/companyData/linkingTable.csv').drop(columns = ['Unnamed: 0'])

changes = pd.read_csv("../../data/companyData/compustatChanges_withControls.csv").drop(columns = ['Unnamed: 0'])
print(changes.shape, changes.head())


changesABI = changes.merge(gvKey_abiLinkingTable, on ='gvkey').drop(columns = {'state','city'})
print(changesABI.shape, changesABI.head())

Now merge in the hq information.

In [None]:
canadian = ['ON', 'AB','QC', 'BC', 'NS', 'NF', 'SK', 'MB', 'NB']
changes = changes[~(changes.state.isin(canadian)) & ~changes.state.isna()]

changes['addzip'] = changes.addzip.astype('str').str.slice(0,5)

changes.state.unique()

In [None]:
hq = pd.read_csv("../../data/ig_uniqueHQs_multLocations.csv").\
    drop(columns = {'Unnamed: 0'}).\
    rename(columns = {'archive_version_year': 'year'})

hq['year'] = hq.year.astype('int64')

igChanges = changesABI.merge(hq)
print(igChanges.shape, igChanges.head())


hq.head()

In [None]:
igChanges.to_csv("../../data/companyData/igData.csv")

At this point, we have zip information in the following forms (from most to least examples):
    - changes: all compustat companies, from the compustat address system
    - igChanges: subset of compustat companies, from the ig merge
    - subset of compustat companies that have SC information and survived the ig merge
    
We could potentially look at the subset of compustat companies for which we have SC information, usign the compustat address system as well.

For now: follow similar trajectory as before but add in weather data for all cstat companies and all ig-merged companies.

First: pull all zips that are mentioned in changes and igChanges and use this to get the weather data.



In [None]:
changes = changes[(~changes.addzip.isna()) & (changes.addzip != 'nan')]
relevantZips = changes.addzip.astype('int64').append(igChanges.zipcode).unique()

changes.rename(columns = {'addzip': 'zipcode'}, inplace = True)
changes.drop(columns = {'cik',
     'datadate','costat', 'add1', 'add2', 'city', 'sic', 'state'}, inplace = True)

In [None]:
len(relevantZips)

In [None]:
# relevantZips = allCustomerData.zipcode.append(allSupplierData.zipcode).unique()
outfile =  '../../data/companyData/relevantZips.pkl'
with open(outfile, 'wb') as pickle_file:
    pkl.dump(relevantZips, pickle_file)

------------------------------------------------

# Stocks

In [None]:
igChanges = pd.read_csv("../../data/companyData/igData.csv").\
    drop(columns = {'Unnamed: 0'})
igChanges.head()

In [None]:
with open('../../data/stockReturns.pkl', 'rb') as f:
    stocks = pkl.load(f)[['date','gvkey','RET']]

In [None]:
stocks.head()

In [None]:
sum(stocks.gvkey.isna())

In [None]:
stocks = stocks[stocks.date.dt.year > 2008]

stocks['qtr']  = stocks.date.dt.quarter
stocks['year'] = stocks.date.dt.year

stocks = stocks[~stocks.gvkey.isna()]
stocks['gvkey'] = stocks['gvkey'].astype(int)
stocks.shape

In [None]:
igChanges.columns

In [None]:
companyControls = igChanges[['gvkey','year','qtr','famafrench','ageTercile','sizeTercile','profitTercile','zipcode']]
companyControls.head()

In [None]:
print(stocks.dtypes, companyControls.dtypes)

In [None]:
stocksWithControls = stocks.merge(companyControls)
print(stocksWithControls.shape,stocks.shape,companyControls.shape)
stocksWithControls.head()

In [None]:
del stocks
del companyControls
del igChanges
gc.collect()

In [None]:
annualWeather = pd.read_csv("../../data/companyData/stockWeather_annual.csv").\
    drop(columns = {'Unnamed: 0'})

annualWeather = annualWeather[~annualWeather.temp_annualLast5.isna()].reset_index(drop = True)

annualWeather['date'] = pd.to_datetime(annualWeather['date'],
                                   format = "%Y%m%d")

annualWeather.rename(columns = {'ZIP': 'zipcode'}, inplace = True)
print(annualWeather.dtypes)
annualWeather.head()

In [None]:
allWeather = pd.read_csv("../../data/companyData/stockWeather_zipQuarterQuants.csv").\
    drop(columns = {'Unnamed: 0'})

allWeather = allWeather[~allWeather.temp_zipQuarterLast5.isna()].reset_index(drop = True)

allWeather['date'] = pd.to_datetime(allWeather['date'],
                                   format = "%Y-%m-%d")

allWeather.rename(columns = {'ZIP': 'zipcode'}, inplace = True)
print(allWeather.dtypes)
allWeather.head()

In [None]:
stocksWithControlsWeather = stocksWithControls.merge(allWeather).merge(annualWeather)
print(stocksWithControlsWeather.shape,allWeather.shape)

stocksWithControlsWeather.head()

In [None]:
stocksWithControlsWeather.to_csv("../../data/2+32+100+600sfgcompanyData/stocksWithControlsWeather.csv")

In [None]:
sum(stocksWithControlsWeather.RET.isna())

In [None]:
stocksWithControlsWeather = pd.read_csv("../../data/companyData/stocksWithControlsWeather.csv").drop(columns = {'Unnamed: 0'})
stocksWithControlsWeather.head()

In [None]:
sum(stocksWithControlsWeather.gvkey.isna())

--------------------

# Weather Data
First do this on the HQ zipcodes.

In [None]:
allWeather = pd.read_csv("../../data/companyData/revised_allWeatherBins_2009to2019.csv").\
    drop(columns = {"Unnamed: 0"})

allWeather['yearQtr'] = allWeather.year + (allWeather.qtr - 1)/4

col = allWeather.pop("yearQtr")
allWeather.insert(0, col.name, col)

lag1 = allWeather.copy()
lag1['yearQtr'] += 0.25
for colname in lag1.columns[4:]:
    lag1.rename(columns = {colname: 'lag1_' + colname}, inplace = True)
lag1.drop(columns = {'year','qtr'},inplace = True)

    
lag2 = allWeather.copy()
lag2['yearQtr'] += 0.5
for colname in lag2.columns[4:]:
    lag2.rename(columns = {colname: 'lag2_' + colname}, inplace = True)
lag2.drop(columns = {'year','qtr'},inplace = True)


lag3 = allWeather.copy()
lag3['yearQtr'] += 0.75
for colname in lag3.columns[4:]:
    lag3.rename(columns = {colname: 'lag3_' + colname}, inplace = True)
lag3.drop(columns = {'year','qtr'},inplace = True)


lag4 = allWeather.copy()
lag4['yearQtr'] += 1
for colname in lag4.columns[4:]:
    lag4.rename(columns = {colname: 'lag4_' + colname}, inplace = True)
lag4.drop(columns = {'year','qtr'},inplace = True)


print(allWeather.shape)

allWeather_withLags = allWeather.merge(lag1).merge(lag2).merge(lag3).merge(lag4)

print(allWeather_withLags.year.value_counts())

allWeather_withLags.to_csv("../../data/companyData/allWeather_withLags.csv")

Do this across all zips, for the establishment records. We'll put this into a different format right after, and then change the columns and whatnot.

In [None]:
allWeather = pd.read_csv("../../data/companyData/revised_allWeatherBins_2009to2019_allZips.csv").\
    drop(columns = {"Unnamed: 0", 'Unnamed: 0.1'})

allWeather['yearQtr'] = allWeather.year + (allWeather.qtr - 1)/4

col = allWeather.pop("yearQtr")
allWeather.insert(0, col.name, col)

lag1 = allWeather.copy()
lag1['yearQtr'] += 0.25
for colname in lag1.columns[4:]:
    lag1.rename(columns = {colname: 'lag1_' + colname}, inplace = True)
lag1.drop(columns = {'year','qtr'},inplace = True)

    
lag2 = allWeather.copy()
lag2['yearQtr'] += 0.5
for colname in lag2.columns[4:]:
    lag2.rename(columns = {colname: 'lag2_' + colname}, inplace = True)
lag2.drop(columns = {'year','qtr'},inplace = True)


lag3 = allWeather.copy()
lag3['yearQtr'] += 0.75
for colname in lag3.columns[4:]:
    lag3.rename(columns = {colname: 'lag3_' + colname}, inplace = True)
lag3.drop(columns = {'year','qtr'},inplace = True)


lag4 = allWeather.copy()
lag4['yearQtr'] += 1
for colname in lag4.columns[4:]:
    lag4.rename(columns = {colname: 'lag4_' + colname}, inplace = True)
lag4.drop(columns = {'year','qtr'},inplace = True)


print(allWeather.shape)

allWeather_withLags = allWeather.merge(lag1).merge(lag2).merge(lag3).merge(lag4)

print(allWeather_withLags.year.value_counts())

allWeather_withLags.to_csv("../../data/companyData/allWeather_withLags_allZips.csv")

Now do the same for the industry-specific weather.

In [None]:
# allWeather = pd.read_csv("../../../../../../../Volumes/backup2/dissData/prism/allWeatherBins_2010.2019.csv").\
allWeather_byInd = pd.read_csv("../../data/companyData/revised_allWeatherBins_2009to2019_byInd.csv").\
    drop(columns = {"Unnamed: 0"})
'''[['famafrench','zipcode','yearQuarter', 
                                    'temp_ffquant_0.95','temp_indQuarterquant_0.95',
                                   'temp5Days_ffquant_0.95', 'temp5Days_indQuarterquant_0.95',
                                   'precip_ffquant_0.95', 'precip_indQuarterquant_0.95',
                                   'precip5Days_ffquant_0.95', 'precip5Days_indQuarterquant_0.95']]
'''
allWeather_byInd['year'] = allWeather_byInd.yearQuarter.str.slice(0,4).astype('int64')
allWeather_byInd['qtr']  = allWeather_byInd.yearQuarter.str.slice(5,6).astype('int64')
allWeather_byInd['yearQtr'] = allWeather_byInd.year + (allWeather_byInd.qtr - 1)/4

allWeather_byInd = allWeather_byInd.astype({'year':       'category',
                         'qtr':        'category',
                         'zipcode':    'category',
                         'famafrench': 'category'})

changes['zipcode'] = changes['zipcode'].astype({'zipcode': 'int64'})

changes = changes.astype({'year':       'category',
                          'qtr':        'category',
                          'zipcode':    'category',
                          'famafrench': 'category'})

col = allWeather_byInd.pop("year")
allWeather_byInd.insert(0, col.name, col)

col = allWeather_byInd.pop("qtr")
allWeather_byInd.insert(0, col.name, col)


col = allWeather_byInd.pop("yearQtr")
allWeather_byInd.insert(0, col.name, col)

allWeather_byInd.drop(columns = {'yearQuarter'}, inplace = True)

print(allWeather_byInd.head())

In [None]:
lag1 = allWeather_byInd.copy()
lag1['yearQtr'] += 0.25
for colname in lag1.columns[5:]:
    lag1.rename(columns = {colname: 'lag1_' + colname}, inplace = True)
lag1.drop(columns = {'year','qtr'},inplace = True)
lag1 = lag1.astype({'yearQtr':       'category'})

    
lag2 = allWeather_byInd.copy()
lag2['yearQtr'] += 0.5
for colname in lag2.columns[5:]:
    lag2.rename(columns = {colname: 'lag2_' + colname}, inplace = True)
lag2.drop(columns = {'year','qtr'},inplace = True)
lag2 = lag2.astype({'yearQtr':       'category'})


lag3 = allWeather_byInd.copy()
lag3['yearQtr'] += 0.75
for colname in lag3.columns[5:]:
    lag3.rename(columns = {colname: 'lag3_' + colname}, inplace = True)
lag3.drop(columns = {'year','qtr'},inplace = True)
lag3 = lag3.astype({'yearQtr':       'category'})


lag4 = allWeather_byInd.copy()
lag4['yearQtr'] += 1
for colname in lag4.columns[5:]:
    lag4.rename(columns = {colname: 'lag4_' + colname}, inplace = True)
lag4.drop(columns = {'year','qtr'},inplace = True)
lag4 = lag4.astype({'yearQtr':       'category'})


allWeather_byInd = allWeather_byInd.astype({'yearQtr':       'category'})


print(allWeather_byInd.shape)


allWeather_byInd.head()


'''allWeather_byInd_withLags = allWeather_byInd.merge(lag1).merge(lag2).merge(lag3).merge(lag4)

allWeather_byInd_withLags.year.value_counts()
'''

In [None]:
allWeather_byInd_withLags = allWeather_byInd.merge(lag1).merge(lag2).merge(lag3).merge(lag4)

In [None]:
allWeather_byInd_withLags.shape

In [None]:
allWeather_byInd_withLags.to_csv("../../data/companyData/allWeather_byInd_withLags.csv")

In [None]:
del allWeather_byInd
del lag1
del lag2
del lag3
del lag4
gc.collect()

In [None]:
allWeather_byInd_withLags = pd.read_csv("../../data/companyData/allWeather_byInd_withLags.csv")
allWeather_byInd_withLags.head()

# Locations
Create a separate definition of weather based not on HQ but on employee-weighted establishment footprint.

In [None]:
fractions = pd.read_csv('../../data/companyData/fractionEmployees_byEstablishment.csv').\
    drop(columns = {"Unnamed: 0", 'latitude','longitude'}).rename(columns = {'archive_version_year': 'year',
                                                    'parent_number': 'abi'})

fractions['year']    = fractions.year.astype('int64')
fractions['zipcode'] = fractions.zipcode.astype('int64')
fractions.head()

gvKey_abiLinkingTable = pd.read_csv('../../data/companyData/linkingTable.csv').drop(columns = ['Unnamed: 0'])

print(gvKey_abiLinkingTable.abi)

gvKey_abiLinkingTable.head()

fractions = fractions[['year','abi','zipcode','locationFracOfEmployees']].merge(gvKey_abiLinkingTable[['abi','gvkey']])

fractions = fractions.astype({'year':       'category',
                           'zipcode':    'category'})

fractions.head()

In [None]:
fractionsWithWeather = fractions.merge(allWeather_withLags_allZips) 
fractionsWithWeather.drop(columns = {'abi','zipcode'}, inplace = True)

print(fractionsWithWeather.shape)
fractionsWithWeather.head()

In [None]:
fractionsWithWeather[fractionsWithWeather.gvkey == 1004]

In [None]:
del allWeather_withLags
del fractions
del gvKey_abiLinkingTable
gc.collect()

In [None]:
for col in fractionsWithWeather.columns[4:]:
    fractionsWithWeather[col] = fractionsWithWeather[col] * fractionsWithWeather.locationFracOfEmployees

In [None]:
g = fractionsWithWeather.groupby(['gvkey','year','qtr']).sum().reset_index()
g.drop(columns = {'locationFracOfEmployees'}, inplace = True)

for colname in g.columns[3:]:
    g.rename(columns = {colname: 'empWt_' + colname}, inplace = True)

g.head()

In [None]:
g.to_csv("../../data/companyData/weatherByEstablishment.csv")

In [None]:
establishmentZips = fractions.zipcode.unique()
len(establishmentZips)

## create the original weather with lags dataset

In [None]:
allWeather_withLags = pd.read_csv("../../data/companyData/allWeather_withLags.csv").\
    drop(columns = {"Unnamed: 0", 'yearQtr'})

averages = pd.read_csv("../../data/companyData/quarterlyStatsByZip.csv").\
    drop(columns = {"Unnamed: 0"}).rename(columns = {'ZIP': 'zipcode'})


averages['qtr'] = averages.quarter.str.slice(1,2).astype('float')
averages.drop(columns = {'quarter'}, inplace = True) 

print(len(averages.zipcode.unique()))

averages.head()

allWeather_withLags       = allWeather_withLags.astype({'year':       'category',
                           'qtr':        'category',
                           'zipcode':    'category'})
averages                  = averages.astype({'qtr':        'category',
                           'zipcode':    'category'})

allWeather_byInd_withLags = pd.read_csv("../../data/companyData/allWeather_byInd_withLags.csv").\
    drop(columns = {"Unnamed: 0", 'yearQtr'})
allWeather_byInd_withLags = allWeather_byInd_withLags.astype({'year':       'category',
                           'qtr':        'category',
                           'zipcode':    'category'})

Create direct effects database. Merge weather to full cstat database.

In [None]:
allWeather_withLags.zipcode

In [None]:
changes.zipcode = changes.zipcode.astype('int64')

In [None]:
# changes['zipcode']  = changes['zipcode'].astype('int64')
changesWithWeather = changes.merge(allWeather_withLags).merge(allWeather_byInd_withLags).merge(averages).merge(g)
print(changes.shape,changesWithWeather.shape)

In [None]:
changesWithWeather.to_csv("../../data/companyData/cstatWithWeather.csv")

Merge weather to the ig-cstat database.

In [None]:
igChangesWithWeather = igChanges.merge(allWeather_withLags).merge(allWeather_byInd_withLags).merge(averages).merge(g)
igChangesWithWeather.shape

In [None]:
igChangesWithWeather.to_csv("../../data/companyData/igWithWeather.csv")

In [None]:
igChangesWithWeather.head()

In [None]:
igChangesWithWeather['temp_zipquant_0.95'].describe()

In [None]:
igChangesWithWeather['temp5Days_ffquant_0.95'].describe()

# Indirect
Introduce the SC Data.

In [None]:
# this does a little bit of a test on the reporting requirements. 
# number 

'''c_linksTest = pd.read_csv("../../data/companyData/compustatSCLinked.csv")[['srcdate','gvkey','cgvkey']]
c_linksTest['year'] = c_linksTest.srcdate.astype('str').str.slice(0,4).astype('int64')

bs = c_linksTest[c_linksTest.year < 2014]
print("Customers per supplier, 1978-2013 Pd: ", len(bs.cgvkey.unique())/len(bs.gvkey.unique()))

bs2 = c_linksTest[c_linksTest.year > 2010]
print("Customers per supplier, Recent Pd: ", len(bs2.cgvkey.unique())/len(bs2.gvkey.unique()))'''


In [40]:
c_links = pd.read_csv("../../data/companyData/compustatSCLinked.csv") # pd.read_csv("../../data/companyData/compustatSCLinked.csv")

c_links['year'] = c_links.srcdate.astype('str').str.slice(0,4).astype('int64')

c_links = c_links[c_links.year > 1999][['year','gvkey','cgvkey','salecs']].\
    rename(columns = {'cgvkey': 'customer_gvkey','gvkey': 'supplier_gvkey'})

c_links['year'] = pd.to_datetime(c_links.year, format = '%Y')


c_links.head()

Unnamed: 0,year,supplier_gvkey,customer_gvkey,salecs
70,2002-01-01,1013,2136,111.056
71,2004-01-01,1013,2136,104.312
72,2005-01-01,1013,2136,146.0
73,2006-01-01,1013,2136,205.0
74,2007-01-01,1013,2136,236.0


In [41]:
supplierCombos = c_links[['supplier_gvkey', 'customer_gvkey']].drop_duplicates().reset_index(drop = True)

print(supplierCombos.shape)

supplierCombos.head()

(16812, 2)


Unnamed: 0,supplier_gvkey,customer_gvkey
0,1013,2136
1,1013,9899
2,1021,61494
3,1021,25880
4,1048,11552


We'll follow Barrot Sauvagnat in assuming that a supplier relationship holds for every year between the first and last year in which a customer is reported. This is going to take a little bit of work. We'll try it like this: 
- subset dataframe to a specific supplier-customer pair
- fill in data for every year that's missing

Then, apply this row-wise to all rows of the unique supplierCombos df above using: https://stackoverflow.com/questions/61942138/apply-function-row-wise-to-pandas-dataframe

In [42]:
def fillYear(supplier, customer, scData = c_links):
    c_linksTemp = scData[(scData.supplier_gvkey == supplier) & \
                      (scData.customer_gvkey == customer)].reset_index(drop = True)
    
    # if there are na values and non-na values for the same supplier-cust combination, then 
    # select for only the non-na values, by (1) replacing na with negative, (2) 1
    c_linksTemp['salecs'] = c_linksTemp['salecs'].fillna(-5)
    c_linksTemp = c_linksTemp.loc[c_linksTemp.reset_index().groupby(['year','supplier_gvkey', 'customer_gvkey'])['salecs'].idxmax()]

    
    # now: find the start and end of the data series
    first = c_linksTemp.year.min()
    last  = c_linksTemp.year.max()

    c_linksTemp = c_linksTemp.set_index('year') 

    c_linksTemp = c_linksTemp.reindex(pd.date_range(first, last, freq = 'YS')).\
        reset_index().rename(columns = {'index': 'year'})

    # and impute all values within the series
    c_linksTemp = c_linksTemp.groupby(c_linksTemp.year.dt.time).ffill()
    
    return(c_linksTemp)

Show that this works for one of the supplier rows.

In [43]:
fillYear(supplierCombos.supplier_gvkey[0], supplierCombos.customer_gvkey[0])

Unnamed: 0,year,supplier_gvkey,customer_gvkey,salecs
0,2002-01-01,1013.0,2136.0,111.056
1,2003-01-01,1013.0,2136.0,111.056
2,2004-01-01,1013.0,2136.0,104.312
3,2005-01-01,1013.0,2136.0,146.0
4,2006-01-01,1013.0,2136.0,205.0
5,2007-01-01,1013.0,2136.0,236.0
6,2008-01-01,1013.0,2136.0,240.0
7,2009-01-01,1013.0,2136.0,176.0
8,2010-01-01,1013.0,2136.0,146.0


Now do it for all rows.

In [44]:
start = time.time()
print(c_links.shape)
c_linksImpd_list = supplierCombos.apply(lambda row: fillYear(row['supplier_gvkey'], row['customer_gvkey']), axis = 1)
c_linksImpd_df   = pd.concat(list(c_linksImpd_list))
print(c_linksImpd_df.shape)
print(time.time() - start)

c_linksImpd_df['year'] = c_linksImpd_df.year.dt.year

(65270, 4)
(68771, 4)
74.10897207260132


We had converted some of the na sales values to -5 so that we could deal with duplicated values, by choosing the larger of said values. Switch back to nan so that we are not thrown off when we look for biggest supplier.

In [45]:
c_linksImpd_df.loc[c_linksImpd_df.salecs == -5, 'salecs'] = float('nan')

In [46]:
c_linksImpd_df.salecs

0    111.056
1    111.056
2    104.312
3    146.000
4    205.000
      ...   
1    179.284
0     34.418
0     25.334
0    283.318
1    316.116
Name: salecs, Length: 68771, dtype: float64

In [47]:
industries = pd.read_csv("../../data/companyData/gvkeyIndustries.csv").drop(columns = {'Unnamed: 0'})
print(industries)

        gvkey  famafrench
0        1004        42.0
1        1010        26.0
2        1013        37.0
3        1019        35.0
4        1021        12.0
...       ...         ...
25237  345920        21.0
25238  345980        44.0
25239  347085        35.0
25240  351491        24.0
25241  351590        24.0

[25242 rows x 2 columns]


In [48]:
c_links = c_linksImpd_df.copy()

print(c_links.shape)

print(c_links.head())

industries.columns = ['customer_gvkey','customer_famafrench']

c_links = c_links.merge(industries)

industries.columns = ['supplier_gvkey','supplier_famafrench']

c_links = c_links.merge(industries)
print(c_links.head(), c_links.shape)


c_links.to_csv("../../data/companyData/c_links.csv")


(68771, 4)
   year  supplier_gvkey  customer_gvkey   salecs
0  2002          1013.0          2136.0  111.056
1  2003          1013.0          2136.0  111.056
2  2004          1013.0          2136.0  104.312
3  2005          1013.0          2136.0  146.000
4  2006          1013.0          2136.0  205.000
   year  supplier_gvkey  customer_gvkey   salecs  customer_famafrench  \
0  2002          1013.0          2136.0  111.056                 33.0   
1  2003          1013.0          2136.0  111.056                 33.0   
2  2004          1013.0          2136.0  104.312                 33.0   
3  2005          1013.0          2136.0  146.000                 33.0   
4  2006          1013.0          2136.0  205.000                 33.0   

   supplier_famafrench  
0                 37.0  
1                 37.0  
2                 37.0  
3                 37.0  
4                 37.0   (66561, 6)


Now see if it's common to have one in and one out of the industries of interest. 

For now, let's keep all the different industry types.

We can always filter later if we need to.

In [49]:
#########################
# get data and reset columns 
gvKey_abiLinkingTable = pd.read_csv('../../data/companyData/linkingTable.csv').drop(columns = ['Unnamed: 0'])

base_columns = gvKey_abiLinkingTable.columns 
customer_columns = "customer_" + base_columns
supplier_columns = "supplier_" + base_columns


'''# merge in customer information
gvKey_abiLinkingTable.columns = customer_columns

print(c_links.shape)
c_linksMerge1 = c_links.merge(gvKey_abiLinkingTable, on ='customer_gvkey')
print(c_links.shape,c_linksMerge1.shape)'''



#########################
# and merge in supplier 
gvKey_abiLinkingTable.columns = supplier_columns

print(c_links.shape)
c_linksMerge2 = c_links.merge(gvKey_abiLinkingTable, on ='supplier_gvkey')
print(c_linksMerge2.shape)

c_linksMerge2.to_csv("../../data/companyData/clinks_IG_selected.csv")

(66561, 6)
(46937, 10)


This is probably because: (1) companies are not in North America, or (2) companies are not in the physical goods industries we're interested in. We can verify this though: look at c_links where both the customer and supplier are in the dataset of interest.

In [50]:
chq     = pd.read_csv("../../data/chq.csv",dtype={'cstatZipcode': 'object'}).drop(columns = {'Unnamed: 0'})

c_linkTest = c_links[c_links.customer_gvkey.isin(chq.gvkey.unique()) & \
                     c_links.supplier_gvkey.isin(chq.gvkey.unique())]

print("Percent of firms with a match: ", c_linksMerge2.shape[0]/c_linkTest.shape[0])

Percent of firms with a match:  0.9528614060374754


In [51]:
c_linksMerge2.head()

Unnamed: 0,year,supplier_gvkey,customer_gvkey,salecs,customer_famafrench,supplier_famafrench,supplier_cstatCompanies,supplier_igCompanies,supplier_delete,supplier_abi
0,2002,1013.0,2136.0,111.056,33.0,37.0,adc telecommunications,adc telecommunications,,7523129
1,2003,1013.0,2136.0,111.056,33.0,37.0,adc telecommunications,adc telecommunications,,7523129
2,2004,1013.0,2136.0,104.312,33.0,37.0,adc telecommunications,adc telecommunications,,7523129
3,2005,1013.0,2136.0,146.0,33.0,37.0,adc telecommunications,adc telecommunications,,7523129
4,2006,1013.0,2136.0,205.0,33.0,37.0,adc telecommunications,adc telecommunications,,7523129


It's entirely possible that we have too small of a sample from the 2010s alone. Let's just try it though and see how it goes.

First, make a sample with the companies on one year of either side of when it reports another customer.

In [52]:
def makeOneEitherSide(df): 
    yrPlus1 = df.copy(); yrPlus1['year'] += 1
    # yrPlus2 = df.copy(); yrPlus2['year'] += 1
    # yrPlus3 = df.copy(); yrPlus3['year'] += 1
    
    yrMinus1 = df.copy(); yrMinus1['year'] -= 1
    # yrMinus2 = df.copy(); yrMinus2['year'] -= 1
    # yrMinus3 = df.copy(); yrMinus3['year'] -= 1
    
    all = pd.concat([yrPlus1,yrMinus1]) # pd.concat([yrPlus1,yrPlus2,yrPlus3,yrMinus1,yrMinus2,yrMinus3])
    
    return(all)

In [53]:
scTableSuppliers.head()

Unnamed: 0,year,supplier_gvkey,supplier_abi
0,2002,1013.0,7523129
1,2003,1013.0,7523129
2,2004,1013.0,7523129
3,2005,1013.0,7523129
4,2006,1013.0,7523129


In [54]:
c_linksMerge2.columns

Index(['year', 'supplier_gvkey', 'customer_gvkey', 'salecs',
       'customer_famafrench', 'supplier_famafrench', 'supplier_cstatCompanies',
       'supplier_igCompanies', 'supplier_delete', 'supplier_abi'],
      dtype='object')

In [56]:
scTableSuppliers = c_linksMerge2.copy()[['year','supplier_gvkey','supplier_abi','supplier_famafrench']].drop_duplicates()

print(scTableSuppliers.shape)
# allSupplierData = makeOneEitherSide(scTableSuppliers)
allSupplierData         = scTableSuppliers.copy()
allSupplierData.columns = ['year','gvkey','abi','famafrench']


allAbi = allSupplierData.abi.drop_duplicates() # allCustomerData.abi.append(

########
hqsOnly = pd.read_csv("../../data/ig_uniqueHQs.csv").drop(columns = {'Unnamed: 0'})

hq = pd.read_csv("../../data/ig_uniqueHQs_multLocations.csv").\
    drop(columns = {'Unnamed: 0'}).\
    rename(columns = {'archive_version_year': 'year'})

hq['year'] = hq.year.astype('int64')

hqRelevant = hq[hq.abi.isin(allAbi)]


allSupplierData = allSupplierData.merge(hqRelevant).drop_duplicates()

print(allSupplierData.head(), allSupplierData.shape)

allSupplierData.to_csv("../../data/companyData/allSupplierData.csv")

(19295, 4)
   year   gvkey      abi  famafrench ticker                     company state  \
0  2003  1013.0  7523129        37.0    NaN  ADC TELECOMMUNICATIONS INC    MN   
1  2004  1013.0  7523129        37.0    NaN  ADC TELECOMMUNICATIONS INC    MN   
2  2005  1013.0  7523129        37.0    NaN  ADC TELECOMMUNICATIONS INC    MN   
3  2006  1013.0  7523129        37.0    NaN  ADC TELECOMMUNICATIONS INC    MN   
4  2007  1013.0  7523129        37.0    NaN  ADC TELECOMMUNICATIONS INC    MN   

           city       address_line_1  zipcode  latitude  longitude  \
0  EDEN PRAIRIE  13625 TECHNOLOGY DR    55344  44.85645  -93.45199   
1  EDEN PRAIRIE  13625 TECHNOLOGY DR    55344  44.85645  -93.45199   
2  EDEN PRAIRIE  13625 TECHNOLOGY DR    55344  44.85645  -93.45199   
3  EDEN PRAIRIE  13625 TECHNOLOGY DR    55344  44.85645  -93.45199   
4  EDEN PRAIRIE  13625 TECHNOLOGY DR    55344  44.85645  -93.45199   

   parent_employee_size_code  location_employee_size_code  employeesAtLocation  


Previously we had done this with the customers as well, but we lose some percentage of the observations if we again try to match on the IG data, so for now just focus on the suppliers.

In [None]:
'''scTableCustomers = c_linksMerge2.copy()[['year','customer_gvkey']].drop_duplicates()
allCustomerData = makeOneEitherSide(scTableCustomers)
allCustomerData.columns = ['year','gvkey','abi']'''

# allCustomerData = allCustomerData.merge(hqRelevant).drop_duplicates()
# allCustomerData.to_csv("../../data/companyData/allCustomerData.csv")

## Find Customer and Supplier pairings and merge with change data
### Can pick up here

In [58]:
allSupplierData = pd.read_csv("../../data/companyData/allSupplierData.csv").\
    drop(columns = ['Unnamed: 0'])[['gvkey', 'famafrench', 'year','zipcode']]
print(allSupplierData.shape, allSupplierData.columns)
'''changes = pd.read_csv("../../data/companyData/compustatChanges_withControls.csv").drop(columns = ['Unnamed: 0'])
print(changes.columns)
suppliers = changes.merge(allSupplierData[['year','gvkey','zipcode']])
print(suppliers.shape)'''

'''
allSupplierData[['year','gvkey','zipcode']]
customers = changes.merge(allCustomerData[['year','gvkey','zipcode','employeesAtLocation']])
print(customers.head())
'''

(15261, 4) Index(['gvkey', 'famafrench', 'year', 'zipcode'], dtype='object')


"\nallSupplierData[['year','gvkey','zipcode']]\ncustomers = changes.merge(allCustomerData[['year','gvkey','zipcode','employeesAtLocation']])\nprint(customers.head())\n"

In [59]:
allSupplierData.head()

Unnamed: 0,gvkey,famafrench,year,zipcode
0,1013.0,37.0,2003,55344
1,1013.0,37.0,2004,55344
2,1013.0,37.0,2005,55344
3,1013.0,37.0,2006,55344
4,1013.0,37.0,2007,55344


## Get first-hop SC data

In [60]:
c_links = pd.read_csv("../../data/companyData/clinks_IG_selected.csv").drop(columns = {'Unnamed: 0'})
c_links.head()

Unnamed: 0,year,supplier_gvkey,customer_gvkey,salecs,customer_famafrench,supplier_famafrench,supplier_cstatCompanies,supplier_igCompanies,supplier_delete,supplier_abi
0,2002,1013.0,2136.0,111.056,33.0,37.0,adc telecommunications,adc telecommunications,,7523129
1,2003,1013.0,2136.0,111.056,33.0,37.0,adc telecommunications,adc telecommunications,,7523129
2,2004,1013.0,2136.0,104.312,33.0,37.0,adc telecommunications,adc telecommunications,,7523129
3,2005,1013.0,2136.0,146.0,33.0,37.0,adc telecommunications,adc telecommunications,,7523129
4,2006,1013.0,2136.0,205.0,33.0,37.0,adc telecommunications,adc telecommunications,,7523129


In [61]:
c_links['suppliers'] = 1
custExp = c_links[['year', 'customer_gvkey', 'salecs','suppliers']].groupby(['year','customer_gvkey']).sum().\
    reset_index().rename(columns = {'salecs': 'totalExp'})

custExp.head()



Unnamed: 0,year,customer_gvkey,totalExp,suppliers
0,2000,1038.0,38.22,2
1,2000,1045.0,38.093,4
2,2000,1078.0,5.07,2
3,2000,1095.0,4.79,1
4,2000,1121.0,7.883,1


In [62]:
print("Number of firms with no exp information and multiple suppliers: ", \
          sum(custExp[custExp.totalExp == 0].suppliers > 1))
print("Number of firms with no exp information and >5 suppliers: ", \
          sum(custExp[custExp.totalExp == 0].suppliers > 5))


Number of firms with no exp information and multiple suppliers:  312
Number of firms with no exp information and >5 suppliers:  3


Most of these firms have expenditure information. We can look at:
    - Expenditure-weighted (just do equal shares if no exp information)
    - Largest supplier
    
    
Our focus is going to be on the economic data of the customers, so isolate for the customers here.

In [63]:
customerDB = c_links[['year','customer_gvkey','supplier_gvkey','salecs']].\
    merge(custExp).rename(columns = {'customer_gvkey': 'gvkey'}).drop_duplicates()
print(customerDB.shape)

customerDB.head()

(46293, 6)


Unnamed: 0,year,gvkey,supplier_gvkey,salecs,totalExp,suppliers
0,2002,2136.0,1013.0,111.056,892.202,13
1,2002,2136.0,3275.0,8.398,892.202,13
2,2002,2136.0,10286.0,16.987,892.202,13
3,2002,2136.0,10420.0,229.158,892.202,13
4,2002,2136.0,14340.0,9.432,892.202,13


## Merge in supplier weather
Get the weather data.

In [37]:
g = pd.read_csv("../../data/companyData/weatherByEstablishment.csv").\
    drop(columns = {"Unnamed: 0"})

allWeather_withLags = pd.read_csv("../../data/companyData/allWeather_withLags.csv").\
    drop(columns = {"Unnamed: 0", 'yearQtr'})

averages = pd.read_csv("../../data/companyData/quarterlyStatsByZip.csv").\
    drop(columns = {"Unnamed: 0"}).rename(columns = {'ZIP': 'zipcode'})


averages['qtr'] = averages.quarter.str.slice(1,2).astype('float')
averages.drop(columns = {'quarter'}, inplace = True) 

averages.head()

allWeather_withLags       = allWeather_withLags.astype({'year':       'category',
                           'qtr':        'category',
                           'zipcode':    'category'})

averages                  = averages.astype({'qtr':        'category',
                           'zipcode':    'category'})

allWeather_byInd_withLags = pd.read_csv("../../data/companyData/allWeather_byInd_withLags.csv").\
    drop(columns = {"Unnamed: 0", 'yearQtr'})
allWeather_byInd_withLags = allWeather_byInd_withLags.astype({'year':       'category',
                           'qtr':        'category',
                           'zipcode':    'category'})

In [55]:
allWeather_byInd_withLags.columns

Index(['qtr', 'year', 'famafrench', 'zipcode', 'temp_ffquant_0.95',
       'temp_ffquant_1xQtr', 'temp_ffquant_1xYr', 'temp_ffquant_1x5Qtrs',
       'temp_ffquant_1x10Qtrs', 'temp_ffquant_1x5Yrs',
       ...
       'lag4_precip5Days_ffquant_1x10Qtrs', 'lag4_precip5Days_ffquant_1x5Yrs',
       'lag4_precip5Days_ffquant_1x10Yrs',
       'lag4_precip5Days_indQuarterquant_0.95',
       'lag4_precip5Days_indQuarterquant_1xQtr',
       'lag4_precip5Days_indQuarterquant_1xYr',
       'lag4_precip5Days_indQuarterquant_1x5Qtrs',
       'lag4_precip5Days_indQuarterquant_1x10Qtrs',
       'lag4_precip5Days_indQuarterquant_1x5Yrs',
       'lag4_precip5Days_indQuarterquant_1x10Yrs'],
      dtype='object', length=284)

In [71]:
print(allSupplierData.merge(g).shape, 
      allSupplierData.merge(allWeather_withLags).shape, 
      allSupplierData.merge(averages).shape,
      allSupplierData.merge(allWeather_byInd_withLags).shape)

(34848, 95) (34908, 425) (60264, 11) (34460, 285)


In [89]:
allSupplierData.head()

Unnamed: 0,gvkey,famafrench,year,zipcode
0,1013.0,37.0,2003,55344
1,1013.0,37.0,2004,55344
2,1013.0,37.0,2005,55344
3,1013.0,37.0,2006,55344
4,1013.0,37.0,2007,55344


In [90]:
allWeather_withLags.head()

Unnamed: 0,zipcode,year,qtr,precip_annualquant_0.95,precip_annualquant_1xQtr,precip_annualquant_1xYr,precip_annualquant_1x5Qtrs,precip_annualquant_1x10Qtrs,precip_annualquant_1x5Yrs,precip_annualquant_1x10Yrs,...,lag4_temp5Days_zipquant_1x10Qtrs,lag4_temp5Days_zipquant_1x5Yrs,lag4_temp5Days_zipquant_1x10Yrs,lag4_temp5Days_zipQuarterquant_0.95,lag4_temp5Days_zipQuarterquant_1xQtr,lag4_temp5Days_zipQuarterquant_1xYr,lag4_temp5Days_zipQuarterquant_1x5Qtrs,lag4_temp5Days_zipQuarterquant_1x10Qtrs,lag4_temp5Days_zipQuarterquant_1x5Yrs,lag4_temp5Days_zipQuarterquant_1x10Yrs
0,1001,2010,1.0,10,9,5,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
1,1001,2010,2.0,8,2,1,0,0,0,0,...,0,0,0,2,1,0,0,0,0,0
2,1001,2010,3.0,6,3,1,0,0,0,0,...,0,0,0,2,0,0,0,0,0,0
3,1001,2010,4.0,11,8,6,1,0,0,0,...,0,0,0,3,1,0,0,0,0,0
4,1001,2011,1.0,12,6,1,1,1,1,1,...,0,0,0,3,2,0,0,0,0,0


In [92]:
allSupplierData[allSupplierData.year > 2009].shape

(9064, 4)

In [86]:
allWeather_withLags['zipcode'] = allWeather_withLags.zipcode.astype('int64')
allSupplierData.merge(allWeather_withLags).shape

(34908, 425)

In [93]:
suppliersWithWeather = allSupplierData[allSupplierData.year > 2009].merge(allWeather_withLags).merge(averages).merge(allWeather_byInd_withLags).merge(g)
print("supplier: ", suppliersWithWeather.shape, suppliersWithWeather.head())

suppliersWithWeather.to_csv("../../data/companyData/suppliersWithWeather.csv")
# customersWithWeather.to_csv("../../data/companyData/customersWithWeather.csv")

'''customersWithWeather = customers.merge(allWeather_withLags).merge(averages).merge(allWeather_byInd_withLags).merge(g)
print("customers: ", customersWithWeather.shape)'''

'''suppliersWithWeather = pd.read_csv("../../data/companyData/suppliersWithWeather.csv").drop(columns = {'Unnamed: 0'})
customersWithWeather = pd.read_csv("../../data/companyData/customersWithWeather.csv").drop(columns = {'Unnamed: 0'})'''

''' frames = [customersWithWeather, suppliersWithWeather]

allCompanies = pd.concat(frames).drop_duplicates()

print(allCompanies.shape)

allCompanies.to_csv("../../data/companyData/allCompaniesWithWeather.csv") '''

supplier:  (33140, 801)       gvkey  famafrench  year zipcode  qtr  precip_annualquant_0.95  \
0    1013.0        37.0  2010   55344  1.0                        2   
1  113362.0        42.0  2010   55344  1.0                        2   
2   66588.0        48.0  2010   55344  1.0                        2   
3  113362.0        42.0  2011   55344  1.0                        3   
4   66588.0        48.0  2011   55344  1.0                        3   

   precip_annualquant_1xQtr  precip_annualquant_1xYr  \
0                         0                        0   
1                         0                        0   
2                         0                        0   
3                         2                        0   
4                         2                        0   

   precip_annualquant_1x5Qtrs  precip_annualquant_1x10Qtrs  ...  \
0                           0                            0  ...   
1                           0                            0  ...   
2          

' frames = [customersWithWeather, suppliersWithWeather]\n\nallCompanies = pd.concat(frames).drop_duplicates()\n\nprint(allCompanies.shape)\n\nallCompanies.to_csv("../../data/companyData/allCompaniesWithWeather.csv") '

## Biggest Supplier
Focus on weather of biggest supplier.

First find the max by supplier. Add back in any rows with only 1 supplier.

In [95]:
customerDB.shape

(46293, 6)

In [96]:
# https://stackoverflow.com/questions/15705630/get-the-rows-which-have-the-max-value-in-groups-using-groupby
idx = customerDB.groupby(['year','gvkey']).salecs.\
    transform(max) == customerDB.salecs
largestSuppliers = customerDB[idx].reset_index(drop = True)
print(largestSuppliers.shape)

# find companies who only have one other supplier
singleSuppliers = customerDB[customerDB.suppliers == 1].reset_index(drop = True)
print(singleSuppliers.shape)

# find largest suppliers of different companies
largestSuppliers = largestSuppliers.append(singleSuppliers).drop_duplicates()
print(largestSuppliers.shape)



(12142, 6)
(8370, 6)
(14549, 6)


In [105]:
largestSuppliers.head()

Unnamed: 0,year,gvkey,supplier_gvkey,salecs,totalExp,suppliers
0,2002,2136.0,10420.0,229.158,892.202,13
1,2003,2136.0,13440.0,214.0,1100.784,14
2,2004,2136.0,10420.0,332.586,1445.879,21
3,2005,2136.0,10420.0,508.518,2251.035,32
4,2006,2136.0,10420.0,551.124,2563.791,31


Merge in the change data for that gvkey.

In [99]:
for column in suppliersWithWeather.columns:
    print(column)

gvkey
famafrench
year
zipcode
qtr
precip_annualquant_0.95
precip_annualquant_1xQtr
precip_annualquant_1xYr
precip_annualquant_1x5Qtrs
precip_annualquant_1x10Qtrs
precip_annualquant_1x5Yrs
precip_annualquant_1x10Yrs
precip_zipquant_0.95
precip_zipquant_1xQtr
precip_zipquant_1xYr
precip_zipquant_1x5Qtrs
precip_zipquant_1x10Qtrs
precip_zipquant_1x5Yrs
precip_zipquant_1x10Yrs
precip_zipQuarterquant_0.95
precip_zipQuarterquant_1xQtr
precip_zipQuarterquant_1xYr
precip_zipQuarterquant_1x5Qtrs
precip_zipQuarterquant_1x10Qtrs
precip_zipQuarterquant_1x5Yrs
precip_zipQuarterquant_1x10Yrs
temp_annualquant_0.95
temp_annualquant_1xQtr
temp_annualquant_1xYr
temp_annualquant_1x5Qtrs
temp_annualquant_1x10Qtrs
temp_annualquant_1x5Yrs
temp_annualquant_1x10Yrs
temp_zipquant_0.95
temp_zipquant_1xQtr
temp_zipquant_1xYr
temp_zipquant_1x5Qtrs
temp_zipquant_1x10Qtrs
temp_zipquant_1x5Yrs
temp_zipquant_1x10Yrs
temp_zipQuarterquant_0.95
temp_zipQuarterquant_1xQtr
temp_zipQuarterquant_1xYr
temp_zipQuarterquant_1x5

In [103]:
relevantVars = [x for x in suppliersWithWeather.columns if (('year' in x) | 
                                                 ('qtr' in x) |
                                                 ('gvkey' in x) |
                                                 ('famafrench' in x) |
                                                 ('_' in x)) & 
                                                ~('roa_lagged' in x) & 
                                                ~('yearQtr' in x)]
relevantVars

['gvkey',
 'famafrench',
 'year',
 'qtr',
 'precip_annualquant_0.95',
 'precip_annualquant_1xQtr',
 'precip_annualquant_1xYr',
 'precip_annualquant_1x5Qtrs',
 'precip_annualquant_1x10Qtrs',
 'precip_annualquant_1x5Yrs',
 'precip_annualquant_1x10Yrs',
 'precip_zipquant_0.95',
 'precip_zipquant_1xQtr',
 'precip_zipquant_1xYr',
 'precip_zipquant_1x5Qtrs',
 'precip_zipquant_1x10Qtrs',
 'precip_zipquant_1x5Yrs',
 'precip_zipquant_1x10Yrs',
 'precip_zipQuarterquant_0.95',
 'precip_zipQuarterquant_1xQtr',
 'precip_zipQuarterquant_1xYr',
 'precip_zipQuarterquant_1x5Qtrs',
 'precip_zipQuarterquant_1x10Qtrs',
 'precip_zipQuarterquant_1x5Yrs',
 'precip_zipQuarterquant_1x10Yrs',
 'temp_annualquant_0.95',
 'temp_annualquant_1xQtr',
 'temp_annualquant_1xYr',
 'temp_annualquant_1x5Qtrs',
 'temp_annualquant_1x10Qtrs',
 'temp_annualquant_1x5Yrs',
 'temp_annualquant_1x10Yrs',
 'temp_zipquant_0.95',
 'temp_zipquant_1xQtr',
 'temp_zipquant_1xYr',
 'temp_zipquant_1x5Qtrs',
 'temp_zipquant_1x10Qtrs',
 'temp

In [104]:
relevantVars = [x for x in suppliersWithWeather.columns if (('year' in x) | 
                                                 ('qtr' in x) |
                                                 ('gvkey' in x) |
                                                 ('famafrench' in x) |
                                                 ('_' in x)) & 
                                                ~('roa_lagged' in x) & 
                                                ~('yearQtr' in x)]

suppliers_toMerge = suppliersWithWeather[relevantVars]


for colname in suppliers_toMerge.columns[4:]:
    suppliers_toMerge.rename(columns = {colname: 'supplier_' + colname}, inplace = True)

    
suppliers_toMerge.rename(columns = {'gvkey': 'supplier_gvkey',
                                   'famafrench': 'supplier_famafrench'},inplace = True)    

print(suppliers_toMerge.columns)


'''suppliers_toMerge = suppliersWithWeather[['year','qtr','gvkey','tmax_quant_1.0','precip_quant_1.0']].\
    rename(columns = {'gvkey': 'supplier_gvkey',
                      'tmax_quant_1.0': 'supplier_tmax_quant_1.0',
                      'precip_quant_1.0': 'supplier_precip_quant_1.0'})'''

Index(['supplier_gvkey', 'supplier_famafrench', 'year', 'qtr',
       'supplier_precip_annualquant_0.95', 'supplier_precip_annualquant_1xQtr',
       'supplier_precip_annualquant_1xYr',
       'supplier_precip_annualquant_1x5Qtrs',
       'supplier_precip_annualquant_1x10Qtrs',
       'supplier_precip_annualquant_1x5Yrs',
       ...
       'supplier_empWt_lag4_precip_zipQuarterquant_1x5Yrs',
       'supplier_empWt_lag4_temp_annualquant_0.95',
       'supplier_empWt_lag4_temp_annualquant_1x5Qtrs',
       'supplier_empWt_lag4_temp_annualquant_1x5Yrs',
       'supplier_empWt_lag4_temp_zipquant_0.95',
       'supplier_empWt_lag4_temp_zipquant_1x5Qtrs',
       'supplier_empWt_lag4_temp_zipquant_1x5Yrs',
       'supplier_empWt_lag4_temp_zipQuarterquant_0.95',
       'supplier_empWt_lag4_temp_zipQuarterquant_1x5Qtrs',
       'supplier_empWt_lag4_temp_zipQuarterquant_1x5Yrs'],
      dtype='object', length=800)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


"suppliers_toMerge = suppliersWithWeather[['year','qtr','gvkey','tmax_quant_1.0','precip_quant_1.0']].    rename(columns = {'gvkey': 'supplier_gvkey',\n                      'tmax_quant_1.0': 'supplier_tmax_quant_1.0',\n                      'precip_quant_1.0': 'supplier_precip_quant_1.0'})"

In [110]:
changes = pd.read_csv("../../data/companyData/compustatChanges_withControls.csv").drop(columns = ['Unnamed: 0'])
print(changes.columns)
customers = changes.merge(largestSuppliers)
print(customers.shape)

  exec(code_obj, self.user_global_ns, self.user_ns)


Index(['gvkey', 'datadate', 'year', 'qtr', 'companyName', 'curcdq', 'assets',
       'costGoodsSold', 'totalInv', 'netIncome', 'totalRevenue', 'cik',
       'costat', 'add1', 'add2', 'addzip', 'city', 'sic', 'state',
       'assetsLast', 'netIncomeLast', 'totalRevenueLast', 'costGoodsSoldLast',
       'totalInvLast', 'incomeChange', 'revenueChange', 'costChange',
       'inventoryChange', 'assetsPrev', 'assetsLagged', 'netIncomeLagged',
       'roa_lagged', 'famafrench', 'earliestYear', 'ageTercile', 'sizeTercile',
       'profitTercile', 'datacqtr', 'datafqtr', 'fyr', 'DATE'],
      dtype='object')
(40588, 45)


In [111]:
largestSuppliersWithWeather = customers.merge(largestSuppliers[['year', 'gvkey', 'supplier_gvkey']]).merge(suppliers_toMerge)
largestSuppliersWithWeather.shape

(18599, 842)

In [112]:
for col in largestSuppliersWithWeather.columns:
    print(col)

gvkey
datadate
year
qtr
companyName
curcdq
assets
costGoodsSold
totalInv
netIncome
totalRevenue
cik
costat
add1
add2
addzip
city
sic
state
assetsLast
netIncomeLast
totalRevenueLast
costGoodsSoldLast
totalInvLast
incomeChange
revenueChange
costChange
inventoryChange
assetsPrev
assetsLagged
netIncomeLagged
roa_lagged
famafrench
earliestYear
ageTercile
sizeTercile
profitTercile
datacqtr
datafqtr
fyr
DATE
supplier_gvkey
salecs
totalExp
suppliers
supplier_famafrench
supplier_precip_annualquant_0.95
supplier_precip_annualquant_1xQtr
supplier_precip_annualquant_1xYr
supplier_precip_annualquant_1x5Qtrs
supplier_precip_annualquant_1x10Qtrs
supplier_precip_annualquant_1x5Yrs
supplier_precip_annualquant_1x10Yrs
supplier_precip_zipquant_0.95
supplier_precip_zipquant_1xQtr
supplier_precip_zipquant_1xYr
supplier_precip_zipquant_1x5Qtrs
supplier_precip_zipquant_1x10Qtrs
supplier_precip_zipquant_1x5Yrs
supplier_precip_zipquant_1x10Yrs
supplier_precip_zipQuarterquant_0.95
supplier_precip_zipQuarterquan

In [113]:
largestSuppliersWithWeather.head()

Unnamed: 0,gvkey,datadate,year,qtr,companyName,curcdq,assets,costGoodsSold,totalInv,netIncome,...,supplier_empWt_lag4_precip_zipQuarterquant_1x5Yrs,supplier_empWt_lag4_temp_annualquant_0.95,supplier_empWt_lag4_temp_annualquant_1x5Qtrs,supplier_empWt_lag4_temp_annualquant_1x5Yrs,supplier_empWt_lag4_temp_zipquant_0.95,supplier_empWt_lag4_temp_zipquant_1x5Qtrs,supplier_empWt_lag4_temp_zipquant_1x5Yrs,supplier_empWt_lag4_temp_zipQuarterquant_0.95,supplier_empWt_lag4_temp_zipQuarterquant_1x5Qtrs,supplier_empWt_lag4_temp_zipQuarterquant_1x5Yrs
0,1045,20100331,2010,1,AMERICAN AIRLINES GROUP INC,USD,25525.0,4360.0,552.0,-505.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,4.0,0.0
1,1045,20100630,2010,2,AMERICAN AIRLINES GROUP INC,USD,25885.0,4575.0,569.0,-11.0,...,0.0,0.0,0.0,0.0,4.0,2.0,0.0,4.0,2.0,0.0
2,1045,20100930,2010,3,AMERICAN AIRLINES GROUP INC,USD,25357.0,4567.0,575.0,143.0,...,0.0,0.0,0.0,0.0,13.0,3.0,0.0,1.0,0.0,0.0
3,1045,20101231,2010,4,AMERICAN AIRLINES GROUP INC,USD,25088.0,4538.0,594.0,-98.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,1.0,0.0
4,1045,20140331,2014,1,AMERICAN AIRLINES GROUP INC,USD,43737.0,7382.0,1052.0,480.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.821918,1.780822,0.0


In [114]:
largestSuppliersWithWeather.to_csv("../../data/companyData/largestSuppliersWithWeather.csv")

In [115]:
for col in largestSuppliersWithWeather.columns:
    print(col)

gvkey
datadate
year
qtr
companyName
curcdq
assets
costGoodsSold
totalInv
netIncome
totalRevenue
cik
costat
add1
add2
addzip
city
sic
state
assetsLast
netIncomeLast
totalRevenueLast
costGoodsSoldLast
totalInvLast
incomeChange
revenueChange
costChange
inventoryChange
assetsPrev
assetsLagged
netIncomeLagged
roa_lagged
famafrench
earliestYear
ageTercile
sizeTercile
profitTercile
datacqtr
datafqtr
fyr
DATE
supplier_gvkey
salecs
totalExp
suppliers
supplier_famafrench
supplier_precip_annualquant_0.95
supplier_precip_annualquant_1xQtr
supplier_precip_annualquant_1xYr
supplier_precip_annualquant_1x5Qtrs
supplier_precip_annualquant_1x10Qtrs
supplier_precip_annualquant_1x5Yrs
supplier_precip_annualquant_1x10Yrs
supplier_precip_zipquant_0.95
supplier_precip_zipquant_1xQtr
supplier_precip_zipquant_1xYr
supplier_precip_zipquant_1x5Qtrs
supplier_precip_zipquant_1x10Qtrs
supplier_precip_zipquant_1x5Yrs
supplier_precip_zipquant_1x10Yrs
supplier_precip_zipQuarterquant_0.95
supplier_precip_zipQuarterquan

In [116]:
largestSuppliersWithWeather = pd.read_csv("../../data/companyData/largestSuppliersWithWeather.csv").\
    drop(columns = {'Unnamed: 0'})
print(largestSuppliersWithWeather.shape)
largestSuppliersWithWeather.head()

(18599, 842)


  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,gvkey,datadate,year,qtr,companyName,curcdq,assets,costGoodsSold,totalInv,netIncome,...,supplier_empWt_lag4_precip_zipQuarterquant_1x5Yrs,supplier_empWt_lag4_temp_annualquant_0.95,supplier_empWt_lag4_temp_annualquant_1x5Qtrs,supplier_empWt_lag4_temp_annualquant_1x5Yrs,supplier_empWt_lag4_temp_zipquant_0.95,supplier_empWt_lag4_temp_zipquant_1x5Qtrs,supplier_empWt_lag4_temp_zipquant_1x5Yrs,supplier_empWt_lag4_temp_zipQuarterquant_0.95,supplier_empWt_lag4_temp_zipQuarterquant_1x5Qtrs,supplier_empWt_lag4_temp_zipQuarterquant_1x5Yrs
0,1045,20100331,2010,1,AMERICAN AIRLINES GROUP INC,USD,25525.0,4360.0,552.0,-505.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,4.0,0.0
1,1045,20100630,2010,2,AMERICAN AIRLINES GROUP INC,USD,25885.0,4575.0,569.0,-11.0,...,0.0,0.0,0.0,0.0,4.0,2.0,0.0,4.0,2.0,0.0
2,1045,20100930,2010,3,AMERICAN AIRLINES GROUP INC,USD,25357.0,4567.0,575.0,143.0,...,0.0,0.0,0.0,0.0,13.0,3.0,0.0,1.0,0.0,0.0
3,1045,20101231,2010,4,AMERICAN AIRLINES GROUP INC,USD,25088.0,4538.0,594.0,-98.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,1.0,0.0
4,1045,20140331,2014,1,AMERICAN AIRLINES GROUP INC,USD,43737.0,7382.0,1052.0,480.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.821918,1.780822,0.0


In [117]:
largestSuppliersWithWeather.supplier_famafrench.value_counts()

48.0    4684
35.0    2177
31.0    1539
13.0    1533
37.0    1401
24.0     639
42.0     616
21.0     547
36.0     476
32.0     434
17.0     383
18.0     356
19.0     335
38.0     307
12.0     304
33.0     244
22.0     228
44.0     204
41.0     203
30.0     192
25.0     192
47.0     163
14.0     144
39.0     144
40.0     140
28.0     124
11.0     115
26.0      88
6.0       86
7.0       84
27.0      80
15.0      69
2.0       60
23.0      56
46.0      52
16.0      40
45.0      32
8.0       28
29.0      23
3.0       20
5.0       20
34.0      16
1.0        8
10.0       8
20.0       5
Name: supplier_famafrench, dtype: int64

## Sales-Weighted Average
If a company doesn't have sales-specific information, then assume equal shares. This doesn't happen for too many of the companies, thankfully.

In [None]:
customerDB = c_links[['year','customer_gvkey','supplier_gvkey','salecs']].\
    merge(custExp).rename(columns = {'customer_gvkey': 'gvkey'}).drop_duplicates()

customerDB['salesWeight'] = customerDB.salecs/customerDB.totalExp

customerDB.fillna(1, inplace = True)

Now merge this with the supplier weather data, and use the sales weights to find a sales-weighted average of the weather conditions for the suppliers.

In [None]:
relevantVars = [x for x in suppliersWithWeather.columns if (('year' in x) | 
                                                 ('qtr' in x) |
                                                 ('gvkey' in x) |
                                                 ('_' in x)) & 
                                                ~('roa_lagged' in x) & 
                                                ~('yearQtr' in x)]

suppliers_toMerge = suppliersWithWeather[relevantVars]


for colname in suppliers_toMerge.columns[3:]:
    suppliers_toMerge.rename(columns = {colname: 'supplier_' + colname}, inplace = True)

    
suppliers_toMerge.rename(columns = {'gvkey': 'supplier_gvkey'},inplace = True)    


In [None]:
suppliers_toMerge.head()

For each of the supplier weather columns, multiply the variable by the fraction of sales attributable to that relationship.

In [None]:
supplierWeather = customerDB[['year','gvkey','supplier_gvkey','salesWeight']].merge(suppliers_toMerge)

for col in supplierWeather.columns[7:]:
        supplierWeather[col]   = supplierWeather.salesWeight*supplierWeather[col]
        
        

supplierWeather.drop(columns = {'supplier_gvkey','salesWeight'}, inplace = True)


print(supplierWeather.head())



# [['year','qtr','gvkey','supplier_tmax_quant_1.0','supplier_precip_quant_1.0']]

In [None]:
supplierWtdAvgWeather = supplierWeather.groupby(['year','qtr','gvkey']).sum().reset_index().drop_duplicates()

In [None]:
supplierWtdAvgWeather.gvkey.unique()

Merge the supplier weighted average weather data with the customer data that has weather as well.

In [None]:
customersWithWeather.head()

In [None]:
wtdAvgSuppliers = customersWithWeather.merge(supplierWtdAvgWeather)

wtdAvgSuppliers.head()

In [None]:
wtdAvgSuppliers.shape

In [None]:
wtdAvgSuppliers.to_csv("../../data/companyData/wtdAvgSuppliers.csv")

In [None]:
wtdAvgSuppliers.head()

In [None]:
wtdAvgSuppliers.columns[wtdAvgSuppliers.columns.str.contains('Tercile')]