In [93]:
import pickle as pkl
import pandas as pd
import matplotlib
import os
import re

import collections
import datetime
import time

import geopandas as gpd

import numpy as np

import rasterio

import spacy

import gc

import geopy.distance

nlp = spacy.load('en_core_web_lg')
from sklearn.metrics.pairwise import cosine_similarity

import itertools

# Get Data

## Changes from year to year

In [None]:
changes = pd.read_csv("../../data/companyData/compustatChanges_all.csv").drop(columns = ['Unnamed: 0'])

changes.head()

In [None]:
otherControls = pd.read_csv('../../data/companyData/otherControls.csv').\
    drop(columns = {'Unnamed: 0', 'fyearq'}).rename(columns = {'year_toMatchOn': 'year',
                                                              'fqtr': 'qtr'})

otherControls.head()

In [None]:
otherControls.head()

In [None]:
print(changes.shape)
changes = changes.merge(otherControls)
print(changes.shape)


industries = changes[['gvkey','indGroup']].drop_duplicates()

In [None]:
industries.to_csv("../../data/companyData/gvkeyIndustries.csv")

In [None]:
changes.to_csv("../../data/companyData/compustatChanges_withControls.csv")
changes.head()

In [None]:
changes = pd.read_csv("../../data/companyData/compustatChanges_withControls.csv").drop(columns = {'Unnamed: 0'})
changes.head()

Put in the calendar quarters and fiscal quarter data.

In [None]:
quarters = pd.read_csv("../../data/companyData/fiscalYears.csv")
quarters.head()

In [None]:
len(quarters.gvkey.unique())

In [None]:
sum((quarters.fyr == 12) | 
   (quarters.fyr == 3) | 
   (quarters.fyr == 6) | 
   (quarters.fyr == 9))/quarters.shape[0]

In [None]:
quarters = quarters[(quarters.fyr == 12) | 
   (quarters.fyr == 3) | 
   (quarters.fyr == 6) | 
   (quarters.fyr == 9)][['gvkey','datadate','datacqtr','datafqtr','fyr']].reset_index(drop = True)


In [None]:
quarters.head()

Merge the quarter data into the change data, and make sure that the quarters that are used line up with the calendar quarters.

In [None]:
changesCal = changes[changes.gvkey.isin(quarters.gvkey.unique())]

changesCal = changesCal.merge(quarters)

print(changesCal.shape[0]/changes.shape[0])

In [None]:
changesCal.loc[~(changesCal.datacqtr.isna()), 'year'] = changesCal.datacqtr.str.slice(0,4)
changesCal.loc[~(changesCal.datacqtr.isna()), 'qtr']  = changesCal.datacqtr.str.slice(5,6)

changesCal['DATE'] = pd.to_datetime(changesCal['datadate'])

changesCal.loc[(changesCal.datacqtr.isna()), 'year'] = changesCal.DATE.dt.year
changesCal.loc[(changesCal.datacqtr.isna()), 'qtr']  = changesCal.DATE.dt.quarter

changesCal['year'] = changesCal.year.astype('int64')
changesCal['qtr']  = changesCal.qtr.astype('int64')

print(changesCal.shape,changesCal.head())

In [None]:
changesCal.to_csv("../../data/companyData/compustatChanges_withControls.csv")
changesCal.head()

In [None]:
changesCal = pd.read_csv("../../data/companyData/compustatChanges_withControls.csv")
changesCal.head()

# Compustat and ABI Linking

In [None]:
gvKey_abiLinkingTable = pd.read_csv('../../data/companyData/linkingTable.csv').drop(columns = ['Unnamed: 0'])

base_columns = gvKey_abiLinkingTable.columns 
customer_columns = "customer_" + base_columns
supplier_columns = "supplier_" + base_columns



hasMatch = gvKey_abiLinkingTable.gvkey.unique()

gvKey_abiLinkingTable.head()


---------------------------------

# Get all change data together
Get the linking table and merge the abi labels into the change df. 

Then, merge the location data into the change data and get as complete a record of companies as possible given the HQ data.

In [None]:
gvKey_abiLinkingTable = pd.read_csv('../../data/companyData/linkingTable.csv').drop(columns = ['Unnamed: 0'])

changes = pd.read_csv("../../data/companyData/compustatChanges_withControls.csv").drop(columns = ['Unnamed: 0'])
print(changes.shape, changes.head())


changesABI = changes.merge(gvKey_abiLinkingTable, on ='gvkey').drop(columns = {'state','city'})
print(changesABI.shape, changesABI.head())

Now merge in the hq information.

In [None]:
canadian = ['ON', 'AB','QC', 'BC', 'NS', 'NF', 'SK', 'MB', 'NB']
changes = changes[~(changes.state.isin(canadian)) & ~changes.state.isna()]

changes['addzip'] = changes.addzip.astype('str').str.slice(0,5)

changes.state.unique()

In [None]:
hq = pd.read_csv("../../data/ig_uniqueHQs_multLocations.csv").\
    drop(columns = {'Unnamed: 0'}).\
    rename(columns = {'archive_version_year': 'year'})

hq['year'] = hq.year.astype('int64')

igChanges = changesABI.merge(hq)
print(igChanges.shape, igChanges.head())


hq.head()

In [None]:
igChanges.to_csv("../../data/companyData/igData.csv")

In [None]:
igChanges.columns

At this point, we have zip information in the following forms (from most to least examples):
    - changes: all compustat companies, from the compustat address system
    - igChanges: subset of compustat companies, from the ig merge
    - subset of compustat companies that have SC information and survived the ig merge
    
We could potentially look at the subset of compustat companies for which we have SC information, usign the compustat address system as well.

For now: follow similar trajectory as before but add in weather data for all cstat companies and all ig-merged companies.

First: pull all zips that are mentioned in changes and igChanges and use this to get the weather data.



In [None]:
changes = changes[(~changes.addzip.isna()) & (changes.addzip != 'nan')]
relevantZips = changes.addzip.astype('int64').append(igChanges.zipcode).unique()

changes.rename(columns = {'addzip': 'zipcode'}, inplace = True)
changes.drop(columns = {'cik',
     'datadate','costat', 'add1', 'add2', 'city', 'sic', 'state'}, inplace = True)

In [None]:
len(relevantZips)

In [None]:
# relevantZips = allCustomerData.zipcode.append(allSupplierData.zipcode).unique()
outfile =  '../../data/companyData/relevantZips.pkl'
with open(outfile, 'wb') as pickle_file:
    pkl.dump(relevantZips, pickle_file)

------------------------------------------------

# Stocks

In [None]:
igChanges = pd.read_csv("../../data/companyData/igData.csv").\
    drop(columns = {'Unnamed: 0'})
igChanges.head()

In [None]:
with open('../../data/stockReturns.pkl', 'rb') as f:
    stocks = pkl.load(f)[['date','gvkey','RET']]

In [None]:
stocks.head()

In [None]:
sum(stocks.gvkey.isna())

In [None]:
stocks = stocks[stocks.date.dt.year > 2008]

stocks['qtr']  = stocks.date.dt.quarter
stocks['year'] = stocks.date.dt.year

stocks = stocks[~stocks.gvkey.isna()]
stocks['gvkey'] = stocks['gvkey'].astype(int)
stocks.shape

In [None]:
igChanges.columns

In [None]:
companyControls = igChanges[['gvkey','year','qtr','famafrench','ageTercile','sizeTercile','profitTercile','zipcode']]
companyControls.head()

In [None]:
print(stocks.dtypes, companyControls.dtypes)

In [None]:
stocksWithControls = stocks.merge(companyControls)
print(stocksWithControls.shape,stocks.shape,companyControls.shape)
stocksWithControls.head()

In [None]:
del stocks
del companyControls
del igChanges
gc.collect()

In [None]:
annualWeather = pd.read_csv("../../data/companyData/stockWeather_annual.csv").\
    drop(columns = {'Unnamed: 0'})

annualWeather = annualWeather[~annualWeather.temp_annualLast5.isna()].reset_index(drop = True)

annualWeather['date'] = pd.to_datetime(annualWeather['date'],
                                   format = "%Y%m%d")

annualWeather.rename(columns = {'ZIP': 'zipcode'}, inplace = True)
print(annualWeather.dtypes)
annualWeather.head()

In [None]:
allWeather = pd.read_csv("../../data/companyData/stockWeather_zipQuarterQuants.csv").\
    drop(columns = {'Unnamed: 0'})

allWeather = allWeather[~allWeather.temp_zipQuarterLast5.isna()].reset_index(drop = True)

allWeather['date'] = pd.to_datetime(allWeather['date'],
                                   format = "%Y-%m-%d")

allWeather.rename(columns = {'ZIP': 'zipcode'}, inplace = True)
print(allWeather.dtypes)
allWeather.head()

In [None]:
stocksWithControlsWeather = stocksWithControls.merge(allWeather).merge(annualWeather)
print(stocksWithControlsWeather.shape,allWeather.shape)

stocksWithControlsWeather.head()

In [None]:
stocksWithControlsWeather.to_csv("../../data/companyData/stocksWithControlsWeather.csv")

In [None]:
sum(stocksWithControlsWeather.RET.isna())

In [None]:
stocksWithControlsWeather = pd.read_csv("../../data/companyData/stocksWithControlsWeather.csv").drop(columns = {'Unnamed: 0'})
stocksWithControlsWeather.head()

In [None]:
sum(stocksWithControlsWeather.gvkey.isna())

--------------------

# Weather Data
First do this on the HQ zipcodes.

In [None]:
allWeather = pd.read_csv("../../data/companyData/revised_allWeatherBins_2009to2019.csv").\
    drop(columns = {"Unnamed: 0"})

allWeather['yearQtr'] = allWeather.year + (allWeather.qtr - 1)/4

col = allWeather.pop("yearQtr")
allWeather.insert(0, col.name, col)

lag1 = allWeather.copy()
lag1['yearQtr'] += 0.25
for colname in lag1.columns[4:]:
    lag1.rename(columns = {colname: 'lag1_' + colname}, inplace = True)
lag1.drop(columns = {'year','qtr'},inplace = True)

    
lag2 = allWeather.copy()
lag2['yearQtr'] += 0.5
for colname in lag2.columns[4:]:
    lag2.rename(columns = {colname: 'lag2_' + colname}, inplace = True)
lag2.drop(columns = {'year','qtr'},inplace = True)


lag3 = allWeather.copy()
lag3['yearQtr'] += 0.75
for colname in lag3.columns[4:]:
    lag3.rename(columns = {colname: 'lag3_' + colname}, inplace = True)
lag3.drop(columns = {'year','qtr'},inplace = True)


lag4 = allWeather.copy()
lag4['yearQtr'] += 1
for colname in lag4.columns[4:]:
    lag4.rename(columns = {colname: 'lag4_' + colname}, inplace = True)
lag4.drop(columns = {'year','qtr'},inplace = True)


print(allWeather.shape)

allWeather_withLags = allWeather.merge(lag1).merge(lag2).merge(lag3).merge(lag4)

print(allWeather_withLags.year.value_counts())

allWeather_withLags.to_csv("../../data/companyData/allWeather_withLags.csv")

Do this across all zips, for the establishment records. We'll put this into a different format right after, and then change the columns and whatnot.

In [None]:
allWeather = pd.read_csv("../../data/companyData/revised_allWeatherBins_2009to2019_allZips.csv").\
    drop(columns = {"Unnamed: 0", 'Unnamed: 0.1'})

allWeather['yearQtr'] = allWeather.year + (allWeather.qtr - 1)/4

col = allWeather.pop("yearQtr")
allWeather.insert(0, col.name, col)

lag1 = allWeather.copy()
lag1['yearQtr'] += 0.25
for colname in lag1.columns[4:]:
    lag1.rename(columns = {colname: 'lag1_' + colname}, inplace = True)
lag1.drop(columns = {'year','qtr'},inplace = True)

    
lag2 = allWeather.copy()
lag2['yearQtr'] += 0.5
for colname in lag2.columns[4:]:
    lag2.rename(columns = {colname: 'lag2_' + colname}, inplace = True)
lag2.drop(columns = {'year','qtr'},inplace = True)


lag3 = allWeather.copy()
lag3['yearQtr'] += 0.75
for colname in lag3.columns[4:]:
    lag3.rename(columns = {colname: 'lag3_' + colname}, inplace = True)
lag3.drop(columns = {'year','qtr'},inplace = True)


lag4 = allWeather.copy()
lag4['yearQtr'] += 1
for colname in lag4.columns[4:]:
    lag4.rename(columns = {colname: 'lag4_' + colname}, inplace = True)
lag4.drop(columns = {'year','qtr'},inplace = True)


print(allWeather.shape)

allWeather_withLags = allWeather.merge(lag1).merge(lag2).merge(lag3).merge(lag4)

print(allWeather_withLags.year.value_counts())

allWeather_withLags.to_csv("../../data/companyData/allWeather_withLags_allZips.csv")

Add in the few new definitions we've started on here: bins by week/month/quarter, and days 90+.

In [None]:
allWeather = pd.read_csv("../../data/companyData/latestExtremes.csv").\
    drop(columns = {"Unnamed: 0"})
allWeather.rename(columns = {'quarter': 'qtr'}, inplace = True)

allWeather['yearQtr'] = allWeather.year + (allWeather.qtr - 1)/4

col = allWeather.pop("yearQtr")
allWeather.insert(0, col.name, col)

lag1 = allWeather.copy()
lag1['yearQtr'] += 0.25
for colname in lag1.columns[4:]:
    lag1.rename(columns = {colname: 'lag1_' + colname}, inplace = True)
lag1.drop(columns = {'year','qtr'},inplace = True)

    
lag2 = allWeather.copy()
lag2['yearQtr'] += 0.5
for colname in lag2.columns[4:]:
    lag2.rename(columns = {colname: 'lag2_' + colname}, inplace = True)
lag2.drop(columns = {'year','qtr'},inplace = True)


lag3 = allWeather.copy()
lag3['yearQtr'] += 0.75
for colname in lag3.columns[4:]:
    lag3.rename(columns = {colname: 'lag3_' + colname}, inplace = True)
lag3.drop(columns = {'year','qtr'},inplace = True)


lag4 = allWeather.copy()
lag4['yearQtr'] += 1
for colname in lag4.columns[4:]:
    lag4.rename(columns = {colname: 'lag4_' + colname}, inplace = True)
lag4.drop(columns = {'year','qtr'},inplace = True)


print(allWeather.shape)

allWeather_withLags = allWeather.merge(lag1).merge(lag2).merge(lag3).merge(lag4)

print(allWeather_withLags.year.value_counts())

allWeather_withLags.to_csv("../../data/companyData/allWeather_withLags_new.csv")

Now do the same for the industry-specific weather.

In [None]:
# allWeather = pd.read_csv("../../../../../../../Volumes/backup2/dissData/prism/allWeatherBins_2010.2019.csv").\
allWeather_byInd = pd.read_csv("../../data/companyData/revised_allWeatherBins_2009to2019_byInd.csv").\
    drop(columns = {"Unnamed: 0"})
'''[['famafrench','zipcode','yearQuarter', 
                                    'temp_ffquant_0.95','temp_indQuarterquant_0.95',
                                   'temp5Days_ffquant_0.95', 'temp5Days_indQuarterquant_0.95',
                                   'precip_ffquant_0.95', 'precip_indQuarterquant_0.95',
                                   'precip5Days_ffquant_0.95', 'precip5Days_indQuarterquant_0.95']]
'''
allWeather_byInd['year'] = allWeather_byInd.yearQuarter.str.slice(0,4).astype('int64')
allWeather_byInd['qtr']  = allWeather_byInd.yearQuarter.str.slice(5,6).astype('int64')
allWeather_byInd['yearQtr'] = allWeather_byInd.year + (allWeather_byInd.qtr - 1)/4

allWeather_byInd = allWeather_byInd.astype({'year':       'category',
                         'qtr':        'category',
                         'zipcode':    'category',
                         'famafrench': 'category'})

changes['zipcode'] = changes['zipcode'].astype({'zipcode': 'int64'})

changes = changes.astype({'year':       'category',
                          'qtr':        'category',
                          'zipcode':    'category',
                          'famafrench': 'category'})

col = allWeather_byInd.pop("year")
allWeather_byInd.insert(0, col.name, col)

col = allWeather_byInd.pop("qtr")
allWeather_byInd.insert(0, col.name, col)


col = allWeather_byInd.pop("yearQtr")
allWeather_byInd.insert(0, col.name, col)

allWeather_byInd.drop(columns = {'yearQuarter'}, inplace = True)

print(allWeather_byInd.head())

In [None]:
lag1 = allWeather_byInd.copy()
lag1['yearQtr'] += 0.25
for colname in lag1.columns[5:]:
    lag1.rename(columns = {colname: 'lag1_' + colname}, inplace = True)
lag1.drop(columns = {'year','qtr'},inplace = True)
lag1 = lag1.astype({'yearQtr':       'category'})

    
lag2 = allWeather_byInd.copy()
lag2['yearQtr'] += 0.5
for colname in lag2.columns[5:]:
    lag2.rename(columns = {colname: 'lag2_' + colname}, inplace = True)
lag2.drop(columns = {'year','qtr'},inplace = True)
lag2 = lag2.astype({'yearQtr':       'category'})


lag3 = allWeather_byInd.copy()
lag3['yearQtr'] += 0.75
for colname in lag3.columns[5:]:
    lag3.rename(columns = {colname: 'lag3_' + colname}, inplace = True)
lag3.drop(columns = {'year','qtr'},inplace = True)
lag3 = lag3.astype({'yearQtr':       'category'})


lag4 = allWeather_byInd.copy()
lag4['yearQtr'] += 1
for colname in lag4.columns[5:]:
    lag4.rename(columns = {colname: 'lag4_' + colname}, inplace = True)
lag4.drop(columns = {'year','qtr'},inplace = True)
lag4 = lag4.astype({'yearQtr':       'category'})


allWeather_byInd = allWeather_byInd.astype({'yearQtr':       'category'})


print(allWeather_byInd.shape)


allWeather_byInd.head()


'''allWeather_byInd_withLags = allWeather_byInd.merge(lag1).merge(lag2).merge(lag3).merge(lag4)

allWeather_byInd_withLags.year.value_counts()
'''

In [None]:
allWeather_byInd_withLags = allWeather_byInd.merge(lag1).merge(lag2).merge(lag3).merge(lag4)

In [None]:
allWeather_byInd_withLags.shape

In [None]:
allWeather_byInd_withLags.to_csv("../../data/companyData/allWeather_byInd_withLags.csv")

In [None]:
del allWeather_byInd
del lag1
del lag2
del lag3
del lag4
gc.collect()

In [None]:
allWeather_byInd_withLags = pd.read_csv("../../data/companyData/allWeather_byInd_withLags.csv")
allWeather_byInd_withLags.head()

Now get the streak data. Make sure it's 0/1 for whether there was a streak.

In [None]:
allWeather = pd.read_csv("../../data/companyData/stockWeather_zipQuarterQuants.csv").\
    drop(columns = {'Unnamed: 0'})

allWeather = allWeather[~allWeather.temp_zipQuarterLast5.isna()].reset_index(drop = True)

allWeather['date'] = pd.to_datetime(allWeather['date'],
                                   format = "%Y-%m-%d")

allWeather['hotStreak']   = (allWeather['temp_zipQuarterLast5'] == 5)*1
allWeather['wetStreak'] = (allWeather['precip_zipQuarterLast5'] == 5)*1

allWeather.rename(columns = {'ZIP': 'zipcode'}, inplace = True)
print(allWeather.dtypes)

allWeather['year'] = allWeather.date.dt.year
allWeather['quarter'] = allWeather.date.dt.quarter


allWeather.drop(columns = {'date','temp_zipQuarterLast5','precip_zipQuarterLast5'}, inplace = True)

streaks = allWeather.groupby(['zipcode','year','quarter']).sum().reset_index()

streaks['hotStreak'] = (streaks['hotStreak'] > 0)*1
streaks['wetStreak'] = (streaks['wetStreak'] > 0)*1

streaks.rename(columns = {'quarter': 'qtr'}, inplace = True)
streaks.head()


In [None]:
streaks['yearQtr'] = streaks.year + (streaks.qtr - 1)/4

col = streaks.pop("yearQtr")
streaks.insert(0, col.name, col)

lag1 = streaks.copy()
lag1['yearQtr'] += 0.25
for colname in lag1.columns[4:]:
    lag1.rename(columns = {colname: 'lag1_' + colname}, inplace = True)
lag1.drop(columns = {'year','qtr'},inplace = True)

    
lag2 = streaks.copy()
lag2['yearQtr'] += 0.5
for colname in lag2.columns[4:]:
    lag2.rename(columns = {colname: 'lag2_' + colname}, inplace = True)
lag2.drop(columns = {'year','qtr'},inplace = True)


lag3 = streaks.copy()
lag3['yearQtr'] += 0.75
for colname in lag3.columns[4:]:
    lag3.rename(columns = {colname: 'lag3_' + colname}, inplace = True)
lag3.drop(columns = {'year','qtr'},inplace = True)


lag4 = streaks.copy()
lag4['yearQtr'] += 1
for colname in lag4.columns[4:]:
    lag4.rename(columns = {colname: 'lag4_' + colname}, inplace = True)
lag4.drop(columns = {'year','qtr'},inplace = True)


print(streaks.shape)

streaks_withLags = streaks.merge(lag1).merge(lag2).merge(lag3).merge(lag4)

print(streaks_withLags.year.value_counts())

streaks_withLags.to_csv("../../data/companyData/streaks_withLags.csv")

In [None]:

streaks_withLags = pd.read_csv("../../data/companyData/streaks_withLags.csv")
streaks_withLags.year.value_counts()

In [None]:
streaks_withLags.head()

And now the severe weather.

In [58]:
zipQuarters = pd.read_csv("../../data/companyData/allWeather_withLags_new.csv")[['zipcode','year','qtr']].drop_duplicates()


zipQuarters_2009 = zipQuarters.copy()
zipQuarters_2009 = zipQuarters[zipQuarters.year == 2010]
zipQuarters_2009['year'] = zipQuarters_2009['year'] - 1

zipQuarters = zipQuarters_2009.append(zipQuarters)


zipQuarters.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,zipcode,year,qtr
0,10001,2009,1
1,10001,2009,2
2,10001,2009,3
3,10001,2009,4
40,10002,2009,1


In [59]:
zipQuarters.year.min()

2009

In [62]:
thunderstorms = pd.read_csv("../../data/companyData/thunderstormWinds.csv").\
    drop(columns = {'Unnamed: 0', 'STCOUNTYFP'})

thunderstorms['year'] =  thunderstorms['yearQtr'].str.slice(0,4).astype('int64')

print(thunderstorms.year.min())

thunderstorms.head()


thunderstorms['qtr']  =  thunderstorms['yearQtr'].str.slice(5,6).astype('int64')
thunderstorms.drop(columns = {'yearQtr'}, inplace = True)
thunderstorms = thunderstorms[(thunderstorms.year > 2008) & (thunderstorms.year < 2020)]

print(thunderstorms.shape)

thunderstorms = zipQuarters.merge(thunderstorms, how = 'left')
thunderstorms = thunderstorms.fillna(0)
print(zipQuarters.shape, thunderstorms.shape)

thunderstorms.year.min()

2000
(4146734, 6)
(1436864, 3) (4290777, 6)


2009

In [63]:
thunderstorms = thunderstorms.groupby(['zipcode','year','qtr']).sum().reset_index()

thunderstorms['propAboveTenThou']     = (thunderstorms['propAboveTenThou'] > 0)*1
thunderstorms['propAboveHundredThou'] = (thunderstorms['propAboveHundredThou'] > 0)*1
thunderstorms['propAboveMilli']       = (thunderstorms['propAboveMilli'] > 0)*1


thunderstorms.year.value_counts()

2009    130624
2010    130624
2011    130624
2012    130624
2013    130624
2014    130624
2015    130624
2016    130624
2017    130624
2018    130624
2019    130624
Name: year, dtype: int64

In [64]:
thunderstorms['yearQtr'] = thunderstorms.year + (thunderstorms.qtr - 1)/4

col = thunderstorms.pop("year")
thunderstorms.insert(0, col.name, col)

col = thunderstorms.pop("qtr")
thunderstorms.insert(0, col.name, col)

col = thunderstorms.pop("yearQtr")
thunderstorms.insert(0, col.name, col)

col = thunderstorms.pop("zipcode")
thunderstorms.insert(0, col.name, col)

thunderstorms = thunderstorms.astype({'zipcode':    'category'})


lag1 = thunderstorms.copy()
lag1['yearQtr'] += 0.25
for colname in lag1.columns[4:]:
    lag1.rename(columns = {colname: 'lag1_' + colname}, inplace = True)
lag1.drop(columns = {'year','qtr'},inplace = True)
lag1 = lag1.astype({'yearQtr':    'category'})



lag2 = thunderstorms.copy()
lag2['yearQtr'] += 0.5
for colname in lag2.columns[4:]:
    lag2.rename(columns = {colname: 'lag2_' + colname}, inplace = True)
lag2.drop(columns = {'year','qtr'},inplace = True)
lag2 = lag2.astype({'yearQtr':    'category'})


lag3 = thunderstorms.copy()
lag3['yearQtr'] += 0.75
for colname in lag3.columns[4:]:
    lag3.rename(columns = {colname: 'lag3_' + colname}, inplace = True)
lag3.drop(columns = {'year','qtr'},inplace = True)
lag3 = lag3.astype({'yearQtr':    'category'})

lag4 = thunderstorms.copy()
lag4['yearQtr'] += 1
for colname in lag4.columns[4:]:
    lag4.rename(columns = {colname: 'lag4_' + colname}, inplace = True)
lag4.drop(columns = {'year','qtr'},inplace = True)
lag4 = lag4.astype({'yearQtr':    'category'})


thunderstorms = thunderstorms.astype({'yearQtr':    'category'})

thunderstorms_withLags = thunderstorms.merge(lag1).merge(lag2).merge(lag3).merge(lag4)
print(thunderstorms_withLags.year.value_counts())

thunderstorms_withLags.to_csv("../../data/companyData/thunderstorms_withLags.csv")

2010    130624
2011    130624
2012    130624
2013    130624
2014    130624
2015    130624
2016    130624
2017    130624
2018    130624
2019    130624
Name: year, dtype: int64


In [65]:
print(thunderstorms_withLags.head())

  zipcode  yearQtr  qtr  year  propAboveTenThou  propAboveHundredThou  \
0    1001   2010.0    1  2010                 0                     0   
1    1001  2010.25    2  2010                 1                     0   
2    1001   2010.5    3  2010                 1                     0   
3    1001  2010.75    4  2010                 0                     0   
4    1001   2011.0    1  2011                 0                     0   

   propAboveMilli  lag1_propAboveTenThou  lag1_propAboveHundredThou  \
0               0                      0                          0   
1               0                      0                          0   
2               0                      1                          0   
3               0                      1                          0   
4               0                      0                          0   

   lag1_propAboveMilli  lag2_propAboveTenThou  lag2_propAboveHundredThou  \
0                    0                      0             

# Locations
Create a separate definition of weather based not on HQ but on employee-weighted establishment footprint.

In [None]:
fractions = pd.read_csv('../../data/companyData/fractionEmployees_byEstablishment.csv').\
    drop(columns = {"Unnamed: 0", 'latitude','longitude'}).rename(columns = {'archive_version_year': 'year',
                                                    'parent_number': 'abi'})

fractions['year']    = fractions.year.astype('int64')
fractions['zipcode'] = fractions.zipcode.astype('int64')
fractions.head()

gvKey_abiLinkingTable = pd.read_csv('../../data/companyData/linkingTable.csv').drop(columns = ['Unnamed: 0'])

print(gvKey_abiLinkingTable.abi)

gvKey_abiLinkingTable.head()

fractions = fractions[['year','abi','zipcode','locationFracOfEmployees']].merge(gvKey_abiLinkingTable[['abi','gvkey']])

fractions = fractions.astype({'year':       'category',
                           'zipcode':    'category'})

fractions.head()

In [None]:
fractions.year

In [None]:
fractions_byZip = fractions[['gvkey','year','zipcode','locationFracOfEmployees']]
fractions_byZip['year'] =  fractions_byZip['year'].astype('int64')

fractions_byZip = fractions_byZip[(fractions_byZip.year > 2008) & (fractions_byZip.year < 2020)]

print(fractions_byZip.shape)

fractions_byZip.to_csv("../../data/companyData/fractions_byZip.csv")

In [None]:
fractionsWithWeather = fractions.merge(allWeather_withLags_allZips) 
fractionsWithWeather.drop(columns = {'abi','zipcode'}, inplace = True)

print(fractionsWithWeather.shape)
fractionsWithWeather.head()

In [None]:
fractionsWithWeather[fractionsWithWeather.gvkey == 1004]

In [None]:
del allWeather_withLags
del fractions
del gvKey_abiLinkingTable
gc.collect()

In [None]:
for col in fractionsWithWeather.columns[4:]:
    fractionsWithWeather[col] = fractionsWithWeather[col] * fractionsWithWeather.locationFracOfEmployees

In [None]:
g = fractionsWithWeather.groupby(['gvkey','year','qtr']).sum().reset_index()
g.drop(columns = {'locationFracOfEmployees'}, inplace = True)

for colname in g.columns[3:]:
    g.rename(columns = {colname: 'empWt_' + colname}, inplace = True)

g.head()

In [None]:
g.to_csv("../../data/companyData/weatherByEstablishment.csv")

In [None]:
establishmentZips = fractions.zipcode.unique()
len(establishmentZips)

## create the original weather with lags dataset

In [3]:
streaks_withLags = pd.read_csv("../../data/companyData/streaks_withLags.csv").\
    drop(columns = {'Unnamed: 0', 'yearQtr'}).astype({'year':       'category',
                           'qtr':        'category',
                           'zipcode':    'category'})


g = pd.read_csv("../../data/companyData/weatherByEstablishment.csv").\
    drop(columns = {"Unnamed: 0"})


allWeather_withLags = pd.read_csv("../../data/companyData/allWeather_withLags.csv").\
    drop(columns = {"Unnamed: 0", 'yearQtr'}).astype({'year':       'category',
                           'qtr':        'category',
                           'zipcode':    'category'})


averages = pd.read_csv("../../data/companyData/quarterlyStatsByZip.csv").\
    drop(columns = {"Unnamed: 0"}).rename(columns = {'ZIP': 'zipcode'})
averages['qtr'] = averages.quarter.str.slice(1,2).astype('float')
averages.drop(columns = {'quarter'}, inplace = True) 
averages = averages.astype({'qtr':        'category',
                           'zipcode':    'category'})


allWeather_withLags2 = pd.read_csv("../../data/companyData/allWeather_withLags_new.csv").\
    drop(columns = {"Unnamed: 0", 'yearQtr'}).astype({'year':       'category',
                           'qtr':        'category',
                           'zipcode':    'category'})


thunderstorms_withLags = pd.read_csv("../../data/companyData/thunderstorms_withLags.csv").\
    drop(columns = {"Unnamed: 0", 'yearQtr'}).astype({'year':       'category',
                           'qtr':        'category',
                           'zipcode':    'category'})


'''allWeather_byInd_withLags = pd.read_csv("../../data/companyData/allWeather_byInd_withLags.csv").\
    drop(columns = {"Unnamed: 0", 'yearQtr'})

allWeather_byInd_withLags = allWeather_byInd_withLags.astype({'year':       'category',
                           'qtr':        'category',
                           'zipcode':    'category'})'''

'allWeather_byInd_withLags = pd.read_csv("../../data/companyData/allWeather_byInd_withLags.csv").    drop(columns = {"Unnamed: 0", \'yearQtr\'})\n\nallWeather_byInd_withLags = allWeather_byInd_withLags.astype({\'year\':       \'category\',\n                           \'qtr\':        \'category\',\n                           \'zipcode\':    \'category\'})'

Create direct effects database. Merge weather to full cstat database.

Merge weather to the ig-cstat database.

In [70]:
igChanges = pd.read_csv("../../data/companyData/igData.csv").drop(columns = {'Unnamed: 0', 
        'parent_employee_size_code', 'location_employee_size_code', 'employeesAtLocation'})

fractions_byZip = pd.read_csv("../../data/companyData/fractions_byZip.csv").drop(columns = {'Unnamed: 0'})
fractions_byZip = fractions_byZip[fractions_byZip.gvkey.isin(list(igChanges.gvkey.unique())) & \
                                  fractions_byZip.zipcode.isin(list(igChanges.zipcode.unique())) ]

fractions_byZip = fractions_byZip.groupby(['year','zipcode','gvkey']).sum().reset_index()

print(igChanges.shape)

igChanges = igChanges.merge(fractions_byZip)

print(igChanges.shape)

(241355, 61)
(134423, 62)


In [68]:
igChangesWithWeather = igChanges.merge(allWeather_withLags).merge(allWeather_withLags2).\
    merge(allWeather_byInd_withLags).merge(averages).merge(g).merge(streaks_withLags).\
    merge(thunderstorms_withLags)
igChangesWithWeather.shape

(116342, 873)

In [71]:
for col in igChangesWithWeather.columns:
    print(col)

gvkey
datadate
year
qtr
companyName
curcdq
assets
cash
costGoodsSold
totalInv
netIncome
opInc_afDep
opInc_befDep
totalRevenue
costat
priceClose
add1
addzip
assetsLast
netIncomeLast
totalRevenueLast
costGoodsSoldLast
totalInvLast
opInc_afDepLast
opInc_befDepLast
priceCloseLast
cashLast
incomeChange
revenueChange
costChange
inventoryChange
opInc_afDepChange
opInc_befDepChange
priceCloseChange
assetsPrev
assetsLagged
netIncomeLagged
roa_lagged
famafrench
sic2
indGroup
earliestYear
ageTercile
sizeTercile
profitTercile
datacqtr
datafqtr
fyr
DATE
cstatCompanies
igCompanies
delete
abi
ticker
company
state
city
address_line_1
zipcode
latitude
longitude
locationFracOfEmployees
precip_annualquant_0.95
precip_annualquant_1xQtr
precip_annualquant_1xYr
precip_annualquant_1x5Qtrs
precip_annualquant_1x10Qtrs
precip_annualquant_1x5Yrs
precip_annualquant_1x10Yrs
precip_zipquant_0.95
precip_zipquant_1xQtr
precip_zipquant_1xYr
precip_zipquant_1x5Qtrs
precip_zipquant_1x10Qtrs
precip_zipquant_1x5Yrs
precip

In [73]:
igChangesWithWeather.to_csv("../../data/companyData/igWithWeather.csv")

In [72]:
igChangesWithWeather.propAboveMilli

0         0
1         0
2         0
3         0
4         0
         ..
116337    0
116338    0
116339    0
116340    0
116341    0
Name: propAboveMilli, Length: 116342, dtype: int64

In [None]:
igChangesWithWeather = pd.read_csv("../../data/companyData/igWithWeather.csv")

In [None]:
for col in igChangesWithWeather.columns:
    print(col)

In [None]:
igChangesWithWeather.shape

# Indirect
Introduce the SC Data.

In [58]:
# this does a little bit of a test on the reporting requirements. 
# number 

'''c_linksTest = pd.read_csv("../../data/companyData/compustatSCLinked.csv")[['srcdate','gvkey','cgvkey']]
c_linksTest['year'] = c_linksTest.srcdate.astype('str').str.slice(0,4).astype('int64')

bs = c_linksTest[c_linksTest.year < 2014]
print("Customers per supplier, 1978-2013 Pd: ", len(bs.cgvkey.unique())/len(bs.gvkey.unique()))

bs2 = c_linksTest[c_linksTest.year > 2010]
print("Customers per supplier, Recent Pd: ", len(bs2.cgvkey.unique())/len(bs2.gvkey.unique()))'''


'c_linksTest = pd.read_csv("../../data/companyData/compustatSCLinked.csv")[[\'srcdate\',\'gvkey\',\'cgvkey\']]\nc_linksTest[\'year\'] = c_linksTest.srcdate.astype(\'str\').str.slice(0,4).astype(\'int64\')\n\nbs = c_linksTest[c_linksTest.year < 2014]\nprint("Customers per supplier, 1978-2013 Pd: ", len(bs.cgvkey.unique())/len(bs.gvkey.unique()))\n\nbs2 = c_linksTest[c_linksTest.year > 2010]\nprint("Customers per supplier, Recent Pd: ", len(bs2.cgvkey.unique())/len(bs2.gvkey.unique()))'

In [59]:
c_links = pd.read_csv("../../data/companyData/compustatSCLinked.csv") # pd.read_csv("../../data/companyData/compustatSCLinked.csv")

c_links['year'] = c_links.srcdate.astype('str').str.slice(0,4).astype('int64')

c_links = c_links[c_links.year > 1999][['year','gvkey','cgvkey','salecs']].\
    rename(columns = {'cgvkey': 'customer_gvkey','gvkey': 'supplier_gvkey'})

c_links['year'] = pd.to_datetime(c_links.year, format = '%Y')


c_links.head()

Unnamed: 0,year,supplier_gvkey,customer_gvkey,salecs
70,2002-01-01,1013,2136,111.056
71,2004-01-01,1013,2136,104.312
72,2005-01-01,1013,2136,146.0
73,2006-01-01,1013,2136,205.0
74,2007-01-01,1013,2136,236.0


In [60]:
supplierCombos = c_links[['supplier_gvkey', 'customer_gvkey']].drop_duplicates().reset_index(drop = True)

print(supplierCombos.shape)

supplierCombos.head()

(16812, 2)


Unnamed: 0,supplier_gvkey,customer_gvkey
0,1013,2136
1,1013,9899
2,1021,61494
3,1021,25880
4,1048,11552


We'll follow Barrot Sauvagnat in assuming that a supplier relationship holds for every year between the first and last year in which a customer is reported. This is going to take a little bit of work. We'll try it like this: 
- subset dataframe to a specific supplier-customer pair
- fill in data for every year that's missing

Then, apply this row-wise to all rows of the unique supplierCombos df above using: https://stackoverflow.com/questions/61942138/apply-function-row-wise-to-pandas-dataframe

In [61]:
def fillYear(supplier, customer, scData = c_links):
    c_linksTemp = scData[(scData.supplier_gvkey == supplier) & \
                      (scData.customer_gvkey == customer)].reset_index(drop = True)
    
    # if there are na values and non-na values for the same supplier-cust combination, then 
    # select for only the non-na values, by (1) replacing na with negative, (2) 1
    c_linksTemp['salecs'] = c_linksTemp['salecs'].fillna(-5)
    c_linksTemp = c_linksTemp.loc[c_linksTemp.reset_index().groupby(['year','supplier_gvkey', 'customer_gvkey'])['salecs'].idxmax()]

    
    # now: find the start and end of the data series
    first = c_linksTemp.year.min()
    last  = c_linksTemp.year.max()

    c_linksTemp = c_linksTemp.set_index('year') 

    c_linksTemp = c_linksTemp.reindex(pd.date_range(first, last, freq = 'YS')).\
        reset_index().rename(columns = {'index': 'year'})

    # and impute all values within the series
    c_linksTemp = c_linksTemp.groupby(c_linksTemp.year.dt.time).ffill()
    
    return(c_linksTemp)

Show that this works for one of the supplier rows.

In [62]:
fillYear(supplierCombos.supplier_gvkey[0], supplierCombos.customer_gvkey[0])

Unnamed: 0,year,supplier_gvkey,customer_gvkey,salecs
0,2002-01-01,1013.0,2136.0,111.056
1,2003-01-01,1013.0,2136.0,111.056
2,2004-01-01,1013.0,2136.0,104.312
3,2005-01-01,1013.0,2136.0,146.0
4,2006-01-01,1013.0,2136.0,205.0
5,2007-01-01,1013.0,2136.0,236.0
6,2008-01-01,1013.0,2136.0,240.0
7,2009-01-01,1013.0,2136.0,176.0
8,2010-01-01,1013.0,2136.0,146.0


Now do it for all rows.

In [63]:
start = time.time()
print(c_links.shape)
c_linksImpd_list = supplierCombos.apply(lambda row: fillYear(row['supplier_gvkey'], row['customer_gvkey']), axis = 1)
c_linksImpd_df   = pd.concat(list(c_linksImpd_list))
print(c_linksImpd_df.shape)
print(time.time() - start)

c_linksImpd_df['year'] = c_linksImpd_df.year.dt.year

(65270, 4)
(68771, 4)
70.72920513153076


We had converted some of the na sales values to -5 so that we could deal with duplicated values, by choosing the larger of said values. Switch back to nan so that we are not thrown off when we look for biggest supplier.

In [64]:
c_linksImpd_df.loc[c_linksImpd_df.salecs == -5, 'salecs'] = float('nan')

In [65]:
c_linksImpd_df.salecs

0    111.056
1    111.056
2    104.312
3    146.000
4    205.000
      ...   
1    179.284
0     34.418
0     25.334
0    283.318
1    316.116
Name: salecs, Length: 68771, dtype: float64

In [66]:
industries = pd.read_csv("../../data/companyData/gvkeyIndustries.csv").drop(columns = {'Unnamed: 0'})
print(industries)

       gvkey   indGroup
0       1004  wholesale
1       1082   services
2       1244       manu
3       1258     mining
4       1331       manu
...      ...        ...
25519  34542    finance
25520  34549    finance
25521  34553    finance
25522  34556    finance
25523  34583    finance

[25524 rows x 2 columns]


In [72]:
c_links = c_linksImpd_df.copy()

print(c_links.shape)

print(c_links.head())

industries.columns = ['customer_gvkey','customer_ind']

c_links = c_links.merge(industries)

industries.columns = ['supplier_gvkey','supplier_ind']

c_links = c_links.merge(industries)
print(c_links.head(), c_links.shape)


c_links.to_csv("../../data/companyData/c_links.csv")


(68771, 4)
   year  supplier_gvkey  customer_gvkey   salecs
0  2002          1013.0          2136.0  111.056
1  2003          1013.0          2136.0  111.056
2  2004          1013.0          2136.0  104.312
3  2005          1013.0          2136.0  146.000
4  2006          1013.0          2136.0  205.000
   year  supplier_gvkey  customer_gvkey   salecs        customer_ind  \
0  2002          1013.0          2136.0  111.056  transportUtilities   
1  2003          1013.0          2136.0  111.056  transportUtilities   
2  2004          1013.0          2136.0  104.312  transportUtilities   
3  2005          1013.0          2136.0  146.000  transportUtilities   
4  2006          1013.0          2136.0  205.000  transportUtilities   

  supplier_ind  
0         manu  
1         manu  
2         manu  
3         manu  
4         manu   (67025, 6)


Let's see how all this translates into different industries. Check how many times different industries show up.

The full count will be roughly 4x whatever is below, assuming we can get a match for roughly all of them (which we should be able to).

In [73]:
c_links.head()

Unnamed: 0,year,supplier_gvkey,customer_gvkey,salecs,customer_ind,supplier_ind
0,2002,1013.0,2136.0,111.056,transportUtilities,manu
1,2003,1013.0,2136.0,111.056,transportUtilities,manu
2,2004,1013.0,2136.0,104.312,transportUtilities,manu
3,2005,1013.0,2136.0,146.0,transportUtilities,manu
4,2006,1013.0,2136.0,205.0,transportUtilities,manu


In [74]:
c_links[c_links.year.astype(int) > 2009].supplier_ind.value_counts()

manu                  15708
finance                8285
services               4309
mining                 2897
transportUtilities     1978
wholesale               820
construction            265
retail                  172
agForFish                62
Name: supplier_ind, dtype: int64

Now see if it's common to have one in and one out of the industries of interest. 

For now, let's keep all the different industry types.

We can always filter later if we need to.

In [75]:
sum(c_linksMerge2.year > 2009)

27942

In [81]:
#########################
# get data and reset columns 
gvKey_abiLinkingTable = pd.read_csv('../../data/companyData/linkingTable.csv').drop(columns = ['Unnamed: 0']).\
    drop_duplicates()

base_columns = gvKey_abiLinkingTable.columns 
customer_columns = "customer_" + base_columns
supplier_columns = "supplier_" + base_columns




# merge in customer information
gvKey_abiLinkingTable.columns = customer_columns

print(c_links.shape)
c_linksMerge1 = c_links.merge(gvKey_abiLinkingTable, on ='customer_gvkey')
print(c_links.shape,c_linksMerge1.shape, c_linksMerge1.head())




#########################
# and merge in supplier 
gvKey_abiLinkingTable.columns = supplier_columns

print(c_links.shape)
c_linksMerge2 = c_linksMerge1.merge(gvKey_abiLinkingTable, on ='supplier_gvkey').drop_duplicates()
print(c_linksMerge2.shape)

c_linksMerge2.to_csv("../../data/companyData/clinks_IG_selected.csv")

(67025, 6)
(67025, 6) (47611, 10)    year  supplier_gvkey  customer_gvkey   salecs        customer_ind  \
0  2002          1013.0          2136.0  111.056  transportUtilities   
1  2003          1013.0          2136.0  111.056  transportUtilities   
2  2004          1013.0          2136.0  104.312  transportUtilities   
3  2005          1013.0          2136.0  146.000  transportUtilities   
4  2006          1013.0          2136.0  205.000  transportUtilities   

  supplier_ind customer_cstatCompanies customer_igCompanies  customer_delete  \
0         manu     verizonmmunications  verizonmmunications              NaN   
1         manu     verizonmmunications  verizonmmunications              NaN   
2         manu     verizonmmunications  verizonmmunications              NaN   
3         manu     verizonmmunications  verizonmmunications              NaN   
4         manu     verizonmmunications  verizonmmunications              NaN   

   customer_abi  
0       7564776  
1       7564776 

In [127]:
c_linksMerge2.head()

Unnamed: 0,year,supplier_gvkey,customer_gvkey,salecs,customer_ind,supplier_ind,customer_cstatCompanies,customer_igCompanies,customer_delete,customer_abi,supplier_cstatCompanies,supplier_igCompanies,supplier_delete,supplier_abi
0,2002,1013.0,2136.0,111.056,transportUtilities,manu,verizonmmunications,verizonmmunications,,7564776,adc telecommunications,adc telecommunications,,7523129
1,2003,1013.0,2136.0,111.056,transportUtilities,manu,verizonmmunications,verizonmmunications,,7564776,adc telecommunications,adc telecommunications,,7523129
2,2004,1013.0,2136.0,104.312,transportUtilities,manu,verizonmmunications,verizonmmunications,,7564776,adc telecommunications,adc telecommunications,,7523129
3,2005,1013.0,2136.0,146.0,transportUtilities,manu,verizonmmunications,verizonmmunications,,7564776,adc telecommunications,adc telecommunications,,7523129
4,2006,1013.0,2136.0,205.0,transportUtilities,manu,verizonmmunications,verizonmmunications,,7564776,adc telecommunications,adc telecommunications,,7523129


This is probably because: (1) companies are not in North America, or (2) companies are not in the physical goods industries we're interested in. We can verify this though: look at c_links where both the customer and supplier are in the dataset of interest.

In [83]:
chq     = pd.read_csv("../../data/chq.csv",dtype={'cstatZipcode': 'object'}).drop(columns = {'Unnamed: 0'})

c_linkTest = c_links[c_links.customer_gvkey.isin(chq.gvkey.unique()) & \
                     c_links.supplier_gvkey.isin(chq.gvkey.unique())]

print("Percent of firms with a match: ", c_linksMerge2.shape[0]/c_linkTest.shape[0])

Percent of firms with a match:  0.7092245720040282


Let's check this with the IG headquarters as well.

In [84]:
c_linksMerge2.head()

Unnamed: 0,year,supplier_gvkey,customer_gvkey,salecs,customer_ind,supplier_ind,customer_cstatCompanies,customer_igCompanies,customer_delete,customer_abi,supplier_cstatCompanies,supplier_igCompanies,supplier_delete,supplier_abi
0,2002,1013.0,2136.0,111.056,transportUtilities,manu,verizonmmunications,verizonmmunications,,7564776,adc telecommunications,adc telecommunications,,7523129
1,2003,1013.0,2136.0,111.056,transportUtilities,manu,verizonmmunications,verizonmmunications,,7564776,adc telecommunications,adc telecommunications,,7523129
2,2004,1013.0,2136.0,104.312,transportUtilities,manu,verizonmmunications,verizonmmunications,,7564776,adc telecommunications,adc telecommunications,,7523129
3,2005,1013.0,2136.0,146.0,transportUtilities,manu,verizonmmunications,verizonmmunications,,7564776,adc telecommunications,adc telecommunications,,7523129
4,2006,1013.0,2136.0,205.0,transportUtilities,manu,verizonmmunications,verizonmmunications,,7564776,adc telecommunications,adc telecommunications,,7523129


It's entirely possible that we have too small of a sample from the 2010s alone. Let's just try it though and see how it goes.

First, make a sample with the companies on one year of either side of when it reports another customer.

In [85]:
def makeOneEitherSide(df): 
    yrPlus1 = df.copy(); yrPlus1['year'] += 1
    # yrPlus2 = df.copy(); yrPlus2['year'] += 1
    # yrPlus3 = df.copy(); yrPlus3['year'] += 1
    
    yrMinus1 = df.copy(); yrMinus1['year'] -= 1
    # yrMinus2 = df.copy(); yrMinus2['year'] -= 1
    # yrMinus3 = df.copy(); yrMinus3['year'] -= 1
    
    all = pd.concat([yrPlus1,yrMinus1]) # pd.concat([yrPlus1,yrPlus2,yrPlus3,yrMinus1,yrMinus2,yrMinus3])
    
    return(all)

In [86]:
c_linksMerge2.columns

Index(['year', 'supplier_gvkey', 'customer_gvkey', 'salecs', 'customer_ind',
       'supplier_ind', 'customer_cstatCompanies', 'customer_igCompanies',
       'customer_delete', 'customer_abi', 'supplier_cstatCompanies',
       'supplier_igCompanies', 'supplier_delete', 'supplier_abi'],
      dtype='object')

In [99]:
########
hqsOnly = pd.read_csv("../../data/ig_uniqueHQs.csv").drop(columns = {'Unnamed: 0'})

hq = pd.read_csv("../../data/ig_uniqueHQs_multLocations.csv").\
    drop(columns = {'Unnamed: 0'}).\
    rename(columns = {'archive_version_year': 'year'})

hq['year'] = hq.year.astype('int64')

hqRelevant = hq[hq.abi.isin(allAbi)]

In [103]:
scTableSuppliers = c_linksMerge2.copy()[['year','supplier_gvkey','supplier_abi','supplier_ind']].drop_duplicates()

print(scTableSuppliers.shape)
# allSupplierData = makeOneEitherSide(scTableSuppliers)
allSupplierData         = scTableSuppliers.copy()
allSupplierData.columns = ['year','gvkey','abi','ind']


allAbi = allSupplierData.abi.drop_duplicates() # allCustomerData.abi.append(

allSupplierData = allSupplierData.merge(hqRelevant).drop_duplicates()

print(allSupplierData.shape)

allSupplierData['supplier_coordinate'] = ''

for i in range(0, allSupplierData.shape[0]):
    allSupplierData.supplier_coordinate[i] = (allSupplierData.latitude[i], allSupplierData.longitude[i])
    
print(allSupplierData.head())
allSupplierData.to_csv("../../data/companyData/allSupplierData.csv")

(16466, 4)
(13223, 15)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


   year   gvkey      abi   ind ticker                     company state  \
0  2003  1013.0  7523129  manu    NaN  ADC TELECOMMUNICATIONS INC    MN   
1  2004  1013.0  7523129  manu    NaN  ADC TELECOMMUNICATIONS INC    MN   
2  2005  1013.0  7523129  manu    NaN  ADC TELECOMMUNICATIONS INC    MN   
3  2006  1013.0  7523129  manu    NaN  ADC TELECOMMUNICATIONS INC    MN   
4  2007  1013.0  7523129  manu    NaN  ADC TELECOMMUNICATIONS INC    MN   

           city       address_line_1  zipcode  latitude  longitude  \
0  EDEN PRAIRIE  13625 TECHNOLOGY DR    55344  44.85645  -93.45199   
1  EDEN PRAIRIE  13625 TECHNOLOGY DR    55344  44.85645  -93.45199   
2  EDEN PRAIRIE  13625 TECHNOLOGY DR    55344  44.85645  -93.45199   
3  EDEN PRAIRIE  13625 TECHNOLOGY DR    55344  44.85645  -93.45199   
4  EDEN PRAIRIE  13625 TECHNOLOGY DR    55344  44.85645  -93.45199   

   parent_employee_size_code  location_employee_size_code  \
0                      250.0                        250.0   
1     

In [104]:
scTableCustomers = c_linksMerge2.copy()[['year','customer_gvkey','customer_abi','customer_ind']].drop_duplicates()

print(scTableCustomers.shape)
# allCustomerData = makeOneEitherSide(scTableCustomers)
allCustomerData         = scTableCustomers.copy()
allCustomerData.columns = ['year','gvkey','abi','ind']


allAbi = allCustomerData.abi.drop_duplicates() # allCustomerData.abi.append(

allCustomerData = allCustomerData.merge(hqRelevant).drop_duplicates()

print(allCustomerData.shape)

allCustomerData['customer_coordinate'] = ''

for i in range(0, allCustomerData.shape[0]):
    allCustomerData.customer_coordinate[i] = (allCustomerData.latitude[i], allCustomerData.longitude[i])
    
print(allCustomerData.head())
allCustomerData.to_csv("../../data/companyData/allCustomerData.csv")

(9861, 4)
(3067, 15)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


   year     gvkey        abi                 ind ticker  \
0  2012    4093.0    7511868  transportUtilities    DUK   
1  2012  184321.0  402704137              mining    NaN   
2  2009    3036.0  402233076  transportUtilities    CBB   
3  2010    3036.0  402233076  transportUtilities    CBB   
4  2003   10984.0  800138737  transportUtilities      S   

                         company state           city         address_line_1  \
0               DUKE ENERGY CORP    NC      CHARLOTTE         550 S TRYON ST   
1  CHESAPEAKE MIDSTREAM PARTNERS    OK  OKLAHOMA CITY         900 NW 63RD ST   
2            CINCINNATI BELL INC    OH     CINCINNATI           221 E 4TH ST   
3            CINCINNATI BELL INC    OH     CINCINNATI           221 E 4TH ST   
4      NEXTEL COMMUNICATIONS INC    VA         RESTON  2001 EDMUND HALLEY DR   

   zipcode  latitude  longitude  parent_employee_size_code  \
0    28202  35.22391  -80.84810                     1000.0   
1    73118  35.51957  -97.53096         

In [122]:
points0 = (allSupplierData.latitude[0], allSupplierData.longitude[0])
points1 = (allSupplierData.latitude[1], allSupplierData.longitude[1])

geopy.distance.geodesic(points0,points1).mi

0.0

Previously we had done this with the customers as well, but we lose some percentage of the observations if we again try to match on the IG data, so for now just focus on the suppliers.

## Find Customer and Supplier pairings and merge with change data
### Can pick up here

In [23]:
allSupplierData = pd.read_csv("../../data/companyData/allSupplierData.csv").\
    drop(columns = ['Unnamed: 0'])[['gvkey', 'ind', 'year','zipcode']]
print(allSupplierData.shape, allSupplierData.columns)

(15313, 4) Index(['gvkey', 'ind', 'year', 'zipcode'], dtype='object')


In [24]:
allSupplierData.head()

Unnamed: 0,gvkey,ind,year,zipcode
0,1013.0,manu,2003,55344
1,1013.0,manu,2004,55344
2,1013.0,manu,2005,55344
3,1013.0,manu,2006,55344
4,1013.0,manu,2007,55344


## Get first-hop SC data

In [124]:
c_links = pd.read_csv("../../data/companyData/clinks_IG_selected.csv").drop(columns = {'Unnamed: 0'})

print(c_links.shape)

c_links.head()

(35213, 14)


Unnamed: 0,year,supplier_gvkey,customer_gvkey,salecs,customer_ind,supplier_ind,customer_cstatCompanies,customer_igCompanies,customer_delete,customer_abi,supplier_cstatCompanies,supplier_igCompanies,supplier_delete,supplier_abi
0,2002,1013.0,2136.0,111.056,transportUtilities,manu,verizonmmunications,verizonmmunications,,7564776,adc telecommunications,adc telecommunications,,7523129
1,2003,1013.0,2136.0,111.056,transportUtilities,manu,verizonmmunications,verizonmmunications,,7564776,adc telecommunications,adc telecommunications,,7523129
2,2004,1013.0,2136.0,104.312,transportUtilities,manu,verizonmmunications,verizonmmunications,,7564776,adc telecommunications,adc telecommunications,,7523129
3,2005,1013.0,2136.0,146.0,transportUtilities,manu,verizonmmunications,verizonmmunications,,7564776,adc telecommunications,adc telecommunications,,7523129
4,2006,1013.0,2136.0,205.0,transportUtilities,manu,verizonmmunications,verizonmmunications,,7564776,adc telecommunications,adc telecommunications,,7523129


In [126]:
c_links.columns

Index(['year', 'supplier_gvkey', 'customer_gvkey', 'salecs', 'customer_ind',
       'supplier_ind', 'customer_cstatCompanies', 'customer_igCompanies',
       'customer_delete', 'customer_abi', 'supplier_cstatCompanies',
       'supplier_igCompanies', 'supplier_delete', 'supplier_abi', 'suppliers'],
      dtype='object')

In [128]:
c_links['suppliers'] = 1
custExp = c_links[['year', 'customer_gvkey', 'salecs','suppliers']].groupby(['year','customer_gvkey']).sum().\
    reset_index().rename(columns = {'salecs': 'totalExp'})

custExp.head()



Unnamed: 0,year,customer_gvkey,totalExp,suppliers
0,2000,1038.0,38.22,2
1,2000,1045.0,38.093,4
2,2000,1078.0,5.07,2
3,2000,1121.0,7.883,1
4,2000,1177.0,284.677,1


In [107]:
print("Number of firms with no exp information and multiple suppliers: ", \
          sum(custExp[custExp.totalExp == 0].suppliers > 1))
print("Number of firms with no exp information and >5 suppliers: ", \
          sum(custExp[custExp.totalExp == 0].suppliers > 5))


Number of firms with no exp information and multiple suppliers:  210
Number of firms with no exp information and >5 suppliers:  6


Most of these firms have expenditure information. We can look at:
    - Expenditure-weighted (just do equal shares if no exp information)
    - Largest supplier
    
    
Our focus is going to be on the economic data of the customers, so isolate for the customers here.

In [120]:
c_links.head()

Unnamed: 0,year,supplier_gvkey,customer_gvkey,salecs,customer_ind,supplier_ind,customer_cstatCompanies,customer_igCompanies,customer_delete,customer_abi,supplier_cstatCompanies,supplier_igCompanies,supplier_delete,supplier_abi,suppliers
0,2002,1013.0,2136.0,111.056,transportUtilities,manu,verizonmmunications,verizonmmunications,,7564776,adc telecommunications,adc telecommunications,,7523129,1
1,2003,1013.0,2136.0,111.056,transportUtilities,manu,verizonmmunications,verizonmmunications,,7564776,adc telecommunications,adc telecommunications,,7523129,1
2,2004,1013.0,2136.0,104.312,transportUtilities,manu,verizonmmunications,verizonmmunications,,7564776,adc telecommunications,adc telecommunications,,7523129,1
3,2005,1013.0,2136.0,146.0,transportUtilities,manu,verizonmmunications,verizonmmunications,,7564776,adc telecommunications,adc telecommunications,,7523129,1
4,2006,1013.0,2136.0,205.0,transportUtilities,manu,verizonmmunications,verizonmmunications,,7564776,adc telecommunications,adc telecommunications,,7523129,1


In [129]:
customerDB = c_links[['year','customer_gvkey','supplier_gvkey','salecs']].\
    merge(custExp).rename(columns = {'customer_gvkey': 'gvkey'}).drop_duplicates()
print(customerDB.shape)

customerDB.head()

(34007, 6)


Unnamed: 0,year,gvkey,supplier_gvkey,salecs,totalExp,suppliers
0,2002,2136.0,1013.0,111.056,892.202,13
1,2002,2136.0,3275.0,8.398,892.202,13
2,2002,2136.0,10286.0,16.987,892.202,13
3,2002,2136.0,10420.0,229.158,892.202,13
4,2002,2136.0,14340.0,9.432,892.202,13


## Merge in supplier weather
Get the weather data.

In [30]:
averages.head()

Unnamed: 0,zipcode,quarterly_avg_precip,quarterly_median_precip,quarterly_variance_precip,quarterly_avg_temp,quarterly_median_temp,quarterly_variance_temp,qtr
0,1001,2.866177,0.0,48.05355,4.253855,3.946,42.033604,1.0
1,1001,3.601853,0.0,75.610613,20.817859,21.287,49.057586,2.0
2,1001,3.468813,0.0,88.529803,26.791997,27.302,20.188445,3.0
3,1001,3.435141,0.0,74.436349,10.708047,10.7795,55.548389,4.0
4,1013,2.755833,0.0,47.911696,3.964976,3.607,42.290685,1.0


In [109]:
streaks_withLags = pd.read_csv("../../data/companyData/streaks_withLags.csv").\
    drop(columns = {'Unnamed: 0', 'yearQtr'}).astype({'year':       'category',
                           'qtr':        'category',
                           'zipcode':    'category'})


g = pd.read_csv("../../data/companyData/weatherByEstablishment.csv").\
    drop(columns = {"Unnamed: 0"})


allWeather_withLags = pd.read_csv("../../data/companyData/allWeather_withLags.csv").\
    drop(columns = {"Unnamed: 0", 'yearQtr'}).astype({'year':       'category',
                           'qtr':        'category',
                           'zipcode':    'category'})


averages = pd.read_csv("../../data/companyData/quarterlyStatsByZip.csv").\
    drop(columns = {"Unnamed: 0"}).rename(columns = {'ZIP': 'zipcode'})
averages['qtr'] = averages.quarter.str.slice(1,2).astype('float')
averages.drop(columns = {'quarter'}, inplace = True) 
averages = averages.astype({'qtr':        'category',
                           'zipcode':    'category'})


allWeather_withLags2 = pd.read_csv("../../data/companyData/allWeather_withLags_new.csv").\
    drop(columns = {"Unnamed: 0", 'yearQtr'}).astype({'year':       'category',
                           'qtr':        'category',
                           'zipcode':    'category'})


thunderstorms_withLags = pd.read_csv("../../data/companyData/thunderstorms_withLags.csv").\
    drop(columns = {"Unnamed: 0", 'yearQtr'}).astype({'year':       'category',
                           'qtr':        'category',
                           'zipcode':    'category'})


In [110]:
allWeather_withLags2.head()

Unnamed: 0,zipcode,year,qtr,precip_zipWeek50_90,precip_zipWeek90_95,precip_zipWeek95_99,precip_zipWeek99_1,precip_zipMonth50_90,precip_zipMonth90_95,precip_zipMonth95_99,...,lag4_temp_zipMonth50_90,lag4_temp_zipMonth90_95,lag4_temp_zipMonth95_99,lag4_temp_zipMonth99_1,lag4_temp_zipQuarter50_90,lag4_temp_zipQuarter90_95,lag4_temp_zipQuarter95_99,lag4_temp_zipQuarter99_1,lag4_days90Plus,lag4_streak90Plus
0,10001,2010,1,7.0,3.0,3.0,3.0,2.0,2.0,2.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
1,10001,2010,2,5.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
2,10001,2010,3,5.0,1.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,0
3,10001,2010,4,7.0,1.0,1.0,1.0,2.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
4,10001,2011,1,8.0,4.0,3.0,0.0,3.0,1.0,0.0,...,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0,0


The weather data, for now at least, is only back until 2009. We can potentially change this but for now it fits the ``last ten years" theme.

In [131]:
suppliersWithWeather = allSupplierData[allSupplierData.year > 2009].\
    merge(streaks_withLags).merge(allWeather_withLags).merge(averages).\
    merge(g).merge(thunderstorms_withLags).merge(thunderstorms_withLags).merge(allWeather_withLags2)


In [135]:
suppliersWithWeather.head()

Unnamed: 0,year,gvkey,abi,ind,ticker,company,state,city,address_line_1,zipcode,...,lag4_temp_zipMonth50_90,lag4_temp_zipMonth90_95,lag4_temp_zipMonth95_99,lag4_temp_zipMonth99_1,lag4_temp_zipQuarter50_90,lag4_temp_zipQuarter90_95,lag4_temp_zipQuarter95_99,lag4_temp_zipQuarter99_1,lag4_days90Plus,lag4_streak90Plus
0,2010,1013.0,7523129,manu,,ADC TELECOMMUNICATIONS INC,MN,EDEN PRAIRIE,13625 TECHNOLOGY DR,55344,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
1,2010,66588.0,408013506,finance,SRDX,SUR MODICS INC,MN,EDEN PRAIRIE,9924 W 74TH ST,55344,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
2,2010,113362.0,981328305,wholesale,,DIGITAL RIVER INC,MN,EDEN PRAIRIE,9625 W 76TH ST # 150,55344,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
3,2011,66588.0,408013506,finance,SRDX,SUR MODICS INC,MN,EDEN PRAIRIE,9924 W 74TH ST,55344,...,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0,0
4,2011,113362.0,981328305,wholesale,,DIGITAL RIVER INC,MN,EDEN PRAIRIE,9625 W 76TH ST # 150,55344,...,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0,0


In [133]:
suppliersWithWeather.columns[0:50]

Index(['year', 'gvkey', 'abi', 'ind', 'ticker', 'company', 'state', 'city',
       'address_line_1', 'zipcode', 'latitude', 'longitude',
       'parent_employee_size_code', 'location_employee_size_code',
       'employeesAtLocation', 'supplier_coordinate', 'qtr', 'hotStreak',
       'wetStreak', 'lag1_hotStreak', 'lag1_wetStreak', 'lag2_hotStreak',
       'lag2_wetStreak', 'lag3_hotStreak', 'lag3_wetStreak', 'lag4_hotStreak',
       'lag4_wetStreak', 'precip_annualquant_0.95', 'precip_annualquant_1xQtr',
       'precip_annualquant_1xYr', 'precip_annualquant_1x5Qtrs',
       'precip_annualquant_1x10Qtrs', 'precip_annualquant_1x5Yrs',
       'precip_annualquant_1x10Yrs', 'precip_zipquant_0.95',
       'precip_zipquant_1xQtr', 'precip_zipquant_1xYr',
       'precip_zipquant_1x5Qtrs', 'precip_zipquant_1x10Qtrs',
       'precip_zipquant_1x5Yrs', 'precip_zipquant_1x10Yrs',
       'precip_zipQuarterquant_0.95', 'precip_zipQuarterquant_1xQtr',
       'precip_zipQuarterquant_1xYr', 'precip_zipQ

In [134]:
print("supplier: ", suppliersWithWeather.shape)

suppliersWithWeather.to_csv("../../data/companyData/suppliersWithWeather.csv")

suppliersWithWeather.head()

supplier:  (29772, 688)


Unnamed: 0,year,gvkey,abi,ind,ticker,company,state,city,address_line_1,zipcode,...,lag4_temp_zipMonth50_90,lag4_temp_zipMonth90_95,lag4_temp_zipMonth95_99,lag4_temp_zipMonth99_1,lag4_temp_zipQuarter50_90,lag4_temp_zipQuarter90_95,lag4_temp_zipQuarter95_99,lag4_temp_zipQuarter99_1,lag4_days90Plus,lag4_streak90Plus
0,2010,1013.0,7523129,manu,,ADC TELECOMMUNICATIONS INC,MN,EDEN PRAIRIE,13625 TECHNOLOGY DR,55344,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
1,2010,66588.0,408013506,finance,SRDX,SUR MODICS INC,MN,EDEN PRAIRIE,9924 W 74TH ST,55344,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
2,2010,113362.0,981328305,wholesale,,DIGITAL RIVER INC,MN,EDEN PRAIRIE,9625 W 76TH ST # 150,55344,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
3,2011,66588.0,408013506,finance,SRDX,SUR MODICS INC,MN,EDEN PRAIRIE,9924 W 74TH ST,55344,...,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0,0
4,2011,113362.0,981328305,wholesale,,DIGITAL RIVER INC,MN,EDEN PRAIRIE,9625 W 76TH ST # 150,55344,...,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0,0


## Any Supplier
Focus on weather of all suppliers. First rename the suppliers data.

In [140]:
relevantVars = [x for x in suppliersWithWeather.columns if (('year' in x) | 
                                                 ('qtr' in x)        |
                                                 ('gvkey' in x)      |
                                                 ('famafrench' in x) |
                                                 ('Streak' in x)     | 
                                                 ('_' in x)) & 
                                                ~('roa_lagged' in x) & 
                                                ~('yearQtr' in x)]

suppliers_toMerge = suppliersWithWeather[relevantVars]


for colname in suppliers_toMerge.columns[4:]:
    suppliers_toMerge.rename(columns = {colname: 'supplier_' + colname}, inplace = True)

    
suppliers_toMerge.rename(columns = {'gvkey': 'supplier_gvkey',
                                   'famafrench': 'supplier_famafrench',
                                   'supplier_supplier_coordinate': 'supplier_coordinate'},inplace = True)    

print(suppliers_toMerge.columns)


Index(['year', 'supplier_gvkey', 'address_line_1', 'parent_employee_size_code',
       'supplier_location_employee_size_code', 'supplier_coordinate',
       'supplier_qtr', 'supplier_hotStreak', 'supplier_wetStreak',
       'supplier_lag1_hotStreak',
       ...
       'supplier_lag4_temp_zipMonth50_90', 'supplier_lag4_temp_zipMonth90_95',
       'supplier_lag4_temp_zipMonth95_99', 'supplier_lag4_temp_zipMonth99_1',
       'supplier_lag4_temp_zipQuarter50_90',
       'supplier_lag4_temp_zipQuarter90_95',
       'supplier_lag4_temp_zipQuarter95_99',
       'supplier_lag4_temp_zipQuarter99_1', 'supplier_lag4_days90Plus',
       'supplier_lag4_streak90Plus'],
      dtype='object', length=673)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Get the change data and merge with the customers in the customer-supplier dataset.

In [141]:
customerDB.head()

Unnamed: 0,year,gvkey,supplier_gvkey,salecs,totalExp,suppliers
0,2002,2136.0,1013.0,111.056,892.202,13
1,2002,2136.0,3275.0,8.398,892.202,13
2,2002,2136.0,10286.0,16.987,892.202,13
3,2002,2136.0,10420.0,229.158,892.202,13
4,2002,2136.0,14340.0,9.432,892.202,13


In [138]:
changes = pd.read_csv("../../data/companyData/compustatChanges_withControls.csv").drop(columns = ['Unnamed: 0'])
print(changes.columns)
customers = changes.merge(customerDB)
print(customers.shape)

Index(['gvkey', 'datadate', 'year', 'qtr', 'companyName', 'curcdq', 'assets',
       'cash', 'costGoodsSold', 'totalInv', 'netIncome', 'opInc_afDep',
       'opInc_befDep', 'totalRevenue', 'costat', 'priceClose', 'add1',
       'addzip', 'city', 'state', 'assetsLast', 'netIncomeLast',
       'totalRevenueLast', 'costGoodsSoldLast', 'totalInvLast',
       'opInc_afDepLast', 'opInc_befDepLast', 'priceCloseLast', 'cashLast',
       'incomeChange', 'revenueChange', 'costChange', 'inventoryChange',
       'opInc_afDepChange', 'opInc_befDepChange', 'priceCloseChange',
       'assetsPrev', 'assetsLagged', 'netIncomeLagged', 'roa_lagged',
       'famafrench', 'sic2', 'indGroup', 'earliestYear', 'ageTercile',
       'sizeTercile', 'profitTercile', 'datacqtr', 'datafqtr', 'fyr', 'DATE'],
      dtype='object')
(89009, 55)


Now put in the customer hq thing, so we can filter out customer-supplier pairs that are within x miles of each other.

In [144]:
customerCoordinates = pd.read_csv("../../data/companyData/allCustomerData.csv").\
    drop(columns = ['Unnamed: 0'])[['year','gvkey','customer_coordinate']]

print(customerCoordinates.head())

   year     gvkey    customer_coordinate
0  2012    4093.0   (35.22391, -80.8481)
1  2012  184321.0  (35.51957, -97.53096)
2  2009    3036.0  (39.10039, -84.50817)
3  2010    3036.0  (39.10039, -84.50817)
4  2003   10984.0  (38.95098, -77.35967)


In [145]:
customers.head()

Unnamed: 0,gvkey,datadate,year,qtr,companyName,curcdq,assets,cash,costGoodsSold,totalInv,...,sizeTercile,profitTercile,datacqtr,datafqtr,fyr,DATE,supplier_gvkey,salecs,totalExp,suppliers
0,1038,20000331,2000,1,AMC ENTERTAINMENT INC -OLD,USD,1771.08189,,363.530609,0.0,...,2.0,1.0,2000Q1,1999Q4,3,1970-01-01 00:00:00.020000331,65710.0,38.22,38.22,2
1,1038,20000331,2000,1,AMC ENTERTAINMENT INC -OLD,USD,1771.08189,,363.530609,0.0,...,2.0,1.0,2000Q1,1999Q4,3,1970-01-01 00:00:00.020000331,1082.0,,38.22,2
2,1038,20000630,2000,2,AMC ENTERTAINMENT INC -OLD,USD,1655.001796,,380.263074,0.0,...,2.0,1.0,2000Q2,2000Q1,3,1970-01-01 00:00:00.020000630,65710.0,38.22,38.22,2
3,1038,20000630,2000,2,AMC ENTERTAINMENT INC -OLD,USD,1655.001796,,380.263074,0.0,...,2.0,1.0,2000Q2,2000Q1,3,1970-01-01 00:00:00.020000630,1082.0,,38.22,2
4,1038,20000930,2000,3,AMC ENTERTAINMENT INC -OLD,USD,1655.367114,,412.768458,0.0,...,2.0,2.0,2000Q3,2000Q2,3,1970-01-01 00:00:00.020000930,65710.0,38.22,38.22,2


In [149]:
customers.shape

(89009, 55)

In [146]:
allSuppliersWithWeather = customers.merge(customerCoordinates).merge(suppliers_toMerge)

In [42]:
allSuppliersWithWeather.to_csv("../../data/companyData/allSuppliersWithWeather.csv")

In [None]:
# get customer and supplier data on its own

In [148]:
allSuppliersWithWeather.shape

(72300, 727)

### stock data
merge in weather at the location of any supplier. First, get the supplier names only.

In [None]:
allSupplierData = pd.read_csv("../../data/companyData/allSupplierData.csv").\
    drop(columns = ['Unnamed: 0'])[['gvkey', 'famafrench', 'year','zipcode']]
print(allSupplierData.shape, allSupplierData.columns)

allSupplierData.head()

And merge with the daily weather data.

In [None]:
allWeather = pd.read_csv("../../data/companyData/stockWeather_zipQuarterQuants.csv").\
    drop(columns = {'Unnamed: 0'})

allWeather = allWeather[~allWeather.temp_zipQuarterLast5.isna()].reset_index(drop = True)

allWeather['date'] = pd.to_datetime(allWeather['date'],
                                   format = "%Y-%m-%d")

allWeather.rename(columns = {'ZIP': 'zipcode'}, inplace = True)
print(allWeather.dtypes)
allWeather.head()


allWeather['year'] = allWeather.date.dt.year

supplierWeather = allSupplierData.merge(allWeather)

In [None]:
supplierWeather.head()

In [None]:
supplierWeather.drop(columns = {'zipcode'},inplace = True)
supplierWeather.rename(columns = {'gvkey': 'supplier_gvkey','famafrench': 'supplier_famafrench'},inplace = True)

Now get the actual stock data.

In [None]:
with open('../../data/stockReturns.pkl', 'rb') as f:
    stocks = pkl.load(f)[['date','gvkey','RET']]

stocks = stocks[stocks.date.dt.year > 2008]

stocks['qtr']  = stocks.date.dt.quarter
stocks['year'] = stocks.date.dt.year

stocks = stocks[~stocks.gvkey.isna()]
stocks['gvkey'] = stocks['gvkey'].astype(int)
print(stocks.shape)

stocks.head()

The controls data.

In [None]:
igChanges = pd.read_csv("../../data/companyData/igData.csv").\
    drop(columns = {'Unnamed: 0'})
igChanges.head()

companyControls = igChanges[['gvkey','year','qtr','famafrench','ageTercile','sizeTercile','profitTercile','zipcode']]
companyControls.head()

And merge it all together.

In [None]:
customerDB['gvkey'] = customerDB.gvkey.astype('int64')

In [None]:
supplierStocks = customerDB.merge(companyControls).merge(stocks).merge(supplierWeather)
print(supplierStocks.shape)

supplierStocks.head()

In [None]:
supplierStocks.to_csv("../../data/companyData/supplierStocks.csv")

## Biggest Supplier
Focus on weather of biggest supplier.

First find the max by supplier. Add back in any rows with only 1 supplier.

In [43]:
customerDB.shape

(46521, 6)

In [44]:
# https://stackoverflow.com/questions/15705630/get-the-rows-which-have-the-max-value-in-groups-using-groupby
idx = customerDB.groupby(['year','gvkey']).salecs.\
    transform(max) == customerDB.salecs
largestSuppliers = customerDB[idx].reset_index(drop = True)
print(largestSuppliers.shape)

# find companies who only have one other supplier
singleSuppliers = customerDB[customerDB.suppliers == 1].reset_index(drop = True)
print(singleSuppliers.shape)

# find largest suppliers of different companies
largestSuppliers = largestSuppliers.append(singleSuppliers).drop_duplicates()
print(largestSuppliers.shape)



(12219, 6)
(8439, 6)
(14641, 6)


In [45]:
largestSuppliers.head()

Unnamed: 0,year,gvkey,supplier_gvkey,salecs,totalExp,suppliers
0,2002,2136.0,10420.0,229.158,892.202,13
1,2003,2136.0,13440.0,214.0,1100.784,14
2,2004,2136.0,10420.0,332.586,1445.879,21
3,2005,2136.0,10420.0,508.518,2251.035,32
4,2006,2136.0,10420.0,551.124,2563.791,31


Merge in the change data for that gvkey.

In [46]:
relevantVars = [x for x in suppliersWithWeather.columns if (('year' in x) | 
                                                 ('qtr' in x) |
                                                 ('gvkey' in x) |
                                                 ('famafrench' in x) |
                                                 ('Streak' in x)     |
                                                 ('_' in x)) & 
                                                ~('roa_lagged' in x) & 
                                                ~('yearQtr' in x)]

suppliers_toMerge = suppliersWithWeather[relevantVars]


for colname in suppliers_toMerge.columns[4:]:
    suppliers_toMerge.rename(columns = {colname: 'supplier_' + colname}, inplace = True)

    
suppliers_toMerge.rename(columns = {'gvkey': 'supplier_gvkey',
                                   'famafrench': 'supplier_famafrench'},inplace = True)    

print(suppliers_toMerge.columns)


'''suppliers_toMerge = suppliersWithWeather[['year','qtr','gvkey','tmax_quant_1.0','precip_quant_1.0']].\
    rename(columns = {'gvkey': 'supplier_gvkey',
                      'tmax_quant_1.0': 'supplier_tmax_quant_1.0',
                      'precip_quant_1.0': 'supplier_precip_quant_1.0'})'''

Index(['supplier_gvkey', 'year', 'qtr', 'hotStreak', 'supplier_wetStreak',
       'supplier_lag1_hotStreak', 'supplier_lag1_wetStreak',
       'supplier_lag2_hotStreak', 'supplier_lag2_wetStreak',
       'supplier_lag3_hotStreak',
       ...
       'supplier_lag4_temp_zipMonth50_90', 'supplier_lag4_temp_zipMonth90_95',
       'supplier_lag4_temp_zipMonth95_99', 'supplier_lag4_temp_zipMonth99_1',
       'supplier_lag4_temp_zipQuarter50_90',
       'supplier_lag4_temp_zipQuarter90_95',
       'supplier_lag4_temp_zipQuarter95_99',
       'supplier_lag4_temp_zipQuarter99_1', 'supplier_lag4_days90Plus',
       'supplier_lag4_streak90Plus'],
      dtype='object', length=669)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


"suppliers_toMerge = suppliersWithWeather[['year','qtr','gvkey','tmax_quant_1.0','precip_quant_1.0']].    rename(columns = {'gvkey': 'supplier_gvkey',\n                      'tmax_quant_1.0': 'supplier_tmax_quant_1.0',\n                      'precip_quant_1.0': 'supplier_precip_quant_1.0'})"

In [47]:
changes = pd.read_csv("../../data/companyData/compustatChanges_withControls.csv").drop(columns = ['Unnamed: 0'])
print(changes.columns)
customers = changes.merge(largestSuppliers) # gvkey is on the customer here
print(customers.shape)

Index(['gvkey', 'datadate', 'year', 'qtr', 'companyName', 'curcdq', 'assets',
       'cash', 'costGoodsSold', 'totalInv', 'netIncome', 'opInc_afDep',
       'opInc_befDep', 'totalRevenue', 'costat', 'priceClose', 'add1',
       'addzip', 'city', 'state', 'assetsLast', 'netIncomeLast',
       'totalRevenueLast', 'costGoodsSoldLast', 'totalInvLast',
       'opInc_afDepLast', 'opInc_befDepLast', 'priceCloseLast', 'cashLast',
       'incomeChange', 'revenueChange', 'costChange', 'inventoryChange',
       'opInc_afDepChange', 'opInc_befDepChange', 'priceCloseChange',
       'assetsPrev', 'assetsLagged', 'netIncomeLagged', 'roa_lagged',
       'famafrench', 'sic2', 'indGroup', 'earliestYear', 'ageTercile',
       'sizeTercile', 'profitTercile', 'datacqtr', 'datafqtr', 'fyr', 'DATE'],
      dtype='object')
(42076, 55)


In [48]:
largestSuppliersWithWeather = customers.merge(largestSuppliers[['year', 'gvkey', 'supplier_gvkey']]).merge(suppliers_toMerge)
largestSuppliersWithWeather.shape

(18792, 721)

In [49]:
for col in largestSuppliersWithWeather.columns:
    print(col)

gvkey
datadate
year
qtr
companyName
curcdq
assets
cash
costGoodsSold
totalInv
netIncome
opInc_afDep
opInc_befDep
totalRevenue
costat
priceClose
add1
addzip
city
state
assetsLast
netIncomeLast
totalRevenueLast
costGoodsSoldLast
totalInvLast
opInc_afDepLast
opInc_befDepLast
priceCloseLast
cashLast
incomeChange
revenueChange
costChange
inventoryChange
opInc_afDepChange
opInc_befDepChange
priceCloseChange
assetsPrev
assetsLagged
netIncomeLagged
roa_lagged
famafrench
sic2
indGroup
earliestYear
ageTercile
sizeTercile
profitTercile
datacqtr
datafqtr
fyr
DATE
supplier_gvkey
salecs
totalExp
suppliers
hotStreak
supplier_wetStreak
supplier_lag1_hotStreak
supplier_lag1_wetStreak
supplier_lag2_hotStreak
supplier_lag2_wetStreak
supplier_lag3_hotStreak
supplier_lag3_wetStreak
supplier_lag4_hotStreak
supplier_lag4_wetStreak
supplier_precip_annualquant_0.95
supplier_precip_annualquant_1xQtr
supplier_precip_annualquant_1xYr
supplier_precip_annualquant_1x5Qtrs
supplier_precip_annualquant_1x10Qtrs
supplie

In [50]:
largestSuppliersWithWeather.to_csv("../../data/companyData/largestSuppliersWithWeather.csv")

## Sales-Weighted Average
If a company doesn't have sales-specific information, then assume equal shares. This doesn't happen for too many of the companies, thankfully.

In [None]:
customerDB = c_links[['year','customer_gvkey','supplier_gvkey','salecs']].\
    merge(custExp).rename(columns = {'customer_gvkey': 'gvkey'}).drop_duplicates()

customerDB['salesWeight'] = customerDB.salecs/customerDB.totalExp

customerDB.fillna(1, inplace = True)

Now merge this with the supplier weather data, and use the sales weights to find a sales-weighted average of the weather conditions for the suppliers.

In [None]:
relevantVars = [x for x in suppliersWithWeather.columns if (('year' in x) | 
                                                 ('qtr' in x) |
                                                 ('gvkey' in x) |
                                                 ('_' in x)) & 
                                                ~('roa_lagged' in x) & 
                                                ~('yearQtr' in x)]

suppliers_toMerge = suppliersWithWeather[relevantVars]


for colname in suppliers_toMerge.columns[3:]:
    suppliers_toMerge.rename(columns = {colname: 'supplier_' + colname}, inplace = True)

    
suppliers_toMerge.rename(columns = {'gvkey': 'supplier_gvkey'},inplace = True)    


In [None]:
suppliers_toMerge.head()

For each of the supplier weather columns, multiply the variable by the fraction of sales attributable to that relationship.

In [None]:
supplierWeather = customerDB[['year','gvkey','supplier_gvkey','salesWeight']].merge(suppliers_toMerge)

for col in supplierWeather.columns[7:]:
        supplierWeather[col]   = supplierWeather.salesWeight*supplierWeather[col]
        
        

supplierWeather.drop(columns = {'supplier_gvkey','salesWeight'}, inplace = True)


print(supplierWeather.head())



# [['year','qtr','gvkey','supplier_tmax_quant_1.0','supplier_precip_quant_1.0']]

In [None]:
supplierWtdAvgWeather = supplierWeather.groupby(['year','qtr','gvkey']).sum().reset_index().drop_duplicates()

In [None]:
supplierWtdAvgWeather.gvkey.unique()

Merge the supplier weighted average weather data with the customer data that has weather as well.

In [None]:
customersWithWeather.head()

In [None]:
wtdAvgSuppliers = customersWithWeather.merge(supplierWtdAvgWeather)

wtdAvgSuppliers.head()

In [None]:
wtdAvgSuppliers.shape

In [None]:
wtdAvgSuppliers.to_csv("../../data/companyData/wtdAvgSuppliers.csv")

In [None]:
wtdAvgSuppliers.head()

In [None]:
wtdAvgSuppliers.columns[wtdAvgSuppliers.columns.str.contains('Tercile')]