In [2]:
import pickle as pkl
import pandas as pd
import matplotlib
import os
import re

import collections
import datetime
import time

import geopandas as gpd

import numpy as np

import rasterio

import spacy

import gc



nlp = spacy.load('en_core_web_lg')
from sklearn.metrics.pairwise import cosine_similarity

import itertools

# Get Data

## Changes from year to year

In [None]:
changes = pd.read_csv("../../data/compustatChanges_all.csv").drop(columns = ['Unnamed: 0'])

changes.head()

In [None]:
otherControls = pd.read_csv('../../data/companyData/otherControls.csv').\
    drop(columns = {'Unnamed: 0', 'fyearq'}).rename(columns = {'year_toMatchOn': 'year',
                                                              'fqtr': 'qtr'})

otherControls.head()

In [None]:
otherControls.head()

In [None]:
print(changes.shape)
changes = changes.merge(otherControls)
print(changes.shape)


industries = changes[['gvkey','famafrench']].drop_duplicates()

In [None]:
'''changes.to_csv("../../data/companyData/compustatChanges_withControls.csv")
changes.head()'''

In [None]:
changes = pd.read_csv("../../data/companyData/compustatChanges_withControls.csv").drop(columns = {'Unnamed: 0',
                                                                                                 'Unnamed: 0.1',
                                                                                                 'Unnamed: 0.1.1'})
changes.head()

Put in the calendar quarters and fiscal quarter data.

In [None]:
quarters = pd.read_csv("../../data/companyData/fiscalYears.csv")
quarters.head()

In [None]:
len(quarters.gvkey.unique())

In [None]:
sum((quarters.fyr == 12) | 
   (quarters.fyr == 3) | 
   (quarters.fyr == 6) | 
   (quarters.fyr == 9))/quarters.shape[0]

In [None]:
quarters = quarters[(quarters.fyr == 12) | 
   (quarters.fyr == 3) | 
   (quarters.fyr == 6) | 
   (quarters.fyr == 9)][['gvkey','datadate','datacqtr','datafqtr','fyr']].reset_index(drop = True)


In [None]:
quarters.head()

Merge the quarter data into the change data, and make sure that the quarters that are used line up with the calendar quarters.

In [None]:
changesCal = changes[changes.gvkey.isin(quarters.gvkey.unique())]

changesCal = changesCal.merge(quarters)

print(changesCal.shape[0]/changes.shape[0])

In [None]:
changesCal.loc[~(changesCal.datacqtr.isna()), 'year'] = changesCal.datacqtr.str.slice(0,4)
changesCal.loc[~(changesCal.datacqtr.isna()), 'qtr']  = changesCal.datacqtr.str.slice(5,6)

changesCal['DATE'] = pd.to_datetime(changesCal['datadate'])

changesCal.loc[(changesCal.datacqtr.isna()), 'year'] = changesCal.DATE.dt.year
changesCal.loc[(changesCal.datacqtr.isna()), 'qtr']  = changesCal.DATE.dt.quarter

changesCal['year'] = changesCal.year.astype('int64')
changesCal['qtr']  = changesCal.qtr.astype('int64')

print(changesCal.shape,changesCal.head())

In [None]:
changesCal.to_csv("../../data/companyData/compustatChanges_withControls.csv")
changesCal.head()

In [None]:
changesCal = pd.read_csv("../../data/companyData/compustatChanges_withControls.csv")
changesCal.head()

# Compustat and ABI Linking

In [3]:
gvKey_abiLinkingTable = pd.read_csv('../../data/companyData/linkingTable.csv').drop(columns = ['Unnamed: 0'])

base_columns = gvKey_abiLinkingTable.columns 
customer_columns = "customer_" + base_columns
supplier_columns = "supplier_" + base_columns



hasMatch = gvKey_abiLinkingTable.gvkey.unique()

gvKey_abiLinkingTable.head()


Unnamed: 0,cstatCompanies,igCompanies,delete,gvkey,abi
0,asa gold and precious metals,asa gold precious metals,,1062,402180222
1,adams diversified equity fd,adams diversified equity fund,,1119,397759739
2,allen organ,allen organ,,1283,400700704
3,american physicians svc gp,american physicians svc,,1539,218548014
4,american science engineering,american science engineering,,1554,441435880


---------------------------------

# Get all change data together
Get the linking table and merge the abi labels into the change df. 

Then, merge the location data into the change data and get as complete a record of companies as possible given the HQ data.

In [19]:
gvKey_abiLinkingTable = pd.read_csv('../../data/companyData/linkingTable.csv').drop(columns = ['Unnamed: 0'])

changes = pd.read_csv("../../data/companyData/compustatChanges_withControls.csv").drop(columns = ['Unnamed: 0'])
print(changes.shape, changes.head())


changesABI = changes.merge(gvKey_abiLinkingTable, on ='gvkey').drop(columns = {'state','city'})
print(changesABI.shape, changesABI.head())

  exec(code_obj, self.user_global_ns, self.user_ns)


(645951, 41)    gvkey  datadate  year  qtr         companyName curcdq  assets  \
0   1010  20010331  2001    1  ACF INDUSTRIES INC    USD  3750.1   
1   1010  20010630  2001    2  ACF INDUSTRIES INC    USD  3701.7   
2   1010  20010930  2001    3  ACF INDUSTRIES INC    USD  3930.6   
3   1010  20011231  2001    4  ACF INDUSTRIES INC    USD  3723.1   
4   1010  20020331  2002    1  ACF INDUSTRIES INC    USD  3691.9   

   costGoodsSold  totalInv  netIncome  ...  roa_lagged  famafrench  \
0           56.3      53.9       24.8  ...    0.005412        26.0   
1           30.8      56.2        1.8  ...    0.021410        26.0   
2           37.0      49.8       42.2  ...         NaN        26.0   
3           30.0      42.4       76.9  ...   -0.011506        26.0   
4           26.1      40.6        7.3  ...    0.019720        26.0   

  earliestYear ageTercile sizeTercile profitTercile datacqtr  datafqtr fyr  \
0         1962          0         2.0           1.0   2001Q1    2001Q1  12   
1

Now merge in the hq information.

In [20]:
canadian = ['ON', 'AB','QC', 'BC', 'NS', 'NF', 'SK', 'MB', 'NB']
changes = changes[~(changes.state.isin(canadian)) & ~changes.state.isna()]

changes['addzip'] = changes.addzip.astype('str').str.slice(0,5)

changes.state.unique()

array(['MO', 'MN', 'NY', 'NJ', 'PA', 'TX', 'SC', 'AZ', 'UT', 'IL', 'MA',
       'WA', 'CT', 'FL', 'CA', 'MD', 'NC', 'AL', 'HI', 'OH', 'AR', 'GA',
       'CO', 'NV', 'KS', 'ID', 'WI', 'ME', 'WY', 'VA', 'OK', 'VT', 'DE',
       'IN', 'PR', 'IA', 'MI', 'LA', 'RI', 'NE', 'MT', 'SD', 'OR', 'DC',
       'WV', 'KY', 'TN', 'MS', 'ND', 'NH', 'NM', 'VI', 'AK', 'GU'],
      dtype=object)

In [21]:
changes.drop(columns = {'Unnamed: 0.1'}, inplace = True)

KeyError: "['Unnamed: 0.1'] not found in axis"

In [22]:
hq = pd.read_csv("../../data/ig_uniqueHQs_multLocations.csv").\
    drop(columns = {'Unnamed: 0'}).\
    rename(columns = {'archive_version_year': 'year'})

hq['year'] = hq.year.astype('int64')

igChanges = changesABI.merge(hq)
print(igChanges.shape, igChanges.head())


hq.head()

(241835, 54)    gvkey  datadate  year  qtr                  companyName curcdq   assets  \
0   1013  20091231  2009    4   ADC TELECOMMUNICATIONS INC    USD   1336.1   
1   1013  20100331  2010    1   ADC TELECOMMUNICATIONS INC    USD   1345.1   
2   1013  20100630  2010    2   ADC TELECOMMUNICATIONS INC    USD   1415.0   
3   1013  20100930  2010    3   ADC TELECOMMUNICATIONS INC    USD   1474.5   
4   1045  20030331  2003    1  AMERICAN AIRLINES GROUP INC    USD  29086.0   

   costGoodsSold  totalInv  netIncome  ...                     company  state  \
0          157.9     123.9      -11.2  ...  ADC TELECOMMUNICATIONS INC     MN   
1          158.5     122.8      -13.2  ...  ADC TELECOMMUNICATIONS INC     MN   
2          176.2     116.3       75.8  ...  ADC TELECOMMUNICATIONS INC     MN   
3          184.6     106.4       10.6  ...  ADC TELECOMMUNICATIONS INC     MN   
4         3943.0     618.0    -1043.0  ...                    AMR CORP     TX   

           city         address

Unnamed: 0,abi,ticker,company,year,state,city,address_line_1,zipcode,latitude,longitude,parent_employee_size_code,location_employee_size_code,employeesAtLocation
0,7609,SODI,SOLITRON DEVICES INC,2003,FL,WEST PALM BEACH,3301 ELECTRONICS WAY # C,33407,26.7412,-80.06694,,,
1,23077,,JENNY LEE BAKERY,2003,PA,MC KEES ROCKS,620 ISLAND AVE,15136,40.47235,-80.06152,50.0,50.0,1.0
2,76547,,MASTER PROTECTION CORP,2003,FL,FORT MYERS,12800 UNIVERSITY DR # 400,33907,26.55504,-81.88423,20.0,20.0,1.0
3,77743,,NATIONAL TECHNICAL SYSTEMS INC,2003,CA,CALABASAS,24007 VENTURA BLVD # 200,91302,34.15562,-118.65163,10.0,10.0,1.0
4,89151,,HILLTOP BASIC RESOURCES,2003,OH,CINCINNATI,1 W 4TH ST # 1100,45202,39.09982,-84.51297,20.0,20.0,1.0


In [23]:
igChanges.to_csv("../../data/companyData/igData.csv")

At this point, we have zip information in the following forms (from most to least examples):
    - changes: all compustat companies, from the compustat address system
    - igChanges: subset of compustat companies, from the ig merge
    - subset of compustat companies that have SC information and survived the ig merge
    
We could potentially look at the subset of compustat companies for which we have SC information, usign the compustat address system as well.

For now: follow similar trajectory as before but add in weather data for all cstat companies and all ig-merged companies.

First: pull all zips that are mentioned in changes and igChanges and use this to get the weather data.



In [24]:
changes = changes[(~changes.addzip.isna()) & (changes.addzip != 'nan')]
relevantZips = changes.addzip.astype('int64').append(igChanges.zipcode).unique()

changes.rename(columns = {'addzip': 'zipcode'}, inplace = True)
changes.drop(columns = {'cik',
     'datadate','costat', 'add1', 'add2', 'city', 'sic', 'state'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [25]:
len(relevantZips)

4538

In [None]:
# relevantZips = allCustomerData.zipcode.append(allSupplierData.zipcode).unique()
outfile =  '../../data/companyData/relevantZips.pkl'
with open(outfile, 'wb') as pickle_file:
    pkl.dump(relevantZips, pickle_file)

------------------------------------------------

# Stocks

In [46]:
igChanges = pd.read_csv("../../data/companyData/igData.csv").\
    drop(columns = {'Unnamed: 0'})
igChanges.head()

Unnamed: 0,gvkey,datadate,year,qtr,companyName,curcdq,assets,costGoodsSold,totalInv,netIncome,...,company,state,city,address_line_1,zipcode,latitude,longitude,parent_employee_size_code,location_employee_size_code,employeesAtLocation
0,1013,20091231,2009,4,ADC TELECOMMUNICATIONS INC,USD,1336.1,157.9,123.9,-11.2,...,ADC TELECOMMUNICATIONS INC,MN,EDEN PRAIRIE,13625 TECHNOLOGY DR,55344,44.85645,-93.45199,250.0,250.0,1.0
1,1013,20100331,2010,1,ADC TELECOMMUNICATIONS INC,USD,1345.1,158.5,122.8,-13.2,...,ADC TELECOMMUNICATIONS INC,MN,EDEN PRAIRIE,13625 TECHNOLOGY DR,55344,44.85645,-93.45199,250.0,250.0,1.0
2,1013,20100630,2010,2,ADC TELECOMMUNICATIONS INC,USD,1415.0,176.2,116.3,75.8,...,ADC TELECOMMUNICATIONS INC,MN,EDEN PRAIRIE,13625 TECHNOLOGY DR,55344,44.85645,-93.45199,250.0,250.0,1.0
3,1013,20100930,2010,3,ADC TELECOMMUNICATIONS INC,USD,1474.5,184.6,106.4,10.6,...,ADC TELECOMMUNICATIONS INC,MN,EDEN PRAIRIE,13625 TECHNOLOGY DR,55344,44.85645,-93.45199,250.0,250.0,1.0
4,1045,20030331,2003,1,AMERICAN AIRLINES GROUP INC,USD,29086.0,3943.0,618.0,-1043.0,...,AMR CORP,TX,FORT WORTH,4333 AMON CARTER BLVD,76155,32.82563,-97.05057,1000.0,1000.0,1.0


In [47]:
with open('../../data/stockReturns.pkl', 'rb') as f:
    stocks = pkl.load(f)[['date','gvkey','RET']]

In [48]:
stocks.head()

Unnamed: 0,date,gvkey,RET
0,2000-01-03,1690.0,0.088754
1,2000-01-04,1690.0,-0.08431
2,2000-01-05,1690.0,0.014634
3,2000-01-06,1690.0,-0.086538
4,2000-01-07,1690.0,0.047368


In [6]:
sum(stocks.gvkey.isna())

2194269

In [49]:
stocks = stocks[stocks.date.dt.year > 2008]

stocks['qtr']  = stocks.date.dt.quarter
stocks['year'] = stocks.date.dt.year

stocks = stocks[~stocks.gvkey.isna()]
stocks['gvkey'] = stocks['gvkey'].astype(int)
stocks.shape

(7565684, 5)

In [50]:
companyControls = igChanges[['gvkey','year','qtr','ageTercile','sizeTercile','profitTercile','zipcode']]
companyControls.head()

Unnamed: 0,gvkey,year,qtr,ageTercile,sizeTercile,profitTercile,zipcode
0,1013,2009,4,0,2.0,0.0,55344
1,1013,2010,1,0,2.0,1.0,55344
2,1013,2010,2,0,2.0,1.0,55344
3,1013,2010,3,0,2.0,0.0,55344
4,1045,2003,1,0,2.0,1.0,76155


In [56]:
print(stocks.dtypes, companyControls.dtypes)

date     datetime64[ns]
gvkey             int64
RET             float64
qtr               int64
year              int64
dtype: object gvkey              int64
year               int64
qtr                int64
ageTercile         int64
sizeTercile      float64
profitTercile    float64
zipcode            int64
dtype: object


In [57]:
stocksWithControls = stocks.merge(companyControls)
print(stocksWithControls.shape,stocks.shape,companyControls.shape)
stocksWithControls.head()

(4966717, 9) (7565684, 5) (241835, 7)


Unnamed: 0,date,gvkey,RET,qtr,year,ageTercile,sizeTercile,profitTercile,zipcode
0,2009-01-02,1690,0.063269,1,2009,0,2.0,2.0,95014
1,2009-01-05,1690,0.042204,1,2009,0,2.0,2.0,95014
2,2009-01-06,1690,-0.016494,1,2009,0,2.0,2.0,95014
3,2009-01-07,1690,-0.021608,1,2009,0,2.0,2.0,95014
4,2009-01-08,1690,0.018569,1,2009,0,2.0,2.0,95014


In [10]:
del stocks
del companyControls
del igChanges
gc.collect()

20

In [58]:
allWeather = pd.read_csv("../../data/companyData/stockWeather_zipQuarterQuants.csv").\
    drop(columns = {'Unnamed: 0'})

allWeather = allWeather[~allWeather.temp_zipQuarterLast5.isna()].reset_index(drop = True)

allWeather['date'] = pd.to_datetime(allWeather['date'],
                                   format = "%Y-%m-%d")

allWeather.rename(columns = {'ZIP': 'zipcode'}, inplace = True)
print(allWeather.dtypes)
allWeather.head()

zipcode                            int64
date                      datetime64[ns]
temp_zipQuarterLast5             float64
precip_zipQuarterLast5           float64
dtype: object


Unnamed: 0,zipcode,date,temp_zipQuarterLast5,precip_zipQuarterLast5
0,1001,2009-01-05,0.0,0.0
1,1002,2009-01-05,0.0,0.0
2,1003,2009-01-05,0.0,0.0
3,1005,2009-01-05,0.0,1.0
4,1007,2009-01-05,0.0,0.0


In [59]:
stocksWithControlsWeather = stocksWithControls.merge(allWeather)
print(stocksWithControlsWeather.shape,allWeather.shape)

stocksWithControlsWeather.head()

(4791633, 11) (130295820, 4)


Unnamed: 0,date,gvkey,RET,qtr,year,ageTercile,sizeTercile,profitTercile,zipcode,temp_zipQuarterLast5,precip_zipQuarterLast5
0,2009-01-05,1690,0.042204,1,2009,0,2.0,2.0,95014,0.0,0.0
1,2009-01-05,15855,-0.033784,1,2009,0,2.0,1.0,95014,0.0,0.0
2,2009-01-05,140044,0.015152,1,2009,1,1.0,0.0,95014,0.0,0.0
3,2009-01-06,1690,-0.016494,1,2009,0,2.0,2.0,95014,0.0,0.0
4,2009-01-06,15855,0.025175,1,2009,0,2.0,1.0,95014,0.0,0.0


In [60]:
stocksWithControlsWeather.to_csv("../../data/companyData/stocksWithControlsWeather.csv")

In [63]:
sum(stocksWithControlsWeather.RET.isna())

0

In [62]:
stocksWithControlsWeather = pd.read_csv("../../data/companyData/stocksWithControlsWeather.csv").drop(columns = {'Unnamed: 0'})
stocksWithControlsWeather.head()

Unnamed: 0,date,gvkey,RET,qtr,year,ageTercile,sizeTercile,profitTercile,zipcode,temp_zipQuarterLast5,precip_zipQuarterLast5
0,2009-01-05,1690,0.042204,1,2009,0,2.0,2.0,95014,0.0,0.0
1,2009-01-05,15855,-0.033784,1,2009,0,2.0,1.0,95014,0.0,0.0
2,2009-01-05,140044,0.015152,1,2009,1,1.0,0.0,95014,0.0,0.0
3,2009-01-06,1690,-0.016494,1,2009,0,2.0,2.0,95014,0.0,0.0
4,2009-01-06,15855,0.025175,1,2009,0,2.0,1.0,95014,0.0,0.0


In [44]:
sum(stocksWithControlsWeather.gvkey.isna())

127625293

--------------------

# Weather Data

In [27]:
# allWeather = pd.read_csv("../../../../../../../Volumes/backup2/dissData/prism/allWeatherBins_2010.2019.csv").\
allWeather = pd.read_csv("../../data/companyData/revised_allWeatherBins_2009to2019_allZips.csv").\
    drop(columns = {"Unnamed: 0"})

allWeather['yearQtr'] = allWeather.year + (allWeather.qtr - 1)/4

In [28]:
allWeather.head()

Unnamed: 0,zipcode,year,qtr,precip_annualquant_0.95,precip_annualquant_1x5Qtrs,precip_annualquant_1x5Yrs,precip_zipquant_0.95,precip_zipquant_1x5Qtrs,precip_zipquant_1x5Yrs,precip_zipQuarterquant_0.95,...,temp_annualquant_0.95,temp_annualquant_1x5Qtrs,temp_annualquant_1x5Yrs,temp_zipquant_0.95,temp_zipquant_1x5Qtrs,temp_zipquant_1x5Yrs,temp_zipQuarterquant_0.95,temp_zipQuarterquant_1x5Qtrs,temp_zipQuarterquant_1x5Yrs,yearQtr
0,1001,2009,1,5.0,0.0,0.0,0.0,0.0,0.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,9.0,4.0,0.0,2009.0
1,1001,2009,2,8.0,3.0,0.0,0.0,0.0,0.0,15.0,...,0.0,0.0,0.0,6.0,4.0,0.0,6.0,3.0,0.0,2009.25
2,1001,2009,3,10.0,4.0,0.0,2.0,2.0,0.0,13.0,...,0.0,0.0,0.0,17.0,9.0,0.0,6.0,1.0,0.0,2009.5
3,1001,2009,4,10.0,2.0,0.0,0.0,0.0,0.0,14.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,0.0,2009.75
4,1001,2010,1,10.0,6.0,0.0,2.0,2.0,0.0,11.0,...,0.0,0.0,0.0,0.0,0.0,0.0,14.0,7.0,0.0,2010.0


In [29]:
col = allWeather.pop("yearQtr")
allWeather.insert(0, col.name, col)

In [30]:
allWeather.head()

Unnamed: 0,yearQtr,zipcode,year,qtr,precip_annualquant_0.95,precip_annualquant_1x5Qtrs,precip_annualquant_1x5Yrs,precip_zipquant_0.95,precip_zipquant_1x5Qtrs,precip_zipquant_1x5Yrs,...,precip_zipQuarterquant_1x5Yrs,temp_annualquant_0.95,temp_annualquant_1x5Qtrs,temp_annualquant_1x5Yrs,temp_zipquant_0.95,temp_zipquant_1x5Qtrs,temp_zipquant_1x5Yrs,temp_zipQuarterquant_0.95,temp_zipQuarterquant_1x5Qtrs,temp_zipQuarterquant_1x5Yrs
0,2009.0,1001,2009,1,5.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,4.0,0.0
1,2009.25,1001,2009,2,8.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,6.0,4.0,0.0,6.0,3.0,0.0
2,2009.5,1001,2009,3,10.0,4.0,0.0,2.0,2.0,0.0,...,1.0,0.0,0.0,0.0,17.0,9.0,0.0,6.0,1.0,0.0
3,2009.75,1001,2009,4,10.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,0.0
4,2010.0,1001,2010,1,10.0,6.0,0.0,2.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,7.0,0.0


In [31]:
lag1 = allWeather.copy()
lag1['yearQtr'] += 0.25
for colname in lag1.columns[4:]:
    lag1.rename(columns = {colname: 'lag1_' + colname}, inplace = True)
lag1.drop(columns = {'year','qtr'},inplace = True)

    
lag2 = allWeather.copy()
lag2['yearQtr'] += 0.5
for colname in lag2.columns[4:]:
    lag2.rename(columns = {colname: 'lag2_' + colname}, inplace = True)
lag2.drop(columns = {'year','qtr'},inplace = True)


lag3 = allWeather.copy()
lag3['yearQtr'] += 0.75
for colname in lag3.columns[4:]:
    lag3.rename(columns = {colname: 'lag3_' + colname}, inplace = True)
lag3.drop(columns = {'year','qtr'},inplace = True)


lag4 = allWeather.copy()
lag4['yearQtr'] += 1
for colname in lag4.columns[4:]:
    lag4.rename(columns = {colname: 'lag4_' + colname}, inplace = True)
lag4.drop(columns = {'year','qtr'},inplace = True)


print(allWeather.shape)

(1418016, 22)


In [32]:
allWeather_withLags = allWeather.merge(lag1).merge(lag2).merge(lag3).merge(lag4)

allWeather_withLags.year.value_counts()

2015    126740
2014    126498
2016    126100
2010    125655
2013    125141
2019    124962
2012    124891
2017    124878
2011    124874
2018    124686
Name: year, dtype: int64

In [33]:
allWeather_withLags.to_csv("../../data/companyData/allWeather_withLags_allZips.csv")

Now do the same for the industry-specific weather.

In [None]:
# allWeather = pd.read_csv("../../../../../../../Volumes/backup2/dissData/prism/allWeatherBins_2010.2019.csv").\
allWeather_byInd = pd.read_csv("../../data/companyData/revised_allWeatherBins_2009to2019_byInd.csv").\
    drop(columns = {"Unnamed: 0"})
'''[['famafrench','zipcode','yearQuarter', 
                                    'temp_ffquant_0.95','temp_indQuarterquant_0.95',
                                   'temp5Days_ffquant_0.95', 'temp5Days_indQuarterquant_0.95',
                                   'precip_ffquant_0.95', 'precip_indQuarterquant_0.95',
                                   'precip5Days_ffquant_0.95', 'precip5Days_indQuarterquant_0.95']]
'''
allWeather_byInd['year'] = allWeather_byInd.yearQuarter.str.slice(0,4).astype('int64')
allWeather_byInd['qtr']  = allWeather_byInd.yearQuarter.str.slice(5,6).astype('int64')
allWeather_byInd['yearQtr'] = allWeather_byInd.year + (allWeather_byInd.qtr - 1)/4

allWeather_byInd = allWeather_byInd.astype({'year':       'category',
                         'qtr':        'category',
                         'zipcode':    'category',
                         'famafrench': 'category'})

changes['zipcode'] = changes['zipcode'].astype({'zipcode': 'int64'})

changes = changes.astype({'year':       'category',
                          'qtr':        'category',
                          'zipcode':    'category',
                          'famafrench': 'category'})

col = allWeather_byInd.pop("year")
allWeather_byInd.insert(0, col.name, col)

col = allWeather_byInd.pop("qtr")
allWeather_byInd.insert(0, col.name, col)


col = allWeather_byInd.pop("yearQtr")
allWeather_byInd.insert(0, col.name, col)

allWeather_byInd.drop(columns = {'yearQuarter'}, inplace = True)

print(allWeather_byInd.head())

In [None]:
lag1 = allWeather_byInd.copy()
lag1['yearQtr'] += 0.25
for colname in lag1.columns[5:]:
    lag1.rename(columns = {colname: 'lag1_' + colname}, inplace = True)
lag1.drop(columns = {'year','qtr'},inplace = True)
lag1 = lag1.astype({'yearQtr':       'category'})

    
lag2 = allWeather_byInd.copy()
lag2['yearQtr'] += 0.5
for colname in lag2.columns[5:]:
    lag2.rename(columns = {colname: 'lag2_' + colname}, inplace = True)
lag2.drop(columns = {'year','qtr'},inplace = True)
lag2 = lag2.astype({'yearQtr':       'category'})


lag3 = allWeather_byInd.copy()
lag3['yearQtr'] += 0.75
for colname in lag3.columns[5:]:
    lag3.rename(columns = {colname: 'lag3_' + colname}, inplace = True)
lag3.drop(columns = {'year','qtr'},inplace = True)
lag3 = lag3.astype({'yearQtr':       'category'})


lag4 = allWeather_byInd.copy()
lag4['yearQtr'] += 1
for colname in lag4.columns[5:]:
    lag4.rename(columns = {colname: 'lag4_' + colname}, inplace = True)
lag4.drop(columns = {'year','qtr'},inplace = True)
lag4 = lag4.astype({'yearQtr':       'category'})


allWeather_byInd = allWeather_byInd.astype({'yearQtr':       'category'})


print(allWeather_byInd.shape)


allWeather_byInd.head()


'''allWeather_byInd_withLags = allWeather_byInd.merge(lag1).merge(lag2).merge(lag3).merge(lag4)

allWeather_byInd_withLags.year.value_counts()
'''

In [None]:
allWeather_byInd_withLags = allWeather_byInd.merge(lag1).merge(lag2).merge(lag3).merge(lag4)

In [None]:
allWeather_byInd_withLags.shape

In [None]:
allWeather_byInd_withLags.to_csv("../../data/companyData/allWeather_byInd_withLags.csv")

In [None]:
del allWeather_byInd
del lag1
del lag2
del lag3
del lag4
gc.collect()

# Locations
Create a separate definition of weather based not on HQ but on employee-weighted establishment footprint.

In [4]:
fractions = pd.read_csv('../../data/companyData/fractionEmployees_byEstablishment.csv').\
    drop(columns = {"Unnamed: 0", 'latitude','longitude'}).rename(columns = {'archive_version_year': 'year',
                                                    'parent_number': 'abi'})

fractions['year']    = fractions.year.astype('int64')
fractions['zipcode'] = fractions.zipcode.astype('int64')
fractions.head()

gvKey_abiLinkingTable = pd.read_csv('../../data/companyData/linkingTable.csv').drop(columns = ['Unnamed: 0'])

print(gvKey_abiLinkingTable.abi)

gvKey_abiLinkingTable.head()

fractions = fractions[['year','abi','zipcode','locationFracOfEmployees']].merge(gvKey_abiLinkingTable[['abi','gvkey']])

fractions = fractions.astype({'year':       'category',
                           'zipcode':    'category'})

fractions.head()

0       402180222
1       397759739
2       400700704
3       218548014
4       441435880
          ...    
8394    432764101
8395    416920671
8396    739118540
8397    227688843
8398    488766353
Name: abi, Length: 8399, dtype: int64


Unnamed: 0,year,abi,zipcode,locationFracOfEmployees,gvkey
0,2003,100537,24541,0.047704,3937
1,2003,100537,24540,0.572451,3937
2,2003,100537,27828,0.357782,3937
3,2003,100537,27804,0.017889,3937
4,2003,100537,27834,0.001789,3937


In [5]:
fractionsWithWeather = fractions.merge(allWeather_withLags) 
fractionsWithWeather.drop(columns = {'abi','zipcode'}, inplace = True)

print(fractionsWithWeather.shape)
fractionsWithWeather.head()

(29667503, 94)


Unnamed: 0,year,locationFracOfEmployees,gvkey,qtr,precip_annualquant_0.95,precip_annualquant_1x5Qtrs,precip_annualquant_1x5Yrs,precip_zipquant_0.95,precip_zipquant_1x5Qtrs,precip_zipquant_1x5Yrs,...,lag4_precip_zipQuarterquant_1x5Yrs,lag4_temp_annualquant_0.95,lag4_temp_annualquant_1x5Qtrs,lag4_temp_annualquant_1x5Yrs,lag4_temp_zipquant_0.95,lag4_temp_zipquant_1x5Qtrs,lag4_temp_zipquant_1x5Yrs,lag4_temp_zipQuarterquant_0.95,lag4_temp_zipQuarterquant_1x5Qtrs,lag4_temp_zipQuarterquant_1x5Yrs
0,2010,0.061576,3937,1,6.0,2.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,4.0,0.0
1,2010,0.061576,3937,2,5.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,9.0,2.0,0.0,7.0,2.0,0.0
2,2010,0.061576,3937,3,6.0,3.0,0.0,0.0,0.0,0.0,...,0.0,2.0,1.0,0.0,23.0,11.0,0.0,5.0,2.0,0.0
3,2010,0.061576,3937,4,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,2.0,0.0
4,2010,9e-05,2663,1,6.0,2.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,4.0,0.0


In [14]:
fractionsWithWeather[fractionsWithWeather.gvkey == 1004]

Unnamed: 0,year,locationFracOfEmployees,gvkey,qtr,precip_annualquant_0.95,precip_annualquant_1x5Qtrs,precip_annualquant_1x5Yrs,precip_zipquant_0.95,precip_zipquant_1x5Qtrs,precip_zipquant_1x5Yrs,...,lag4_precip_zipQuarterquant_1x5Yrs,lag4_temp_annualquant_0.95,lag4_temp_annualquant_1x5Qtrs,lag4_temp_annualquant_1x5Yrs,lag4_temp_zipquant_0.95,lag4_temp_zipquant_1x5Qtrs,lag4_temp_zipquant_1x5Yrs,lag4_temp_zipQuarterquant_0.95,lag4_temp_zipQuarterquant_1x5Qtrs,lag4_temp_zipQuarterquant_1x5Yrs
64544,2013,0.003831,1004,1,0.022983,0.003831,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.068949,0.045966,0.000000
64545,2013,0.003831,1004,2,0.042136,0.019153,0.000000,0.003831,0.003831,0.000000,...,0.003831,0.007661,0.000000,0.0,0.038305,0.022983,0.000000,0.038305,0.022983,0.007661
64546,2013,0.003831,1004,3,0.042136,0.022983,0.000000,0.007661,0.007661,0.000000,...,0.000000,0.007661,0.003831,0.0,0.141729,0.068949,0.000000,0.065119,0.015322,0.003831
64547,2013,0.003831,1004,4,0.026814,0.007661,0.000000,0.003831,0.003831,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.015322,0.011492,0.000000
67968,2014,0.038161,1004,1,0.305285,0.038161,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.228964,0.038161,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28206919,2013,0.011013,1004,4,0.011013,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.022025,0.011013,0.0,0.022025,0.000000,0.000000,0.088102,0.066076,0.000000
28686737,2010,0.000968,1004,1,0.011621,0.005811,0.000000,0.001937,0.000968,0.000000,...,0.001937,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.003874,0.002905,0.000000
28686738,2010,0.000968,1004,2,0.008716,0.001937,0.000968,0.000000,0.000000,0.000000,...,0.000968,0.003874,0.001937,0.0,0.014526,0.007747,0.001937,0.016463,0.012590,0.001937
28686739,2010,0.000968,1004,3,0.012590,0.006779,0.000000,0.000968,0.000968,0.000000,...,0.000000,0.000000,0.000000,0.0,0.012590,0.004842,0.000000,0.002905,0.000000,0.000000


In [7]:
del allWeather_withLags
del fractions
del gvKey_abiLinkingTable
gc.collect()

NameError: name 'allWeather_withLags' is not defined

In [10]:
for col in fractionsWithWeather.columns[4:]:
    fractionsWithWeather[col] = fractionsWithWeather[col] * fractionsWithWeather.locationFracOfEmployees

In [15]:
g = fractionsWithWeather.groupby(['gvkey','year','qtr']).sum().reset_index()
g.drop(columns = {'locationFracOfEmployees'}, inplace = True)

for colname in g.columns[3:]:
    g.rename(columns = {colname: 'empWt_' + colname}, inplace = True)

g.head()

Unnamed: 0,gvkey,year,qtr,empWt_precip_annualquant_0.95,empWt_precip_annualquant_1x5Qtrs,empWt_precip_annualquant_1x5Yrs,empWt_precip_zipquant_0.95,empWt_precip_zipquant_1x5Qtrs,empWt_precip_zipquant_1x5Yrs,empWt_precip_zipQuarterquant_0.95,...,empWt_lag4_precip_zipQuarterquant_1x5Yrs,empWt_lag4_temp_annualquant_0.95,empWt_lag4_temp_annualquant_1x5Qtrs,empWt_lag4_temp_annualquant_1x5Yrs,empWt_lag4_temp_zipquant_0.95,empWt_lag4_temp_zipquant_1x5Qtrs,empWt_lag4_temp_zipquant_1x5Yrs,empWt_lag4_temp_zipQuarterquant_0.95,empWt_lag4_temp_zipQuarterquant_1x5Qtrs,empWt_lag4_temp_zipQuarterquant_1x5Yrs
0,1004,2010,1,4.929305,1.937633,0.001549,0.417199,0.251211,0.063529,8.740461,...,0.522758,0.0,0.0,0.0,0.0,0.0,0.0,11.253147,5.650978,0.004842
1,1004,2010,2,9.171799,2.376719,0.352121,0.619407,0.548131,0.181484,12.80612,...,0.378268,2.718768,1.803603,0.0,8.629285,4.866938,0.032733,10.768545,6.249855,0.674027
2,1004,2010,3,9.110014,2.795468,0.008135,0.976177,0.457099,0.076312,12.741623,...,0.188069,4.817935,3.178772,0.0,16.749758,6.322293,0.0,4.360256,2.752663,0.0
3,1004,2010,4,3.486539,0.935309,0.1687,0.138873,0.076893,0.001937,4.639357,...,0.394151,0.0,0.0,0.0,0.894635,0.493318,0.0,5.435987,2.593066,0.333721
4,1004,2011,1,4.287566,1.164211,0.128462,0.339815,0.014928,0.0,6.902966,...,0.12473,0.0,0.0,0.0,0.0,0.0,0.0,4.816735,2.130623,0.0


In [16]:
g.to_csv("../../data/companyData/weatherByEstablishment.csv")

In [19]:
establishmentZips = fractions.zipcode.unique()
len(establishmentZips)

37200

## create the original weather with lags dataset

In [17]:
allWeather_withLags = pd.read_csv("../../data/companyData/allWeather_withLags.csv").\
    drop(columns = {"Unnamed: 0", 'yearQtr'})

averages = pd.read_csv("../../data/companyData/quarterlyStatsByZip.csv").\
    drop(columns = {"Unnamed: 0"}).rename(columns = {'ZIP': 'zipcode'})


averages['qtr'] = averages.quarter.str.slice(1,2).astype('float')
averages.drop(columns = {'quarter'}, inplace = True) 

print(len(averages.zipcode.unique()))

averages.head()

allWeather_withLags       = allWeather_withLags.astype({'year':       'category',
                           'qtr':        'category',
                           'zipcode':    'category'})
averages                  = averages.astype({'qtr':        'category',
                           'zipcode':    'category'})

allWeather_byInd_withLags = pd.read_csv("../../data/companyData/allWeather_byInd_withLags.csv").\
    drop(columns = {"Unnamed: 0", 'yearQtr'})
allWeather_byInd_withLags = allWeather_byInd_withLags.astype({'year':       'category',
                           'qtr':        'category',
                           'zipcode':    'category'})

4456


Create direct effects database. Merge weather to full cstat database.

In [33]:
allWeather_withLags.zipcode

0          1001
1          1001
2          1001
3          1001
4          1001
          ...  
167715    99403
167716    99403
167717    99403
167718    99403
167719    99403
Name: zipcode, Length: 167720, dtype: category
Categories (4193, int64): [1001, 1013, 1085, 1089, ..., 99352, 99354, 99362, 99403]

In [34]:
changes.zipcode = changes.zipcode.astype('int64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [36]:
# changes['zipcode']  = changes['zipcode'].astype('int64')
changesWithWeather = changes.merge(allWeather_withLags).merge(allWeather_byInd_withLags).merge(averages).merge(g)
print(changes.shape,changesWithWeather.shape)

(487582, 33) (129014, 829)


In [37]:
changesWithWeather.to_csv("../../data/companyData/cstatWithWeather.csv")

Merge weather to the ig-cstat database.

In [38]:
igChangesWithWeather = igChanges.merge(allWeather_withLags).merge(allWeather_byInd_withLags).merge(averages).merge(g)
igChangesWithWeather.shape

(125540, 850)

In [39]:
igChangesWithWeather.to_csv("../../data/companyData/igWithWeather.csv")

In [40]:
igChangesWithWeather.head()

Unnamed: 0,gvkey,datadate,year,qtr,companyName,curcdq,assets,costGoodsSold,totalInv,netIncome,...,empWt_lag4_precip_zipQuarterquant_1x5Yrs,empWt_lag4_temp_annualquant_0.95,empWt_lag4_temp_annualquant_1x5Qtrs,empWt_lag4_temp_annualquant_1x5Yrs,empWt_lag4_temp_zipquant_0.95,empWt_lag4_temp_zipquant_1x5Qtrs,empWt_lag4_temp_zipquant_1x5Yrs,empWt_lag4_temp_zipQuarterquant_0.95,empWt_lag4_temp_zipQuarterquant_1x5Qtrs,empWt_lag4_temp_zipQuarterquant_1x5Yrs
0,1013,20100331,2010,1,ADC TELECOMMUNICATIONS INC,USD,1345.1,158.5,122.8,-13.2,...,0.332755,0.0,0.0,0.0,0.0,0.0,0.0,10.198698,5.252495,0.0
1,21743,20100331,2010,1,NVE CORP,USD,57.463,2.219,1.706,3.598,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,5.0,0.0
2,4051,20100331,2010,1,WIND ENERGY AMERICA INC,USD,17.312,0.5,0.0,-0.53,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,5.0,0.0
3,6900,20100331,2010,1,MTS SYSTEMS CORP,USD,377.479,52.567,45.972,6.174,...,0.002935,0.0,0.0,0.0,0.0,0.0,0.0,8.925636,4.842466,0.0
4,28775,20100331,2010,1,XRS CORP,USD,69.79,7.68,2.488,-0.402,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.491124,5.0,0.0


In [None]:
igChangesWithWeather['temp_zipquant_0.95'].describe()

In [None]:
igChangesWithWeather['temp5Days_ffquant_0.95'].describe()

# Indirect
Introduce the SC Data.

In [None]:
# this does a little bit of a test on the reporting requirements. 
# number 

'''c_linksTest = pd.read_csv("../../data/companyData/compustatSCLinked.csv")[['srcdate','gvkey','cgvkey']]
c_linksTest['year'] = c_linksTest.srcdate.astype('str').str.slice(0,4).astype('int64')

bs = c_linksTest[c_linksTest.year < 2014]
print("Customers per supplier, 1978-2013 Pd: ", len(bs.cgvkey.unique())/len(bs.gvkey.unique()))

bs2 = c_linksTest[c_linksTest.year > 2010]
print("Customers per supplier, Recent Pd: ", len(bs2.cgvkey.unique())/len(bs2.gvkey.unique()))'''


In [None]:
c_links = pd.read_csv("../../data/companyData/compustatSCLinked.csv")

c_links['year'] = c_links.srcdate.astype('str').str.slice(0,4).astype('int64')

c_links = c_links[c_links.year > 1999][['year','gvkey','cgvkey','salecs']].\
    rename(columns = {'cgvkey': 'customer_gvkey','gvkey': 'supplier_gvkey'})


print(c_links.shape)

c_links.head()

industries.columns = ['customer_gvkey','customer_famafrench']

c_links = c_links.merge(industries)

industries.columns = ['supplier_gvkey','supplier_famafrench']

c_links = c_links.merge(industries)
print(c_links.head(), c_links.shape)


c_links.to_csv("../../data/companyData/c_links.csv")


sum(c_links.supplier_gvkey.isin(hasMatch) | c_links.customer_gvkey.isin(hasMatch))


Now see if it's common to have one in and one out of the industries of interest. 

For now, let's keep all the different industry types.

We can always filter later if we need to.

In [None]:
#########################
# merge in customer information
gvKey_abiLinkingTable.columns = customer_columns

print(c_links.shape)
c_linksMerge1 = c_links.merge(gvKey_abiLinkingTable, on ='customer_gvkey')
print(c_links.shape,c_linksMerge1.shape)



#########################
# and merge in supplier 
gvKey_abiLinkingTable.columns = supplier_columns

print(c_links.shape)
c_linksMerge2 = c_linksMerge1.merge(gvKey_abiLinkingTable, on ='supplier_gvkey')
print(c_links.shape,c_linksMerge2.shape)

c_linksMerge2.to_csv("../../data/companyData/clinks_IG_selected.csv")

This is probably because: (1) companies are not in North America, or (2) companies are not in the physical goods industries we're interested in. We can verify this though: look at c_links where both the customer and supplier are in the dataset of interest.

In [None]:
chq     = pd.read_csv("../../data/chq.csv",dtype={'cstatZipcode': 'object'}).drop(columns = {'Unnamed: 0'})

c_linkTest = c_links[c_links.customer_gvkey.isin(chq.gvkey.unique()) & \
                     c_links.supplier_gvkey.isin(chq.gvkey.unique())]

print("Percent of firms with a match: ", c_linksMerge2.shape[0]/c_linkTest.shape[0])

It's entirely possible that we have too small of a sample from the 2010s alone. Let's just try it though and see how it goes.

First, make a sample with the companies on three years of either side of when it reports another customer.

In [None]:
def makeOneEitherSide(df): 
    yrPlus1 = df.copy(); yrPlus1['year'] += 1
    # yrPlus2 = df.copy(); yrPlus2['year'] += 1
    # yrPlus3 = df.copy(); yrPlus3['year'] += 1
    
    yrMinus1 = df.copy(); yrMinus1['year'] -= 1
    # yrMinus2 = df.copy(); yrMinus2['year'] -= 1
    # yrMinus3 = df.copy(); yrMinus3['year'] -= 1
    
    all = pd.concat([yrPlus1,yrMinus1]) # pd.concat([yrPlus1,yrPlus2,yrPlus3,yrMinus1,yrMinus2,yrMinus3])
    
    return(all)

In [None]:
scTableCustomers = c_linksMerge2.copy()[['year','customer_gvkey','customer_abi']].drop_duplicates()
scTableSuppliers = c_linksMerge2.copy()[['year','supplier_gvkey','supplier_abi']].drop_duplicates()

allCustomerData = makeOneEitherSide(scTableCustomers)
allCustomerData.columns = ['year','gvkey','abi']


allSupplierData = makeOneEitherSide(scTableSuppliers)
allSupplierData.columns = ['year','gvkey','abi']

allAbi = allCustomerData.abi.append(allSupplierData.abi).drop_duplicates()

hqsOnly = pd.read_csv("../../data/ig_uniqueHQs.csv").drop(columns = {'Unnamed: 0'})

hq = pd.read_csv("../../data/ig_uniqueHQs_multLocations.csv").\
    drop(columns = {'Unnamed: 0'}).\
    rename(columns = {'archive_version_year': 'year'})

hq['year'] = hq.year.astype('int64')

hqRelevant = hq[hq.abi.isin(allAbi)]


allSupplierData = allSupplierData.merge(hqRelevant).drop_duplicates()
allCustomerData = allCustomerData.merge(hqRelevant).drop_duplicates()

print(allSupplierData.head())

allCustomerData.to_csv("../../data/companyData/allCustomerData.csv")
allSupplierData.to_csv("../../data/companyData/allSupplierData.csv")

## Find Customer and Supplier pairings and merge with change data
### Can pick up here

In [None]:
allSupplierData = pd.read_csv("../../data/companyData/allSupplierData.csv").drop(columns = ['Unnamed: 0'])
allCustomerData = pd.read_csv("../../data/companyData/allCustomerData.csv").drop(columns = ['Unnamed: 0'])

changes = pd.read_csv("../../data/companyData/compustatChanges_withControls.csv").drop(columns = ['Unnamed: 0'])
changes.head()
suppliers = changes.merge(allSupplierData[['year','gvkey','zipcode','employeesAtLocation']])
print(suppliers.shape)

customers = changes.merge(allCustomerData[['year','gvkey','zipcode','employeesAtLocation']])
print(customers.head())

print(allCustomerData.shape,allSupplierData.shape)

## Get first-hop SC data

In [None]:
c_links = pd.read_csv("../../data/companyData/clinks_IG_selected.csv").drop(columns = {'Unnamed: 0'})
c_links.head()

In [None]:
c_links['suppliers'] = 1
custExp = c_links[['year', 'customer_gvkey', 'salecs','suppliers']].groupby(['year','customer_gvkey']).sum().\
    reset_index().rename(columns = {'salecs': 'totalExp'})

custExp.head()



In [None]:
print("Number of firms with no exp information and multiple suppliers: ", \
          sum(custExp[custExp.totalExp == 0].suppliers > 1))
print("Number of firms with no exp information and >5 suppliers: ", \
          sum(custExp[custExp.totalExp == 0].suppliers > 5))


Most of these firms have expenditure information. We can look at:
    - Expenditure-weighted (just do equal shares if no exp information)
    - Largest supplier

In [None]:
customerDB = c_links[['year','customer_gvkey','supplier_gvkey','salecs']].\
    merge(custExp).rename(columns = {'customer_gvkey': 'gvkey'}).drop_duplicates()
print(customerDB.shape)

customerDB.head()

In [None]:
customersWithWeather = customers.merge(allWeather_withLags).merge(averages).merge(allWeather_byInd_withLags)
customersWithWeather.shape

suppliersWithWeather = suppliers.merge(allWeather_withLags).merge(averages).merge(allWeather_byInd_withLags)
suppliersWithWeather.shape

suppliersWithWeather.to_csv("../../data/companyData/suppliersWithWeather.csv")
customersWithWeather.to_csv("../../data/companyData/customersWithWeather.csv")

'''suppliersWithWeather = pd.read_csv("../../data/companyData/suppliersWithWeather.csv").drop(columns = {'Unnamed: 0'})
customersWithWeather = pd.read_csv("../../data/companyData/customersWithWeather.csv").drop(columns = {'Unnamed: 0'})'''

frames = [customersWithWeather, suppliersWithWeather]

allCompanies = pd.concat(frames).drop_duplicates()

print(allCompanies.shape)

allCompanies.to_csv("../../data/companyData/allCompaniesWithWeather.csv")

## Biggest Supplier
Focus on weather of biggest supplier.

First find the max by supplier. Add back in any rows with only 1 supplier.

In [None]:
customerDB.shape

In [None]:
# https://stackoverflow.com/questions/15705630/get-the-rows-which-have-the-max-value-in-groups-using-groupby
idx = customerDB.groupby(['year','gvkey']).salecs.\
    transform(max) == customerDB.salecs
largestSuppliers = customerDB[idx].reset_index(drop = True)
print(largestSuppliers.shape)

# find companies who only have one other supplier
singleSuppliers = customerDB[customerDB.suppliers == 1].reset_index(drop = True)
print(singleSuppliers.shape)

# find largest suppliers of different companies
largestSuppliers = largestSuppliers.append(singleSuppliers).drop_duplicates()
print(largestSuppliers.shape)



In [None]:
customerDB[['year','gvkey']][customerDB.year > 2009].drop_duplicates().shape

In [None]:
len(largestSuppliers.gvkey.unique())

In [None]:
for column in suppliersWithWeather.columns:
    print(column)

In [None]:
relevantVars = [x for x in suppliersWithWeather.columns if (('year' in x) | 
                                                 ('qtr' in x) |
                                                 ('gvkey' in x) |
                                                 ('_' in x)) & 
                                                ~('roa_lagged' in x) & 
                                                ~('yearQtr' in x)]

suppliers_toMerge = suppliersWithWeather[relevantVars]


for colname in suppliers_toMerge.columns[3:]:
    suppliers_toMerge.rename(columns = {colname: 'supplier_' + colname}, inplace = True)

    
suppliers_toMerge.rename(columns = {'gvkey': 'supplier_gvkey'},inplace = True)    

print(suppliers_toMerge.columns)


'''suppliers_toMerge = suppliersWithWeather[['year','qtr','gvkey','tmax_quant_1.0','precip_quant_1.0']].\
    rename(columns = {'gvkey': 'supplier_gvkey',
                      'tmax_quant_1.0': 'supplier_tmax_quant_1.0',
                      'precip_quant_1.0': 'supplier_precip_quant_1.0'})'''

In [None]:
customersWithWeather.shape

In [None]:
len(set(supplierWtdAvgWeather.gvkey.unique()) - set(suppliers_toMerge.gvkey.unique()))


In [None]:
largestSuppliers.head()

In [None]:
largestSuppliersWithWeather = customersWithWeather.merge(largestSuppliers[['year', 'gvkey', 'supplier_gvkey']]).merge(suppliers_toMerge)
largestSuppliersWithWeather.shape

In [None]:
largestSuppliersWithWeather.head()

In [None]:
largestSuppliersWithWeather.to_csv("../../data/companyData/largestSuppliersWithWeather.csv")


In [None]:
largestSuppliersWithWeather.suppliers

In [None]:
largestSuppliersWithWeather = pd.read_csv("../../data/companyData/largestSuppliersWithWeather.csv")
largestSuppliersWithWeather.head()

In [None]:
largestSuppliersWithWeather.columns[0:50]

## Sales-Weighted Average
If a company doesn't have sales-specific information, then assume equal shares. This doesn't happen for too many of the companies, thankfully.

In [None]:
customerDB = c_links[['year','customer_gvkey','supplier_gvkey','salecs']].\
    merge(custExp).rename(columns = {'customer_gvkey': 'gvkey'}).drop_duplicates()

customerDB['salesWeight'] = customerDB.salecs/customerDB.totalExp

customerDB.fillna(1, inplace = True)

Now merge this with the supplier weather data, and use the sales weights to find a sales-weighted average of the weather conditions for the suppliers.

In [None]:
relevantVars = [x for x in suppliersWithWeather.columns if (('year' in x) | 
                                                 ('qtr' in x) |
                                                 ('gvkey' in x) |
                                                 ('_' in x)) & 
                                                ~('roa_lagged' in x) & 
                                                ~('yearQtr' in x)]

suppliers_toMerge = suppliersWithWeather[relevantVars]


for colname in suppliers_toMerge.columns[3:]:
    suppliers_toMerge.rename(columns = {colname: 'supplier_' + colname}, inplace = True)

    
suppliers_toMerge.rename(columns = {'gvkey': 'supplier_gvkey'},inplace = True)    


In [None]:
suppliers_toMerge.head()

For each of the supplier weather columns, multiply the variable by the fraction of sales attributable to that relationship.

In [None]:
supplierWeather = customerDB[['year','gvkey','supplier_gvkey','salesWeight']].merge(suppliers_toMerge)

for col in supplierWeather.columns[5:]:
        supplierWeather[col]   = supplierWeather.salesWeight*supplierWeather[col]
        
        

supplierWeather.drop(columns = {'supplier_gvkey','salesWeight'}, inplace = True)


print(supplierWeather.head())



# [['year','qtr','gvkey','supplier_tmax_quant_1.0','supplier_precip_quant_1.0']]

In [None]:
supplierWtdAvgWeather = supplierWeather.groupby(['year','qtr','gvkey']).sum().reset_index().drop_duplicates()

In [None]:
supplierWtdAvgWeather.gvkey.unique()

Merge the supplier weighted average weather data with the customer data that has weather as well.

In [None]:
customersWithWeather.head()

In [None]:
wtdAvgSuppliers = customersWithWeather.merge(supplierWtdAvgWeather)

wtdAvgSuppliers.head()

In [None]:
wtdAvgSuppliers.shape

In [None]:
wtdAvgSuppliers.to_csv("../../data/companyData/wtdAvgSuppliers.csv")

In [None]:
wtdAvgSuppliers.head()

In [None]:
wtdAvgSuppliers.columns[wtdAvgSuppliers.columns.str.contains('Tercile')]