In [21]:
import pickle
import pandas as pd
import matplotlib
import os
import re

import scipy

import collections
import datetime
import time

import geopandas as gpd

import numpy as np
 
from difflib import get_close_matches

from fuzzywuzzy import process
from fuzzywuzzy import fuzz
from sklearn import linear_model

import statsmodels.api as sm
import statsmodels.formula.api as smf

from linearmodels import PanelOLS, FamaMacBeth
from scipy import stats

import itertools

import matplotlib.pyplot as plt

from numpy.linalg import matrix_rank


In [8]:
def makePlots(results, industries, filePrefix, yLim, numCol = 2, padding = 1, xdim = 20, ydim = 40):
    
    # loop over outcome variables and weather definitions
    weatherVars = results.weatherVar.unique()
    outcomeVars = results.outcomeVar.unique()


    for outcome in outcomeVars:
        for weather in weatherVars:
            # choose the elective parts of this - number of columns and the range of the axes
            numCols = numCol
            yLims   = yLim

            rowNum = len(industries) // numCols + padding
            colNum = numCols

            fig, ax = plt.subplots(rowNum, colNum, sharex='all', sharey='all',
                                  figsize=(xdim,ydim),
                                  constrained_layout=True)

            fig.suptitle('Direct Effects: ' + outcome + ' ~ ' + weather, fontsize=36)



            i = 0
            for ind in industries:
                rowIndex = i // numCols 
                colIndex = i % numCols


                i   = i + 1


                rev = results[(results.outcomeVar == outcome) & (results.weatherVar == weather) & 
                             (results.industry == ind)].reset_index()
                # indName = rev.industryName.unique()[0]
                x   = [0,1,2,3,4]
                y   = [rev.lag0,rev.lag1,rev.lag2,rev.lag3,rev.lag4]


                errors = [rev.bse0,rev.bse1,rev.bse2,rev.bse3,rev.bse4]


                ax[rowIndex, colIndex].errorbar(x,y,yerr = errors, fmt = '.k')
                ax[rowIndex, colIndex].xaxis.grid(False)
                ax[rowIndex, colIndex].yaxis.grid(False)
                ax[rowIndex, colIndex].axhline(y=0)
                ax[rowIndex, colIndex].set_ylim([-yLims, yLims])

                ax[rowIndex, colIndex].yaxis.set_ticks(np.arange(-yLims, yLims + yLims, yLims/2))
                ax[rowIndex, colIndex].xaxis.set_ticks(np.arange(0.0, 5.0, 1.0))

                ax[rowIndex, colIndex].tick_params(axis='both', labelsize = 16)
                ax[rowIndex, colIndex].set_title(ind, fontsize = 24)

            fig.savefig(filePrefix + outcome + weather + '.png')
            fig.show()


                # ax[rowIndex, colIndex].





# Plots
All direct effects - as result of number of days of extremes.

In [None]:
results = pd.read_csv("../../allIndustryResults.csv").drop(columns = {'Unnamed: 0'})
industries = results.industry.unique()
yLim   = 0.01
numCol = 3
padding = 1
xdim = 20
ydim = 40
filePrefix = 'dirEffects'

makePlots(results, industries, filePrefix, yLim)


## Grab Data

In [2]:
os.getcwd()

'/Users/brianreed/Documents/supplyChain/extremes/extremesAnalysisCode'

In [44]:
goodsData = pd.read_csv("../../data/companyData/goodsData_igData.csv").drop(columns = {'Unnamed: 0'})
goodsData = goodsData.rename(columns = {'precip_zipQuarterquant_0.95': 'precip_zipQuarterquant_Extreme',
                                        'lag1_precip_zipQuarterquant_0.95': 'lag1_precip_zipQuarterquant_Extreme',
                                        'lag2_precip_zipQuarterquant_0.95': 'lag2_precip_zipQuarterquant_Extreme',
                                        'lag3_precip_zipQuarterquant_0.95': 'lag3_precip_zipQuarterquant_Extreme',
                    'precip_annualquant_0.95': 'precip_annualquant_Extreme',
                                        'lag1_precip_annualquant_0.95': 'lag1_precip_annualquant_Extreme',
                                        'lag2_precip_annualquant_0.95': 'lag2_precip_annualquant_Extreme',
                                        'lag3_precip_annualquant_0.95': 'lag3_precip_annualquant_Extreme',
                    'precip5Days_zipQuarterquant_0.95': 'precip5Days_zipQuarterquant_Extreme',
                                       'lag1_precip5Days_zipQuarterquant_0.95': 'lag1_precip5Days_zipQuarterquant_Extreme',
                                       'lag2_precip5Days_zipQuarterquant_0.95': 'lag2_precip5Days_zipQuarterquant_Extreme',
                                       'lag3_precip5Days_zipQuarterquant_0.95': 'lag3_precip5Days_zipQuarterquant_Extreme',
                    'temp_zipQuarterquant_0.95': 'temp_zipQuarterquant_Extreme',
                                       'lag1_temp_zipQuarterquant_0.95': 'lag1_temp_zipQuarterquant_Extreme',
                                       'lag2_temp_zipQuarterquant_0.95': 'lag2_temp_zipQuarterquant_Extreme',
                                       'lag3_temp_zipQuarterquant_0.95': 'lag3_temp_zipQuarterquant_Extreme',
                    'temp5Days_zipQuarterquant_0.95': 'temp5Days_zipQuarterquant_Extreme',
                                       'lag1_temp5Days_zipQuarterquant_0.95': 'lag1_temp5Days_zipQuarterquant_Extreme',
                                       'lag2_temp5Days_zipQuarterquant_0.95': 'lag2_temp5Days_zipQuarterquant_Extreme',
                                       'lag3_temp5Days_zipQuarterquant_0.95': 'lag3_temp5Days_zipQuarterquant_Extreme',
                    'temp_annualquant_0.95': 'temp_annualquant_Extreme',
                                       'lag1_temp_annualquant_0.95': 'lag1_temp_annualquant_Extreme',
                                       'lag2_temp_annualquant_0.95': 'lag2_temp_annualquant_Extreme',
                                       'lag3_temp_annualquant_0.95': 'lag3_temp_annualquant_Extreme',
                    'empWt_temp_zipQuarterquant_0.95': 'empWt_temp_zipQuarterquant_Extreme',
                                        'empWt_lag1_temp_zipQuarterquant_0.95': 'empWt_lag1_temp_zipQuarterquant_Extreme',
                                        'empWt_lag2_temp_zipQuarterquant_0.95': 'empWt_lag2_temp_zipQuarterquant_Extreme',
                                        'empWt_lag3_temp_zipQuarterquant_0.95': 'empWt_lag3_temp_zipQuarterquant_Extreme'
                                       })
print(goodsData.shape)

firms = goodsData['gvkey']

# goodsData = goodsData[~goodsData.lnRev.isna() & ~goodsData.lnCost.isna()] # & ~goodsData.lnCostNormd.isna()]


(60806, 881)


# Direct Effects
Look at the effects on the suppliers when they're affected directly.

## Complete Dataset
### At HQs

The below gives us the full, clustered standard errors.

First, do the basics: days of extreme precipitation and (separately) extreme temperature, with 3 lags. We include a balance of time and industry-specific controls, fewer than are in the other regressions but generally allowing for a time trend, firm-specific trends, industry-seasonal trends, and profit, size, and age characteristics. We don't have time-specific trends across firms or industries but it's not clear that these would really change over the 10 years of the sample.



There are a couple of background facts that I'm relying on here: 
- the 1x year, 1x5 years, etc variables might be too rare to really pick up an effect.
- it's possible that lower tiers, or less extreme extremes, might matter too. may want to try to pick up a lower threshold as well. 
- the normalized variables (divided by lagged assets) seem to be more sensitive / response than just growth and just log-levels. this is likely because of something like the fact that this helps equalize for differences in the size of the firms in a way that neither log nor growth does. 



there are a couple of things to remember with these results:
- the company size/age/profitability terciles don't make a lick of difference
- precipitation seems to matter, period, for cumulative number of days
- temperature might need a longer streak for the effect to happen



a few things come out more in the heterogeneity analyses:
- it seems like the local-relative extremes matter especially at the upper ends of the distributions. this is a little counterintuitive but i think the story is something like the following: we expect that places with higher average temperatures would have higher ''95th percentile events'', and places with lower average temperatures might have lower ''95th percentile events'', that might actually not be that extreme. 
- we would expect the heatBin:extremeTemp(Precip) measure to show an opposite result if the extreme definition is an absolute one and not a relative one (larger effect in places with lower normal temps (precip) // lower effect in places with higher normal temps (precip)) because it's closer to their baseline & closer to what they might expect.
- there's not much with the industry-specific results? it could be that the data are currently too diffuse or too small to really 



questions:
- are there other moments of distributions or other ways to measure shifts in extremes?
- how should i best approach the industry-specific regressions? - separate regressions or interaction terms?
- what mechanisms should i consider? bs consider the role of "input specificity", as judged by patents or r&d. ps consider a few different ones: materiality, defined by value of physical assets/value of total assets; industry specificity; and expectation. 
    - are there any "climate mechanisms" i can examine here, other than just expectations?
    - how can we adapt or incorporate the scc here?



things to push forward on:
- targeting specific industries: either with different lag tiers, or with 
- indirect regressions!
- stock regressions
- extreme convective storms
- counts in disclosures



things that are probably very relevant that i should keep experimenting with:
- measures of concentration: establishment weights, percent of firm w/in 10% (or honestly 70%+) of hq
- extreme temp as 90+, maybe some flood-relative measure of extreme rain?


First, total days of heat and rain.



*AT SOME POINT, WE CAN ADD ADDTL COLUMNS FOR OTHER VARIABLES OF INTEREST TO THIS AS WELL: cost & profit, maybe also stocks [if we do a quarter before, quarter after] thing

In [12]:
outcome + ' ~ precip_zipQuarterquant_Extreme + lag1_precip_zipQuarterquant_Extreme + lag2_precip_zipQuarterquant_Extreme + lag3_precip_zipQuarterquant_Extreme + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)'

'lnRevNormd ~ precip_zipQuarterquant_Extreme + lag1_precip_zipQuarterquant_Extreme + lag2_precip_zipQuarterquant_Extreme + lag3_precip_zipQuarterquant_Extreme + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)'

In [13]:
resultList = []
resultList.append('a')
resultList

['a']

In [14]:
weatherVars = ['precip_zipQuarterquant_Extreme + lag1_precip_zipQuarterquant_Extreme + lag2_precip_zipQuarterquant_Extreme + lag3_precip_zipQuarterquant_Extreme',
              # 'temp_zipQuarterquant_Extreme + lag1_temp_zipQuarterquant_Extreme + lag2_temp_zipQuarterquant_Extreme + lag3_temp_zipQuarterquant_Extreme',
              'precip5Days_zipQuarterquant_Extreme + lag1_precip5Days_zipQuarterquant_Extreme + lag2_precip5Days_zipQuarterquant_Extreme + lag3_precip5Days_zipQuarterquant_Extreme',
              'temp5Days_zipQuarterquant_Extreme + lag1_temp5Days_zipQuarterquant_Extreme + lag2_temp5Days_zipQuarterquant_Extreme + lag3_temp5Days_zipQuarterquant_Extreme',
              'days90Plus + lag1_days90Plus + lag2_days90Plus + lag3_days90Plus',
              'streak90Plus + lag1_streak90Plus + lag2_streak90Plus + lag3_streak90Plus',
              'temp_zipWeek95_99 + lag1_temp_zipWeek95_99 + lag2_temp_zipWeek95_99 ', 
              'temp_zipMonth95_99 + lag1_temp_zipMonth95_99 + lag2_temp_zipMonth95_99 ',
              'temp_zipQuarter95_99 + lag1_temp_zipQuarter95_99 + lag2_temp_zipQuarter95_99 ']

for var in weatherVars:
    print(var[0:4])

prec
temp
prec
temp
days
stre
temp
temp
temp


In [17]:
start = time.time()
# lnRevNormd lnCostNormd lnStockClose lnOpIncNormd lnNetIncNormd
outcomes = ['lnRevNormd','lnCostNormd','lnStockClose','lnOpIncNormd','lnNetIncNormd']

weatherVars = [
              'precip_zipQuarterquant_Extreme + lag1_precip_zipQuarterquant_Extreme + lag2_precip_zipQuarterquant_Extreme + lag3_precip_zipQuarterquant_Extreme',
              'temp_zipQuarterquant_Extreme + lag1_temp_zipQuarterquant_Extreme + lag2_temp_zipQuarterquant_Extreme + lag3_temp_zipQuarterquant_Extreme',
              # 'precip5Days_zipQuarterquant_Extreme + lag1_precip5Days_zipQuarterquant_Extreme + lag2_precip5Days_zipQuarterquant_Extreme + lag3_precip5Days_zipQuarterquant_Extreme',
              'temp5Days_zipQuarterquant_Extreme + lag1_temp5Days_zipQuarterquant_Extreme + lag2_temp5Days_zipQuarterquant_Extreme + lag3_temp5Days_zipQuarterquant_Extreme',
              'days90Plus + lag1_days90Plus + lag2_days90Plus + lag3_days90Plus',
              'streak90Plus + lag1_streak90Plus + lag2_streak90Plus + lag3_streak90Plus',
              'temp_zipWeek95_99 + lag1_temp_zipWeek95_99 + lag2_temp_zipWeek95_99 ', 
              'temp_zipMonth95_99 + lag1_temp_zipMonth95_99 + lag2_temp_zipMonth95_99 ',
              'temp_zipQuarter95_99 + lag1_temp_zipQuarter95_99 + lag2_temp_zipQuarter95_99 ']

resultList = []

for weatherVar in weatherVars:
    print("************************************")
    results = pd.DataFrame()

    for outcome in outcomes:    
        print(outcome + "~" + weatherVar)
        mod = smf.ols(formula = outcome + ' ~ ' + weatherVar + ' + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData).fit()

        print(time.time() - start) 

        # convert this into a much more condensed version
        coeffs = pd.DataFrame(mod.params,   columns = ['coeffs'])
        pvalues = pd.DataFrame(mod.pvalues, columns = ['pvals'])

        coeffs = coeffs[coeffs.index.str.contains(weatherVar[0:4])]
        pvalues = pvalues[pvalues.index.str.contains(weatherVar[0:4])]

        resultsTemp = pd.concat([coeffs,pvalues],axis = 1)

        resultsTemp.loc['upperVariable'] = ['^' + outcome, '*********'] 

        results = pd.concat([results,resultsTemp], axis = 1)
        print(resultsTemp)
        
        resultList.append(results)


************************************
lnRevNormd~precip_zipQuarterquant_Extreme + lag1_precip_zipQuarterquant_Extreme + lag2_precip_zipQuarterquant_Extreme + lag3_precip_zipQuarterquant_Extreme


KeyboardInterrupt: 

Try to get the variance-covariance matrix, from https://www.statsmodels.org/dev/generated/statsmodels.regression.linear_model.RegressionResults.cov_params.html . We can use this in the calculation of MEs.

In [6]:
precipMod.cov_params()

Unnamed: 0,Intercept,C(yearQtr)[T.2010_2],C(yearQtr)[T.2010_3],C(yearQtr)[T.2010_4],C(yearQtr)[T.2011_1],C(yearQtr)[T.2011_2],C(yearQtr)[T.2011_3],C(yearQtr)[T.2011_4],C(yearQtr)[T.2012_1],C(yearQtr)[T.2012_2],...,C(indGroup)[T.construction]:C(qtr)[4],C(indGroup)[T.manu]:C(qtr)[4],C(indGroup)[T.mining]:C(qtr)[4],C(indGroup)[T.retail]:C(qtr)[4],C(indGroup)[T.transportUtilities]:C(qtr)[4],C(indGroup)[T.wholesale]:C(qtr)[4],precip_zipQuarterquant_Extreme,lag1_precip_zipQuarterquant_Extreme,lag2_precip_zipQuarterquant_Extreme,lag3_precip_zipQuarterquant_Extreme
Intercept,0.012618,-5.066770e-04,-5.059835e-04,-5.307789e-04,-1.862569e-04,-5.173577e-04,-5.162481e-04,-5.278845e-04,-1.842230e-04,-5.263955e-04,...,4.646046e-04,1.181127e-02,2.348291e-03,1.805044e-03,-1.683989e-04,-9.493848e-05,-4.486681e-06,-4.900487e-06,-3.380797e-06,-4.496482e-06
C(yearQtr)[T.2010_2],-0.000507,2.609797e-04,5.415909e-05,5.150022e-05,1.105843e-05,8.109296e-05,5.010037e-05,5.011768e-05,1.060146e-05,7.834038e-05,...,-2.170859e-06,-8.154608e-05,-6.664412e-06,-5.832524e-06,-7.935331e-06,-2.157333e-06,1.354528e-07,-3.905316e-07,-7.656596e-07,-8.467290e-08
C(yearQtr)[T.2010_3],-0.000506,5.415909e-05,2.653527e-04,5.308136e-05,1.346909e-05,4.999884e-05,8.259774e-05,5.013469e-05,1.327897e-05,4.992683e-05,...,-8.616607e-06,-1.461802e-05,-1.291058e-05,-1.086682e-05,-1.470086e-05,-9.562807e-06,2.711889e-07,2.041605e-07,-4.917503e-07,-6.463233e-07
C(yearQtr)[T.2010_4],-0.000531,5.150022e-05,5.308136e-05,2.685097e-04,1.659786e-05,5.135005e-05,5.103400e-05,8.470368e-05,1.428346e-05,4.965515e-05,...,-5.540994e-04,-5.678127e-04,-5.617789e-04,-5.604827e-04,-5.646755e-04,-5.578673e-04,4.244607e-08,3.655686e-07,1.059626e-07,-3.957670e-07
C(yearQtr)[T.2011_1],-0.000186,1.105843e-05,1.346909e-05,1.659786e-05,3.724487e-04,1.808439e-05,1.714777e-05,1.658015e-05,1.831311e-04,1.896139e-05,...,-1.979709e-05,6.547982e-05,-1.511971e-05,-1.649240e-05,-1.893390e-05,-1.856214e-05,5.907745e-07,8.136097e-07,4.473467e-07,2.570782e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C(indGroup)[T.wholesale]:C(qtr)[4],-0.000095,-2.157333e-06,-9.562807e-06,-5.578673e-04,-1.856214e-05,-1.657862e-05,-1.232148e-05,-5.578278e-04,-1.842178e-05,-1.772429e-05,...,6.983696e-03,-1.147559e-02,7.402389e-03,7.364395e-03,7.094136e-03,7.665537e-03,7.184952e-08,-9.266947e-07,1.351730e-07,-3.613935e-07
precip_zipQuarterquant_Extreme,-0.000004,1.354528e-07,2.711889e-07,4.244607e-08,5.907745e-07,-9.293240e-08,-3.250213e-08,-6.736319e-08,1.278679e-06,7.570721e-07,...,8.813192e-08,3.269819e-07,5.576542e-07,1.393594e-07,9.194602e-08,7.184952e-08,4.085743e-07,2.196407e-08,2.362352e-08,2.073362e-08
lag1_precip_zipQuarterquant_Extreme,-0.000005,-3.905316e-07,2.041605e-07,3.655686e-07,8.136097e-07,1.688363e-07,-7.094148e-08,3.324842e-08,7.118119e-07,9.267356e-07,...,-8.132716e-07,-1.552513e-06,-9.484866e-07,-6.801475e-07,-1.000108e-06,-9.266947e-07,2.196407e-08,4.102119e-07,1.606490e-08,2.352372e-08
lag2_precip_zipQuarterquant_Extreme,-0.000003,-7.656596e-07,-4.917503e-07,1.059626e-07,4.473467e-07,-1.912693e-09,1.002161e-08,-2.025553e-07,1.470234e-07,-4.558699e-08,...,3.095971e-07,-1.344337e-06,5.563734e-07,3.211890e-07,2.553767e-07,1.351730e-07,2.362352e-08,1.606490e-08,4.085049e-07,1.134900e-08


In [45]:
start = time.time()

# 
precipMod = smf.ols(formula = 'lnRevNormd ~ precip_zipQuarterquant_Extreme + lag1_precip_zipQuarterquant_Extreme + lag2_precip_zipQuarterquant_Extreme + lag3_precip_zipQuarterquant_Extreme + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData).fit()

print(time.time() - start) 

# convert this into a much more condensed version
coeffs = pd.DataFrame(precipMod.params,   columns = ['coeffs'])
pvalues = pd.DataFrame(precipMod.pvalues, columns = ['pvals'])

coeffs = coeffs[coeffs.index.str.contains('precip')]
pvalues = pvalues[pvalues.index.str.contains('precip')]

results = pd.concat([coeffs,pvalues],axis = 1)


print(results)


122.09479403495789
                                       coeffs     pvals
precip_zipQuarterquant_Extreme      -0.001576  0.002861
lag1_precip_zipQuarterquant_Extreme -0.001515  0.004222
lag2_precip_zipQuarterquant_Extreme -0.001532  0.003707
lag3_precip_zipQuarterquant_Extreme -0.002028  0.000126


In [26]:
start = time.time()

# + C(ageTercile) + C(profitTercile) + C(sizeTercile)
tempMod = smf.ols(formula = 'lnRevNormd ~ temp_zipQuarterquant_Extreme + lag1_temp_zipQuarterquant_Extreme + lag2_temp_zipQuarterquant_Extreme + lag3_temp_zipQuarterquant_Extreme + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) ', data = goodsData).fit()

print(time.time() - start) 

# convert this into a much more condensed version
coeffs = pd.DataFrame(tempMod.params,   columns = ['coeffs'])
pvalues = pd.DataFrame(tempMod.pvalues, columns = ['pvals'])

coeffs = coeffs[coeffs.index.str.contains('temp')]
pvalues = pvalues[pvalues.index.str.contains('temp')]

results = pd.concat([coeffs,pvalues],axis = 1)


print(results)


263.60220217704773
                                     coeffs     pvals
temp_zipQuarterquant_Extreme      -0.000006  0.915190
lag1_temp_zipQuarterquant_Extreme -0.000007  0.902910
lag2_temp_zipQuarterquant_Extreme  0.000034  0.542114
lag3_temp_zipQuarterquant_Extreme  0.000136  0.014040


These results are influenced by the particular transformation. if we do 1 + the ratio, we have a particular problem with the second period here.

Look at "sustained" heat and rain. We can look at incidence of a heatwave or sustained temperatures above a given amount.

In [19]:
start = time.time()


precip5DaysMod = smf.ols(formula = 'lnRevNormd ~ precip5Days_zipQuarterquant_Extreme + lag1_precip5Days_zipQuarterquant_Extreme + lag2_precip5Days_zipQuarterquant_Extreme + lag3_precip5Days_zipQuarterquant_Extreme + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData).fit()

print(time.time() - start) 

# convert this into a much more condensed version
coeffs = pd.DataFrame(precip5DaysMod.params,   columns = ['coeffs'])
pvalues = pd.DataFrame(precip5DaysMod.pvalues, columns = ['pvals'])

coeffs = coeffs[coeffs.index.str.contains('precip')]
pvalues = pvalues[pvalues.index.str.contains('precip')]

results = pd.concat([coeffs,pvalues],axis = 1)


print(results)


LinAlgError: SVD did not converge

In [11]:
start = time.time()


temp5DaysMod = smf.ols(formula = 'lnRevNormd ~ temp5Days_zipQuarterquant_Extreme + lag1_temp5Days_zipQuarterquant_Extreme + lag2_temp5Days_zipQuarterquant_Extreme + lag3_temp5Days_zipQuarterquant_Extreme + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData).fit()

print(time.time() - start) 

# convert this into a much more condensed version
coeffs  = pd.DataFrame(temp5DaysMod.params,  columns = ['coeffs'])
pvalues = pd.DataFrame(temp5DaysMod.pvalues, columns = ['pvals'])

coeffs  = coeffs[coeffs.index.str.contains('temp')]
pvalues = pvalues[pvalues.index.str.contains('temp')]

results = pd.concat([coeffs,pvalues],axis = 1)


print(results)


105.95902276039124
                                          coeffs     pvals
temp5Days_zipQuarterquant_Extreme      -0.001796  0.101108
lag1_temp5Days_zipQuarterquant_Extreme -0.002152  0.050241
lag2_temp5Days_zipQuarterquant_Extreme -0.003043  0.006116
lag3_temp5Days_zipQuarterquant_Extreme -0.000803  0.468141


## Breakouts by tercile

See how the effect varies in places that are background hot // background wet.

Sort of inspired by the BS2016 tercile approach, we divide each place into terciles. I THINK (double check this) that this is based on annual average temperature and precipitation. 



In [18]:
start = time.time()


precipModTercile = smf.ols(formula = 'lnRevNormd ~ C(precipTercile)*(precip_zipQuarterquant_Extreme + lag1_precip_zipQuarterquant_Extreme + lag2_precip_zipQuarterquant_Extreme + lag3_precip_zipQuarterquant_Extreme) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData).fit()

print(time.time() - start) 

# convert this into a much more condensed version
coeffs  = pd.DataFrame(precipModTercile.params,  columns = ['coeffs'])
pvalues = pd.DataFrame(precipModTercile.pvalues, columns = ['pvals'])

coeffs  = coeffs[coeffs.index.str.contains('precip')]
pvalues = pvalues[pvalues.index.str.contains('precip')]

results = pd.concat([coeffs,pvalues],axis = 1)


print(results)

105.90068006515503
                                                      coeffs     pvals
C(precipTercile)[T.2]                               0.049555  0.057615
C(precipTercile)[T.3]                               0.074487  0.006760
precip_zipQuarterquant_Extreme                     -0.001436  0.136472
C(precipTercile)[T.2]:precip_zipQuarterquant_Ex...  0.000104  0.944961
C(precipTercile)[T.3]:precip_zipQuarterquant_Ex... -0.001781  0.230543
lag1_precip_zipQuarterquant_Extreme                 0.000706  0.464992
C(precipTercile)[T.2]:lag1_precip_zipQuarterqua... -0.003525  0.017027
C(precipTercile)[T.3]:lag1_precip_zipQuarterqua... -0.003745  0.012471
lag2_precip_zipQuarterquant_Extreme                -0.000563  0.562460
C(precipTercile)[T.2]:lag2_precip_zipQuarterqua...  0.000806  0.576831
C(precipTercile)[T.3]:lag2_precip_zipQuarterqua... -0.001887  0.218126
lag3_precip_zipQuarterquant_Extreme                -0.000554  0.563287
C(precipTercile)[T.2]:lag3_precip_zipQuarterqua... -0.0022

In [19]:
start = time.time()


tempModTercile = smf.ols(formula = 'lnRevNormd ~ C(tempTercile)*(temp_zipQuarterquant_Extreme + lag1_temp_zipQuarterquant_Extreme + lag2_temp_zipQuarterquant_Extreme + lag3_temp_zipQuarterquant_Extreme) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData).fit()

print(time.time() - start) 

# convert this into a much more condensed version
coeffs  = pd.DataFrame(tempModTercile.params,  columns = ['coeffs'])
pvalues = pd.DataFrame(tempModTercile.pvalues, columns = ['pvals'])

coeffs  = coeffs[coeffs.index.str.contains('temp')]
pvalues = pvalues[pvalues.index.str.contains('temp')]

results = pd.concat([coeffs,pvalues],axis = 1)


print(results)

110.59080696105957
                                                      coeffs     pvals
C(tempTercile)[T.2]                                -0.037440  0.061516
C(tempTercile)[T.3]                                -0.011779  0.599312
temp_zipQuarterquant_Extreme                       -0.000425  0.638552
C(tempTercile)[T.2]:temp_zipQuarterquant_Extreme    0.001445  0.217247
C(tempTercile)[T.3]:temp_zipQuarterquant_Extreme    0.000868  0.420447
lag1_temp_zipQuarterquant_Extreme                   0.001015  0.241426
C(tempTercile)[T.2]:lag1_temp_zipQuarterquant_E... -0.000301  0.769986
C(tempTercile)[T.3]:lag1_temp_zipQuarterquant_E... -0.001074  0.340839
lag2_temp_zipQuarterquant_Extreme                   0.000820  0.293456
C(tempTercile)[T.2]:lag2_temp_zipQuarterquant_E... -0.000932  0.350436
C(tempTercile)[T.3]:lag2_temp_zipQuarterquant_E...  0.000232  0.835740
lag3_temp_zipQuarterquant_Extreme                  -0.000423  0.628594
C(tempTercile)[T.2]:lag3_temp_zipQuarterquant_E...  0.0032

Now try the sustained effects.

In [14]:
start = time.time()


precip5DaysModTercile = smf.ols(formula = 'lnRevNormd ~ C(precipTercile)*(precip5Days_zipQuarterquant_Extreme + lag1_precip5Days_zipQuarterquant_Extreme + lag2_precip5Days_zipQuarterquant_Extreme + lag3_precip5Days_zipQuarterquant_Extreme) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData).fit()

print(time.time() - start) 

# convert this into a much more condensed version
coeffs  = pd.DataFrame(precip5DaysModTercile.params,  columns = ['coeffs'])
pvalues = pd.DataFrame(precip5DaysModTercile.pvalues, columns = ['pvals'])

coeffs  = coeffs[coeffs.index.str.contains('precip')]
pvalues = pvalues[pvalues.index.str.contains('precip')]

results = pd.concat([coeffs,pvalues],axis = 1)


print(results)

105.98802304267883
                                                      coeffs     pvals
C(precipTercile)[T.2]                               0.009927  0.549393
C(precipTercile)[T.3]                              -0.004962  0.777395
precip5Days_zipQuarterquant_Extreme                -0.000844  0.449366
C(precipTercile)[T.2]:precip5Days_zipQuarterqua... -0.001808  0.187008
C(precipTercile)[T.3]:precip5Days_zipQuarterqua...  0.000621  0.648737
lag1_precip5Days_zipQuarterquant_Extreme           -0.002311  0.046708
C(precipTercile)[T.2]:lag1_precip5Days_zipQuart...  0.001632  0.273114
C(precipTercile)[T.3]:lag1_precip5Days_zipQuart...  0.000793  0.591631
lag2_precip5Days_zipQuarterquant_Extreme           -0.001094  0.353321
C(precipTercile)[T.2]:lag2_precip5Days_zipQuart...  0.001613  0.276528
C(precipTercile)[T.3]:lag2_precip5Days_zipQuart... -0.000201  0.893416
lag3_precip5Days_zipQuarterquant_Extreme           -0.000961  0.390169
C(precipTercile)[T.2]:lag3_precip5Days_zipQuart... -0.0019

In [15]:
start = time.time()


temp5DaysModTercile = smf.ols(formula = 'lnRevNormd ~ C(tempTercile)*(temp5Days_zipQuarterquant_Extreme + lag1_temp5Days_zipQuarterquant_Extreme + lag2_temp5Days_zipQuarterquant_Extreme + lag3_temp5Days_zipQuarterquant_Extreme) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData).fit()

print(time.time() - start) 

# convert this into a much more condensed version
coeffs  = pd.DataFrame(temp5DaysModTercile.params,  columns = ['coeffs'])
pvalues = pd.DataFrame(temp5DaysModTercile.pvalues, columns = ['pvals'])

coeffs  = coeffs[coeffs.index.str.contains('temp')]
pvalues = pvalues[pvalues.index.str.contains('temp')]

results = pd.concat([coeffs,pvalues],axis = 1)


print(results)


103.5402500629425
                                                      coeffs     pvals
C(tempTercile)[T.2]                                -0.037943  0.004561
C(tempTercile)[T.3]                                -0.049328  0.013328
temp5Days_zipQuarterquant_Extreme                  -0.002586  0.045335
C(tempTercile)[T.2]:temp5Days_zipQuarterquant_E...  0.001072  0.273194
C(tempTercile)[T.3]:temp5Days_zipQuarterquant_E...  0.001028  0.456556
lag1_temp5Days_zipQuarterquant_Extreme             -0.003903  0.005098
C(tempTercile)[T.2]:lag1_temp5Days_zipQuarterqu...  0.001772  0.174366
C(tempTercile)[T.3]:lag1_temp5Days_zipQuarterqu...  0.003443  0.034688
lag2_temp5Days_zipQuarterquant_Extreme             -0.001324  0.362130
C(tempTercile)[T.2]:lag2_temp5Days_zipQuarterqu... -0.002912  0.046377
C(tempTercile)[T.3]:lag2_temp5Days_zipQuarterqu... -0.002137  0.206312
lag3_temp5Days_zipQuarterquant_Extreme             -0.002867  0.031902
C(tempTercile)[T.2]:lag3_temp5Days_zipQuarterqu...  0.00356

# Temperature
It seems like we're getting a pretty strong signal on precipitation: more precipitation is bad, and it's bad even (especially?) in places where background level of precipitation is high, maybe because the most extreme tail of it is that much more extreme in these places. We have a little bit more work to do with temperature. 

From the above, we find the following:
    - Temperature does NOT seem to matter on a 1-day fluctuation basis. 
    - Temperature DOES seem to matter on a 5-day moving average case.
    
We can seem to look at the following:
    - Total days above 90F (another extreme; maybe interact with quartiles of avg temperature too)
    - Y/N for whether there was a 7-day streak above 90F, matching PS.
    - Weeks, months, qtr at different t'hold
        - Maybe try different bins as well.


First, try the total number of days that are at least 90F. Weird result is that more days above 90 is associated with better results here. REVISIT THIS.

In [22]:
start = time.time()


tempDaysAbove90Mod = smf.ols(formula = 'lnRevNormd ~ days90Plus + lag1_days90Plus + lag2_days90Plus + lag3_days90Plus + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData).fit()

print(time.time() - start) 

# convert this into a much more condensed version
coeffs  = pd.DataFrame(tempDaysAbove90Mod.params,  columns = ['coeffs'])
pvalues = pd.DataFrame(tempDaysAbove90Mod.pvalues, columns = ['pvals'])

coeffs  = coeffs[coeffs.index.str.contains('days90')]
pvalues = pvalues[pvalues.index.str.contains('days90')]

results = pd.concat([coeffs,pvalues],axis = 1)


print(results)

107.82512164115906
                   coeffs     pvals
days90Plus       0.000883  0.002083
lag1_days90Plus  0.000794  0.005429
lag2_days90Plus  0.000655  0.022759
lag3_days90Plus  0.000818  0.004621


If we look at the breakdown by days that are normally below, at, or above average, we see the strongest result is in places that are normally below average. This is a drop of almost 4\%.

In [23]:
start = time.time()


tempDaysAbove90Mod = smf.ols(formula = 'lnRevNormd ~ C(tempTercile)*(days90Plus + lag1_days90Plus + lag2_days90Plus + lag3_days90Plus) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData).fit()

print(time.time() - start) 

coeffs  = pd.DataFrame(tempDaysAbove90Mod.params,  columns = ['coeffs'])
pvalues = pd.DataFrame(tempDaysAbove90Mod.pvalues, columns = ['pvals'])

coeffs  = coeffs[coeffs.index.str.contains('days90')]
pvalues = pvalues[pvalues.index.str.contains('days90')]

results = pd.concat([coeffs,pvalues],axis = 1)


print(results)

94.70704102516174
                                       coeffs     pvals
days90Plus                          -0.039787  0.013798
C(tempTercile)[T.2]:days90Plus       0.040358  0.012655
C(tempTercile)[T.3]:days90Plus       0.040573  0.012013
lag1_days90Plus                      0.000852  0.111201
C(tempTercile)[T.2]:lag1_days90Plus -0.000152  0.768731
C(tempTercile)[T.3]:lag1_days90Plus  0.000278  0.666997
lag2_days90Plus                     -0.000130  0.827730
C(tempTercile)[T.2]:lag2_days90Plus  0.000950  0.158896
C(tempTercile)[T.3]:lag2_days90Plus  0.000717  0.395120
lag3_days90Plus                      0.002621  0.058465
C(tempTercile)[T.2]:lag3_days90Plus -0.001893  0.203349
C(tempTercile)[T.3]:lag3_days90Plus -0.001692  0.227867


Now let's try the same things by streaks. The effect sizes are large, but not statistically significantly estimated.

In [24]:
start = time.time()


tempStreakAbove90Mod = smf.ols(formula = 'lnRevNormd ~ streak90Plus + lag1_streak90Plus + lag2_streak90Plus + lag3_streak90Plus + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData).fit()

print(time.time() - start) 

coeffs  = pd.DataFrame(tempStreakAbove90Mod.params,  columns = ['coeffs'])
pvalues = pd.DataFrame(tempStreakAbove90Mod.pvalues, columns = ['pvals'])

coeffs  = coeffs[coeffs.index.str.contains('streak90')]
pvalues = pvalues[pvalues.index.str.contains('streak90')]

results = pd.concat([coeffs,pvalues],axis = 1)


print(results)

119.95602703094482
                     coeffs     pvals
streak90Plus      -0.021078  0.238399
lag1_streak90Plus -0.011678  0.511472
lag2_streak90Plus -0.008739  0.624497
lag3_streak90Plus -0.001942  0.913540


In [34]:
start = time.time()


tempStreakAbove90Mod_intxn = smf.ols(formula = 'lnRevNormd ~  C(tempTercile)*(streak90Plus + lag1_streak90Plus + lag2_streak90Plus + lag3_streak90Plus) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData).fit()

print(time.time() - start) 

coeffs  = pd.DataFrame(tempStreakAbove90Mod_intxn.params,  columns = ['coeffs'])
pvalues = pd.DataFrame(tempStreakAbove90Mod_intxn.pvalues, columns = ['pvals'])

coeffs  = coeffs[coeffs.index.str.contains('streak90')]
pvalues = pvalues[pvalues.index.str.contains('streak90')]

results = pd.concat([coeffs,pvalues],axis = 1)


print(results)

113.52526903152466
                                         coeffs     pvals
streak90Plus                          -0.028383  0.353261
C(tempTercile)[T.2]:streak90Plus      -0.000389  0.993034
C(tempTercile)[T.3]:streak90Plus       0.016328  0.702954
lag1_streak90Plus                      0.010102  0.740533
C(tempTercile)[T.2]:lag1_streak90Plus -0.035661  0.404653
C(tempTercile)[T.3]:lag1_streak90Plus -0.034964  0.428799
lag2_streak90Plus                      0.046157  0.129009
C(tempTercile)[T.2]:lag2_streak90Plus -0.047172  0.296078
C(tempTercile)[T.3]:lag2_streak90Plus -0.111516  0.008230
lag3_streak90Plus                      0.015551  0.605256
C(tempTercile)[T.2]:lag3_streak90Plus -0.039146  0.371044
C(tempTercile)[T.3]:lag3_streak90Plus -0.022306  0.604505


Let's try the things by weeks, month, quarter.

In [36]:
start = time.time()

#  + lag1_temp_zipWeek95_99 + lag2_temp_zipWeek95_99 + lag3_temp_zipWeek95_99
# 

tempWeekMod = smf.ols(formula = 'lnRevNormd ~  (temp_zipWeek95_99 + lag1_temp_zipWeek95_99 + lag2_temp_zipWeek95_99 ) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData).fit()

print(time.time() - start) 

coeffs  = pd.DataFrame(tempWeekMod.params,  columns = ['coeffs'])
pvalues = pd.DataFrame(tempWeekMod.pvalues, columns = ['pvals'])

coeffs  = coeffs[coeffs.index.str.contains('temp')]
pvalues = pvalues[pvalues.index.str.contains('temp')]

results = pd.concat([coeffs,pvalues],axis = 1)


print(results)

116.61620664596558
                          coeffs     pvals
temp_zipWeek95_99       0.002928  0.125013
lag1_temp_zipWeek95_99  0.002965  0.127086
lag2_temp_zipWeek95_99  0.002855  0.138335


If we break this down by the background temperature of the place, though, it seems like we find a similar effect in the coldest places: a warm week in the coldest places is the most negative, in the quarter concurrent with when it's warmest.


[is this the same effect? other places, did we not see a positive effect of slightly warmer weather in cooler places?]

In [42]:
start = time.time()

#  + lag1_temp_zipWeek95_99 + lag2_temp_zipWeek95_99 + lag3_temp_zipWeek95_99
# + C(ageTercile) + C(profitTercile) + C(sizeTercile)

tempWeekMod_intxn = smf.ols(formula = 'lnRevNormd ~  C(tempTercile)*(temp_zipWeek95_99 + lag1_temp_zipWeek95_99 + lag2_temp_zipWeek95_99 ) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile) ', data = goodsData).fit()

print(time.time() - start) 

coeffs  = pd.DataFrame(tempWeekMod_intxn.params,  columns = ['coeffs'])
pvalues = pd.DataFrame(tempWeekMod_intxn.pvalues, columns = ['pvals'])

coeffs  = coeffs[coeffs.index.str.contains('temp')]
pvalues = pvalues[pvalues.index.str.contains('temp')]

results = pd.concat([coeffs,pvalues],axis = 1)


print(results)

110.50980377197266
                                              coeffs     pvals
C(tempTercile)[T.2]                        -0.006719  0.624232
C(tempTercile)[T.3]                        -0.003850  0.825409
temp_zipWeek95_99                          -0.006341  0.106442
C(tempTercile)[T.2]:temp_zipWeek95_99       0.012628  0.012025
C(tempTercile)[T.3]:temp_zipWeek95_99       0.011278  0.020001
lag1_temp_zipWeek95_99                      0.003608  0.342254
C(tempTercile)[T.2]:lag1_temp_zipWeek95_99 -0.002525  0.603116
C(tempTercile)[T.3]:lag1_temp_zipWeek95_99  0.000892  0.857246
lag2_temp_zipWeek95_99                      0.005767  0.102441
C(tempTercile)[T.2]:lag2_temp_zipWeek95_99 -0.000077  0.986818
C(tempTercile)[T.3]:lag2_temp_zipWeek95_99 -0.007942  0.105213


Try months now.

In [38]:
start = time.time()

#  + lag1_temp_zipMonth95_99 + lag2_temp_zipMonth95_99 + lag3_temp_zipMonth95_99
# 

tempMonthMod = smf.ols(formula = 'lnRevNormd ~  (temp_zipMonth95_99 + lag1_temp_zipMonth95_99 + lag2_temp_zipMonth95_99 ) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData).fit()

print(time.time() - start) 

coeffs  = pd.DataFrame(tempMonthMod.params,  columns = ['coeffs'])
pvalues = pd.DataFrame(tempMonthMod.pvalues, columns = ['pvals'])

coeffs  = coeffs[coeffs.index.str.contains('temp')]
pvalues = pvalues[pvalues.index.str.contains('temp')]

results = pd.concat([coeffs,pvalues],axis = 1)


print(results)

114.2045030593872
                           coeffs     pvals
temp_zipMonth95_99       0.009830  0.023806
lag1_temp_zipMonth95_99  0.004806  0.280005
lag2_temp_zipMonth95_99  0.013350  0.002527


In [40]:
start = time.time()

#  + lag1_temp_zipMonth95_99 + lag2_temp_zipMonth95_99 + lag3_temp_zipMonth95_99
# + C(ageTercile) + C(profitTercile) + C(sizeTercile)

tempMonthMod_intxn = smf.ols(formula = 'lnRevNormd ~  C(tempTercile)*(temp_zipMonth95_99 + lag1_temp_zipMonth95_99 + lag2_temp_zipMonth95_99 ) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile) ', data = goodsData).fit()

print(time.time() - start) 

coeffs  = pd.DataFrame(tempMonthMod_intxn.params,  columns = ['coeffs'])
pvalues = pd.DataFrame(tempMonthMod_intxn.pvalues, columns = ['pvals'])

coeffs  = coeffs[coeffs.index.str.contains('temp')]
pvalues = pvalues[pvalues.index.str.contains('temp')]

results = pd.concat([coeffs,pvalues],axis = 1)


print(results)

108.40949201583862
                                               coeffs     pvals
C(tempTercile)[T.2]                         -0.004046  0.735738
C(tempTercile)[T.3]                         -0.001589  0.923088
temp_zipMonth95_99                          -0.001667  0.832368
C(tempTercile)[T.2]:temp_zipMonth95_99       0.019297  0.054603
C(tempTercile)[T.3]:temp_zipMonth95_99       0.012500  0.238812
lag1_temp_zipMonth95_99                      0.004461  0.566858
C(tempTercile)[T.2]:lag1_temp_zipMonth95_99 -0.003163  0.761903
C(tempTercile)[T.3]:lag1_temp_zipMonth95_99  0.003602  0.734986
lag2_temp_zipMonth95_99                      0.013744  0.074669
C(tempTercile)[T.2]:lag2_temp_zipMonth95_99  0.008131  0.450774
C(tempTercile)[T.3]:lag2_temp_zipMonth95_99 -0.007822  0.458455


And quarters.

In [45]:
start = time.time()

#  + lag1_temp_zipWeek95_99 + lag2_temp_zipWeek95_99 + lag3_temp_zipWeek95_99
# + C(ageTercile) + C(profitTercile) + C(sizeTercile)

tempQuarterMod = smf.ols(formula = 'lnRevNormd ~  (temp_zipQuarter95_99 + lag1_temp_zipQuarter95_99 + lag2_temp_zipQuarter95_99 ) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile) ', data = goodsData).fit()

print(time.time() - start) 

coeffs  = pd.DataFrame(tempQuarterMod.params,  columns = ['coeffs'])
pvalues = pd.DataFrame(tempQuarterMod.pvalues, columns = ['pvals'])

coeffs  = coeffs[coeffs.index.str.contains('temp')]
pvalues = pvalues[pvalues.index.str.contains('temp')]

results = pd.concat([coeffs,pvalues],axis = 1)


print(results)

114.30202984809875
                             coeffs     pvals
temp_zipQuarter95_99       0.013951  0.068572
lag1_temp_zipQuarter95_99  0.008178  0.292307
lag2_temp_zipQuarter95_99  0.022527  0.004156


In [47]:
start = time.time()


tempQuarterMod_intxn = smf.ols(formula = 'lnRevNormd ~  C(tempTercile)*(temp_zipQuarter90_95 + lag1_temp_zipQuarter90_95 + lag2_temp_zipQuarter90_95 ) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile) ', data = goodsData).fit()

print(time.time() - start) 

coeffs  = pd.DataFrame(tempQuarterMod_intxn.params,  columns = ['coeffs'])
pvalues = pd.DataFrame(tempQuarterMod_intxn.pvalues, columns = ['pvals'])

coeffs  = coeffs[coeffs.index.str.contains('temp')]
pvalues = pvalues[pvalues.index.str.contains('temp')]

results = pd.concat([coeffs,pvalues],axis = 1)


print(results)

113.19785404205322
                                                 coeffs     pvals
C(tempTercile)[T.2]                            0.005795  0.614856
C(tempTercile)[T.3]                           -0.000258  0.987331
temp_zipQuarter90_95                          -0.011533  0.359505
C(tempTercile)[T.2]:temp_zipQuarter90_95       0.007451  0.646007
C(tempTercile)[T.3]:temp_zipQuarter90_95       0.026969  0.098400
lag1_temp_zipQuarter90_95                      0.011373  0.312643
C(tempTercile)[T.2]:lag1_temp_zipQuarter90_95 -0.009584  0.546676
C(tempTercile)[T.3]:lag1_temp_zipQuarter90_95 -0.005831  0.717145
lag2_temp_zipQuarter90_95                      0.010494  0.356890
C(tempTercile)[T.2]:lag2_temp_zipQuarter90_95  0.004977  0.760370
C(tempTercile)[T.3]:lag2_temp_zipQuarter90_95 -0.006950  0.669921


## Additional Tests
Now try a few other ones here. 
- Streak of days above 95th percentile, temperature and rain.
- By categories of days: 0-5, 5-10, 10-15, 15+

In [48]:
start = time.time()


precipStreakMod = smf.ols(formula   = 'lnRevNormd ~ C(wetStreak) + C(lag1_wetStreak) + C(lag2_wetStreak) + C(lag3_wetStreak) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData).fit()


print(time.time() - start) 

coeffs  = pd.DataFrame(precipStreakMod.params,  columns = ['coeffs'])
pvalues = pd.DataFrame(precipStreakMod.pvalues, columns = ['pvals'])

coeffs  = coeffs[coeffs.index.str.contains('wet')]
pvalues = pvalues[pvalues.index.str.contains('wet')]

results = pd.concat([coeffs,pvalues],axis = 1)


print(results)

107.63323974609375
                          coeffs     pvals
C(wetStreak)[T.1]      -0.027029  0.082906
C(lag1_wetStreak)[T.1] -0.026606  0.088871
C(lag2_wetStreak)[T.1]  0.000444  0.981029
C(lag3_wetStreak)[T.1] -0.033990  0.024229


In [54]:
start = time.time()


tempStreakMod = smf.ols(formula   = 'lnRevNormd ~ C(hotStreak) + C(lag1_hotStreak) + C(lag2_hotStreak) + C(lag3_hotStreak) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData).fit()

print(time.time() - start) 

coeffs  = pd.DataFrame(tempStreakMod.params,  columns = ['coeffs'])
pvalues = pd.DataFrame(tempStreakMod.pvalues, columns = ['pvals'])

coeffs  = coeffs[coeffs.index.str.contains('hot')]
pvalues = pvalues[pvalues.index.str.contains('hot')]

results = pd.concat([coeffs,pvalues],axis = 1)


print(results)

111.07659530639648
                          coeffs     pvals
C(hotStreak)[T.1]       0.000941  0.884783
C(lag1_hotStreak)[T.1]  0.003434  0.533000
C(lag2_hotStreak)[T.1]  0.014003  0.019287
C(lag3_hotStreak)[T.1]  0.012590  0.041427


Try with the different breakout categories of what's coming together.

In [53]:
start = time.time()


precipCatMod = smf.ols(formula   = 'lnRevNormd ~ C(wetDaysCat) + C(lag1_wetDaysCat) + C(lag2_wetDaysCat) + C(lag3_wetDaysCat) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData).fit() # cov_type  = 'cluster',cov_kwds={'groups': firms},use_t=True)


print(time.time() - start) 

coeffs  = pd.DataFrame(precipCatMod.params,  columns = ['coeffs'])
pvalues = pd.DataFrame(precipCatMod.pvalues, columns = ['pvals'])

coeffs  = coeffs[coeffs.index.str.contains('wet')]
pvalues = pvalues[pvalues.index.str.contains('wet')]

results = pd.concat([coeffs,pvalues],axis = 1)


print(results)

111.89800381660461
                          coeffs     pvals
C(wetStreak)[T.1]      -0.027029  0.082906
C(lag1_wetStreak)[T.1] -0.026606  0.088871
C(lag2_wetStreak)[T.1]  0.000444  0.981029
C(lag3_wetStreak)[T.1] -0.033990  0.024229


In [52]:
start = time.time()


tempCatMod = smf.ols(formula   = 'lnRevNormd ~ C(hotDaysCat) + C(lag1_hotDaysCat) + C(lag2_hotDaysCat) + C(lag3_hotDaysCat) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData).fit()


print(time.time() - start) 


coeffs  = pd.DataFrame(tempCatMod.params,  columns = ['coeffs'])
pvalues = pd.DataFrame(tempCatMod.pvalues, columns = ['pvals'])

coeffs  = coeffs[coeffs.index.str.contains('hot')]
pvalues = pvalues[pvalues.index.str.contains('hot')]

results = pd.concat([coeffs,pvalues],axis = 1)


print(results)

123.73698306083679
                              coeffs     pvals
C(hotDaysCat)[T.cat2]       0.006906  0.409917
C(hotDaysCat)[T.cat3]       0.005226  0.557844
C(hotDaysCat)[T.cat4]       0.013512  0.229831
C(lag1_hotDaysCat)[T.cat2] -0.005885  0.492355
C(lag1_hotDaysCat)[T.cat3] -0.002182  0.801170
C(lag1_hotDaysCat)[T.cat4]  0.014211  0.173030
C(lag2_hotDaysCat)[T.cat2]  0.003182  0.656881
C(lag2_hotDaysCat)[T.cat3]  0.009131  0.235569
C(lag2_hotDaysCat)[T.cat4]  0.012884  0.139918
C(lag3_hotDaysCat)[T.cat2] -0.000532  0.937934
C(lag3_hotDaysCat)[T.cat3]  0.002991  0.700001
C(lag3_hotDaysCat)[T.cat4]  0.024133  0.004881


In [56]:
start = time.time()


tempCatMod_intxn = smf.ols(formula   = 'lnRevNormd ~ C(tempTercile)*(C(hotDaysCat) + C(lag1_hotDaysCat) + C(lag2_hotDaysCat) + C(lag3_hotDaysCat)) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData).fit()


print(time.time() - start) 


coeffs  = pd.DataFrame(tempCatMod_intxn.params,  columns = ['coeffs'])
pvalues = pd.DataFrame(tempCatMod_intxn.pvalues, columns = ['pvals'])

coeffs  = coeffs[coeffs.index.str.contains('hot')]
pvalues = pvalues[pvalues.index.str.contains('hot')]

results = pd.concat([coeffs,pvalues],axis = 1)


print(results)

117.95130610466003
                                                  coeffs     pvals
C(hotDaysCat)[T.cat2]                           0.003924  0.729189
C(hotDaysCat)[T.cat3]                           0.001932  0.887118
C(hotDaysCat)[T.cat4]                           0.003083  0.845820
C(lag1_hotDaysCat)[T.cat2]                     -0.011356  0.360955
C(lag1_hotDaysCat)[T.cat3]                     -0.007938  0.557421
C(lag1_hotDaysCat)[T.cat4]                      0.019565  0.215048
C(lag2_hotDaysCat)[T.cat2]                     -0.007020  0.543209
C(lag2_hotDaysCat)[T.cat3]                      0.002584  0.845864
C(lag2_hotDaysCat)[T.cat4]                      0.014563  0.323895
C(lag3_hotDaysCat)[T.cat2]                      0.003105  0.780632
C(lag3_hotDaysCat)[T.cat3]                     -0.017960  0.185827
C(lag3_hotDaysCat)[T.cat4]                     -0.003815  0.802959
C(tempTercile)[T.2]:C(hotDaysCat)[T.cat2]      -0.001703  0.918860
C(tempTercile)[T.3]:C(hotDaysCat)[T.cat2]  

# Robustness Checks
Try playing with temperature a little bit more. Look at:
    - interaction with concentration
    - establishment-weighted vars

In [None]:
start = time.time()


tempStreakConcMod = smf.ols(formula   = 'lnRevNormd ~ C(firmConcTercile)*(C(hotDaysCat) + C(lag1_hotDaysCat) + C(lag2_hotDaysCat) + C(lag3_hotDaysCat)) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', 
                           data = goodsData)
tempStreakConcRes = tempStreakConcMod.fit() # cov_type  = 'cluster',cov_kwds={'groups': firms},use_t=True)


print(time.time() - start) 

tempStreakConcRes.summary()

Try the temperature as defined by super super hot days, anywhere in the country - 95th percentile anywhere. This will only happen in a few places in , or at least, there will be some geographic skew. But we can control for that by looking at the effect of hot temps given different baselines.

In [None]:
start = time.time()


tempModAnnual_noControls = smf.ols(formula   = 'lnRevNormd ~ temp_annualquant_Extreme + lag1_temp_annualquant_Extreme + lag2_temp_annualquant_Extreme + lag3_temp_annualquant_Extreme + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey)', data = goodsData)
tempResAnnual_noControls = tempModAnnual_noControls.fit(cov_type  = 'cluster',cov_kwds={'groups': firms},use_t=True)


print(time.time() - start) 

tempResAnnual_noControls.summary()


Let's try the standard interactions, controlling for the background climate in given places.

If we look at the below, we see that the places that are normally coolest are negatively impacted by extreme extremes. Specifically, using an across-the-country cutoff for temperature, we have that the biggest negative effect happens in the places that are normally the lowest-temperature.

This gives some promise that we might find an effect of temperature in some places, depending on expectation or baseline climate.

In [None]:
start = time.time()


tempEstMod_annual = smf.ols(formula   = 'lnRevNormd ~ C(tempTercile)*(temp_annualquant_Extreme + lag1_temp_annualquant_Extreme + lag2_temp_annualquant_Extreme + lag3_temp_annualquant_Extreme) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', 
                           data = goodsData)


tempResMod_annual = tempEstMod_annual.fit() # cov_type  = 'cluster',cov_kwds={'groups': firms},use_t=True)


print(time.time() - start) 

tempResMod_annual.summary()

Let's try it by precipitation quartile for comparison's sake.

In [None]:
start = time.time()


precipEstMod_annual = smf.ols(formula   = 'lnRevNormd ~ C(precipTercile)*(precip_annualquant_Extreme + lag1_precip_annualquant_Extreme + lag2_precip_annualquant_Extreme + lag3_precip_annualquant_Extreme) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', 
                           data = goodsData)


precipResMod_annual = precipEstMod_annual.fit() # cov_type  = 'cluster',cov_kwds={'groups': firms},use_t=True)


print(time.time() - start) 

precipResMod_annual.summary()

Now let's make sure we have the originals, the OGs, for comparison.

In [None]:
start = time.time()


tempEstMod_zipQuarter = smf.ols(formula   = 'lnRevNormd ~ C(tempTercile)*(temp_zipQuarterquant_Extreme + lag1_temp_zipQuarterquant_Extreme + lag2_temp_zipQuarterquant_Extreme + lag3_temp_zipQuarterquant_Extreme) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', 
                           data = goodsData)


tempResMod_zipQuarter = tempEstMod_zipQuarter.fit() # cov_type  = 'cluster',cov_kwds={'groups': firms},use_t=True)


print(time.time() - start) 

tempResMod_zipQuarter.summary()

In [None]:
start = time.time()


precipEstMod_zipQuarter = smf.ols(formula   = 'lnRevNormd ~ C(precipTercile)*(precip_zipQuarterquant_Extreme + lag1_precip_zipQuarterquant_Extreme + lag2_precip_zipQuarterquant_Extreme + lag3_precip_zipQuarterquant_Extreme) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', 
                           data = goodsData)


precipResMod_zipQuarter = precipEstMod_zipQuarter.fit() # cov_type  = 'cluster',cov_kwds={'groups': firms},use_t=True)


print(time.time() - start) 

precipResMod_zipQuarter.summary()

# Industry-Specific

Start to do some of the heterogeneity analysis.

In [None]:
precipMod_byInd       = smf.ols(formula = 'lnRevNormd ~ C(indGroup)*(precip_zipQuarterquant_Extreme + lag1_precip_zipQuarterquant_Extreme + lag2_precip_zipQuarterquant_Extreme + lag3_precip_zipQuarterquant_Extreme) + C(indGroup)*C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData).fit()
coeff = precipMod_byInd.params
pvals = precipMod_byInd.pvalues


In [None]:
precipMod_byInd.summary()

In [None]:
phrase    = 'precip_zipQuarterquant_Extreme'

condition = [s for s in coeff.index if phrase in s]
coeffs_ofInt = coeff[condition]
pvals_ofInt  = pvals[condition] 

results = pd.DataFrame()

# get coeffs, lags, for each of these
lag0   = [s for s in coeffs_ofInt.index if ('lag' not in s)]
# lag0   = ['t']*len(lag0)
coeff0 = coeffs_ofInt[lag0]
pval0  = pvals_ofInt[lag0]
lags0  = ['t']*len(lag0)

lag1   = [s for s in coeffs_ofInt.index if ('lag1' in s)]
coeff1 = coeffs_ofInt[lag1]
pval1  = pvals_ofInt[lag1]
lags1  = ['t-1']*len(lag0)

lag2   = [s for s in coeffs_ofInt.index if ('lag2' in s)]
coeff2 = coeffs_ofInt[lag2]
pval2  = pvals_ofInt[lag2]
lags2  = ['t-2']*len(lag0)

lag3   = [s for s in coeffs_ofInt.index if ('lag3' in s)]
coeff3 = coeffs_ofInt[lag3]
pval3  = pvals_ofInt[lag3]
lags3  = ['t-3']*len(lag3)

allNames = list(itertools.chain(lag0,lag1,lag2,lag3))
intxns   = [char.split(':')[0] for char in allNames]
allCoefs = list(itertools.chain(coeff0,coeff1,coeff2,coeff3))  
allPVals = list(itertools.chain(pval0,pval1,pval2,pval3))  
allLagLabels = list(itertools.chain(lags0,lags1,lags2,lags3))  
coefsWithPVals = []

for i in range(0,len(allCoefs)):
    next = str("%.4f" % allCoefs[i]) + ' (' + str("%.2f" % allPVals[i]) + ')'
    coefsWithPVals.append(next)
    
take2 = pd.DataFrame([intxns,allLagLabels,coefsWithPVals]).T
take2.columns = ['indInteraction','allLagLabels','coefsWithPVals']
take2.pivot(index='indInteraction', columns='allLagLabels', values='coefsWithPVals').reset_index().to_csv('take2.csv')


Now try with the total number of industries as described in the other doc.

In [None]:
precipTotal_byInd  = smf.ols(formula = 'lnRevNormd ~ C(indGroup)*(extremePrecip) + C(indGroup)*C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData).fit()
coeff              = precipTotal_byInd.params
pvals              = precipTotal_byInd.pvalues


In [None]:
precipTotal_byInd.summary()

In [None]:
phrase    = 'extremePrecip'

condition = [s for s in coeff.index if phrase in s]
coeffs_ofInt = coeff[condition]
pvals_ofInt  = pvals[condition] 


results = pd.DataFrame()


allNames = coeffs_ofInt.index
intxns   = [char.split(':')[0] for char in allNames]
allCoefs = list(coeffs_ofInt)  
allPVals = list(pvals_ofInt)  
coefsWithPVals = []

for i in range(0,len(allCoefs)):
    next = str("%.4f" % allCoefs[i]) + ' (' + str("%.2f" % allPVals[i]) + ')'
    coefsWithPVals.append(next)

print(coefsWithPVals)
    

take3 = pd.DataFrame([intxns,coefsWithPVals]).T
take3.columns = ['indInteraction','coefsWithPVals']

print(take3)

take3.to_csv('take3.csv')

'''take2.pivot(index='indInteraction', columns='allLagLabels', values='coefsWithPVals').reset_index().to_csv('take2.csv')
'''

In [None]:
Now try this for each regression separately.

Do the same for temperature.

In [None]:
tempMod_byInd       = smf.ols(formula = 'lnRevNormd ~ C(indGroup)*(temp_zipQuarterquant_Extreme + lag1_temp_zipQuarterquant_Extreme + lag2_temp_zipQuarterquant_Extreme + lag3_temp_zipQuarterquant_Extreme) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData)
tempMod_byInd_res   = tempMod_byInd.fit(cov_type='cluster',cov_kwds={'groups': firms},use_t=True)


tempMod_byInd_res.summary()


Try just the concurrent quarter:

In [None]:
precipMod_byInd       = smf.ols(formula = 'lnRevNormd ~ C(indGroup)*(precip_zipQuarterquant_Extreme) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData)
precipMod_byInd_res   = precipMod_byInd.fit()


precipMod_byInd_res.summary()


Try with the categories.

In [None]:
hotCat_byInd       = smf.ols(formula = 'lnRevNormd ~ C(indGroup)*(C(hotDaysCat) + C(lag1_hotDaysCat) + C(lag2_hotDaysCat) + C(lag3_hotDaysCat)) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData)
hotCat_byInd_res   = hotCat_byInd.fit()

hotCat_byInd_res.summary()

In [None]:
wetCat_byInd       = smf.ols(formula = 'lnRevNormd ~ C(indGroup)*(C(wetDaysCat) + C(lag1_wetDaysCat) + C(lag2_wetDaysCat) + C(lag3_wetDaysCat)) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData)
wetCat_byInd_res   = wetCat_byInd.fit()

wetCat_byInd_res.summary()




It seems like if we split hairs by dividing things up the last few quarters, everything starts to go a little haywire. The most generous description is something like, we can't separately identify the effects from different quarters, and there's a lot of fairly collinear effects. There are a few less generous descriptions as well, including that there's not necessarily much signal here. 


One of the understated pros of all of this is that the r-squared values are all very high - we're getting great identification here. We could potentially expand the data sample.

Things for Larry tomorrow:
    - emphasis on, here is the specific regression form. here's why i think it is good/bad
    - main precipitation + temperature plot
    - a sense of the heterogeneity, by types of place
    - a little discussion of what to do about temperature: focus on a higher cutoff, the effects in places that aren't quite used to it, and the effects on firms that have more of their operations concentrated in one place
           - the problem with our current definition (zip-quarter) is that for some quarters, we don't have high enough baselines to really register the types of high temperatures 
           - it seems like there might be more variability in precipitation? or at least, more zipcodes seem to trigger it than trigger the temperature threshold
    - some of the industry - intxn results
    - some of the specific industry results
    - discussino of future results: indirect effect results, stock results, by concentration of firm 
    - a discussion of the different time frames: the further back, the less insight we have into what businesses are saying about all of this. the different data sources to mention are: disclosures (8-Ks); PRISM; zipcodes; compustat

----------------------------------

In [None]:
goodsData.indGroup.unique()

In [None]:
cutoffVarsYr = ['0.95']  # , ] # ,'1xQtr''1x5Qtrs',
weatherVars  = ['precip_'] # , 'temp5Days_', 'precip5Days_'] # , 'precip_']#, , ] #[,]
statVarsYr   = ['zipQuarterquant_'] #  , , ]  #,'zipQuarterquant_']
outcomeVars  = ['lnRevNormd'] # , 'lnRev', 'lnCost', 'revenueChange', 'costChange']

goodsData = goodsData[~goodsData.lnRev.isna() & ~goodsData.lnCost.isna()] # & ~goodsData.lnCostNormd.isna()]


start = time.time()

results = pd.DataFrame()

i = 0
for outcomeVar in outcomeVars:
    for weatherVar in weatherVars:
        for statVar in statVarsYr:                     
            for cutoffVar in cutoffVarsYr:
                i = i + 1
                indVar = weatherVar + statVar + cutoffVar
                
                
                print(outcomeVar, "~", indVar)


                # find: concurrent ; or lagged supplier data
                X = goodsData.loc[:,((goodsData.columns.str.contains(indVar) & ~goodsData.columns.str.contains('empWt_')) | 
                                                (goodsData.columns.str.contains('indQtr_')) |
                                                (goodsData.columns.str.contains('gvkey_')))] #  | 
                                                (goodsData.columns.str.contains('ageTercile_')) |
                                                # (goodsData.columns.str.contains('sizeTercile_')) |
                                                # (goodsData.columns.str.contains('profitTercile_')))]
                
                
                X = sm.add_constant(X)

                
                firms = goodsData['gvkey']
        

                y = goodsData[outcomeVar]
                
                
                model = sm.OLS(y, X).fit(cov_type='cluster',cov_kwds={'groups': firms},use_t=True)
                coeff = model.params[1:     1 + len(goodsData.columns[goodsData.columns.str.contains(indVar) & ~goodsData.columns.str.contains('empWt_')])]
                pvals = model.pvalues[1:    1 + len(goodsData.columns[goodsData.columns.str.contains(indVar) & ~goodsData.columns.str.contains('empWt_')])]
                errs  = modelResults.bse[1: 1 + len(goodsData.columns[goodsData.columns.str.contains(indVar) & ~goodsData.columns.str.contains('empWt_')])]
                # print(model.summary())
                print(coeff)
                print(pvals)


                results.loc[i,'industry'] = ind

                results.loc[i,'outcomeVar'] = outcomeVar
                results.loc[i,'weatherVar'] = weatherVar

                results.loc[i,'lag0']       = coeff[0]
                results.loc[i,'lag1']       = coeff[1]
                results.loc[i,'lag2']       = coeff[2]
                results.loc[i,'lag3']       = coeff[3]
                results.loc[i,'lag4']       = coeff[4]
                
                
                results.loc[i,'pval0']      = pvals[0]
                results.loc[i,'pval1']      = pvals[1]
                results.loc[i,'pval2']      = pvals[2]
                results.loc[i,'pval3']      = pvals[3]
                results.loc[i,'pval4']      = pvals[4]
                
                
                results.loc[i,'bse0']       = errs[0]
                results.loc[i,'bse1']       = errs[1]
                results.loc[i,'bse2']       = errs[2]
                results.loc[i,'bse3']       = errs[3]
                results.loc[i,'bse4']       = errs[4]

                                
                # results.to_csv("../../data/utilitiesResults_rightInds_noCtrls.csv")
                
                print( time.time() - start)

In [None]:
weatherVars  = ['hotStreak', 'wetStreak'] # , 'temp5Days_', 'precip5Days_'] # , 'precip_']#, , ] #[,]
outcomeVars  = ['lnRevNormd', 'lnCostNormd'] # , 'lnRev', 'lnCost', 'revenueChange', 'costChange']

goodsData = goodsData[~goodsData.lnRev.isna() & ~goodsData.lnCost.isna()] # & ~goodsData.lnCostNormd.isna()]


start = time.time()

results = pd.DataFrame()

i = 0
for outcomeVar in outcomeVars:
    for weatherVar in weatherVars:
        i = i + 1
        indVar = weatherVar


        print(outcomeVar, "~", indVar)


        # find: concurrent ; or lagged supplier data
        X = goodsData.loc[:,((goodsData.columns.str.contains(indVar) & ~goodsData.columns.str.contains('empWt_')) | 
                                        (goodsData.columns.str.contains('indQtr_')) |
                                        (goodsData.columns.str.contains('gvkey_')))] #  | 
                                        # (goodsData.columns.str.contains('ageTercile_')) |
                                        # (goodsData.columns.str.contains('sizeTercile_')) |
                                        # (goodsData.columns.str.contains('profitTercile_')))]


        X = sm.add_constant(X)
        print(X.columns)

        firms = goodsData['gvkey']


        y = goodsData[outcomeVar]


        model = sm.OLS(y, X).fit(cov_type='cluster',cov_kwds={'groups': firms},use_t=True)
        coeff = model.params[1:     1 + len(goodsData.columns[goodsData.columns.str.contains(indVar) & ~goodsData.columns.str.contains('empWt_')])]
        pvals = model.pvalues[1:    1 + len(goodsData.columns[goodsData.columns.str.contains(indVar) & ~goodsData.columns.str.contains('empWt_')])]
        errs  = modelResults.bse[1: 1 + len(goodsData.columns[goodsData.columns.str.contains(indVar) & ~goodsData.columns.str.contains('empWt_')])]
        # print(model.summary())
        print(coeff)
        print(pvals)


        results.loc[i,'industry'] = ind

        results.loc[i,'outcomeVar'] = outcomeVar
        results.loc[i,'weatherVar'] = weatherVar

        results.loc[i,'lag0']       = coeff[0]
        results.loc[i,'lag1']       = coeff[1]
        results.loc[i,'lag2']       = coeff[2]
        results.loc[i,'lag3']       = coeff[3]
        results.loc[i,'lag4']       = coeff[4]


        results.loc[i,'pval0']      = pvals[0]
        results.loc[i,'pval1']      = pvals[1]
        results.loc[i,'pval2']      = pvals[2]
        results.loc[i,'pval3']      = pvals[3]
        results.loc[i,'pval4']      = pvals[4]


        results.loc[i,'bse0']       = errs[0]
        results.loc[i,'bse1']       = errs[1]
        results.loc[i,'bse2']       = errs[2]
        results.loc[i,'bse3']       = errs[3]
        results.loc[i,'bse4']       = errs[4]


        # results.to_csv("../../data/utilitiesResults_rightInds_noCtrls.csv")

        print( time.time() - start)


In [None]:
results.to_csv("../../data/utilitiesResults_rightInds.csv")

### Employment-Wtd Weather
Run the regressions using the emp-wtd data.

In [None]:
cutoffVar   = '0.95'
weatherVar  = 'precip_'
statVar  = 'zipquant_'
outcomeVar  = 'lnRevNormd'

indVar = weatherVar + statVar + cutoffVar


goodsData.columns[goodsData.columns.str.contains(indVar) & goodsData.columns.str.contains('empWt_')] 

In [None]:
cutoffVar   = '0.95'
weatherVar  = 'precip_'
statVarYr  = 'zipquant_'
outcomeVar  = 'lnRevNormd'

ind = 2


##################
filename = '../../data/companyData/igData_ind' + str(ind) + '.csv'           
goodsData = pd.read_csv(filename).drop(columns = {'Unnamed: 0'})


indVar = weatherVar + statVar + cutoffVar


print(outcomeVar, "~", indVar)


# find: concurrent ; or lagged supplier data
X = goodsData.loc[:,((goodsData.columns.str.contains(indVar) & goodsData.columns.str.contains('empWt_')) | 
                                (goodsData.columns.str.contains('indQtr_')) |
                                (goodsData.columns.str.contains('gvkey_'))  | 
                                (goodsData.columns.str.contains('ageTercile_')) |
                                (goodsData.columns.str.contains('sizeTercile_')) |
                                (goodsData.columns.str.contains('profitTercile_')))]


print(X.columns)

firms = goodsData['gvkey']


y = goodsData[outcomeVar]


model = sm.OLS(y, X).fit(cov_type='cluster',cov_kwds={'groups': firms},use_t=True)
pvals = model.pvalues[0:len(goodsData.columns[goodsData.columns.str.contains(indVar)])]
coeff =  model.params[0:len(goodsData.columns[goodsData.columns.str.contains(indVar)])]

print(model.summary())

## Industry-Specific
Go through every famafrench industry and run the regressions above. First do this by days of extremes at hqs.

### HQs

In [None]:
goodsData = pd.read_csv("../../data/companyData/goodsData_igData.csv").drop(columns = {'Unnamed: 0'})

industries = goodsData.indGroup.unique()

In [None]:
results

In [None]:
cutoffVarsYr = ['0.95'] 
weatherVars  = ['precip_'] # , 'temp_'] 
statVarsYr   = ['zipQuarterquant_']
outcomeVars  = ['lnRevNormd'] # , 'lnCostNormd']




start = time.time()

results = pd.DataFrame()

i = 0

for ind in industries:
    print('##########################################################')
    print(ind)
    filename = '../../data/companyData/igData_ind' + str(ind) + '.csv'           
    goodsData = pd.read_csv(filename).drop(columns = {'Unnamed: 0'})
    if goodsData.shape[0] > 0:
    
        for outcomeVar in outcomeVars:
            for weatherVar in weatherVars:
                for statVar in statVarsYr:                     
                    for cutoffVar in cutoffVarsYr:

                        i = i + 1


                        indVar = weatherVar + statVar + cutoffVar


                        print(outcomeVar, "~", indVar)


                        # find: concurrent ; or lagged supplier data
                        X = goodsData.loc[:,(goodsData.columns.str.contains(indVar) & ~goodsData.columns.str.contains('empWt_') & ~goodsData.columns.str.contains('lag4')) | 
                                                        (goodsData.columns.str.contains('indQtr_')) | #  |
                                                        (goodsData.columns.str.contains('gvkey_')) | #  | 
                                                        (goodsData.columns.str.contains('ageTercile_')) |
                                                        (goodsData.columns.str.contains('sizeTercile_')) |
                                                        (goodsData.columns.str.contains('profitTercile_'))]

                        X = sm.add_constant(X)

                        firms = goodsData['gvkey']


                        y = goodsData[outcomeVar]


                        model = sm.OLS(y, X).fit(cov_type='cluster',cov_kwds={'groups': firms},use_t=True)
                        pvals = model.pvalues[1: 1 + len(goodsData.columns[goodsData.columns.str.contains(indVar) & ~goodsData.columns.str.contains('empWt_')])]
                        coeff = model.params[1: 1 + len(goodsData.columns[goodsData.columns.str.contains(indVar)  & ~goodsData.columns.str.contains('empWt_')])]
                        errs  = model.bse[1: 1 + len(goodsData.columns[goodsData.columns.str.contains(indVar)     & ~goodsData.columns.str.contains('empWt_')])]
                
                        '''print(coeff)
                        print(pvals)'''


                        results.loc[i,'industry'] = ind

                        results.loc[i,'outcomeVar'] = outcomeVar
                        results.loc[i,'weatherVar'] = weatherVar

                        # str("%.4f" % allCoefs[i]) + ' (' + str("%.2f" % allPVals[i]) + ')'
                        
                        results.loc[i,'lag0']       = str("%.4f" % coeff[0]) + ' (' + str("%.2f" % pvals[0]) + ')'
                        results.loc[i,'lag1']       = str("%.4f" % coeff[1]) + ' (' + str("%.2f" % pvals[1]) + ')'
                        results.loc[i,'lag2']       = str("%.4f" % coeff[2]) + ' (' + str("%.2f" % pvals[2]) + ')'
                        results.loc[i,'lag3']       = str("%.4f" % coeff[3]) + ' (' + str("%.2f" % pvals[3]) + ')'
                        
                        results.loc[i,'n'] = X.shape[0]
                        # results.loc[i,'lag4']       = coeff[4]

                        '''results.loc[i,'pval0']      = pvals[0]
                        results.loc[i,'pval1']      = pvals[1]
                        results.loc[i,'pval2']      = pvals[2]
                        results.loc[i,'pval3']      = pvals[3]
                        # results.loc[i,'pval4']      = pvals[4]
                        
                        results.loc[i,'bse0']       = errs[0]
                        results.loc[i,'bse1']       = errs[1]
                        results.loc[i,'bse2']       = errs[2]
                        results.loc[i,'bse3']       = errs[3]'''
                        # results.loc[i,'bse4']       = errs[4]


                        results.to_csv("../../allIndustryResults.csv")

                        print( time.time() - start)
                        



In [None]:
results.to_csv("allIndustryResults.csv")


In [None]:
print(results)

'''# merge in the industry names
conversionTable = pd.read_csv("../../data/indMapping.csv")
conversionTable.dropna(inplace=True)
conversionTable.reset_index(drop = True, inplace = True)

conversionTable.head()

results = results.merge(conversionTable)

results.to_csv("../../allIndustryResults.csv")
'''

In [None]:
results

Try this with the streak data.

In [None]:
weatherVars  = ['hotStreak', 'wetStreak'] 
outcomeVars  = ['lnRevNormd', 'lnCostNormd']


industries = range(1,44)


start = time.time()

results = pd.DataFrame()

i = 0

for ind in industries:
    print('##########################################################')
    print(ind)
    filename = '../../data/companyData/igData_ind' + str(ind) + '.csv'           
    goodsData = pd.read_csv(filename).drop(columns = {'Unnamed: 0'})
    
    if goodsData.shape[0] > 0:
    
        for outcomeVar in outcomeVars:
            for weatherVar in weatherVars:
                i = i + 1


                indVar = weatherVar


                print(outcomeVar, "~", indVar)


                # find: concurrent ; or lagged supplier data
                X = goodsData.loc[:,((goodsData.columns.str.contains(indVar) & ~goodsData.columns.str.contains('empWt_')) | 
                                                (goodsData.columns.str.contains('indQtr_')) |
                                                (goodsData.columns.str.contains('gvkey_'))  | 
                                                (goodsData.columns.str.contains('ageTercile_')) |
                                                (goodsData.columns.str.contains('sizeTercile_')) |
                                                (goodsData.columns.str.contains('profitTercile_')))]
                
                X = sm.add_constant(X)



                firms = goodsData['gvkey']


                y = goodsData[outcomeVar]


                model = sm.OLS(y, X).fit(cov_type='cluster',cov_kwds={'groups': firms},use_t=True)
                pvals = model.pvalues[1: 1 + len(goodsData.columns[goodsData.columns.str.contains(indVar) & ~goodsData.columns.str.contains('empWt_')] )]
                coeff = model.params[1: 1 + len(goodsData.columns[goodsData.columns.str.contains(indVar)  & ~goodsData.columns.str.contains('empWt_')])]
                errs  = model.bse[1: 1 + len(goodsData.columns[goodsData.columns.str.contains(indVar)     & ~goodsData.columns.str.contains('empWt_')])]
                
                '''print(coeff)
                print(pvals)'''


                results.loc[i,'industry'] = ind

                results.loc[i,'outcomeVar'] = outcomeVar
                results.loc[i,'weatherVar'] = weatherVar

                results.loc[i,'lag0']       = coeff[0]
                results.loc[i,'lag1']       = coeff[1]
                results.loc[i,'lag2']       = coeff[2]
                results.loc[i,'lag3']       = coeff[3]
                results.loc[i,'lag4']       = coeff[4]
                
                results.loc[i,'bse0']       = errs[0]
                results.loc[i,'bse1']       = errs[1]
                results.loc[i,'bse2']       = errs[2]
                results.loc[i,'bse3']       = errs[3]
                results.loc[i,'bse4']       = errs[4]

                results.loc[i,'pval0']      = pvals[0]
                results.loc[i,'pval1']      = pvals[1]
                results.loc[i,'pval2']      = pvals[2]
                results.loc[i,'pval3']      = pvals[3]
                results.loc[i,'pval4']      = pvals[4]


                results.to_csv("../../allIndustryResults_streaks.csv")

                print( time.time() - start)
                

# merge in the industry names
conversionTable = pd.read_csv("../../data/indMapping.csv")
conversionTable.dropna(inplace=True)
conversionTable.reset_index(drop = True, inplace = True)

conversionTable.head()

results = results.merge(conversionTable)


results.to_csv("../../allIndustryResults_streaks.csv")

In [None]:
results.head()

In [None]:
results = pd.read_csv("../../allIndustryResults_streaks.csv").drop(columns = {'Unnamed: 0'})
results.head()

### Employment Weights

Now do this for the employment-weighted average of the days of extreme weather.

In [None]:
cutoffVarsYr = ['0.95'] # , '1x5Qtrs', '1x5Yrs'] # '1x5Qtrs',
weatherVars  = ['precip_', 'temp_']        #, 'temp5Days_', 'precip5Days_'] # , 'precip_']#, , ] #[,]
statVarsYr   = ['zipQuarterquant_']
outcomeVars  = ['lnRevNormd', 'lnCostNormd']

industries = range(1,44)

start = time.time()

results = pd.DataFrame()

i = 0



for ind in industries:
    print('##########################################################')
    print(ind)
    filename = '../../data/companyData/igData_ind' + str(ind) + '.csv'           
    goodsData = pd.read_csv(filename).drop(columns = {'Unnamed: 0'})
    if goodsData.shape[0] > 0:


        for outcomeVar in outcomeVars:
            for weatherVar in weatherVars:
                for statVar in statVarsYr:                     
                    for cutoffVar in cutoffVarsYr:

                        i = i + 1



                        '''goodsData = goodsData[~goodsData.lnRev.isna() & 
                                             ~goodsData.lnCost.isna() & 
                                             ~goodsData.revenueChange.isna() & 
                                             ~goodsData.costChange.isna()]'''


                        indVar = weatherVar + statVar + cutoffVar


                        print(outcomeVar, "~", indVar)


                        # find: concurrent ; or lagged supplier data
                        X = goodsData.loc[:,((goodsData.columns.str.contains(indVar) & goodsData.columns.str.contains('empWt_')) | 
                                                        (goodsData.columns.str.contains('indQtr_')) |
                                                        (goodsData.columns.str.contains('gvkey_'))  | 
                                                        (goodsData.columns.str.contains('ageTercile_')) |
                                                        (goodsData.columns.str.contains('sizeTercile_')) |
                                                        (goodsData.columns.str.contains('profitTercile_')))]

                        X = sm.add_constant(X)

                        firms = goodsData['gvkey']


                        y = goodsData[outcomeVar]


                        model = sm.OLS(y, X).fit(cov_type='cluster',cov_kwds={'groups': firms},use_t=True)
                        pvals = model.pvalues[1: 1 + len(goodsData.columns[goodsData.columns.str.contains(indVar) & goodsData.columns.str.contains('empWt_')])]
                        coeff = model.params[1: 1 + len(goodsData.columns[goodsData.columns.str.contains(indVar) & goodsData.columns.str.contains('empWt_')])]
                        errs  = model.bse[1: 1 + len(goodsData.columns[goodsData.columns.str.contains(indVar)  & goodsData.columns.str.contains('empWt_')])]
                
                        '''print(coeff)
                        print(pvals)'''


                        results.loc[i,'industry'] = ind

                        results.loc[i,'outcomeVar'] = outcomeVar
                        results.loc[i,'weatherVar'] = weatherVar

                        results.loc[i,'lag0']       = coeff[0]
                        results.loc[i,'lag1']       = coeff[1]
                        results.loc[i,'lag2']       = coeff[2]
                        results.loc[i,'lag3']       = coeff[3]
                        results.loc[i,'lag4']       = coeff[4]

                        results.loc[i,'bse0']       = errs[0]
                        results.loc[i,'bse1']       = errs[1]
                        results.loc[i,'bse2']       = errs[2]
                        results.loc[i,'bse3']       = errs[3]
                        results.loc[i,'bse4']       = errs[4]

                        results.loc[i,'pval0']      = pvals[0]
                        results.loc[i,'pval1']      = pvals[1]
                        results.loc[i,'pval2']      = pvals[2]
                        results.loc[i,'pval3']      = pvals[3]
                        results.loc[i,'pval4']      = pvals[4]


                        results.to_csv("../../results_byInds_withControls_empWts.csv")

                        print( time.time() - start)
                        

# merge in the industry names
conversionTable = pd.read_csv("../../data/indMapping.csv")
conversionTable.dropna(inplace=True)
conversionTable.reset_index(drop = True, inplace = True)

conversionTable.head()

results = results.merge(conversionTable)  

In [None]:
results

In [None]:
# loop over outcome variables and weather definitions
weather = results.weatherVar.unique()
outcome = results.outcomeVar.unique()


for weather in weatherVars:
    for outcome in outcomeVars:
        # choose the elective parts of this - number of columns and the range of the axes
        numCols = 4
        yLims   = 0.1

        industries = results.industryName.unique()
        rowNum = len(industries) // numCols + 1
        colNum = numCols

        fig, ax = plt.subplots(rowNum, colNum, sharex='all', sharey='all',
                              figsize=(20,40),
                              constrained_layout=True)

        fig.suptitle('Direct Effects: ' + outcome + ' ~ ' + weather + ' Employment Weights', fontsize=36)



        i = 0
        for ind in industries:
            rowIndex = i // numCols
            colIndex = i % numCols


            i   = i + 1


            rev = results[(results.outcomeVar == outcome) & (results.weatherVar == weather) & 
                         (results.industryName == ind)].reset_index()
            x   = [0,1,2,3,4]
            y   = [rev.lag0,rev.lag1,rev.lag2,rev.lag3,rev.lag4]


            errors = [rev.bse0,rev.bse1,rev.bse2,rev.bse3,rev.bse4]

            # plt.errorbar(x,y,yerr = errors, fmt = '.k')
            # plt.show()

            '''ax[rowIndex, colIndex].text(0.5, 0.5, str((i, j)),
                                  fontsize=18, ha='center')'''
            ax[rowIndex, colIndex].errorbar(x,y,yerr = errors, fmt = '.k')
            ax[rowIndex, colIndex].xaxis.grid(False)
            ax[rowIndex, colIndex].yaxis.grid(False)
            ax[rowIndex, colIndex].axhline(y=0)
            ax[rowIndex, colIndex].set_ylim([-yLims, yLims])

            ax[rowIndex, colIndex].yaxis.set_ticks(np.arange(-yLims, yLims + 0.1, 0.1))
            ax[rowIndex, colIndex].xaxis.set_ticks(np.arange(0.0, 5.0, 1.0))

            ax[rowIndex, colIndex].tick_params(axis='both', labelsize = 16)
            ax[rowIndex, colIndex].set_title(ind, fontsize = 24)


            # ax[rowIndex, colIndex].
            
        fig.savefig('dirEffects_' + outcome + '_' + weather + '_empWts' + '.png')

# Indirect Effects
This is almost exactly the same but with supplier information in place of the direct company information.

In [None]:
os.getcwd()

Can alter this so that we're doing it with the employment weights as well.

In [None]:
cutoffVarsYr = ['0.95'] 
weatherVars  = ['precip_', 'temp_'] 
statVarsYr   = ['zipQuarterquant_']
outcomeVars  = ['lnRevNormd', 'lnCostNormd']


industries = range(1,44)


start = time.time()

results = pd.DataFrame()

i = 0



for ind in industries:
    print('##########################################################')
    print(ind)
    
    filename = "../../data/companyData/supplier_igData_ind" + str(ind) + ".csv"
    goodsData = pd.read_csv(filename).drop(columns = {'Unnamed: 0'})

    if goodsData.shape[0] > 50:
        for outcomeVar in outcomeVars:
            for weatherVar in weatherVars:
                for statVar in statVarsYr:                     
                    for cutoffVar in cutoffVarsYr:

                        i = i + 1

                        indVar = weatherVar + statVar + cutoffVar


                        print(outcomeVar, "~", indVar)


                        # find: concurrent ; or lagged supplier data
                        X = goodsData.loc[:,(((goodsData.columns.str.contains(indVar)) & ~goodsData.columns.str.contains('empWt_')) | 
                                (goodsData.columns.str.contains('indQtr_')) |
                                (goodsData.columns.str.contains('gvkey_')) | #  | 
                                (goodsData.columns.str.contains('ageTercile_')) |
                                (goodsData.columns.str.contains('sizeTercile_')) |
                                (goodsData.columns.str.contains('profitTercile_')) | 
                                (goodsData.columns == 'supplierTercile'))] 
                        
                        X = sm.add_constant(X)

                        print(X.columns)
                        firms = goodsData['gvkey']


                        y = goodsData[outcomeVar]


                        model = sm.OLS(y, X).fit(cov_type='cluster',cov_kwds={'groups': firms},use_t=True)
                        pvals = model.pvalues[1: 1 + len(goodsData.columns[goodsData.columns.str.contains(indVar) & ~goodsData.columns.str.contains('empWt_')] )]
                        coeff = model.params[1: 1 + len(goodsData.columns[goodsData.columns.str.contains(indVar)  & ~goodsData.columns.str.contains('empWt_')])]
                        errs  = model.bse[1: 1 + len(goodsData.columns[goodsData.columns.str.contains(indVar)     & ~goodsData.columns.str.contains('empWt_')])]
                
                        '''print(coeff)
                        print(pvals)'''


                        results.loc[i,'industry'] = ind

                        results.loc[i,'outcomeVar'] = outcomeVar
                        results.loc[i,'weatherVar'] = weatherVar

                        results.loc[i,'lag0']       = coeff[0]
                        results.loc[i,'lag1']       = coeff[1]
                        results.loc[i,'lag2']       = coeff[2]
                        results.loc[i,'lag3']       = coeff[3]
                        results.loc[i,'lag4']       = coeff[4]

                        results.loc[i,'bse0']       = errs[0]
                        results.loc[i,'bse1']       = errs[1]
                        results.loc[i,'bse2']       = errs[2]
                        results.loc[i,'bse3']       = errs[3]
                        results.loc[i,'bse4']       = errs[4]

                        results.loc[i,'pval0']      = pvals[0]
                        results.loc[i,'pval1']      = pvals[1]
                        results.loc[i,'pval2']      = pvals[2]
                        results.loc[i,'pval3']      = pvals[3]
                        results.loc[i,'pval4']      = pvals[4]


                        results.to_csv("../../indirResults_hqs.csv")

                        print( time.time() - start)


# merge in the industry names
conversionTable = pd.read_csv("../../data/indMapping.csv")
conversionTable.dropna(inplace=True)
conversionTable.reset_index(drop = True, inplace = True)

conversionTable.head()

results = results.merge(conversionTable)

results.to_csv("../../indirResults_hqs.csv")


In [None]:
results = pd.read_csv("../../indirResults_hqs.csv").drop(columns = {'Unnamed: 0'})
print(results.industry.unique())
results.head()


In [None]:
print(outcome, weather, ind)

rev = results[(results.outcomeVar == outcome) & (results.weatherVar == weather) & 
                         (results.industry == ind)].reset_index()

In [None]:
# loop over outcome variables and weather definitions
weatherVars = results.weatherVar.unique()
outcomeVars = results.outcomeVar.unique()

industries = [2,17,18,28,31,40,41,42] # results.industryName.unique()

for outcome in outcomeVars:
    for weather in weatherVars:
        # choose the elective parts of this - number of columns and the range of the axes
        numCols = 3
        yLims   = 0.03

        # industries = results.industryName.unique()
        rowNum = len(industries) // numCols + 1
        colNum = numCols

        fig, ax = plt.subplots(rowNum, colNum, sharex='all', sharey='all',
                              figsize=(20,20),
                              constrained_layout=True)

        fig.suptitle('Indirect Effects: ' + outcome + ' ~ ' + weather, fontsize=36)



        i = 0
        for ind in industries:
            rowIndex = i // numCols
            colIndex = i % numCols


            i   = i + 1


            rev = results[(results.outcomeVar == outcome) & (results.weatherVar == weather) & 
                         (results.industry == ind)].reset_index()
            indName = rev.industryName.unique()[0]
            x   = [0,1,2,3,4]
            y   = [rev.lag0,rev.lag1,rev.lag2,rev.lag3,rev.lag4]


            errors = [rev.bse0,rev.bse1,rev.bse2,rev.bse3,rev.bse4]

            # plt.errorbar(x,y,yerr = errors, fmt = '.k')
            # plt.show()

            '''ax[rowIndex, colIndex].text(0.5, 0.5, str((i, j)),
                                  fontsize=18, ha='center')'''
            ax[rowIndex, colIndex].errorbar(x,y,yerr = errors, fmt = '.k')
            ax[rowIndex, colIndex].xaxis.grid(False)
            ax[rowIndex, colIndex].yaxis.grid(False)
            ax[rowIndex, colIndex].axhline(y=0)
            ax[rowIndex, colIndex].set_ylim([-yLims, yLims])

            ax[rowIndex, colIndex].yaxis.set_ticks(np.arange(-yLims, yLims + 0.1, 0.1))
            ax[rowIndex, colIndex].xaxis.set_ticks(np.arange(0.0, 5.0, 1.0))

            ax[rowIndex, colIndex].tick_params(axis='both', labelsize = 16)
            ax[rowIndex, colIndex].set_title(indName, fontsize = 24)


            # ax[rowIndex, colIndex].
    
        fig.savefig('indirEffects_' + outcome + '_' + weather + '.png')




Now do this by streaks - consecutive days with at least 95th percentile temp or rain.

In [None]:
weatherVars  = ['hotStreak',  'wetStreak']   #[,]
outcomeVars  = ['lnRevNormd', 'lnCostNormd'] # ['revenueChange'] #[, 'costChange']#,'lnCost','lnInc','lnRev']

# if we wanted to do the regressions below for all industries, we would use the following
'''filename = "../../data/companyData/goodsData_supplierData.csv"
goodsData = pd.read_csv(filename).drop(columns = {'Unnamed: 0'})
'''

# goodsData = goodsData[~goodsData.lnRev.isna() & ~goodsData.lnCost.isna() & ~goodsData.lnCostNormd.isna()]
goodsData['scTercile']  = pd.qcut(goodsData['suppliers'], 3, labels=False, duplicates = 'drop')


start = time.time()
results = pd.DataFrame()
i = 0

industries = range(1,44)

for ind in industries:
    filename = "../../data/companyData/supplier_igData_ind" + str(ind) + ".csv"
    goodsData = pd.read_csv(filename).drop(columns = {'Unnamed: 0'})

    if goodsData.shape[0] > 50:

        for outcomeVar in outcomeVars:
            for weatherVar in weatherVars:
                
                i = i + 1
                
                indVar = weatherVar


                print(outcomeVar, "~", indVar)


                # find: concurrent ; or lagged supplier datawet
                X = goodsData.loc[:,(((goodsData.columns.str.contains(indVar))) | 
                                (goodsData.columns.str.contains('indQtr_')) |
                                (goodsData.columns.str.contains('gvkey_')) | #  | 
                                (goodsData.columns.str.contains('ageTercile_')) |
                                (goodsData.columns.str.contains('sizeTercile_')) |
                                (goodsData.columns.str.contains('profitTercile_')) | 
                                (goodsData.columns == 'supplierTercile'))]     

                X = sm.add_constant(X)

                
                firms = goodsData['gvkey']


                y = goodsData[outcomeVar]


                modelResults = sm.OLS(y, X).fit(cov_type='cluster',cov_kwds={'groups': firms},use_t=True)
                pvals = modelResults.pvalues[1: 1 + len(goodsData.columns[goodsData.columns.str.contains(indVar) & goodsData.columns.str.contains('supplier_')])]
                coeff = modelResults.params[1: 1 + len(goodsData.columns[goodsData.columns.str.contains(indVar)  & goodsData.columns.str.contains('supplier_')])]
                errs  = modelResults.bse[1: 1 + len(goodsData.columns[goodsData.columns.str.contains(indVar)  & goodsData.columns.str.contains('supplier_')])]
                
                '''print(coeff)
                print(pvals)'''


                results.loc[i,'industry'] = ind

                results.loc[i,'outcomeVar'] = outcomeVar
                results.loc[i,'weatherVar'] = weatherVar

                results.loc[i,'lag0']       = coeff[0]
                results.loc[i,'lag1']       = coeff[1]
                results.loc[i,'lag2']       = coeff[2]
                results.loc[i,'lag3']       = coeff[3]
                results.loc[i,'lag4']       = coeff[4]
                
                results.loc[i,'bse0']       = errs[0]
                results.loc[i,'bse1']       = errs[1]
                results.loc[i,'bse2']       = errs[2]
                results.loc[i,'bse3']       = errs[3]
                results.loc[i,'bse4']       = errs[4]

                results.loc[i,'pval0']      = pvals[0]
                results.loc[i,'pval1']      = pvals[1]
                results.loc[i,'pval2']      = pvals[2]
                results.loc[i,'pval3']      = pvals[3]
                results.loc[i,'pval4']      = pvals[4]
                
                
                
                print( time.time() - start)

                results.to_csv("../../data/indirResults_hqs_streaks.csv")

# merge in the industry names
conversionTable = pd.read_csv("../../data/indMapping.csv")
conversionTable.dropna(inplace=True)
conversionTable.reset_index(drop = True, inplace = True)

conversionTable.head()

results = results.merge(conversionTable)


results.to_csv("../../data/indirResults_hqs_streaks.csv")


In [None]:
results = pd.read_csv("../../data/indirResults_hqs_streaks.csv")

In [None]:
weatherVars = results.weatherVar.unique()
outcomeVars = results.outcomeVar.unique()

industries = [2,17,18,28,31,40,41,42] # results.industryName.unique()

for outcome in outcomeVars:
    for weather in weatherVars:
        # choose the elective parts of this - number of columns and the range of the axes
        numCols = 3
        yLims   = 0.2

        # industries = results.industryName.unique()
        rowNum = len(industries) // numCols + 1
        colNum = numCols

        fig, ax = plt.subplots(rowNum, colNum, sharex='all', sharey='all',
                              figsize=(20,20),
                              constrained_layout=True)

        fig.suptitle('Indirect Effects: ' + outcome + ' ~ ' + weather, fontsize=36)



        i = 0
        for ind in industries:
            rowIndex = i // numCols
            colIndex = i % numCols


            i   = i + 1


            rev = results[(results.outcomeVar == outcome) & (results.weatherVar == weather) & 
                         (results.industry == ind)].reset_index()
            indName = rev.industryName.unique()[0]
            x   = [0,1,2,3,4]
            y   = [rev.lag0,rev.lag1,rev.lag2,rev.lag3,rev.lag4]


            errors = [rev.bse0,rev.bse1,rev.bse2,rev.bse3,rev.bse4]

            # plt.errorbar(x,y,yerr = errors, fmt = '.k')
            # plt.show()

            '''ax[rowIndex, colIndex].text(0.5, 0.5, str((i, j)),
                                  fontsize=18, ha='center')'''
            ax[rowIndex, colIndex].errorbar(x,y,yerr = errors, fmt = '.k')
            ax[rowIndex, colIndex].xaxis.grid(False)
            ax[rowIndex, colIndex].yaxis.grid(False)
            ax[rowIndex, colIndex].axhline(y=0)
            ax[rowIndex, colIndex].set_ylim([-yLims, yLims])

            ax[rowIndex, colIndex].yaxis.set_ticks(np.arange(-yLims, yLims + 0.1, 0.1))
            ax[rowIndex, colIndex].xaxis.set_ticks(np.arange(0.0, 5.0, 1.0))

            ax[rowIndex, colIndex].tick_params(axis='both', labelsize = 16)
            ax[rowIndex, colIndex].set_title(indName, fontsize = 24)

            # ax[rowIndex, colIndex].
    
        fig.savefig('indirEffects_' + outcome + '_' + weather + '.png')













----------------













### Faster and More Heuristic
The below gives us unclustered standard errors, output to a csv file.

In [None]:
def findSE(X,reg,y):
    N = len(X)
    p = len(X.columns) + 1  # plus one because LinearRegression adds an intercept term

    X_with_intercept = np.empty(shape=(N, p), dtype=np.float)
    X_with_intercept[:, 0] = 1
    X_with_intercept[:, 1:p] = X.values

    y_hat = reg.predict(X)
    residuals = y.values - y_hat
    residual_sum_of_squares = residuals.T @ residuals
    sigma_squared_hat = residual_sum_of_squares / (N - p)
    var_beta_hat = np.linalg.inv(X_with_intercept.T @ X_with_intercept) * sigma_squared_hat

    se0 = var_beta_hat[1, 1] ** 0.5
    se1 = var_beta_hat[2, 2] ** 0.5
    se2 = var_beta_hat[3, 3] ** 0.5
    se3 = var_beta_hat[4, 4] ** 0.5
    se4 = var_beta_hat[5, 5] ** 0.5
    se5 = var_beta_hat[6, 6] ** 0.5
    '''se6 = var_beta_hat[7, 7] ** 0.5
    se7 = var_beta_hat[8, 8] ** 0.5
    se8 = var_beta_hat[9, 9] ** 0.5'''
    return([abs(reg.coef_[0]/se0),abs(reg.coef_[1]/se1),abs(reg.coef_[2]/se2),
            abs(reg.coef_[3]/se3),abs(reg.coef_[4]/se4),abs(reg.coef_[5]/se5)]
          )

'''        
abs(reg.coef_[0]/se0),
          abs(reg.coef_[1]/se1),
          abs(reg.coef_[2]/se2),
          abs(reg.coef_[3]/se3),
          abs(reg.coef_[4]/se4),
          abs(reg.coef_[5]/se5),
          "SE0: ", se0,
          "SE1: ", se1,
          "SE2: ", se2,
          "SE3: ", se3,
          "SE4: ", se4,
          "SE5: ", se5,

'''


'''cutoffVarsYr = ['0.95'] # ,'1xYr']                                    #,'1x5Yrs'] #, ] # ,'1xQtr', '1x5Qtrs'
weatherVars  = ['precip_', 'temp_', 'precip5Days_', 'temp5Days_'] #[,]
statVarsYr   = ['zipquant_','zipQuarterquant_']
outcomeVars  = ['lnRev', 'revenueChange'] # ,'lnCost',  'costChange'] # [,'lnRevNormd','lnCostNormd'] # 'revenueChange' 'costChange',
firmVars     = ['firmQtr_'] # 'gvkey'
'''

# try this by industry
cutoffVarsYr = ['0.95'] # ,'1xYr']                                    #,'1x5Yrs'] #, ] # ,'1xQtr', '1x5Qtrs'
weatherVars  = ['precip_', 'temp_', 'precip5Days_', 'temp5Days_'] #[,]
statVarsYr   = ['ffquant_','indQuarterquant_']
outcomeVars  = ['lnRev', 'revenueChange',  'lnCost',  'costChange'] # [,'lnRevNormd','lnCostNormd'] # 'revenueChange' 'costChange',
firmVars     = ['firmQtr_']


inds = [1, 2, 6, 7, 18, 31, 41, 42]

goodsData = goodsData[~goodsData.lnRev.isna() & ~goodsData.lnCost.isna() &
                      ~goodsData.lnCostNormd.isna() & ~goodsData.lnRevNormd.isna()]

start = time.time()

results = pd.DataFrame()
i = 0
for ind in inds:
    print('#######################################################################################',ind)
    for outcomeVar in outcomeVars:
        for weatherVar in weatherVars:
            for statVar in statVarsYr:                     
                for cutoffVar in cutoffVarsYr:
                    for firmVar in firmVars:
                        tempData = goodsData[goodsData.famafrench == ind]
                        
                        i = i + 1
                        indVar = weatherVar + statVar + cutoffVar


                        print(outcomeVar, "~", indVar, "|", firmVar)


                        # find: concurrent ; or lagged supplier data
                        X = tempData.loc[:,((tempData.columns.str.contains(indVar)) |
                                          (tempData.columns.str.contains('indQtr_')) |
                                          # (goodsData.columns.str.contains('gvkey_'))) |   # &   
                                          # (goodsData.columns.str.contains('firmQtr_'))) |
                                          (tempData.columns.str.contains(firmVar)))] # |
                        '''(tempData.columns.str.contains('ageQtr_')) |
                          (tempData.columns.str.contains('sizeQtr_')) |
                          (tempData.columns.str.contains('profitQtr_'))]   #  & '''

                                          # (goodsData.columns.str.contains('firmQtr_')))       & 
                                        # ~(goodsData.columns.str.contains('lag4')) &
                                                                        # ~(goodsData.columns.str.contains('lag2')) & 


                        X = X[X.columns[(X.sum(axis = 0) >= 4)]]
                        # print(X.columns)
                        firms = tempData['gvkey']


                        y = tempData[outcomeVar]


                        ######################################
                        # fit the model on this subset
                        reg = linear_model.LinearRegression()
                        reg.fit(X,y)


                        # print('Coeff: ' , reg.coef_[0:5], 'SE type (looking >2): ', findSE(X,reg,y))
                        results.loc[i,'ind'] = ind


                        results.loc[i,'outcomeVar'] = outcomeVar
                        results.loc[i,'weatherVar'] = weatherVar
                        results.loc[i,'statVar']    = statVar
                        results.loc[i,'cutoffVar']  = cutoffVar
                        results.loc[i,'firmVar']    = firmVar


                        results.loc[i,'lag0']       = reg.coef_[0]
                        results.loc[i,'lag1']       = reg.coef_[1]
                        results.loc[i,'lag2']       = reg.coef_[2]
                        results.loc[i,'lag3']       = reg.coef_[3]
                        results.loc[i,'lag4']       = reg.coef_[4]



                        seratios = findSE(X,reg,y)

                        results.loc[i,'ratio0']       = seratios[0]
                        results.loc[i,'ratio1']       = seratios[1]
                        results.loc[i,'ratio2']       = seratios[2]
                        results.loc[i,'ratio3']       = seratios[3]
                        results.loc[i,'ratio4']       = seratios[4]

                        # print(results)

                        print(time.time() - start)

                        print('*******************************************************************')
                    
results.to_csv("../../data/results_notNormd.csv")


# merge in the industry names
conversionTable = pd.read_csv("../../data/indMapping.csv")
conversionTable.dropna(inplace=True)
conversionTable.reset_index(drop = True, inplace = True)

conversionTable.head()

results = results.merge(conversionTable)