In [1]:
import pickle
import pandas as pd
import matplotlib
import os
import re

import scipy

import collections
import datetime
import time

import geopandas as gpd

import numpy as np
 
from difflib import get_close_matches

from fuzzywuzzy import process
from fuzzywuzzy import fuzz
from sklearn import linear_model

import statsmodels.api as sm
import statsmodels.formula.api as smf

from linearmodels import PanelOLS, FamaMacBeth
from scipy import stats

import itertools

import matplotlib.pyplot as plt

from numpy.linalg import matrix_rank


## Grab Data

In [2]:
os.getcwd()

'/Users/brianreed/Documents/supplyChain/extremes/extremesAnalysisCode'

In [3]:
goodsData = pd.read_csv("../../data/companyData/goodsData_igData.csv").drop(columns = {'Unnamed: 0'})

goodsData.columns = goodsData.columns.str.replace("0.95", "Extreme")

print(goodsData.shape, goodsData.columns)

firms = goodsData['gvkey']


(63087, 892) Index(['X', 'gvkey', 'datadate', 'year', 'qtr', 'companyName', 'curcdq',
       'assets', 'cash', 'costGoodsSold',
       ...
       'lnNetIncNormd', 'lnOpIncNormd', 'yearQtr', 'firmQtr', 'ageQtr',
       'sizeQtr', 'profitQtr', 'indQtr', 'extremeHeat', 'extremePrecip'],
      dtype='object', length=892)


  This is separate from the ipykernel package so we can avoid doing imports until


In [4]:
goodsData.tempTercile.unique()

array([1, 2])

The following are the initial results. This is when we have 3 lags and tercile is defined It roughly seems like:
- higher temperature is beneficial in places at lower levels of temperature
- higher precipitation is still harmful in places at lower levels of precipitation
- high levels of both might be harmful

In [9]:
start = time.time()

outcome  = 'lnOpIncNormd'
weather  = 'C(tempTercile)*(temp_zipQuarterquant_Extreme + lag1_temp_zipQuarterquant_Extreme + lag2_temp_zipQuarterquant_Extreme + lag3_temp_zipQuarterquant_Extreme)'
controls = 'C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)'

equation = outcome + ' ~ ' + weather + ' + ' + controls
 
print(equation)

tempMod = smf.ols(formula = equation, data = goodsData).fit()

print(time.time() - start) 


print(tempMod.summary())


lnOpIncNormd ~ C(tempTercile)*(temp_zipQuarterquant_Extreme + lag1_temp_zipQuarterquant_Extreme + lag2_temp_zipQuarterquant_Extreme + lag3_temp_zipQuarterquant_Extreme) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)
166.84982085227966
                            OLS Regression Results                            
Dep. Variable:           lnOpIncNormd   R-squared:                       0.623
Model:                            OLS   Adj. R-squared:                  0.606
Method:                 Least Squares   F-statistic:                     36.31
Date:                Thu, 01 Sep 2022   Prob (F-statistic):               0.00
Time:                        13:32:00   Log-Likelihood:            -1.5470e+05
No. Observations:               62829   AIC:                         3.149e+05
Df Residuals:                   60089   BIC:                         3.397e+05
Df Model:                        2739                                         
Cova

# Indirect Effects
Let's look at the effects on a customer of extremes at its suppliers.

In [5]:
goodsData = pd.read_csv("../../data/companyData/goodsData_supplierData.csv").drop(columns = {'Unnamed: 0'})

goodsData.columns = goodsData.columns.str.replace("0.95", "Extreme")

print(goodsData.shape, goodsData.columns)

firms = goodsData['gvkey']

(49859, 738) Index(['gvkey', 'datadate', 'year', 'qtr', 'companyName', 'curcdq', 'assets',
       'cash', 'costGoodsSold', 'totalInv',
       ...
       'yearQtr', 'firmQtr', 'ageQtr', 'sizeQtr', 'profitQtr', 'indQtr',
       'supplier_extremeHeat', 'supplier_extremePrecip', 'supplierTempTercile',
       'supplierPrecipTercile'],
      dtype='object', length=738)


  This is separate from the ipykernel package so we can avoid doing imports until


In [26]:
start = time.time()

outcome  = 'lnOpIncNormd'
weather  = 'supplier_extremePrecip'
controls = 'C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)'

equation = outcome + ' ~ ' + weather + ' + ' + controls
 
print(equation)

precipMod = smf.ols(formula = equation, data = goodsData).fit()

print(time.time() - start) 


print(precipMod.summary())

lnOpIncNormd ~ supplier_extremePrecip + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)
16.4791898727417
                            OLS Regression Results                            
Dep. Variable:           lnOpIncNormd   R-squared:                       0.405
Model:                            OLS   Adj. R-squared:                  0.395
Method:                 Least Squares   F-statistic:                     38.50
Date:                Fri, 02 Sep 2022   Prob (F-statistic):               0.00
Time:                        15:07:21   Log-Likelihood:            -1.0365e+05
No. Observations:               49717   AIC:                         2.090e+05
Df Residuals:                   48852   BIC:                         2.167e+05
Df Model:                         864                                         
Covariance Type:            nonrobust                                         
                                                  coef    

We see the same thing here with temperature: positive correlation overall, with a negative effect on the warmer terciles and a positive effect on the coolest one.

In [20]:
start = time.time()

outcome  = 'lnOpIncNormd'
weather  = 'supplier_extremeHeat'
controls = 'C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)'

equation = outcome + ' ~ ' + weather + ' + ' + controls
 
print(equation)

tempMod = smf.ols(formula = equation, data = goodsData).fit()

print(time.time() - start) 


print(tempMod.summary())

lnOpIncNormd ~ supplier_extremeHeat + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)
18.850395917892456
                            OLS Regression Results                            
Dep. Variable:           lnOpIncNormd   R-squared:                       0.405
Model:                            OLS   Adj. R-squared:                  0.395
Method:                 Least Squares   F-statistic:                     38.56
Date:                Fri, 02 Sep 2022   Prob (F-statistic):               0.00
Time:                        14:25:10   Log-Likelihood:            -1.0363e+05
No. Observations:               49717   AIC:                         2.090e+05
Df Residuals:                   48852   BIC:                         2.166e+05
Df Model:                         864                                         
Covariance Type:            nonrobust                                         
                                                  coef    

In [19]:
start = time.time()

outcome  = 'lnOpIncNormd'
weather  = 'C(supplierTempTercile)*supplier_extremeHeat'
controls = 'C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)'

equation = outcome + ' ~ ' + weather + ' + ' + controls
 
print(equation)

tempMod = smf.ols(formula = equation, data = goodsData).fit()

print(time.time() - start) 


print(tempMod.summary())

lnOpIncNormd ~ C(supplierTempTercile)*supplier_extremeHeat + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)
16.790416955947876
                            OLS Regression Results                            
Dep. Variable:           lnOpIncNormd   R-squared:                       0.406
Model:                            OLS   Adj. R-squared:                  0.395
Method:                 Least Squares   F-statistic:                     38.42
Date:                Fri, 02 Sep 2022   Prob (F-statistic):               0.00
Time:                        14:15:15   Log-Likelihood:            -1.0362e+05
No. Observations:               49717   AIC:                         2.090e+05
Df Residuals:                   48848   BIC:                         2.166e+05
Df Model:                         868                                         
Covariance Type:            nonrobust                                         
                                   

Let's do the same as we did below, making a dataframe that summarizes a lot of the results. For now just do: 
- extreme heat
- extreme precip
- both of above, with breakdown by tercile

In [None]:
resultList = []
start = time.time()


outcomes = ['lnOpIncNormd', 'lnRevNormd', 'lnCostNormd', 'lnStockClose']

weatherVars = ['supplier_extremePrecip',
               'supplier_extremeHeat',
               'supplier_extremePrecip*C(supplierPrecipTercile)',
               'supplier_extremeHeat*C(supplierTempTercile)',
               'supplier_lag1_propAboveHundredThou + supplier_lag1_propAboveHundredThou',
               'supplier_lag1_propAboveMilli + supplier_lag1_propAboveMilli'
              ]

'''precip_zipQuarterquant_Extreme + lag1_precip_zipQuarterquant_Extreme + lag2_precip_zipQuarterquant_Extreme + lag3_precip_zipQuarterquant_Extreme',
'temp5Days_zipQuarterquant_Extreme + lag1_temp5Days_zipQuarterquant_Extreme + lag2_temp5Days_zipQuarterquant_Extreme + lag3_temp5Days_zipQuarterquant_Extreme',
'days90Plus + lag1_days90Plus + lag2_days90Plus + lag3_days90Plus',
'streak90Plus + lag1_streak90Plus + lag2_streak90Plus + lag3_streak90Plus', 
'propAboveTenThou + lag1_propAboveTenThou + lag2_propAboveTenThou + lag3_propAboveTenThou',
'propAboveHundredThou + lag1_propAboveHundredThou + lag2_propAboveHundredThou + lag3_propAboveHundredThou',
'propAboveMilli + lag1_propAboveMilli + lag2_propAboveMilli + lag3_propAboveMilli'''

# 'temp_zipWeek95_99 + lag1_temp_zipWeek95_99 + lag2_temp_zipWeek95_99 ', 
# 'temp_zipMonth95_99 + lag1_temp_zipMonth95_99 + lag2_temp_zipMonth95_99 ',
# 'temp_zipQuarter95_99 + lag1_temp_zipQuarter95_99 + lag2_temp_zipQuarter95_99 ']

controls = 'C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)'

for weatherVar in weatherVars:
    print("***************")
    print(weatherVar)
    results = pd.DataFrame()

    for outcome in outcomes:            
        equation = outcome + ' ~ ' + weatherVar + ' + ' + controls
        print(equation)
        
        try:
            mod = smf.ols(formula = equation, data = goodsData).fit()             

            print(time.time() - start) 

            # convert this into a much more condensed version
            coeffs = pd.DataFrame(mod.params,   columns = ['coeffs'])
            pvalues = pd.DataFrame(mod.pvalues, columns = ['pvals'])

            coeffs = coeffs[coeffs.index.str.contains(weatherVar[0:4])]
            pvalues = pvalues[pvalues.index.str.contains(weatherVar[0:4])]

            resultsTemp = pd.concat([coeffs,pvalues],axis = 1)

            resultsTemp.loc['upperVariable'] = ['^' + outcome, '*********'] 

            results = pd.concat([results,resultsTemp], axis = 1)

            print(resultsTemp)
        except:
            print('No Dice! ' + outcome + "~" + weatherVar)
            pass

    resultList.append(results)


allResults = pd.concat(resultList, axis=0)


***************
supplier_extremePrecip
lnOpIncNormd ~ supplier_extremePrecip + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)
16.73697590827942
                               coeffs      pvals
supplier_extremePrecip      -0.013267        0.0
upperVariable           ^lnOpIncNormd  *********
lnRevNormd ~ supplier_extremePrecip + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)
34.05679702758789
                             coeffs      pvals
supplier_extremePrecip    -0.001704        0.0
upperVariable           ^lnRevNormd  *********
lnCostNormd ~ supplier_extremePrecip + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)
No Dice! lnCostNormd~supplier_extremePrecip
lnStockClose ~ supplier_extremePrecip + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)
No Dice! lnStockClose~supplier_extremePrecip
***********

# Direct Effects
Look at the effects on the suppliers when they're affected directly.

## Complete Dataset
### At HQs

The below gives us the full, clustered standard errors.

In [54]:
resultList[0]

Unnamed: 0,coeffs,pvals,conc
precip_zipQuarterquant_Extreme,-0.013955,0.001611,_more25%
lag1_precip_zipQuarterquant_Extreme,-0.009473,0.032546,_more25%
lag2_precip_zipQuarterquant_Extreme,-0.003884,0.379477,_more25%
lag3_precip_zipQuarterquant_Extreme,-0.005953,0.178952,_more25%
upperVariable,^lnOpIncNormd,*********,_more25%


First, do the basics: days of extreme precipitation and (separately) extreme temperature, with 3 lags. We include a balance of time and industry-specific controls, fewer than are in the other regressions but generally allowing for a time trend, firm-specific trends, industry-seasonal trends, and profit, size, and age characteristics. We don't have time-specific trends across firms or industries but it's not clear that these would really change over the 10 years of the sample.



There are a couple of background facts that I'm relying on here: 
- the 1x year, 1x5 years, etc variables might be too rare to really pick up an effect.
- it's possible that lower tiers, or less extreme extremes, might matter too. may want to try to pick up a lower threshold as well. 
- the normalized variables (divided by lagged assets) seem to be more sensitive / response than just growth and just log-levels. this is likely because of something like the fact that this helps equalize for differences in the size of the firms in a way that neither log nor growth does. 



there are a couple of things to remember with these results:
- the company size/age/profitability terciles don't make a lick of difference
- precipitation seems to matter, period, for cumulative number of days
- temperature might need a longer streak for the effect to happen



a few things come out more in the heterogeneity analyses:
- it seems like the local-relative extremes matter especially at the upper ends of the distributions. this is a little counterintuitive but i think the story is something like the following: we expect that places with higher average temperatures would have higher ''95th percentile events'', and places with lower average temperatures might have lower ''95th percentile events'', that might actually not be that extreme. 
- we would expect the heatBin:extremeTemp(Precip) measure to show an opposite result if the extreme definition is an absolute one and not a relative one (larger effect in places with lower normal temps (precip) // lower effect in places with higher normal temps (precip)) because it's closer to their baseline & closer to what they might expect.
- there's not much with the industry-specific results? it could be that the data are currently too diffuse or too small to really 



questions:
- are there other moments of distributions or other ways to measure shifts in extremes?
- how should i best approach the industry-specific regressions? - separate regressions or interaction terms?
- what mechanisms should i consider? bs consider the role of "input specificity", as judged by patents or r&d. ps consider a few different ones: materiality, defined by value of physical assets/value of total assets; industry specificity; and expectation. 
    - are there any "climate mechanisms" i can examine here, other than just expectations?
    - how can we adapt or incorporate the scc here?



things to push forward on:
- targeting specific industries: either with different lag tiers, or with 
- indirect regressions!
- stock regressions
- extreme convective storms
- counts in disclosures



things that are probably very relevant that i should keep experimenting with:
- measures of concentration: establishment weights, percent of firm w/in 10% (or honestly 70%+) of hq
- extreme temp as 90+, maybe some flood-relative measure of extreme rain?


First, total days of heat and rain.



*AT SOME POINT, WE CAN ADD ADDTL COLUMNS FOR OTHER VARIABLES OF INTEREST TO THIS AS WELL: cost & profit, maybe also stocks [if we do a quarter before, quarter after] thing

In [4]:
resultList = []

keys = ['_more25%', '_more50%', '_more75%', '']

start = time.time()


for key in keys:
    '##################################################################'
    filename = '../../data/companyData/goodsData_igData' + key + '.csv'
    
    print(filename)

    goodsData = pd.read_csv(filename).drop(columns = {'Unnamed: 0'})

    goodsData.columns = goodsData.columns.str.replace("0.95", "Extreme")

    print(goodsData.shape)

    firms = goodsData['gvkey']



    outcomes = ['lnOpIncNormd', 'lnRevNormd', 'lnCostNormd', 'lnStockClose']

    weatherVars = ['precip_zipQuarterquant_Extreme + lag1_precip_zipQuarterquant_Extreme + lag2_precip_zipQuarterquant_Extreme + lag3_precip_zipQuarterquant_Extreme',
                  'temp5Days_zipQuarterquant_Extreme + lag1_temp5Days_zipQuarterquant_Extreme + lag2_temp5Days_zipQuarterquant_Extreme + lag3_temp5Days_zipQuarterquant_Extreme',
                  'days90Plus + lag1_days90Plus + lag2_days90Plus + lag3_days90Plus',
                  'streak90Plus + lag1_streak90Plus + lag2_streak90Plus + lag3_streak90Plus', 
                  'propAboveTenThou + lag1_propAboveTenThou + lag2_propAboveTenThou + lag3_propAboveTenThou',
                  'propAboveHundredThou + lag1_propAboveHundredThou + lag2_propAboveHundredThou + lag3_propAboveHundredThou',
                  'propAboveMilli + lag1_propAboveMilli + lag2_propAboveMilli + lag3_propAboveMilli']

                  # 'temp_zipWeek95_99 + lag1_temp_zipWeek95_99 + lag2_temp_zipWeek95_99 ', 
                  # 'temp_zipMonth95_99 + lag1_temp_zipMonth95_99 + lag2_temp_zipMonth95_99 ',
                  # 'temp_zipQuarter95_99 + lag1_temp_zipQuarter95_99 + lag2_temp_zipQuarter95_99 ']

    controls = ' + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)'

    for weatherVar in weatherVars:
        print("***************")
        results = pd.DataFrame()

        for outcome in outcomes:    
            print(outcome + "~" + weatherVar)
            try:
                mod = smf.ols(formula = outcome + ' ~ ' + weatherVar + controls, data = goodsData).fit()             

                print(time.time() - start) 

                # convert this into a much more condensed version
                coeffs = pd.DataFrame(mod.params,   columns = ['coeffs'])
                pvalues = pd.DataFrame(mod.pvalues, columns = ['pvals'])

                coeffs = coeffs[coeffs.index.str.contains(weatherVar[0:4])]
                pvalues = pvalues[pvalues.index.str.contains(weatherVar[0:4])]

                resultsTemp = pd.concat([coeffs,pvalues],axis = 1)

                resultsTemp.loc['upperVariable'] = ['^' + outcome, '*********'] 

                results = pd.concat([results,resultsTemp], axis = 1)
                results['conc'] = key

                print(resultsTemp)
            except:
                print('No Dice! ' + outcome + "~" + weatherVar)
                pass

        resultList.append(results)
        
        
allResults = pd.concat(resultList, axis=0)

allResults.to_csv('../../data/companyData/results.csv')


../../data/companyData/goodsData_igData_more25%.csv


  exec(code_obj, self.user_global_ns, self.user_ns)
  app.launch_new_instance()


(37942, 892)
***************
lnOpIncNormd~precip_zipQuarterquant_Extreme + lag1_precip_zipQuarterquant_Extreme + lag2_precip_zipQuarterquant_Extreme + lag3_precip_zipQuarterquant_Extreme
63.42503595352173
                                            coeffs      pvals
precip_zipQuarterquant_Extreme           -0.013955   0.001611
lag1_precip_zipQuarterquant_Extreme      -0.009473   0.032546
lag2_precip_zipQuarterquant_Extreme      -0.003884   0.379477
lag3_precip_zipQuarterquant_Extreme      -0.005953   0.178952
upperVariable                        ^lnOpIncNormd  *********
lnRevNormd~precip_zipQuarterquant_Extreme + lag1_precip_zipQuarterquant_Extreme + lag2_precip_zipQuarterquant_Extreme + lag3_precip_zipQuarterquant_Extreme
111.02078413963318
                                          coeffs      pvals
precip_zipQuarterquant_Extreme          -0.00132    0.17013
lag1_precip_zipQuarterquant_Extreme    -0.000822   0.394243
lag2_precip_zipQuarterquant_Extreme    -0.001947   0.043086
lag3_pre

1022.3124768733978
                        coeffs      pvals
streak90Plus         -0.054487   0.047039
lag1_streak90Plus     0.002503   0.927138
lag2_streak90Plus    -0.021388   0.436719
lag3_streak90Plus    -0.019173   0.487189
upperVariable      ^lnRevNormd  *********
lnCostNormd~streak90Plus + lag1_streak90Plus + lag2_streak90Plus + lag3_streak90Plus
1071.3880381584167
                         coeffs      pvals
streak90Plus          -0.017113   0.656941
lag1_streak90Plus       0.00354   0.926203
lag2_streak90Plus      0.008828   0.818466
lag3_streak90Plus      0.004479   0.907322
upperVariable      ^lnCostNormd  *********
lnStockClose~streak90Plus + lag1_streak90Plus + lag2_streak90Plus + lag3_streak90Plus
No Dice! lnStockClose~streak90Plus + lag1_streak90Plus + lag2_streak90Plus + lag3_streak90Plus
***************
lnOpIncNormd~propAboveTenThou + lag1_propAboveTenThou + lag2_propAboveTenThou + lag3_propAboveTenThou
1204.6434009075165
                              coeffs      pvals
p

  app.launch_new_instance()


(28532, 892)
***************
lnOpIncNormd~precip_zipQuarterquant_Extreme + lag1_precip_zipQuarterquant_Extreme + lag2_precip_zipQuarterquant_Extreme + lag3_precip_zipQuarterquant_Extreme
2091.1863720417023
                                            coeffs      pvals
precip_zipQuarterquant_Extreme           -0.011429   0.029054
lag1_precip_zipQuarterquant_Extreme      -0.008627   0.100308
lag2_precip_zipQuarterquant_Extreme        -0.0014   0.789117
lag3_precip_zipQuarterquant_Extreme      -0.007068   0.178095
upperVariable                        ^lnOpIncNormd  *********
lnRevNormd~precip_zipQuarterquant_Extreme + lag1_precip_zipQuarterquant_Extreme + lag2_precip_zipQuarterquant_Extreme + lag3_precip_zipQuarterquant_Extreme
2127.428983926773
                                          coeffs      pvals
precip_zipQuarterquant_Extreme         -0.001868    0.12388
lag1_precip_zipQuarterquant_Extreme     -0.00073   0.549666
lag2_precip_zipQuarterquant_Extreme    -0.002726    0.02513
lag3_pre

3137.1364970207214
                        coeffs      pvals
streak90Plus         -0.054797   0.115777
lag1_streak90Plus     0.005034    0.88536
lag2_streak90Plus    -0.029894   0.393288
lag3_streak90Plus    -0.034939   0.323256
upperVariable      ^lnRevNormd  *********
lnCostNormd~streak90Plus + lag1_streak90Plus + lag2_streak90Plus + lag3_streak90Plus
3171.0796768665314
                         coeffs      pvals
streak90Plus          -0.028438   0.549049
lag1_streak90Plus      0.018736   0.691876
lag2_streak90Plus      0.019684    0.67847
lag3_streak90Plus      0.037921   0.428049
upperVariable      ^lnCostNormd  *********
lnStockClose~streak90Plus + lag1_streak90Plus + lag2_streak90Plus + lag3_streak90Plus
3207.6925690174103
                          coeffs      pvals
streak90Plus           -0.005878   0.782506
lag1_streak90Plus       0.011553    0.58531
lag2_streak90Plus       0.003411   0.872343
lag3_streak90Plus       0.005335   0.803418
upperVariable      ^lnStockClose  ********

  app.launch_new_instance()


(20418, 892)
***************
lnOpIncNormd~precip_zipQuarterquant_Extreme + lag1_precip_zipQuarterquant_Extreme + lag2_precip_zipQuarterquant_Extreme + lag3_precip_zipQuarterquant_Extreme
3745.664139032364
                                            coeffs      pvals
precip_zipQuarterquant_Extreme           -0.008154   0.187437
lag1_precip_zipQuarterquant_Extreme       -0.00602   0.331032
lag2_precip_zipQuarterquant_Extreme       0.000511   0.933931
lag3_precip_zipQuarterquant_Extreme      -0.009062   0.142759
upperVariable                        ^lnOpIncNormd  *********
lnRevNormd~precip_zipQuarterquant_Extreme + lag1_precip_zipQuarterquant_Extreme + lag2_precip_zipQuarterquant_Extreme + lag3_precip_zipQuarterquant_Extreme
3766.857211112976
                                          coeffs      pvals
precip_zipQuarterquant_Extreme         -0.001907   0.233512
lag1_precip_zipQuarterquant_Extreme    -0.000953   0.552795
lag2_precip_zipQuarterquant_Extreme    -0.003118   0.051473
lag3_prec

4132.650290250778
                        coeffs      pvals
streak90Plus         -0.059724   0.207916
lag1_streak90Plus     0.016838   0.722736
lag2_streak90Plus    -0.057936   0.218793
lag3_streak90Plus    -0.048249   0.307739
upperVariable      ^lnRevNormd  *********
lnCostNormd~streak90Plus + lag1_streak90Plus + lag2_streak90Plus + lag3_streak90Plus
4160.570013999939
                         coeffs      pvals
streak90Plus          -0.035746   0.570303
lag1_streak90Plus      0.070684   0.258771
lag2_streak90Plus      0.054054   0.386608
lag3_streak90Plus      0.092539   0.139615
upperVariable      ^lnCostNormd  *********
lnStockClose~streak90Plus + lag1_streak90Plus + lag2_streak90Plus + lag3_streak90Plus
4186.679594993591
                          coeffs      pvals
streak90Plus            0.010993    0.67346
lag1_streak90Plus       0.018267   0.480058
lag2_streak90Plus       0.005625   0.826745
lag3_streak90Plus       0.029114   0.259712
upperVariable      ^lnStockClose  *********
*

  app.launch_new_instance()


(63087, 892)
***************
lnOpIncNormd~precip_zipQuarterquant_Extreme + lag1_precip_zipQuarterquant_Extreme + lag2_precip_zipQuarterquant_Extreme + lag3_precip_zipQuarterquant_Extreme
4648.251849889755
                                            coeffs      pvals
precip_zipQuarterquant_Extreme            -0.01209   0.000209
lag1_precip_zipQuarterquant_Extreme       -0.01146   0.000446
lag2_precip_zipQuarterquant_Extreme       -0.00658   0.043235
lag3_precip_zipQuarterquant_Extreme      -0.007516   0.021134
upperVariable                        ^lnOpIncNormd  *********
lnRevNormd~precip_zipQuarterquant_Extreme + lag1_precip_zipQuarterquant_Extreme + lag2_precip_zipQuarterquant_Extreme + lag3_precip_zipQuarterquant_Extreme
4798.914854049683
                                          coeffs      pvals
precip_zipQuarterquant_Extreme         -0.001467    0.01902
lag1_precip_zipQuarterquant_Extreme    -0.001669   0.007736
lag2_precip_zipQuarterquant_Extreme    -0.001487   0.017348
lag3_prec

No Dice! lnCostNormd~streak90Plus + lag1_streak90Plus + lag2_streak90Plus + lag3_streak90Plus
lnStockClose~streak90Plus + lag1_streak90Plus + lag2_streak90Plus + lag3_streak90Plus
54634.25126194954
                          coeffs      pvals
streak90Plus           -0.011658   0.400096
lag1_streak90Plus      -0.009562   0.487581
lag2_streak90Plus      -0.011704   0.395988
lag3_streak90Plus      -0.014419   0.296781
upperVariable      ^lnStockClose  *********
***************
lnOpIncNormd~propAboveTenThou + lag1_propAboveTenThou + lag2_propAboveTenThou + lag3_propAboveTenThou
54794.49238014221
                              coeffs      pvals
propAboveTenThou             0.07947   0.028235
lag1_propAboveTenThou       0.030511    0.40158
lag2_propAboveTenThou       0.027229   0.451427
lag3_propAboveTenThou         0.1107   0.002248
upperVariable          ^lnOpIncNormd  *********
lnRevNormd~propAboveTenThou + lag1_propAboveTenThou + lag2_propAboveTenThou + lag3_propAboveTenThou
54958.09838199

InvalidIndexError: Reindexing only valid with uniquely valued Index objects

In [12]:
allResults = pd.DataFrame()

for result in resultList:
    if result.shape[1] == 9:
        allResults = pd.concat([allResults,result])
    print(allResults)

                                            coeffs      pvals      conc  \
precip_zipQuarterquant_Extreme           -0.013955   0.001611  _more25%   
lag1_precip_zipQuarterquant_Extreme      -0.009473   0.032546  _more25%   
lag2_precip_zipQuarterquant_Extreme      -0.003884   0.379477  _more25%   
lag3_precip_zipQuarterquant_Extreme      -0.005953   0.178952  _more25%   
upperVariable                        ^lnOpIncNormd  *********  _more25%   

                                          coeffs      pvals        coeffs  \
precip_zipQuarterquant_Extreme          -0.00132    0.17013      -0.00058   
lag1_precip_zipQuarterquant_Extreme    -0.000822   0.394243     -0.000487   
lag2_precip_zipQuarterquant_Extreme    -0.001947   0.043086     -0.002664   
lag3_precip_zipQuarterquant_Extreme    -0.003343   0.000533     -0.003296   
upperVariable                        ^lnRevNormd  *********  ^lnCostNormd   

                                         pvals         coeffs      pvals  
precip_zipQ

In [14]:
allResults.to_csv('../../data/companyData/allResults_byConc.csv')

Try to get the variance-covariance matrix, from https://www.statsmodels.org/dev/generated/statsmodels.regression.linear_model.RegressionResults.cov_params.html . We can use this in the calculation of MEs.

In [None]:
precipMod.cov_params()

In [None]:
start = time.time()

# + C(ageTercile) + C(profitTercile) + C(sizeTercile)
tempMod = smf.ols(formula = 'lnRevNormd ~ temp_zipQuarterquant_Extreme + lag1_temp_zipQuarterquant_Extreme + lag2_temp_zipQuarterquant_Extreme + lag3_temp_zipQuarterquant_Extreme + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) ', data = goodsData).fit()

print(time.time() - start) 

# convert this into a much more condensed version
coeffs = pd.DataFrame(tempMod.params,   columns = ['coeffs'])
pvalues = pd.DataFrame(tempMod.pvalues, columns = ['pvals'])

coeffs = coeffs[coeffs.index.str.contains('temp')]
pvalues = pvalues[pvalues.index.str.contains('temp')]

results = pd.concat([coeffs,pvalues],axis = 1)


print(results)


These results are influenced by the particular transformation. if we do 1 + the ratio, we have a particular problem with the second period here.

Look at "sustained" heat and rain. We can look at incidence of a heatwave or sustained temperatures above a given amount.

In [None]:
start = time.time()


precip5DaysMod = smf.ols(formula = 'lnRevNormd ~ precip5Days_zipQuarterquant_Extreme + lag1_precip5Days_zipQuarterquant_Extreme + lag2_precip5Days_zipQuarterquant_Extreme + lag3_precip5Days_zipQuarterquant_Extreme + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData).fit()

print(time.time() - start) 

# convert this into a much more condensed version
coeffs = pd.DataFrame(precip5DaysMod.params,   columns = ['coeffs'])
pvalues = pd.DataFrame(precip5DaysMod.pvalues, columns = ['pvals'])

coeffs = coeffs[coeffs.index.str.contains('precip')]
pvalues = pvalues[pvalues.index.str.contains('precip')]

results = pd.concat([coeffs,pvalues],axis = 1)


print(results)


In [None]:
start = time.time()


temp5DaysMod = smf.ols(formula = 'lnRevNormd ~ temp5Days_zipQuarterquant_Extreme + lag1_temp5Days_zipQuarterquant_Extreme + lag2_temp5Days_zipQuarterquant_Extreme + lag3_temp5Days_zipQuarterquant_Extreme + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData).fit()

print(time.time() - start) 

# convert this into a much more condensed version
coeffs  = pd.DataFrame(temp5DaysMod.params,  columns = ['coeffs'])
pvalues = pd.DataFrame(temp5DaysMod.pvalues, columns = ['pvals'])

coeffs  = coeffs[coeffs.index.str.contains('temp')]
pvalues = pvalues[pvalues.index.str.contains('temp')]

results = pd.concat([coeffs,pvalues],axis = 1)


print(results)


## Breakouts by tercile

See how the effect varies in places that are background hot // background wet.

Sort of inspired by the BS2016 tercile approach, we divide each place into terciles. I THINK (double check this) that this is based on annual average temperature and precipitation. 



In [None]:
start = time.time()


precipModTercile = smf.ols(formula = 'lnRevNormd ~ C(precipTercile)*(precip_zipQuarterquant_Extreme + lag1_precip_zipQuarterquant_Extreme + lag2_precip_zipQuarterquant_Extreme + lag3_precip_zipQuarterquant_Extreme) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData).fit()

print(time.time() - start) 

# convert this into a much more condensed version
coeffs  = pd.DataFrame(precipModTercile.params,  columns = ['coeffs'])
pvalues = pd.DataFrame(precipModTercile.pvalues, columns = ['pvals'])

coeffs  = coeffs[coeffs.index.str.contains('precip')]
pvalues = pvalues[pvalues.index.str.contains('precip')]

results = pd.concat([coeffs,pvalues],axis = 1)


print(results)

In [None]:
start = time.time()


tempModTercile = smf.ols(formula = 'lnRevNormd ~ C(tempTercile)*(temp_zipQuarterquant_Extreme + lag1_temp_zipQuarterquant_Extreme + lag2_temp_zipQuarterquant_Extreme + lag3_temp_zipQuarterquant_Extreme) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData).fit()

print(time.time() - start) 

# convert this into a much more condensed version
coeffs  = pd.DataFrame(tempModTercile.params,  columns = ['coeffs'])
pvalues = pd.DataFrame(tempModTercile.pvalues, columns = ['pvals'])

coeffs  = coeffs[coeffs.index.str.contains('temp')]
pvalues = pvalues[pvalues.index.str.contains('temp')]

results = pd.concat([coeffs,pvalues],axis = 1)


print(results)

Now try the sustained effects.

In [None]:
start = time.time()


precip5DaysModTercile = smf.ols(formula = 'lnRevNormd ~ C(precipTercile)*(precip5Days_zipQuarterquant_Extreme + lag1_precip5Days_zipQuarterquant_Extreme + lag2_precip5Days_zipQuarterquant_Extreme + lag3_precip5Days_zipQuarterquant_Extreme) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData).fit()

print(time.time() - start) 

# convert this into a much more condensed version
coeffs  = pd.DataFrame(precip5DaysModTercile.params,  columns = ['coeffs'])
pvalues = pd.DataFrame(precip5DaysModTercile.pvalues, columns = ['pvals'])

coeffs  = coeffs[coeffs.index.str.contains('precip')]
pvalues = pvalues[pvalues.index.str.contains('precip')]

results = pd.concat([coeffs,pvalues],axis = 1)


print(results)

In [None]:
start = time.time()


temp5DaysModTercile = smf.ols(formula = 'lnRevNormd ~ C(tempTercile)*(temp5Days_zipQuarterquant_Extreme + lag1_temp5Days_zipQuarterquant_Extreme + lag2_temp5Days_zipQuarterquant_Extreme + lag3_temp5Days_zipQuarterquant_Extreme) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData).fit()

print(time.time() - start) 

# convert this into a much more condensed version
coeffs  = pd.DataFrame(temp5DaysModTercile.params,  columns = ['coeffs'])
pvalues = pd.DataFrame(temp5DaysModTercile.pvalues, columns = ['pvals'])

coeffs  = coeffs[coeffs.index.str.contains('temp')]
pvalues = pvalues[pvalues.index.str.contains('temp')]

results = pd.concat([coeffs,pvalues],axis = 1)


print(results)


# Temperature
It seems like we're getting a pretty strong signal on precipitation: more precipitation is bad, and it's bad even (especially?) in places where background level of precipitation is high, maybe because the most extreme tail of it is that much more extreme in these places. We have a little bit more work to do with temperature. 

From the above, we find the following:
    - Temperature does NOT seem to matter on a 1-day fluctuation basis. 
    - Temperature DOES seem to matter on a 5-day moving average case.
    
We can seem to look at the following:
    - Total days above 90F (another extreme; maybe interact with quartiles of avg temperature too)
    - Y/N for whether there was a 7-day streak above 90F, matching PS.
    - Weeks, months, qtr at different t'hold
        - Maybe try different bins as well.


First, try the total number of days that are at least 90F. Weird result is that more days above 90 is associated with better results here. REVISIT THIS.

In [None]:
start = time.time()


tempDaysAbove90Mod = smf.ols(formula = 'lnRevNormd ~ days90Plus + lag1_days90Plus + lag2_days90Plus + lag3_days90Plus + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData).fit()

print(time.time() - start) 

# convert this into a much more condensed version
coeffs  = pd.DataFrame(tempDaysAbove90Mod.params,  columns = ['coeffs'])
pvalues = pd.DataFrame(tempDaysAbove90Mod.pvalues, columns = ['pvals'])

coeffs  = coeffs[coeffs.index.str.contains('days90')]
pvalues = pvalues[pvalues.index.str.contains('days90')]

results = pd.concat([coeffs,pvalues],axis = 1)


print(results)

If we look at the breakdown by days that are normally below, at, or above average, we see the strongest result is in places that are normally below average. This is a drop of almost 4\%.

In [None]:
start = time.time()


tempDaysAbove90Mod = smf.ols(formula = 'lnRevNormd ~ C(tempTercile)*(days90Plus + lag1_days90Plus + lag2_days90Plus + lag3_days90Plus) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData).fit()

print(time.time() - start) 

coeffs  = pd.DataFrame(tempDaysAbove90Mod.params,  columns = ['coeffs'])
pvalues = pd.DataFrame(tempDaysAbove90Mod.pvalues, columns = ['pvals'])

coeffs  = coeffs[coeffs.index.str.contains('days90')]
pvalues = pvalues[pvalues.index.str.contains('days90')]

results = pd.concat([coeffs,pvalues],axis = 1)


print(results)

Now let's try the same things by streaks. The effect sizes are large, but not statistically significantly estimated.

In [None]:
start = time.time()


tempStreakAbove90Mod = smf.ols(formula = 'lnRevNormd ~ streak90Plus + lag1_streak90Plus + lag2_streak90Plus + lag3_streak90Plus + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData).fit()

print(time.time() - start) 

coeffs  = pd.DataFrame(tempStreakAbove90Mod.params,  columns = ['coeffs'])
pvalues = pd.DataFrame(tempStreakAbove90Mod.pvalues, columns = ['pvals'])

coeffs  = coeffs[coeffs.index.str.contains('streak90')]
pvalues = pvalues[pvalues.index.str.contains('streak90')]

results = pd.concat([coeffs,pvalues],axis = 1)


print(results)

In [None]:
start = time.time()


tempStreakAbove90Mod_intxn = smf.ols(formula = 'lnRevNormd ~  C(tempTercile)*(streak90Plus + lag1_streak90Plus + lag2_streak90Plus + lag3_streak90Plus) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData).fit()

print(time.time() - start) 

coeffs  = pd.DataFrame(tempStreakAbove90Mod_intxn.params,  columns = ['coeffs'])
pvalues = pd.DataFrame(tempStreakAbove90Mod_intxn.pvalues, columns = ['pvals'])

coeffs  = coeffs[coeffs.index.str.contains('streak90')]
pvalues = pvalues[pvalues.index.str.contains('streak90')]

results = pd.concat([coeffs,pvalues],axis = 1)


print(results)

Let's try the things by weeks, month, quarter.

In [None]:
start = time.time()

#  + lag1_temp_zipWeek95_99 + lag2_temp_zipWeek95_99 + lag3_temp_zipWeek95_99
# 

tempWeekMod = smf.ols(formula = 'lnRevNormd ~  (temp_zipWeek95_99 + lag1_temp_zipWeek95_99 + lag2_temp_zipWeek95_99 ) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData).fit()

print(time.time() - start) 

coeffs  = pd.DataFrame(tempWeekMod.params,  columns = ['coeffs'])
pvalues = pd.DataFrame(tempWeekMod.pvalues, columns = ['pvals'])

coeffs  = coeffs[coeffs.index.str.contains('temp')]
pvalues = pvalues[pvalues.index.str.contains('temp')]

results = pd.concat([coeffs,pvalues],axis = 1)


print(results)

If we break this down by the background temperature of the place, though, it seems like we find a similar effect in the coldest places: a warm week in the coldest places is the most negative, in the quarter concurrent with when it's warmest.


[is this the same effect? other places, did we not see a positive effect of slightly warmer weather in cooler places?]

In [None]:
start = time.time()

#  + lag1_temp_zipWeek95_99 + lag2_temp_zipWeek95_99 + lag3_temp_zipWeek95_99
# + C(ageTercile) + C(profitTercile) + C(sizeTercile)

tempWeekMod_intxn = smf.ols(formula = 'lnRevNormd ~  C(tempTercile)*(temp_zipWeek95_99 + lag1_temp_zipWeek95_99 + lag2_temp_zipWeek95_99 ) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile) ', data = goodsData).fit()

print(time.time() - start) 

coeffs  = pd.DataFrame(tempWeekMod_intxn.params,  columns = ['coeffs'])
pvalues = pd.DataFrame(tempWeekMod_intxn.pvalues, columns = ['pvals'])

coeffs  = coeffs[coeffs.index.str.contains('temp')]
pvalues = pvalues[pvalues.index.str.contains('temp')]

results = pd.concat([coeffs,pvalues],axis = 1)


print(results)

Try months now.

In [None]:
start = time.time()

#  + lag1_temp_zipMonth95_99 + lag2_temp_zipMonth95_99 + lag3_temp_zipMonth95_99
# 

tempMonthMod = smf.ols(formula = 'lnRevNormd ~  (temp_zipMonth95_99 + lag1_temp_zipMonth95_99 + lag2_temp_zipMonth95_99 ) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData).fit()

print(time.time() - start) 

coeffs  = pd.DataFrame(tempMonthMod.params,  columns = ['coeffs'])
pvalues = pd.DataFrame(tempMonthMod.pvalues, columns = ['pvals'])

coeffs  = coeffs[coeffs.index.str.contains('temp')]
pvalues = pvalues[pvalues.index.str.contains('temp')]

results = pd.concat([coeffs,pvalues],axis = 1)


print(results)

In [None]:
start = time.time()

#  + lag1_temp_zipMonth95_99 + lag2_temp_zipMonth95_99 + lag3_temp_zipMonth95_99
# + C(ageTercile) + C(profitTercile) + C(sizeTercile)

tempMonthMod_intxn = smf.ols(formula = 'lnRevNormd ~  C(tempTercile)*(temp_zipMonth95_99 + lag1_temp_zipMonth95_99 + lag2_temp_zipMonth95_99 ) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile) ', data = goodsData).fit()

print(time.time() - start) 

coeffs  = pd.DataFrame(tempMonthMod_intxn.params,  columns = ['coeffs'])
pvalues = pd.DataFrame(tempMonthMod_intxn.pvalues, columns = ['pvals'])

coeffs  = coeffs[coeffs.index.str.contains('temp')]
pvalues = pvalues[pvalues.index.str.contains('temp')]

results = pd.concat([coeffs,pvalues],axis = 1)


print(results)

And quarters.

In [None]:
start = time.time()

#  + lag1_temp_zipWeek95_99 + lag2_temp_zipWeek95_99 + lag3_temp_zipWeek95_99
# + C(ageTercile) + C(profitTercile) + C(sizeTercile)

tempQuarterMod = smf.ols(formula = 'lnRevNormd ~  (temp_zipQuarter95_99 + lag1_temp_zipQuarter95_99 + lag2_temp_zipQuarter95_99 ) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile) ', data = goodsData).fit()

print(time.time() - start) 

coeffs  = pd.DataFrame(tempQuarterMod.params,  columns = ['coeffs'])
pvalues = pd.DataFrame(tempQuarterMod.pvalues, columns = ['pvals'])

coeffs  = coeffs[coeffs.index.str.contains('temp')]
pvalues = pvalues[pvalues.index.str.contains('temp')]

results = pd.concat([coeffs,pvalues],axis = 1)


print(results)

In [None]:
start = time.time()


tempQuarterMod_intxn = smf.ols(formula = 'lnRevNormd ~  C(tempTercile)*(temp_zipQuarter90_95 + lag1_temp_zipQuarter90_95 + lag2_temp_zipQuarter90_95 ) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile) ', data = goodsData).fit()

print(time.time() - start) 

coeffs  = pd.DataFrame(tempQuarterMod_intxn.params,  columns = ['coeffs'])
pvalues = pd.DataFrame(tempQuarterMod_intxn.pvalues, columns = ['pvals'])

coeffs  = coeffs[coeffs.index.str.contains('temp')]
pvalues = pvalues[pvalues.index.str.contains('temp')]

results = pd.concat([coeffs,pvalues],axis = 1)


print(results)

## Additional Tests
Now try a few other ones here. 
- Streak of days above 95th percentile, temperature and rain.
- By categories of days: 0-5, 5-10, 10-15, 15+

In [None]:
start = time.time()


precipStreakMod = smf.ols(formula   = 'lnRevNormd ~ C(wetStreak) + C(lag1_wetStreak) + C(lag2_wetStreak) + C(lag3_wetStreak) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData).fit()


print(time.time() - start) 

coeffs  = pd.DataFrame(precipStreakMod.params,  columns = ['coeffs'])
pvalues = pd.DataFrame(precipStreakMod.pvalues, columns = ['pvals'])

coeffs  = coeffs[coeffs.index.str.contains('wet')]
pvalues = pvalues[pvalues.index.str.contains('wet')]

results = pd.concat([coeffs,pvalues],axis = 1)


print(results)

In [None]:
start = time.time()


tempStreakMod = smf.ols(formula   = 'lnRevNormd ~ C(hotStreak) + C(lag1_hotStreak) + C(lag2_hotStreak) + C(lag3_hotStreak) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData).fit()

print(time.time() - start) 

coeffs  = pd.DataFrame(tempStreakMod.params,  columns = ['coeffs'])
pvalues = pd.DataFrame(tempStreakMod.pvalues, columns = ['pvals'])

coeffs  = coeffs[coeffs.index.str.contains('hot')]
pvalues = pvalues[pvalues.index.str.contains('hot')]

results = pd.concat([coeffs,pvalues],axis = 1)


print(results)

Try with the different breakout categories of what's coming together.

In [None]:
start = time.time()


precipCatMod = smf.ols(formula   = 'lnRevNormd ~ C(wetDaysCat) + C(lag1_wetDaysCat) + C(lag2_wetDaysCat) + C(lag3_wetDaysCat) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData).fit() # cov_type  = 'cluster',cov_kwds={'groups': firms},use_t=True)


print(time.time() - start) 

coeffs  = pd.DataFrame(precipCatMod.params,  columns = ['coeffs'])
pvalues = pd.DataFrame(precipCatMod.pvalues, columns = ['pvals'])

coeffs  = coeffs[coeffs.index.str.contains('wet')]
pvalues = pvalues[pvalues.index.str.contains('wet')]

results = pd.concat([coeffs,pvalues],axis = 1)


print(results)

In [None]:
start = time.time()


tempCatMod = smf.ols(formula   = 'lnRevNormd ~ C(hotDaysCat) + C(lag1_hotDaysCat) + C(lag2_hotDaysCat) + C(lag3_hotDaysCat) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData).fit()


print(time.time() - start) 


coeffs  = pd.DataFrame(tempCatMod.params,  columns = ['coeffs'])
pvalues = pd.DataFrame(tempCatMod.pvalues, columns = ['pvals'])

coeffs  = coeffs[coeffs.index.str.contains('hot')]
pvalues = pvalues[pvalues.index.str.contains('hot')]

results = pd.concat([coeffs,pvalues],axis = 1)


print(results)

In [None]:
start = time.time()


tempCatMod_intxn = smf.ols(formula   = 'lnRevNormd ~ C(tempTercile)*(C(hotDaysCat) + C(lag1_hotDaysCat) + C(lag2_hotDaysCat) + C(lag3_hotDaysCat)) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData).fit()


print(time.time() - start) 


coeffs  = pd.DataFrame(tempCatMod_intxn.params,  columns = ['coeffs'])
pvalues = pd.DataFrame(tempCatMod_intxn.pvalues, columns = ['pvals'])

coeffs  = coeffs[coeffs.index.str.contains('hot')]
pvalues = pvalues[pvalues.index.str.contains('hot')]

results = pd.concat([coeffs,pvalues],axis = 1)


print(results)

# Robustness Checks
Try playing with temperature a little bit more. Look at:
    - interaction with concentration
    - establishment-weighted vars

In [None]:
start = time.time()


tempStreakConcMod = smf.ols(formula   = 'lnRevNormd ~ C(firmConcTercile)*(C(hotDaysCat) + C(lag1_hotDaysCat) + C(lag2_hotDaysCat) + C(lag3_hotDaysCat)) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', 
                           data = goodsData)
tempStreakConcRes = tempStreakConcMod.fit() # cov_type  = 'cluster',cov_kwds={'groups': firms},use_t=True)


print(time.time() - start) 

tempStreakConcRes.summary()

Try the temperature as defined by super super hot days, anywhere in the country - 95th percentile anywhere. This will only happen in a few places in , or at least, there will be some geographic skew. But we can control for that by looking at the effect of hot temps given different baselines.

In [None]:
start = time.time()


tempModAnnual_noControls = smf.ols(formula   = 'lnRevNormd ~ temp_annualquant_Extreme + lag1_temp_annualquant_Extreme + lag2_temp_annualquant_Extreme + lag3_temp_annualquant_Extreme + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey)', data = goodsData)
tempResAnnual_noControls = tempModAnnual_noControls.fit(cov_type  = 'cluster',cov_kwds={'groups': firms},use_t=True)


print(time.time() - start) 

tempResAnnual_noControls.summary()


Let's try the standard interactions, controlling for the background climate in given places.

If we look at the below, we see that the places that are normally coolest are negatively impacted by extreme extremes. Specifically, using an across-the-country cutoff for temperature, we have that the biggest negative effect happens in the places that are normally the lowest-temperature.

This gives some promise that we might find an effect of temperature in some places, depending on expectation or baseline climate.

In [None]:
start = time.time()


tempEstMod_annual = smf.ols(formula   = 'lnRevNormd ~ C(tempTercile)*(temp_annualquant_Extreme + lag1_temp_annualquant_Extreme + lag2_temp_annualquant_Extreme + lag3_temp_annualquant_Extreme) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', 
                           data = goodsData)


tempResMod_annual = tempEstMod_annual.fit() # cov_type  = 'cluster',cov_kwds={'groups': firms},use_t=True)


print(time.time() - start) 

tempResMod_annual.summary()

Let's try it by precipitation quartile for comparison's sake.

In [None]:
start = time.time()


precipEstMod_annual = smf.ols(formula   = 'lnRevNormd ~ C(precipTercile)*(precip_annualquant_Extreme + lag1_precip_annualquant_Extreme + lag2_precip_annualquant_Extreme + lag3_precip_annualquant_Extreme) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', 
                           data = goodsData)


precipResMod_annual = precipEstMod_annual.fit() # cov_type  = 'cluster',cov_kwds={'groups': firms},use_t=True)


print(time.time() - start) 

precipResMod_annual.summary()

Now let's make sure we have the originals, the OGs, for comparison.

In [None]:
start = time.time()


tempEstMod_zipQuarter = smf.ols(formula   = 'lnRevNormd ~ C(tempTercile)*(temp_zipQuarterquant_Extreme + lag1_temp_zipQuarterquant_Extreme + lag2_temp_zipQuarterquant_Extreme + lag3_temp_zipQuarterquant_Extreme) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', 
                           data = goodsData)


tempResMod_zipQuarter = tempEstMod_zipQuarter.fit() # cov_type  = 'cluster',cov_kwds={'groups': firms},use_t=True)


print(time.time() - start) 

tempResMod_zipQuarter.summary()

In [None]:
start = time.time()


precipEstMod_zipQuarter = smf.ols(formula   = 'lnRevNormd ~ C(precipTercile)*(precip_zipQuarterquant_Extreme + lag1_precip_zipQuarterquant_Extreme + lag2_precip_zipQuarterquant_Extreme + lag3_precip_zipQuarterquant_Extreme) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', 
                           data = goodsData)


precipResMod_zipQuarter = precipEstMod_zipQuarter.fit() # cov_type  = 'cluster',cov_kwds={'groups': firms},use_t=True)


print(time.time() - start) 

precipResMod_zipQuarter.summary()

# Industry-Specific

Start to do some of the heterogeneity analysis.

In [None]:
precipMod_byInd       = smf.ols(formula = 'lnRevNormd ~ C(indGroup)*(precip_zipQuarterquant_Extreme + lag1_precip_zipQuarterquant_Extreme + lag2_precip_zipQuarterquant_Extreme + lag3_precip_zipQuarterquant_Extreme) + C(indGroup)*C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData).fit()
coeff = precipMod_byInd.params
pvals = precipMod_byInd.pvalues


In [None]:
precipMod_byInd.summary()

In [None]:
phrase    = 'precip_zipQuarterquant_Extreme'

condition = [s for s in coeff.index if phrase in s]
coeffs_ofInt = coeff[condition]
pvals_ofInt  = pvals[condition] 

results = pd.DataFrame()

# get coeffs, lags, for each of these
lag0   = [s for s in coeffs_ofInt.index if ('lag' not in s)]
# lag0   = ['t']*len(lag0)
coeff0 = coeffs_ofInt[lag0]
pval0  = pvals_ofInt[lag0]
lags0  = ['t']*len(lag0)

lag1   = [s for s in coeffs_ofInt.index if ('lag1' in s)]
coeff1 = coeffs_ofInt[lag1]
pval1  = pvals_ofInt[lag1]
lags1  = ['t-1']*len(lag0)

lag2   = [s for s in coeffs_ofInt.index if ('lag2' in s)]
coeff2 = coeffs_ofInt[lag2]
pval2  = pvals_ofInt[lag2]
lags2  = ['t-2']*len(lag0)

lag3   = [s for s in coeffs_ofInt.index if ('lag3' in s)]
coeff3 = coeffs_ofInt[lag3]
pval3  = pvals_ofInt[lag3]
lags3  = ['t-3']*len(lag3)

allNames = list(itertools.chain(lag0,lag1,lag2,lag3))
intxns   = [char.split(':')[0] for char in allNames]
allCoefs = list(itertools.chain(coeff0,coeff1,coeff2,coeff3))  
allPVals = list(itertools.chain(pval0,pval1,pval2,pval3))  
allLagLabels = list(itertools.chain(lags0,lags1,lags2,lags3))  
coefsWithPVals = []

for i in range(0,len(allCoefs)):
    next = str("%.4f" % allCoefs[i]) + ' (' + str("%.2f" % allPVals[i]) + ')'
    coefsWithPVals.append(next)
    
take2 = pd.DataFrame([intxns,allLagLabels,coefsWithPVals]).T
take2.columns = ['indInteraction','allLagLabels','coefsWithPVals']
take2.pivot(index='indInteraction', columns='allLagLabels', values='coefsWithPVals').reset_index().to_csv('take2.csv')


Now try with the total number of industries as described in the other doc.

In [None]:
precipTotal_byInd  = smf.ols(formula = 'lnRevNormd ~ C(indGroup)*(extremePrecip) + C(indGroup)*C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData).fit()
coeff              = precipTotal_byInd.params
pvals              = precipTotal_byInd.pvalues


In [None]:
precipTotal_byInd.summary()

In [None]:
phrase    = 'extremePrecip'

condition = [s for s in coeff.index if phrase in s]
coeffs_ofInt = coeff[condition]
pvals_ofInt  = pvals[condition] 


results = pd.DataFrame()


allNames = coeffs_ofInt.index
intxns   = [char.split(':')[0] for char in allNames]
allCoefs = list(coeffs_ofInt)  
allPVals = list(pvals_ofInt)  
coefsWithPVals = []

for i in range(0,len(allCoefs)):
    next = str("%.4f" % allCoefs[i]) + ' (' + str("%.2f" % allPVals[i]) + ')'
    coefsWithPVals.append(next)

print(coefsWithPVals)
    

take3 = pd.DataFrame([intxns,coefsWithPVals]).T
take3.columns = ['indInteraction','coefsWithPVals']

print(take3)

take3.to_csv('take3.csv')

'''take2.pivot(index='indInteraction', columns='allLagLabels', values='coefsWithPVals').reset_index().to_csv('take2.csv')
'''

In [None]:
Now try this for each regression separately.

Do the same for temperature.

In [None]:
tempMod_byInd       = smf.ols(formula = 'lnRevNormd ~ C(indGroup)*(temp_zipQuarterquant_Extreme + lag1_temp_zipQuarterquant_Extreme + lag2_temp_zipQuarterquant_Extreme + lag3_temp_zipQuarterquant_Extreme) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData)
tempMod_byInd_res   = tempMod_byInd.fit(cov_type='cluster',cov_kwds={'groups': firms},use_t=True)


tempMod_byInd_res.summary()


Try just the concurrent quarter:

In [None]:
precipMod_byInd       = smf.ols(formula = 'lnRevNormd ~ C(indGroup)*(precip_zipQuarterquant_Extreme) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData)
precipMod_byInd_res   = precipMod_byInd.fit()


precipMod_byInd_res.summary()


Try with the categories.

In [None]:
hotCat_byInd       = smf.ols(formula = 'lnRevNormd ~ C(indGroup)*(C(hotDaysCat) + C(lag1_hotDaysCat) + C(lag2_hotDaysCat) + C(lag3_hotDaysCat)) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData)
hotCat_byInd_res   = hotCat_byInd.fit()

hotCat_byInd_res.summary()

In [None]:
wetCat_byInd       = smf.ols(formula = 'lnRevNormd ~ C(indGroup)*(C(wetDaysCat) + C(lag1_wetDaysCat) + C(lag2_wetDaysCat) + C(lag3_wetDaysCat)) + C(indGroup):C(qtr) + C(yearQtr) + C(gvkey) + C(ageTercile) + C(profitTercile) + C(sizeTercile)', data = goodsData)
wetCat_byInd_res   = wetCat_byInd.fit()

wetCat_byInd_res.summary()




It seems like if we split hairs by dividing things up the last few quarters, everything starts to go a little haywire. The most generous description is something like, we can't separately identify the effects from different quarters, and there's a lot of fairly collinear effects. There are a few less generous descriptions as well, including that there's not necessarily much signal here. 


One of the understated pros of all of this is that the r-squared values are all very high - we're getting great identification here. We could potentially expand the data sample.

Things for Larry tomorrow:
    - emphasis on, here is the specific regression form. here's why i think it is good/bad
    - main precipitation + temperature plot
    - a sense of the heterogeneity, by types of place
    - a little discussion of what to do about temperature: focus on a higher cutoff, the effects in places that aren't quite used to it, and the effects on firms that have more of their operations concentrated in one place
           - the problem with our current definition (zip-quarter) is that for some quarters, we don't have high enough baselines to really register the types of high temperatures 
           - it seems like there might be more variability in precipitation? or at least, more zipcodes seem to trigger it than trigger the temperature threshold
    - some of the industry - intxn results
    - some of the specific industry results
    - discussino of future results: indirect effect results, stock results, by concentration of firm 
    - a discussion of the different time frames: the further back, the less insight we have into what businesses are saying about all of this. the different data sources to mention are: disclosures (8-Ks); PRISM; zipcodes; compustat

----------------------------------

In [None]:
goodsData.indGroup.unique()

In [None]:
cutoffVarsYr = ['0.95']  # , ] # ,'1xQtr''1x5Qtrs',
weatherVars  = ['precip_'] # , 'temp5Days_', 'precip5Days_'] # , 'precip_']#, , ] #[,]
statVarsYr   = ['zipQuarterquant_'] #  , , ]  #,'zipQuarterquant_']
outcomeVars  = ['lnRevNormd'] # , 'lnRev', 'lnCost', 'revenueChange', 'costChange']

goodsData = goodsData[~goodsData.lnRev.isna() & ~goodsData.lnCost.isna()] # & ~goodsData.lnCostNormd.isna()]


start = time.time()

results = pd.DataFrame()

i = 0
for outcomeVar in outcomeVars:
    for weatherVar in weatherVars:
        for statVar in statVarsYr:                     
            for cutoffVar in cutoffVarsYr:
                i = i + 1
                indVar = weatherVar + statVar + cutoffVar
                
                
                print(outcomeVar, "~", indVar)


                # find: concurrent ; or lagged supplier data
                X = goodsData.loc[:,((goodsData.columns.str.contains(indVar) & ~goodsData.columns.str.contains('empWt_')) | 
                                                (goodsData.columns.str.contains('indQtr_')) |
                                                (goodsData.columns.str.contains('gvkey_')))] #  | 
                                                (goodsData.columns.str.contains('ageTercile_')) |
                                                # (goodsData.columns.str.contains('sizeTercile_')) |
                                                # (goodsData.columns.str.contains('profitTercile_')))]
                
                
                X = sm.add_constant(X)

                
                firms = goodsData['gvkey']
        

                y = goodsData[outcomeVar]
                
                
                model = sm.OLS(y, X).fit(cov_type='cluster',cov_kwds={'groups': firms},use_t=True)
                coeff = model.params[1:     1 + len(goodsData.columns[goodsData.columns.str.contains(indVar) & ~goodsData.columns.str.contains('empWt_')])]
                pvals = model.pvalues[1:    1 + len(goodsData.columns[goodsData.columns.str.contains(indVar) & ~goodsData.columns.str.contains('empWt_')])]
                errs  = modelResults.bse[1: 1 + len(goodsData.columns[goodsData.columns.str.contains(indVar) & ~goodsData.columns.str.contains('empWt_')])]
                # print(model.summary())
                print(coeff)
                print(pvals)


                results.loc[i,'industry'] = ind

                results.loc[i,'outcomeVar'] = outcomeVar
                results.loc[i,'weatherVar'] = weatherVar

                results.loc[i,'lag0']       = coeff[0]
                results.loc[i,'lag1']       = coeff[1]
                results.loc[i,'lag2']       = coeff[2]
                results.loc[i,'lag3']       = coeff[3]
                results.loc[i,'lag4']       = coeff[4]
                
                
                results.loc[i,'pval0']      = pvals[0]
                results.loc[i,'pval1']      = pvals[1]
                results.loc[i,'pval2']      = pvals[2]
                results.loc[i,'pval3']      = pvals[3]
                results.loc[i,'pval4']      = pvals[4]
                
                
                results.loc[i,'bse0']       = errs[0]
                results.loc[i,'bse1']       = errs[1]
                results.loc[i,'bse2']       = errs[2]
                results.loc[i,'bse3']       = errs[3]
                results.loc[i,'bse4']       = errs[4]

                                
                # results.to_csv("../../data/utilitiesResults_rightInds_noCtrls.csv")
                
                print( time.time() - start)

In [None]:
weatherVars  = ['hotStreak', 'wetStreak'] # , 'temp5Days_', 'precip5Days_'] # , 'precip_']#, , ] #[,]
outcomeVars  = ['lnRevNormd', 'lnCostNormd'] # , 'lnRev', 'lnCost', 'revenueChange', 'costChange']

goodsData = goodsData[~goodsData.lnRev.isna() & ~goodsData.lnCost.isna()] # & ~goodsData.lnCostNormd.isna()]


start = time.time()

results = pd.DataFrame()

i = 0
for outcomeVar in outcomeVars:
    for weatherVar in weatherVars:
        i = i + 1
        indVar = weatherVar


        print(outcomeVar, "~", indVar)


        # find: concurrent ; or lagged supplier data
        X = goodsData.loc[:,((goodsData.columns.str.contains(indVar) & ~goodsData.columns.str.contains('empWt_')) | 
                                        (goodsData.columns.str.contains('indQtr_')) |
                                        (goodsData.columns.str.contains('gvkey_')))] #  | 
                                        # (goodsData.columns.str.contains('ageTercile_')) |
                                        # (goodsData.columns.str.contains('sizeTercile_')) |
                                        # (goodsData.columns.str.contains('profitTercile_')))]


        X = sm.add_constant(X)
        print(X.columns)

        firms = goodsData['gvkey']


        y = goodsData[outcomeVar]


        model = sm.OLS(y, X).fit(cov_type='cluster',cov_kwds={'groups': firms},use_t=True)
        coeff = model.params[1:     1 + len(goodsData.columns[goodsData.columns.str.contains(indVar) & ~goodsData.columns.str.contains('empWt_')])]
        pvals = model.pvalues[1:    1 + len(goodsData.columns[goodsData.columns.str.contains(indVar) & ~goodsData.columns.str.contains('empWt_')])]
        errs  = modelResults.bse[1: 1 + len(goodsData.columns[goodsData.columns.str.contains(indVar) & ~goodsData.columns.str.contains('empWt_')])]
        # print(model.summary())
        print(coeff)
        print(pvals)


        results.loc[i,'industry'] = ind

        results.loc[i,'outcomeVar'] = outcomeVar
        results.loc[i,'weatherVar'] = weatherVar

        results.loc[i,'lag0']       = coeff[0]
        results.loc[i,'lag1']       = coeff[1]
        results.loc[i,'lag2']       = coeff[2]
        results.loc[i,'lag3']       = coeff[3]
        results.loc[i,'lag4']       = coeff[4]


        results.loc[i,'pval0']      = pvals[0]
        results.loc[i,'pval1']      = pvals[1]
        results.loc[i,'pval2']      = pvals[2]
        results.loc[i,'pval3']      = pvals[3]
        results.loc[i,'pval4']      = pvals[4]


        results.loc[i,'bse0']       = errs[0]
        results.loc[i,'bse1']       = errs[1]
        results.loc[i,'bse2']       = errs[2]
        results.loc[i,'bse3']       = errs[3]
        results.loc[i,'bse4']       = errs[4]


        # results.to_csv("../../data/utilitiesResults_rightInds_noCtrls.csv")

        print( time.time() - start)


In [None]:
results.to_csv("../../data/utilitiesResults_rightInds.csv")

### Employment-Wtd Weather
Run the regressions using the emp-wtd data.

In [None]:
cutoffVar   = '0.95'
weatherVar  = 'precip_'
statVar  = 'zipquant_'
outcomeVar  = 'lnRevNormd'

indVar = weatherVar + statVar + cutoffVar


goodsData.columns[goodsData.columns.str.contains(indVar) & goodsData.columns.str.contains('empWt_')] 

In [None]:
cutoffVar   = '0.95'
weatherVar  = 'precip_'
statVarYr  = 'zipquant_'
outcomeVar  = 'lnRevNormd'

ind = 2


##################
filename = '../../data/companyData/igData_ind' + str(ind) + '.csv'           
goodsData = pd.read_csv(filename).drop(columns = {'Unnamed: 0'})


indVar = weatherVar + statVar + cutoffVar


print(outcomeVar, "~", indVar)


# find: concurrent ; or lagged supplier data
X = goodsData.loc[:,((goodsData.columns.str.contains(indVar) & goodsData.columns.str.contains('empWt_')) | 
                                (goodsData.columns.str.contains('indQtr_')) |
                                (goodsData.columns.str.contains('gvkey_'))  | 
                                (goodsData.columns.str.contains('ageTercile_')) |
                                (goodsData.columns.str.contains('sizeTercile_')) |
                                (goodsData.columns.str.contains('profitTercile_')))]


print(X.columns)

firms = goodsData['gvkey']


y = goodsData[outcomeVar]


model = sm.OLS(y, X).fit(cov_type='cluster',cov_kwds={'groups': firms},use_t=True)
pvals = model.pvalues[0:len(goodsData.columns[goodsData.columns.str.contains(indVar)])]
coeff =  model.params[0:len(goodsData.columns[goodsData.columns.str.contains(indVar)])]

print(model.summary())

In [None]:
results = pd.read_csv("../../allIndustryResults.csv").drop(columns = {'Unnamed: 0'})
industries = results.industry.unique()
yLim   = 0.01
numCol = 3
padding = 1
xdim = 20
ydim = 40
filePrefix = 'dirEffects'

makePlots(results, industries, filePrefix, yLim)


In [None]:
def makePlots(results, industries, filePrefix, yLim, numCol = 2, padding = 1, xdim = 20, ydim = 40):
    
    # loop over outcome variables and weather definitions
    weatherVars = results.weatherVar.unique()
    outcomeVars = results.outcomeVar.unique()


    for outcome in outcomeVars:
        for weather in weatherVars:
            # choose the elective parts of this - number of columns and the range of the axes
            numCols = numCol
            yLims   = yLim

            rowNum = len(industries) // numCols + padding
            colNum = numCols

            fig, ax = plt.subplots(rowNum, colNum, sharex='all', sharey='all',
                                  figsize=(xdim,ydim),
                                  constrained_layout=True)

            fig.suptitle('Direct Effects: ' + outcome + ' ~ ' + weather, fontsize=36)



            i = 0
            for ind in industries:
                rowIndex = i // numCols 
                colIndex = i % numCols


                i   = i + 1


                rev = results[(results.outcomeVar == outcome) & (results.weatherVar == weather) & 
                             (results.industry == ind)].reset_index()
                # indName = rev.industryName.unique()[0]
                x   = [0,1,2,3,4]
                y   = [rev.lag0,rev.lag1,rev.lag2,rev.lag3,rev.lag4]


                errors = [rev.bse0,rev.bse1,rev.bse2,rev.bse3,rev.bse4]


                ax[rowIndex, colIndex].errorbar(x,y,yerr = errors, fmt = '.k')
                ax[rowIndex, colIndex].xaxis.grid(False)
                ax[rowIndex, colIndex].yaxis.grid(False)
                ax[rowIndex, colIndex].axhline(y=0)
                ax[rowIndex, colIndex].set_ylim([-yLims, yLims])

                ax[rowIndex, colIndex].yaxis.set_ticks(np.arange(-yLims, yLims + yLims, yLims/2))
                ax[rowIndex, colIndex].xaxis.set_ticks(np.arange(0.0, 5.0, 1.0))

                ax[rowIndex, colIndex].tick_params(axis='both', labelsize = 16)
                ax[rowIndex, colIndex].set_title(ind, fontsize = 24)

            fig.savefig(filePrefix + outcome + weather + '.png')
            fig.show()


                # ax[rowIndex, colIndex].





## Industry-Specific
Go through every famafrench industry and run the regressions above. First do this by days of extremes at hqs.

### HQs

In [None]:
goodsData = pd.read_csv("../../data/companyData/goodsData_igData.csv").drop(columns = {'Unnamed: 0'})

industries = goodsData.indGroup.unique()

In [None]:
results

In [None]:
cutoffVarsYr = ['0.95'] 
weatherVars  = ['precip_'] # , 'temp_'] 
statVarsYr   = ['zipQuarterquant_']
outcomeVars  = ['lnRevNormd'] # , 'lnCostNormd']




start = time.time()

results = pd.DataFrame()

i = 0

for ind in industries:
    print('##########################################################')
    print(ind)
    filename = '../../data/companyData/igData_ind' + str(ind) + '.csv'           
    goodsData = pd.read_csv(filename).drop(columns = {'Unnamed: 0'})
    if goodsData.shape[0] > 0:
    
        for outcomeVar in outcomeVars:
            for weatherVar in weatherVars:
                for statVar in statVarsYr:                     
                    for cutoffVar in cutoffVarsYr:

                        i = i + 1


                        indVar = weatherVar + statVar + cutoffVar


                        print(outcomeVar, "~", indVar)


                        # find: concurrent ; or lagged supplier data
                        X = goodsData.loc[:,(goodsData.columns.str.contains(indVar) & ~goodsData.columns.str.contains('empWt_') & ~goodsData.columns.str.contains('lag4')) | 
                                                        (goodsData.columns.str.contains('indQtr_')) | #  |
                                                        (goodsData.columns.str.contains('gvkey_')) | #  | 
                                                        (goodsData.columns.str.contains('ageTercile_')) |
                                                        (goodsData.columns.str.contains('sizeTercile_')) |
                                                        (goodsData.columns.str.contains('profitTercile_'))]

                        X = sm.add_constant(X)

                        firms = goodsData['gvkey']


                        y = goodsData[outcomeVar]


                        model = sm.OLS(y, X).fit(cov_type='cluster',cov_kwds={'groups': firms},use_t=True)
                        pvals = model.pvalues[1: 1 + len(goodsData.columns[goodsData.columns.str.contains(indVar) & ~goodsData.columns.str.contains('empWt_')])]
                        coeff = model.params[1: 1 + len(goodsData.columns[goodsData.columns.str.contains(indVar)  & ~goodsData.columns.str.contains('empWt_')])]
                        errs  = model.bse[1: 1 + len(goodsData.columns[goodsData.columns.str.contains(indVar)     & ~goodsData.columns.str.contains('empWt_')])]
                
                        '''print(coeff)
                        print(pvals)'''


                        results.loc[i,'industry'] = ind

                        results.loc[i,'outcomeVar'] = outcomeVar
                        results.loc[i,'weatherVar'] = weatherVar

                        # str("%.4f" % allCoefs[i]) + ' (' + str("%.2f" % allPVals[i]) + ')'
                        
                        results.loc[i,'lag0']       = str("%.4f" % coeff[0]) + ' (' + str("%.2f" % pvals[0]) + ')'
                        results.loc[i,'lag1']       = str("%.4f" % coeff[1]) + ' (' + str("%.2f" % pvals[1]) + ')'
                        results.loc[i,'lag2']       = str("%.4f" % coeff[2]) + ' (' + str("%.2f" % pvals[2]) + ')'
                        results.loc[i,'lag3']       = str("%.4f" % coeff[3]) + ' (' + str("%.2f" % pvals[3]) + ')'
                        
                        results.loc[i,'n'] = X.shape[0]
                        # results.loc[i,'lag4']       = coeff[4]

                        '''results.loc[i,'pval0']      = pvals[0]
                        results.loc[i,'pval1']      = pvals[1]
                        results.loc[i,'pval2']      = pvals[2]
                        results.loc[i,'pval3']      = pvals[3]
                        # results.loc[i,'pval4']      = pvals[4]
                        
                        results.loc[i,'bse0']       = errs[0]
                        results.loc[i,'bse1']       = errs[1]
                        results.loc[i,'bse2']       = errs[2]
                        results.loc[i,'bse3']       = errs[3]'''
                        # results.loc[i,'bse4']       = errs[4]


                        results.to_csv("../../allIndustryResults.csv")

                        print( time.time() - start)
                        



In [None]:
results.to_csv("allIndustryResults.csv")


In [None]:
print(results)

'''# merge in the industry names
conversionTable = pd.read_csv("../../data/indMapping.csv")
conversionTable.dropna(inplace=True)
conversionTable.reset_index(drop = True, inplace = True)

conversionTable.head()

results = results.merge(conversionTable)

results.to_csv("../../allIndustryResults.csv")
'''

In [None]:
results

Try this with the streak data.

In [None]:
weatherVars  = ['hotStreak', 'wetStreak'] 
outcomeVars  = ['lnRevNormd', 'lnCostNormd']


industries = range(1,44)


start = time.time()

results = pd.DataFrame()

i = 0

for ind in industries:
    print('##########################################################')
    print(ind)
    filename = '../../data/companyData/igData_ind' + str(ind) + '.csv'           
    goodsData = pd.read_csv(filename).drop(columns = {'Unnamed: 0'})
    
    if goodsData.shape[0] > 0:
    
        for outcomeVar in outcomeVars:
            for weatherVar in weatherVars:
                i = i + 1


                indVar = weatherVar


                print(outcomeVar, "~", indVar)


                # find: concurrent ; or lagged supplier data
                X = goodsData.loc[:,((goodsData.columns.str.contains(indVar) & ~goodsData.columns.str.contains('empWt_')) | 
                                                (goodsData.columns.str.contains('indQtr_')) |
                                                (goodsData.columns.str.contains('gvkey_'))  | 
                                                (goodsData.columns.str.contains('ageTercile_')) |
                                                (goodsData.columns.str.contains('sizeTercile_')) |
                                                (goodsData.columns.str.contains('profitTercile_')))]
                
                X = sm.add_constant(X)



                firms = goodsData['gvkey']


                y = goodsData[outcomeVar]


                model = sm.OLS(y, X).fit(cov_type='cluster',cov_kwds={'groups': firms},use_t=True)
                pvals = model.pvalues[1: 1 + len(goodsData.columns[goodsData.columns.str.contains(indVar) & ~goodsData.columns.str.contains('empWt_')] )]
                coeff = model.params[1: 1 + len(goodsData.columns[goodsData.columns.str.contains(indVar)  & ~goodsData.columns.str.contains('empWt_')])]
                errs  = model.bse[1: 1 + len(goodsData.columns[goodsData.columns.str.contains(indVar)     & ~goodsData.columns.str.contains('empWt_')])]
                
                '''print(coeff)
                print(pvals)'''


                results.loc[i,'industry'] = ind

                results.loc[i,'outcomeVar'] = outcomeVar
                results.loc[i,'weatherVar'] = weatherVar

                results.loc[i,'lag0']       = coeff[0]
                results.loc[i,'lag1']       = coeff[1]
                results.loc[i,'lag2']       = coeff[2]
                results.loc[i,'lag3']       = coeff[3]
                results.loc[i,'lag4']       = coeff[4]
                
                results.loc[i,'bse0']       = errs[0]
                results.loc[i,'bse1']       = errs[1]
                results.loc[i,'bse2']       = errs[2]
                results.loc[i,'bse3']       = errs[3]
                results.loc[i,'bse4']       = errs[4]

                results.loc[i,'pval0']      = pvals[0]
                results.loc[i,'pval1']      = pvals[1]
                results.loc[i,'pval2']      = pvals[2]
                results.loc[i,'pval3']      = pvals[3]
                results.loc[i,'pval4']      = pvals[4]


                results.to_csv("../../allIndustryResults_streaks.csv")

                print( time.time() - start)
                

# merge in the industry names
conversionTable = pd.read_csv("../../data/indMapping.csv")
conversionTable.dropna(inplace=True)
conversionTable.reset_index(drop = True, inplace = True)

conversionTable.head()

results = results.merge(conversionTable)


results.to_csv("../../allIndustryResults_streaks.csv")

In [None]:
results.head()

In [None]:
results = pd.read_csv("../../allIndustryResults_streaks.csv").drop(columns = {'Unnamed: 0'})
results.head()

### Employment Weights

Now do this for the employment-weighted average of the days of extreme weather.

In [None]:
cutoffVarsYr = ['0.95'] # , '1x5Qtrs', '1x5Yrs'] # '1x5Qtrs',
weatherVars  = ['precip_', 'temp_']        #, 'temp5Days_', 'precip5Days_'] # , 'precip_']#, , ] #[,]
statVarsYr   = ['zipQuarterquant_']
outcomeVars  = ['lnRevNormd', 'lnCostNormd']

industries = range(1,44)

start = time.time()

results = pd.DataFrame()

i = 0



for ind in industries:
    print('##########################################################')
    print(ind)
    filename = '../../data/companyData/igData_ind' + str(ind) + '.csv'           
    goodsData = pd.read_csv(filename).drop(columns = {'Unnamed: 0'})
    if goodsData.shape[0] > 0:


        for outcomeVar in outcomeVars:
            for weatherVar in weatherVars:
                for statVar in statVarsYr:                     
                    for cutoffVar in cutoffVarsYr:

                        i = i + 1



                        '''goodsData = goodsData[~goodsData.lnRev.isna() & 
                                             ~goodsData.lnCost.isna() & 
                                             ~goodsData.revenueChange.isna() & 
                                             ~goodsData.costChange.isna()]'''


                        indVar = weatherVar + statVar + cutoffVar


                        print(outcomeVar, "~", indVar)


                        # find: concurrent ; or lagged supplier data
                        X = goodsData.loc[:,((goodsData.columns.str.contains(indVar) & goodsData.columns.str.contains('empWt_')) | 
                                                        (goodsData.columns.str.contains('indQtr_')) |
                                                        (goodsData.columns.str.contains('gvkey_'))  | 
                                                        (goodsData.columns.str.contains('ageTercile_')) |
                                                        (goodsData.columns.str.contains('sizeTercile_')) |
                                                        (goodsData.columns.str.contains('profitTercile_')))]

                        X = sm.add_constant(X)

                        firms = goodsData['gvkey']


                        y = goodsData[outcomeVar]


                        model = sm.OLS(y, X).fit(cov_type='cluster',cov_kwds={'groups': firms},use_t=True)
                        pvals = model.pvalues[1: 1 + len(goodsData.columns[goodsData.columns.str.contains(indVar) & goodsData.columns.str.contains('empWt_')])]
                        coeff = model.params[1: 1 + len(goodsData.columns[goodsData.columns.str.contains(indVar) & goodsData.columns.str.contains('empWt_')])]
                        errs  = model.bse[1: 1 + len(goodsData.columns[goodsData.columns.str.contains(indVar)  & goodsData.columns.str.contains('empWt_')])]
                
                        '''print(coeff)
                        print(pvals)'''


                        results.loc[i,'industry'] = ind

                        results.loc[i,'outcomeVar'] = outcomeVar
                        results.loc[i,'weatherVar'] = weatherVar

                        results.loc[i,'lag0']       = coeff[0]
                        results.loc[i,'lag1']       = coeff[1]
                        results.loc[i,'lag2']       = coeff[2]
                        results.loc[i,'lag3']       = coeff[3]
                        results.loc[i,'lag4']       = coeff[4]

                        results.loc[i,'bse0']       = errs[0]
                        results.loc[i,'bse1']       = errs[1]
                        results.loc[i,'bse2']       = errs[2]
                        results.loc[i,'bse3']       = errs[3]
                        results.loc[i,'bse4']       = errs[4]

                        results.loc[i,'pval0']      = pvals[0]
                        results.loc[i,'pval1']      = pvals[1]
                        results.loc[i,'pval2']      = pvals[2]
                        results.loc[i,'pval3']      = pvals[3]
                        results.loc[i,'pval4']      = pvals[4]


                        results.to_csv("../../results_byInds_withControls_empWts.csv")

                        print( time.time() - start)
                        

# merge in the industry names
conversionTable = pd.read_csv("../../data/indMapping.csv")
conversionTable.dropna(inplace=True)
conversionTable.reset_index(drop = True, inplace = True)

conversionTable.head()

results = results.merge(conversionTable)  

In [None]:
results

In [None]:
# loop over outcome variables and weather definitions
weather = results.weatherVar.unique()
outcome = results.outcomeVar.unique()


for weather in weatherVars:
    for outcome in outcomeVars:
        # choose the elective parts of this - number of columns and the range of the axes
        numCols = 4
        yLims   = 0.1

        industries = results.industryName.unique()
        rowNum = len(industries) // numCols + 1
        colNum = numCols

        fig, ax = plt.subplots(rowNum, colNum, sharex='all', sharey='all',
                              figsize=(20,40),
                              constrained_layout=True)

        fig.suptitle('Direct Effects: ' + outcome + ' ~ ' + weather + ' Employment Weights', fontsize=36)



        i = 0
        for ind in industries:
            rowIndex = i // numCols
            colIndex = i % numCols


            i   = i + 1


            rev = results[(results.outcomeVar == outcome) & (results.weatherVar == weather) & 
                         (results.industryName == ind)].reset_index()
            x   = [0,1,2,3,4]
            y   = [rev.lag0,rev.lag1,rev.lag2,rev.lag3,rev.lag4]


            errors = [rev.bse0,rev.bse1,rev.bse2,rev.bse3,rev.bse4]

            # plt.errorbar(x,y,yerr = errors, fmt = '.k')
            # plt.show()

            '''ax[rowIndex, colIndex].text(0.5, 0.5, str((i, j)),
                                  fontsize=18, ha='center')'''
            ax[rowIndex, colIndex].errorbar(x,y,yerr = errors, fmt = '.k')
            ax[rowIndex, colIndex].xaxis.grid(False)
            ax[rowIndex, colIndex].yaxis.grid(False)
            ax[rowIndex, colIndex].axhline(y=0)
            ax[rowIndex, colIndex].set_ylim([-yLims, yLims])

            ax[rowIndex, colIndex].yaxis.set_ticks(np.arange(-yLims, yLims + 0.1, 0.1))
            ax[rowIndex, colIndex].xaxis.set_ticks(np.arange(0.0, 5.0, 1.0))

            ax[rowIndex, colIndex].tick_params(axis='both', labelsize = 16)
            ax[rowIndex, colIndex].set_title(ind, fontsize = 24)


            # ax[rowIndex, colIndex].
            
        fig.savefig('dirEffects_' + outcome + '_' + weather + '_empWts' + '.png')

# Indirect Effects
This is almost exactly the same but with supplier information in place of the direct company information.

In [None]:
os.getcwd()

Can alter this so that we're doing it with the employment weights as well.

In [None]:
cutoffVarsYr = ['0.95'] 
weatherVars  = ['precip_', 'temp_'] 
statVarsYr   = ['zipQuarterquant_']
outcomeVars  = ['lnRevNormd', 'lnCostNormd']


industries = range(1,44)


start = time.time()

results = pd.DataFrame()

i = 0



for ind in industries:
    print('##########################################################')
    print(ind)
    
    filename = "../../data/companyData/supplier_igData_ind" + str(ind) + ".csv"
    goodsData = pd.read_csv(filename).drop(columns = {'Unnamed: 0'})

    if goodsData.shape[0] > 50:
        for outcomeVar in outcomeVars:
            for weatherVar in weatherVars:
                for statVar in statVarsYr:                     
                    for cutoffVar in cutoffVarsYr:

                        i = i + 1

                        indVar = weatherVar + statVar + cutoffVar


                        print(outcomeVar, "~", indVar)


                        # find: concurrent ; or lagged supplier data
                        X = goodsData.loc[:,(((goodsData.columns.str.contains(indVar)) & ~goodsData.columns.str.contains('empWt_')) | 
                                (goodsData.columns.str.contains('indQtr_')) |
                                (goodsData.columns.str.contains('gvkey_')) | #  | 
                                (goodsData.columns.str.contains('ageTercile_')) |
                                (goodsData.columns.str.contains('sizeTercile_')) |
                                (goodsData.columns.str.contains('profitTercile_')) | 
                                (goodsData.columns == 'supplierTercile'))] 
                        
                        X = sm.add_constant(X)

                        print(X.columns)
                        firms = goodsData['gvkey']


                        y = goodsData[outcomeVar]


                        model = sm.OLS(y, X).fit(cov_type='cluster',cov_kwds={'groups': firms},use_t=True)
                        pvals = model.pvalues[1: 1 + len(goodsData.columns[goodsData.columns.str.contains(indVar) & ~goodsData.columns.str.contains('empWt_')] )]
                        coeff = model.params[1: 1 + len(goodsData.columns[goodsData.columns.str.contains(indVar)  & ~goodsData.columns.str.contains('empWt_')])]
                        errs  = model.bse[1: 1 + len(goodsData.columns[goodsData.columns.str.contains(indVar)     & ~goodsData.columns.str.contains('empWt_')])]
                
                        '''print(coeff)
                        print(pvals)'''


                        results.loc[i,'industry'] = ind

                        results.loc[i,'outcomeVar'] = outcomeVar
                        results.loc[i,'weatherVar'] = weatherVar

                        results.loc[i,'lag0']       = coeff[0]
                        results.loc[i,'lag1']       = coeff[1]
                        results.loc[i,'lag2']       = coeff[2]
                        results.loc[i,'lag3']       = coeff[3]
                        results.loc[i,'lag4']       = coeff[4]

                        results.loc[i,'bse0']       = errs[0]
                        results.loc[i,'bse1']       = errs[1]
                        results.loc[i,'bse2']       = errs[2]
                        results.loc[i,'bse3']       = errs[3]
                        results.loc[i,'bse4']       = errs[4]

                        results.loc[i,'pval0']      = pvals[0]
                        results.loc[i,'pval1']      = pvals[1]
                        results.loc[i,'pval2']      = pvals[2]
                        results.loc[i,'pval3']      = pvals[3]
                        results.loc[i,'pval4']      = pvals[4]


                        results.to_csv("../../indirResults_hqs.csv")

                        print( time.time() - start)


# merge in the industry names
conversionTable = pd.read_csv("../../data/indMapping.csv")
conversionTable.dropna(inplace=True)
conversionTable.reset_index(drop = True, inplace = True)

conversionTable.head()

results = results.merge(conversionTable)

results.to_csv("../../indirResults_hqs.csv")


In [None]:
results = pd.read_csv("../../indirResults_hqs.csv").drop(columns = {'Unnamed: 0'})
print(results.industry.unique())
results.head()


In [None]:
print(outcome, weather, ind)

rev = results[(results.outcomeVar == outcome) & (results.weatherVar == weather) & 
                         (results.industry == ind)].reset_index()

In [None]:
# loop over outcome variables and weather definitions
weatherVars = results.weatherVar.unique()
outcomeVars = results.outcomeVar.unique()

industries = [2,17,18,28,31,40,41,42] # results.industryName.unique()

for outcome in outcomeVars:
    for weather in weatherVars:
        # choose the elective parts of this - number of columns and the range of the axes
        numCols = 3
        yLims   = 0.03

        # industries = results.industryName.unique()
        rowNum = len(industries) // numCols + 1
        colNum = numCols

        fig, ax = plt.subplots(rowNum, colNum, sharex='all', sharey='all',
                              figsize=(20,20),
                              constrained_layout=True)

        fig.suptitle('Indirect Effects: ' + outcome + ' ~ ' + weather, fontsize=36)



        i = 0
        for ind in industries:
            rowIndex = i // numCols
            colIndex = i % numCols


            i   = i + 1


            rev = results[(results.outcomeVar == outcome) & (results.weatherVar == weather) & 
                         (results.industry == ind)].reset_index()
            indName = rev.industryName.unique()[0]
            x   = [0,1,2,3,4]
            y   = [rev.lag0,rev.lag1,rev.lag2,rev.lag3,rev.lag4]


            errors = [rev.bse0,rev.bse1,rev.bse2,rev.bse3,rev.bse4]

            # plt.errorbar(x,y,yerr = errors, fmt = '.k')
            # plt.show()

            '''ax[rowIndex, colIndex].text(0.5, 0.5, str((i, j)),
                                  fontsize=18, ha='center')'''
            ax[rowIndex, colIndex].errorbar(x,y,yerr = errors, fmt = '.k')
            ax[rowIndex, colIndex].xaxis.grid(False)
            ax[rowIndex, colIndex].yaxis.grid(False)
            ax[rowIndex, colIndex].axhline(y=0)
            ax[rowIndex, colIndex].set_ylim([-yLims, yLims])

            ax[rowIndex, colIndex].yaxis.set_ticks(np.arange(-yLims, yLims + 0.1, 0.1))
            ax[rowIndex, colIndex].xaxis.set_ticks(np.arange(0.0, 5.0, 1.0))

            ax[rowIndex, colIndex].tick_params(axis='both', labelsize = 16)
            ax[rowIndex, colIndex].set_title(indName, fontsize = 24)


            # ax[rowIndex, colIndex].
    
        fig.savefig('indirEffects_' + outcome + '_' + weather + '.png')




Now do this by streaks - consecutive days with at least 95th percentile temp or rain.

In [None]:
weatherVars  = ['hotStreak',  'wetStreak']   #[,]
outcomeVars  = ['lnRevNormd', 'lnCostNormd'] # ['revenueChange'] #[, 'costChange']#,'lnCost','lnInc','lnRev']

# if we wanted to do the regressions below for all industries, we would use the following
'''filename = "../../data/companyData/goodsData_supplierData.csv"
goodsData = pd.read_csv(filename).drop(columns = {'Unnamed: 0'})
'''

# goodsData = goodsData[~goodsData.lnRev.isna() & ~goodsData.lnCost.isna() & ~goodsData.lnCostNormd.isna()]
goodsData['scTercile']  = pd.qcut(goodsData['suppliers'], 3, labels=False, duplicates = 'drop')


start = time.time()
results = pd.DataFrame()
i = 0

industries = range(1,44)

for ind in industries:
    filename = "../../data/companyData/supplier_igData_ind" + str(ind) + ".csv"
    goodsData = pd.read_csv(filename).drop(columns = {'Unnamed: 0'})

    if goodsData.shape[0] > 50:

        for outcomeVar in outcomeVars:
            for weatherVar in weatherVars:
                
                i = i + 1
                
                indVar = weatherVar


                print(outcomeVar, "~", indVar)


                # find: concurrent ; or lagged supplier datawet
                X = goodsData.loc[:,(((goodsData.columns.str.contains(indVar))) | 
                                (goodsData.columns.str.contains('indQtr_')) |
                                (goodsData.columns.str.contains('gvkey_')) | #  | 
                                (goodsData.columns.str.contains('ageTercile_')) |
                                (goodsData.columns.str.contains('sizeTercile_')) |
                                (goodsData.columns.str.contains('profitTercile_')) | 
                                (goodsData.columns == 'supplierTercile'))]     

                X = sm.add_constant(X)

                
                firms = goodsData['gvkey']


                y = goodsData[outcomeVar]


                modelResults = sm.OLS(y, X).fit(cov_type='cluster',cov_kwds={'groups': firms},use_t=True)
                pvals = modelResults.pvalues[1: 1 + len(goodsData.columns[goodsData.columns.str.contains(indVar) & goodsData.columns.str.contains('supplier_')])]
                coeff = modelResults.params[1: 1 + len(goodsData.columns[goodsData.columns.str.contains(indVar)  & goodsData.columns.str.contains('supplier_')])]
                errs  = modelResults.bse[1: 1 + len(goodsData.columns[goodsData.columns.str.contains(indVar)  & goodsData.columns.str.contains('supplier_')])]
                
                '''print(coeff)
                print(pvals)'''


                results.loc[i,'industry'] = ind

                results.loc[i,'outcomeVar'] = outcomeVar
                results.loc[i,'weatherVar'] = weatherVar

                results.loc[i,'lag0']       = coeff[0]
                results.loc[i,'lag1']       = coeff[1]
                results.loc[i,'lag2']       = coeff[2]
                results.loc[i,'lag3']       = coeff[3]
                results.loc[i,'lag4']       = coeff[4]
                
                results.loc[i,'bse0']       = errs[0]
                results.loc[i,'bse1']       = errs[1]
                results.loc[i,'bse2']       = errs[2]
                results.loc[i,'bse3']       = errs[3]
                results.loc[i,'bse4']       = errs[4]

                results.loc[i,'pval0']      = pvals[0]
                results.loc[i,'pval1']      = pvals[1]
                results.loc[i,'pval2']      = pvals[2]
                results.loc[i,'pval3']      = pvals[3]
                results.loc[i,'pval4']      = pvals[4]
                
                
                
                print( time.time() - start)

                results.to_csv("../../data/indirResults_hqs_streaks.csv")

# merge in the industry names
conversionTable = pd.read_csv("../../data/indMapping.csv")
conversionTable.dropna(inplace=True)
conversionTable.reset_index(drop = True, inplace = True)

conversionTable.head()

results = results.merge(conversionTable)


results.to_csv("../../data/indirResults_hqs_streaks.csv")


In [None]:
results = pd.read_csv("../../data/indirResults_hqs_streaks.csv")

In [None]:
weatherVars = results.weatherVar.unique()
outcomeVars = results.outcomeVar.unique()

industries = [2,17,18,28,31,40,41,42] # results.industryName.unique()

for outcome in outcomeVars:
    for weather in weatherVars:
        # choose the elective parts of this - number of columns and the range of the axes
        numCols = 3
        yLims   = 0.2

        # industries = results.industryName.unique()
        rowNum = len(industries) // numCols + 1
        colNum = numCols

        fig, ax = plt.subplots(rowNum, colNum, sharex='all', sharey='all',
                              figsize=(20,20),
                              constrained_layout=True)

        fig.suptitle('Indirect Effects: ' + outcome + ' ~ ' + weather, fontsize=36)



        i = 0
        for ind in industries:
            rowIndex = i // numCols
            colIndex = i % numCols


            i   = i + 1


            rev = results[(results.outcomeVar == outcome) & (results.weatherVar == weather) & 
                         (results.industry == ind)].reset_index()
            indName = rev.industryName.unique()[0]
            x   = [0,1,2,3,4]
            y   = [rev.lag0,rev.lag1,rev.lag2,rev.lag3,rev.lag4]


            errors = [rev.bse0,rev.bse1,rev.bse2,rev.bse3,rev.bse4]

            # plt.errorbar(x,y,yerr = errors, fmt = '.k')
            # plt.show()

            '''ax[rowIndex, colIndex].text(0.5, 0.5, str((i, j)),
                                  fontsize=18, ha='center')'''
            ax[rowIndex, colIndex].errorbar(x,y,yerr = errors, fmt = '.k')
            ax[rowIndex, colIndex].xaxis.grid(False)
            ax[rowIndex, colIndex].yaxis.grid(False)
            ax[rowIndex, colIndex].axhline(y=0)
            ax[rowIndex, colIndex].set_ylim([-yLims, yLims])

            ax[rowIndex, colIndex].yaxis.set_ticks(np.arange(-yLims, yLims + 0.1, 0.1))
            ax[rowIndex, colIndex].xaxis.set_ticks(np.arange(0.0, 5.0, 1.0))

            ax[rowIndex, colIndex].tick_params(axis='both', labelsize = 16)
            ax[rowIndex, colIndex].set_title(indName, fontsize = 24)

            # ax[rowIndex, colIndex].
    
        fig.savefig('indirEffects_' + outcome + '_' + weather + '.png')













----------------













### Faster and More Heuristic
The below gives us unclustered standard errors, output to a csv file.

In [None]:
def findSE(X,reg,y):
    N = len(X)
    p = len(X.columns) + 1  # plus one because LinearRegression adds an intercept term

    X_with_intercept = np.empty(shape=(N, p), dtype=np.float)
    X_with_intercept[:, 0] = 1
    X_with_intercept[:, 1:p] = X.values

    y_hat = reg.predict(X)
    residuals = y.values - y_hat
    residual_sum_of_squares = residuals.T @ residuals
    sigma_squared_hat = residual_sum_of_squares / (N - p)
    var_beta_hat = np.linalg.inv(X_with_intercept.T @ X_with_intercept) * sigma_squared_hat

    se0 = var_beta_hat[1, 1] ** 0.5
    se1 = var_beta_hat[2, 2] ** 0.5
    se2 = var_beta_hat[3, 3] ** 0.5
    se3 = var_beta_hat[4, 4] ** 0.5
    se4 = var_beta_hat[5, 5] ** 0.5
    se5 = var_beta_hat[6, 6] ** 0.5
    '''se6 = var_beta_hat[7, 7] ** 0.5
    se7 = var_beta_hat[8, 8] ** 0.5
    se8 = var_beta_hat[9, 9] ** 0.5'''
    return([abs(reg.coef_[0]/se0),abs(reg.coef_[1]/se1),abs(reg.coef_[2]/se2),
            abs(reg.coef_[3]/se3),abs(reg.coef_[4]/se4),abs(reg.coef_[5]/se5)]
          )

'''        
abs(reg.coef_[0]/se0),
          abs(reg.coef_[1]/se1),
          abs(reg.coef_[2]/se2),
          abs(reg.coef_[3]/se3),
          abs(reg.coef_[4]/se4),
          abs(reg.coef_[5]/se5),
          "SE0: ", se0,
          "SE1: ", se1,
          "SE2: ", se2,
          "SE3: ", se3,
          "SE4: ", se4,
          "SE5: ", se5,

'''


'''cutoffVarsYr = ['0.95'] # ,'1xYr']                                    #,'1x5Yrs'] #, ] # ,'1xQtr', '1x5Qtrs'
weatherVars  = ['precip_', 'temp_', 'precip5Days_', 'temp5Days_'] #[,]
statVarsYr   = ['zipquant_','zipQuarterquant_']
outcomeVars  = ['lnRev', 'revenueChange'] # ,'lnCost',  'costChange'] # [,'lnRevNormd','lnCostNormd'] # 'revenueChange' 'costChange',
firmVars     = ['firmQtr_'] # 'gvkey'
'''

# try this by industry
cutoffVarsYr = ['0.95'] # ,'1xYr']                                    #,'1x5Yrs'] #, ] # ,'1xQtr', '1x5Qtrs'
weatherVars  = ['precip_', 'temp_', 'precip5Days_', 'temp5Days_'] #[,]
statVarsYr   = ['ffquant_','indQuarterquant_']
outcomeVars  = ['lnRev', 'revenueChange',  'lnCost',  'costChange'] # [,'lnRevNormd','lnCostNormd'] # 'revenueChange' 'costChange',
firmVars     = ['firmQtr_']


inds = [1, 2, 6, 7, 18, 31, 41, 42]

goodsData = goodsData[~goodsData.lnRev.isna() & ~goodsData.lnCost.isna() &
                      ~goodsData.lnCostNormd.isna() & ~goodsData.lnRevNormd.isna()]

start = time.time()

results = pd.DataFrame()
i = 0
for ind in inds:
    print('#######################################################################################',ind)
    for outcomeVar in outcomeVars:
        for weatherVar in weatherVars:
            for statVar in statVarsYr:                     
                for cutoffVar in cutoffVarsYr:
                    for firmVar in firmVars:
                        tempData = goodsData[goodsData.famafrench == ind]
                        
                        i = i + 1
                        indVar = weatherVar + statVar + cutoffVar


                        print(outcomeVar, "~", indVar, "|", firmVar)


                        # find: concurrent ; or lagged supplier data
                        X = tempData.loc[:,((tempData.columns.str.contains(indVar)) |
                                          (tempData.columns.str.contains('indQtr_')) |
                                          # (goodsData.columns.str.contains('gvkey_'))) |   # &   
                                          # (goodsData.columns.str.contains('firmQtr_'))) |
                                          (tempData.columns.str.contains(firmVar)))] # |
                        '''(tempData.columns.str.contains('ageQtr_')) |
                          (tempData.columns.str.contains('sizeQtr_')) |
                          (tempData.columns.str.contains('profitQtr_'))]   #  & '''

                                          # (goodsData.columns.str.contains('firmQtr_')))       & 
                                        # ~(goodsData.columns.str.contains('lag4')) &
                                                                        # ~(goodsData.columns.str.contains('lag2')) & 


                        X = X[X.columns[(X.sum(axis = 0) >= 4)]]
                        # print(X.columns)
                        firms = tempData['gvkey']


                        y = tempData[outcomeVar]


                        ######################################
                        # fit the model on this subset
                        reg = linear_model.LinearRegression()
                        reg.fit(X,y)


                        # print('Coeff: ' , reg.coef_[0:5], 'SE type (looking >2): ', findSE(X,reg,y))
                        results.loc[i,'ind'] = ind


                        results.loc[i,'outcomeVar'] = outcomeVar
                        results.loc[i,'weatherVar'] = weatherVar
                        results.loc[i,'statVar']    = statVar
                        results.loc[i,'cutoffVar']  = cutoffVar
                        results.loc[i,'firmVar']    = firmVar


                        results.loc[i,'lag0']       = reg.coef_[0]
                        results.loc[i,'lag1']       = reg.coef_[1]
                        results.loc[i,'lag2']       = reg.coef_[2]
                        results.loc[i,'lag3']       = reg.coef_[3]
                        results.loc[i,'lag4']       = reg.coef_[4]



                        seratios = findSE(X,reg,y)

                        results.loc[i,'ratio0']       = seratios[0]
                        results.loc[i,'ratio1']       = seratios[1]
                        results.loc[i,'ratio2']       = seratios[2]
                        results.loc[i,'ratio3']       = seratios[3]
                        results.loc[i,'ratio4']       = seratios[4]

                        # print(results)

                        print(time.time() - start)

                        print('*******************************************************************')
                    
results.to_csv("../../data/results_notNormd.csv")


# merge in the industry names
conversionTable = pd.read_csv("../../data/indMapping.csv")
conversionTable.dropna(inplace=True)
conversionTable.reset_index(drop = True, inplace = True)

conversionTable.head()

results = results.merge(conversionTable)