In [None]:
import pickle
import pandas as pd
import matplotlib
import os
import re

import scipy

import collections
import datetime
import time

import geopandas as gpd

import numpy as np
 
from difflib import get_close_matches

from fuzzywuzzy import process
from fuzzywuzzy import fuzz
from sklearn import linear_model
import statsmodels.api as sm

from linearmodels import PanelOLS, FamaMacBeth
from scipy import stats

import itertools


## Grab Data

In [None]:
stockData = pd.read_csv("../../data/companyData/stocksWithControlsWeather.csv").\
    drop(columns = {'Unnamed: 0'})

stockData.head()

In [None]:
stockData['indQtr']  = stockData.famafrench.astype('str') + stockData.qtr.astype('str')
stockData['yearQtr'] = stockData.year.astype('str')       + stockData.qtr.astype('str')

In [None]:
firms = stockData['gvkey']

stockData = pd.get_dummies(stockData, columns = ['year', 'yearQtr', 'indQtr', 'famafrench']) # , 'gvkey'

In [None]:
stockData.head()

In [None]:
stockData.columns[0:50]

In [None]:
stockData['heatwave_annual']          = (stockData.temp_annualLast5 == 5) * 1
stockData['extremePrecip_annual']     = (stockData.precip_annualLast5 > 0) * 1

stockData['heatwave_zipQuarter']      = (stockData.temp_zipQuarterLast5 == 5) * 1
stockData['extremePrecip_zipQuarter'] = (stockData.precip_zipQuarterLast5 > 0) * 1

indVars     = ['heatwave_annual', 'extremePrecip_annual', 'heatwave_zipQuarter', 'extremePrecip_zipQuarter'] 
outcomeVar = 'RET'


#############################
for indVar in indVars:
    print(outcomeVar, "~", indVar)
    
    col = stockData.pop(indVar)
    stockData.insert(0, col.name, col)

    # find: concurrent ; or lagged supplier data
    X = stockData.loc[:,((stockData.columns.str.contains(indVar)) | \
                        (stockData.columns.str.contains('yearQtr_')))] # | 
                        # (stockData.columns.str.contains('famafrench_')))] # | 
                        # (stockData.columns.str.contains('famafrench_'))] #  | \
    
    print(X.columns)

    y = stockData[outcomeVar]

    ######################################
    # fit the model on this subset
    '''reg = linear_model.LinearRegression()
    reg.fit(X,y)

    reg.coef_[-1]'''


    #######################################
    # try the more descriptive one
    start = time.time()

    model = sm.OLS(y, X).fit(cov_type='cluster',cov_kwds={'groups': firms},use_t=True)
    print(model.summary())

    print(time.time() - start)

In [None]:
X.columns

------------------------------------------------------

# Direct Effects
Look at the effects on the suppliers when they're affected directly.

The below gives us the full, clustered standard errors.

Get the weather data to start.

In [None]:
stockData.columns[0:15]

In [None]:
sum(stockData.RET.isna())

In [None]:
stockData.RET.describe()

In [None]:
weatherVars  = ['precip_', 'temp_'] 
statVarsYr   = ['zipQuarterLast5', 'annualLast5'] 
outcomeVars  = ['RET']

start = time.time()

results = pd.DataFrame()

i = 0

for outcomeVar in outcomeVars:
    for weatherVar in weatherVars:
        for statVar in statVarsYr:  
            
            i = i + 1
            
            indVar = weatherVar + statVar

            print(outcomeVar, "~", indVar)

            # find: concurrent ; or lagged supplier data
            X = stockData.loc[:,((stockData.columns.str.contains(indVar)) | 
                                            (stockData.columns.str.contains('indQtr_')))] # |
                                            # (stockData.columns.str.contains('gvkey_')))]#  | 
                                            # (goodsData.columns.str.contains('firmQtr_')) |
                                            #(goodsData.columns.str.contains('ageQtr_')) |
                                            #(goodsData.columns.str.contains('sizeQtr_')) |
                                            #(goodsData.columns.str.contains('profitQtr_')))]


            # firms = stockData['gvkey']


            y = stockData[outcomeVar]


            model = sm.OLS(y, X).fit() # fit(cov_type='cluster',cov_kwds={'groups': firms},use_t=True)
            pvals = model.pvalues[0:len(stockData.columns[stockData.columns.str.contains(indVar)])]
            coeff =  model.params[0:len(stockData.columns[stockData.columns.str.contains(indVar)])]

            # results.loc[i,'ind'] = ind


            results.loc[i,'outcomeVar'] = outcomeVar
            results.loc[i,'weatherVar'] = weatherVar
            results.loc[i,'statVar']    = statVar
            # results.loc[i,'cutoffVar']  = cutoffVar
            # results.loc[i,'firmVar']    = firmVar


            results.loc[i,'lag0']       = coeff[0]
            results.loc[i,'lag1']       = coeff[1]
            results.loc[i,'lag2']       = coeff[2]
            results.loc[i,'lag3']       = coeff[3]
            results.loc[i,'lag4']       = coeff[4]

            results.loc[i,'pval0']      = pvals[0]
            results.loc[i,'pval1']      = pvals[1]
            results.loc[i,'pval2']      = pvals[2]
            results.loc[i,'pval3']      = pvals[3]
            results.loc[i,'pval4']      = pvals[4]


            results.to_csv("../../data/stocks_rightInds_noCtrls.csv")

            print( time.time() - start)

In [None]:
results

In [None]:
results.to_csv("../../data/utilitiesResults_rightInds.csv")

# Faster and More Heuristic

The below gives us unclustered standard errors, output to a csv file.

In [None]:
def findSE(X,reg,y):
    N = len(X)
    p = len(X.columns) + 1  # plus one because LinearRegression adds an intercept term

    X_with_intercept = np.empty(shape=(N, p), dtype=np.float)
    X_with_intercept[:, 0] = 1
    X_with_intercept[:, 1:p] = X.values

    y_hat = reg.predict(X)
    residuals = y.values - y_hat
    residual_sum_of_squares = residuals.T @ residuals
    sigma_squared_hat = residual_sum_of_squares / (N - p)
    var_beta_hat = np.linalg.inv(X_with_intercept.T @ X_with_intercept) * sigma_squared_hat

    se0 = var_beta_hat[1, 1] ** 0.5
    se1 = var_beta_hat[2, 2] ** 0.5
    se2 = var_beta_hat[3, 3] ** 0.5
    se3 = var_beta_hat[4, 4] ** 0.5
    se4 = var_beta_hat[5, 5] ** 0.5
    se5 = var_beta_hat[6, 6] ** 0.5
    '''se6 = var_beta_hat[7, 7] ** 0.5
    se7 = var_beta_hat[8, 8] ** 0.5
    se8 = var_beta_hat[9, 9] ** 0.5'''
    return([abs(reg.coef_[0]/se0),abs(reg.coef_[1]/se1),abs(reg.coef_[2]/se2),
            abs(reg.coef_[3]/se3),abs(reg.coef_[4]/se4),abs(reg.coef_[5]/se5)]
          )


In [None]:
# try this by industry
weatherVars  = ['precip_', 'temp_'] 
statVarsYr   = ['zipQuarterLast5', 'annualLast5'] 
outcomeVars  = ['RET']


start = time.time()


results = pd.DataFrame()
i = 0
for outcomeVar in outcomeVars:
    for weatherVar in weatherVars:
        for statVar in statVarsYr:                     

                    i = i + 1
                    indVar = weatherVar + statVar

                    print(outcomeVar, "~", indVar)


                    # find: concurrent ; or lagged supplier data
                    X = stockData.loc[:,((stockData.columns.str.contains(indVar)) |
                                      (stockData.columns.str.contains('indQtr_')))] #  |


                    X = X[X.columns[(X.sum(axis = 0) >= 4)]]
                    

                    y = stockData[outcomeVar]


                    ######################################
                    # fit the model on this subset
                    reg = linear_model.LinearRegression()
                    reg.fit(X,y)
                    


                    results.loc[i,'outcomeVar'] = outcomeVar
                    results.loc[i,'weatherVar'] = weatherVar
                    results.loc[i,'statVar']    = statVar



                    results.loc[i,'lag0']       = reg.coef_[0]
                    results.loc[i,'lag1']       = reg.coef_[1]
                    results.loc[i,'lag2']       = reg.coef_[2]
                    results.loc[i,'lag3']       = reg.coef_[3]
                    results.loc[i,'lag4']       = reg.coef_[4]



                    '''seratios = findSE(X,reg,y)

                    results.loc[i,'ratio0']       = seratios[0]
                    results.loc[i,'ratio1']       = seratios[1]
                    results.loc[i,'ratio2']       = seratios[2]
                    results.loc[i,'ratio3']       = seratios[3]
                    results.loc[i,'ratio4']       = seratios[4]'''

                    # print(results)

                    print(time.time() - start)

                    print('*******************************************************************')

results.to_csv("../../data/stockResults_notNormd.csv")

In [None]:
results

# Indirect Effects
This is almost exactly the same but with supplier information in place of the direct company information.

In [None]:
cutoffVarsYr = ['1xYr'] #, ] # ,'1xQtr''1x5Qtrs',
weatherVars  = ['precip_', 'temp_', 'precip5Days_', 'temp5Days_'] #[,]
statVarsYr   = ['annualquant_']  #,'zipQuarterquant_']
outcomeVars  = ['costChange', 'revenueChange', 'lnRevNormd', 'lnCostNormd'] # ['revenueChange'] #[, 'costChange']#,'lnCost','lnInc','lnRev']

goodsData = goodsData[~goodsData.lnRev.isna() & ~goodsData.lnCost.isna() & ~goodsData.lnCostNormd.isna()]
goodsData['scTercile']  = pd.qcut(goodsData['suppliers'], 3, labels=False, duplicates = 'drop')


start = time.time()

for outcomeVar in outcomeVars:
    for weatherVar in weatherVars:
        for statVar in statVarsYr:                     
            for cutoffVar in cutoffVarsYr:
                indVar = weatherVar + statVar + cutoffVar
                
                
                print(outcomeVar, "~", indVar)


                # find: concurrent ; or lagged supplier data
                X = goodsData.loc[:,(((goodsData.columns.str.contains(indVar)) & goodsData.columns.str.contains('supplier')) | 
                                                (goodsData.columns.str.contains('indQtr_')) |
                                                (goodsData.columns.str.contains('gvkey_'))  | 
                                                (goodsData.columns.str.contains('ageQtr_')) |
                                                (goodsData.columns.str.contains('sizeQtr_')) |
                                                (goodsData.columns.str.contains('profitQtr_')) | 
                                                (goodsData.columns == 'suppliers')) &   
                                                # (goodsData.columns.str.contains('firmQtr_')))       & 
                                                ~(goodsData.columns.str.contains('gvkey_1013')) & # do 1 minus the FEs
                                                ~(goodsData.columns.str.contains('indQtr_12010_1'))]# & 
                                                # ~(goodsData.columns.str.contains('lag4'))]     
                
                
                firms = goodsData['gvkey']


                y = goodsData[outcomeVar]
                
                
                results = sm.OLS(y, X).fit(cov_type='cluster',cov_kwds={'groups': firms},use_t=True)
                pvals = results.pvalues[0:len(goodsData.columns[goodsData.columns.str.contains(indVar) & goodsData.columns.str.contains('supplier_')])]
                coeff = results.params[0:len(goodsData.columns[goodsData.columns.str.contains(indVar)  & goodsData.columns.str.contains('supplier_')])]
                print(coeff)
                print(pvals)
                
                
                print( time.time() - start)
                
                
                

In [None]:
cutoffVarsYr = ['1xYr'] #, ] # ,'1xQtr''1x5Qtrs',
weatherVars  = ['precip_', 'temp_', 'precip5Days_', 'temp5Days_'] #[,]
statVarsYr   = ['annualquant_']  #,'zipQuarterquant_']
outcomeVars  = ['revenueChange',  'costChange'] # ['revenueChange'] #['incomeChange', 'revenueChange', 'costChange']#,'lnCost','lnInc','lnRev']


# 'revenueChange',  'costChange', 
# 'lnRevNormd', 'lnCostNormd'


goodsData = goodsData[~goodsData.lnRev.isna() & ~goodsData.lnCost.isna() & ~goodsData.lnCostNormd.isna()]


start = time.time()

for outcomeVar in outcomeVars:
    for weatherVar in weatherVars:
        for statVar in statVarsYr:                     
            for cutoffVar in cutoffVarsYr:
                indVar = weatherVar + statVar + cutoffVar
                supplierIndVar = 'supplier_' + weatherVar + statVar + cutoffVar
                
                
                print(outcomeVar, "~", indVar, supplierIndVar)


                # find: concurrent ; or lagged supplier data
                X = goodsData.loc[:,((goodsData.columns.str.contains(indVar) & goodsData.columns.str.contains('supplier')) |
                                  (goodsData.columns.str.contains('indQtr_'))       |
                                  (goodsData.columns.str.contains('gvkey_'))) &   
                                # (goodsData.columns.str.contains('firmQtr_')))       & 
                                  ~(goodsData.columns.str.contains('gvkey_1045'))     & # do 1 minus the FEs
                                ~(goodsData.columns.str.contains('indQtr_12013_1'))]#  &
                                # ~(goodsData.columns.str.contains('lag4')) &
                                # (goodsData.columns.str.contains('ageQtr_')) |
                                # (goodsData.columns.str.contains('sizeQtr_')) |
                                # (goodsData.columns.str.contains('profitQtr_'))) & 
                                # ~(goodsData.columns.str.contains('firmQtr_10451'))] # & # do 1 minus the FEs
                                # ~(goodsData.columns.str.contains('indQtr_12013_1'))]# & 
                                # ~(goodsData.columns.str.contains('lag2')) & 
                                 
                                
                print(X.columns[0:5])

                X = X[X.columns[(X.sum(axis = 0) >= 4)]]
                firms = goodsData['gvkey']


                y = goodsData[outcomeVar]


                ######################################
                # fit the model on this subset
                reg = linear_model.LinearRegression()
                reg.fit(X,y)


                print('Coeff: ' , reg.coef_[0:5], 'SE type (looking >2): ', findSE(X,reg,y))
                
                print(time.time() - start)
                
                print('*******************************************************************')
                


# Quarterly-Specific Effects
Look at what is going on within a quarter, for a specific zipcode. 

Note: this is effectively deprecated at this point because the zip-level effects mask the differences in the underlying weather distributions, ie some regions would likely benefit from getting warmer. If we want to do a zip-level study, we should also include some sort of threshold here.

Do this for all qtrs.