In [1]:
import pickle
import pandas as pd
import matplotlib
import os
import re

import scipy

import collections
import datetime
import time

import geopandas as gpd

import numpy as np

from difflib import get_close_matches

from fuzzywuzzy import process
from fuzzywuzzy import fuzz
from sklearn import linear_model
import statsmodels.api as sm

from linearmodels import PanelOLS, FamaMacBeth
from scipy import stats

import itertools


# Grab the Company Weather-Location Data

In [2]:
goodsData = pd.read_csv("../goodsData_igData.csv").drop(columns = {'Unnamed: 0'})

goodsData = goodsData[~goodsData.revenueChange.isna() & 
                     # ~goodsData.incomeChange.isna() & 
                     ~goodsData.costChange.isna()]

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
relevantVars = [x for x in goodsData.columns if ('5Days_' in x)]
for var in relevantVars:
    goodsData[var] = 1*(goodsData[var]>0)

In [98]:
famafrench = pd.read_csv("../../data/famafrench.csv",header=None).iloc[:,0]
famafrench = pd.DataFrame(famafrench[famafrench.str.contains("-")==False]).reset_index(drop = True)
famafrench = famafrench[0].str.split(' ', expand=True).loc[:,0:1]

famafrench.columns = ['famafrench','industryName']
famafrench['famafrench'] = famafrench['famafrench'].astype('int64')

famafrench.head()

Unnamed: 0,famafrench,industryName
0,1,Agric
1,2,Food
2,3,Soda
3,4,Beer
4,5,Smoke


Figure out how many companies there are in the database, in different industries.

In [149]:
counts = pd.DataFrame(goodsData[['famafrench']].value_counts()).reset_index()
counts.columns = ['famafrench', 'indCounts']
counts = counts.merge(famafrench)
counts[0:10]

Unnamed: 0,famafrench,indCounts,industryName
0,35,13818,Comps
1,48,9261,Other
2,13,6698,Drugs
3,37,6365,LabEq
4,31,4803,Util
5,32,4359,Telcm
6,12,3766,MedEq
7,42,3745,Rtail
8,43,3600,Meals
9,21,3536,Mach


The absolute number of companies experiencing the extreme is important, because these are the number of observations we'll have for each. So let's do it by total companies first.

In [161]:
var = 'temp_annualquant_0.95'

extremes = goodsData[['famafrench',var,'temp_annualquant_1xYr',
                      'temp_annualquant_1x5Yrs',
                      'temp_annualquant_1x10Yrs']].\
    groupby('famafrench').sum().reset_index().merge(counts).reset_index(drop = True).merge(famafrench)

# extremes.sort_values(by = ['temp_annualquant_1x10Yrs'],ascending = False).head(),
extremes.sort_values(by = ['temp_annualquant_1x10Yrs'],ascending = False)[0:10]

Unnamed: 0,famafrench,temp_annualquant_0.95,temp_annualquant_1xYr,temp_annualquant_1x5Yrs,temp_annualquant_1x10Yrs,indCounts,industryName
33,35,61608,4336,812,499,13818,Comps
6,7,16668,3039,602,418,1636,Fun
35,37,20710,1787,334,212,6365,LabEq
27,29,6337,1241,265,174,544,Coal
40,42,19838,1209,254,158,3745,Rtail
30,32,26360,1489,247,158,4359,Telcm
32,34,6450,1083,226,146,1020,BusSv
39,41,16341,1242,234,132,2596,Whlsl
43,48,40189,1525,215,130,9261,Other
16,17,11130,862,179,118,2211,BldMt


In [163]:
var = 'precip_annualquant_0.95'

extremes = goodsData[['famafrench',var,'precip_annualquant_1xYr',
                      'precip_annualquant_1x5Yrs',
                      'precip_annualquant_1x10Yrs']].\
    groupby('famafrench').sum().reset_index().merge(counts).reset_index(drop = True).merge(famafrench)

# extremes.sort_values(by = ['temp_annualquant_1x10Yrs'],ascending = False).head(),
extremes.sort_values(by = ['precip_annualquant_0.95'],ascending = False)[0:10]

Unnamed: 0,famafrench,precip_annualquant_0.95,precip_annualquant_1xYr,precip_annualquant_1x5Yrs,precip_annualquant_1x10Yrs,indCounts,industryName
33,35,62820,4245,905,568,13818,Comps
43,48,48706,3519,788,518,9261,Other
12,13,29409,1854,322,174,6698,Drugs
29,31,24006,3072,1092,777,4803,Util
35,37,23052,1451,337,186,6365,LabEq
30,32,21319,1585,430,269,4359,Telcm
40,42,19721,1563,424,279,3745,Rtail
41,43,19685,1459,330,200,3600,Meals
42,46,17723,1338,285,185,3124,RlEst
20,21,17395,1177,250,172,3536,Mach


We can also imagine normalizing by number of companies, as below.

In [110]:
var = 'temp_annualquant_1x10Yrs'

extremes = goodsData[['famafrench',var]].\
    groupby('famafrench').sum().reset_index().merge(counts).reset_index(drop = True).merge(famafrench)

extremes['perCompany'] = extremes[var]/extremes.indCounts

extremes.sort_values(by = ['perCompany'],ascending = False)


Unnamed: 0,famafrench,temp_annualquant_1x10Yrs,indCounts,industryName,perCompany
27,29,174,544,Coal,0.319853
6,7,418,1636,Fun,0.255501
32,34,146,1020,BusSv,0.143137
22,23,89,848,Autos,0.104953
2,3,30,377,Soda,0.079576
17,18,84,1559,Cnstr,0.053881
16,17,118,2211,BldMt,0.05337
39,41,132,2596,Whlsl,0.050847
5,6,31,651,Toys,0.047619
8,9,61,1367,Hshld,0.044623


# Extreme Extremes
By the time we get up to looking at 1/365 or 1/(365x) events, we're effectively filtering on the couple of states that actually see extremes this high. It's slightly less the case for precipitation than temperature.

In [151]:
var = 'precip_annualquant_0.95'# 'temp_annualquant_0.95'

extremesByState = goodsData[['state',var,'precip_annualquant_1xYr',
                              'precip_annualquant_1x5Yrs','precip_annualquant_1x10Yrs']].\
    groupby('state').sum().reset_index().reset_index(drop = True)

extremesByState.sort_values(by = ['precip_annualquant_1x10Yrs'],ascending = False)[0:10]

Unnamed: 0,state,precip_annualquant_0.95,precip_annualquant_1xYr,precip_annualquant_1x5Yrs,precip_annualquant_1x10Yrs
41,TX,55953,8522,3073,2127
8,FL,31055,2650,726,466
32,NY,56633,3259,459,376
12,IL,22898,1503,329,228
36,PA,23545,1340,299,190
16,LA,6284,846,270,182
29,NJ,24761,1648,289,170
43,VA,15195,1078,292,170
18,MD,10831,812,239,155
40,TN,10917,781,199,132


In [152]:
var = 'temp_annualquant_0.95'

extremesByState = goodsData[['state',var,'temp_annualquant_1xYr',
                              'temp_annualquant_1x5Yrs','temp_annualquant_1x10Yrs']].\
    groupby('state').sum().reset_index().reset_index(drop = True)

extremesByState.sort_values(by = ['temp_annualquant_1x10Yrs'],ascending = False)[0:10]

Unnamed: 0,state,temp_annualquant_0.95,temp_annualquant_1xYr,temp_annualquant_1x5Yrs,temp_annualquant_1x10Yrs
2,AZ,52287,15731,3250,2057
31,NV,22860,5807,1154,772
3,CA,44670,1367,140,97
34,OK,11286,501,104,80
42,UT,7252,191,29,12
1,AR,4591,80,10,8
41,TX,189552,3749,88,7
45,WA,1948,30,1,1
14,KS,2896,74,2,1
0,AL,3390,0,0,0


# Seasonality?
Guessing most of the hottest temperatures occur during the summer, maybe with some in the spring and whatnot.

In [153]:
var = 'temp_annualquant_0.95'# 'temp_annualquant_0.95'

extremesBySeason = goodsData[['qtr',var,'temp_annualquant_1xYr','temp_annualquant_1x5Yrs','temp_annualquant_1x10Yrs']].\
    groupby(['qtr']).sum().reset_index().reset_index(drop = True)

extremesBySeason.sort_values(by = [var],ascending = False)[0:20]

Unnamed: 0,qtr,temp_annualquant_0.95,temp_annualquant_1xYr,temp_annualquant_1x5Yrs,temp_annualquant_1x10Yrs
2,3,375017,20762,3427,2125
1,2,115836,6998,1351,910
3,4,13003,101,0,0
0,1,266,0,0,0


In [154]:
var = 'precip_annualquant_0.95'# 'temp_annualquant_0.95'

extremesBySeason = goodsData[['qtr',var,'precip_annualquant_1xYr',
                              'precip_annualquant_1x5Yrs','precip_annualquant_1x10Yrs']].\
    groupby(['qtr']).sum().reset_index().reset_index(drop = True)

extremesBySeason.sort_values(by = [var],ascending = False)[0:20]

Unnamed: 0,qtr,precip_annualquant_0.95,precip_annualquant_1xYr,precip_annualquant_1x5Yrs,precip_annualquant_1x10Yrs
1,2,133806,10014,2107,1230
2,3,124662,11353,3327,2312
3,4,116801,8389,1948,1210
0,1,107827,5139,759,389


Can also imagine that we might be interested in extremes by year as well, to see if we have trends in counts of extremes and whatnot.