In [None]:
import pickle
import pandas as pd
import matplotlib
import os
import re

import scipy

import collections
import datetime
import time

import geopandas as gpd

import numpy as np

from difflib import get_close_matches

from fuzzywuzzy import process
from fuzzywuzzy import fuzz
from sklearn import linear_model
import statsmodels.api as sm

from linearmodels import PanelOLS, FamaMacBeth
from scipy import stats

import itertools


# Grab the Company Weather-Location Data

In [None]:
goodsData = pd.read_csv("../goodsData_igData.csv").drop(columns = {'Unnamed: 0'})

goodsData = goodsData[~goodsData.revenueChange.isna() & 
                     # ~goodsData.incomeChange.isna() & 
                     ~goodsData.costChange.isna()]

In [None]:
relevantVars = [x for x in goodsData.columns if ('5Days_' in x)]
for var in relevantVars:
    goodsData[var] = 1*(goodsData[var]>0)

In [None]:
famafrench = pd.read_csv("../../data/famafrench.csv",header=None).iloc[:,0]
famafrench = pd.DataFrame(famafrench[famafrench.str.contains("-")==False]).reset_index(drop = True)
famafrench = famafrench[0].str.split(' ', expand=True).loc[:,0:1]

famafrench.columns = ['famafrench','industryName']
famafrench['famafrench'] = famafrench['famafrench'].astype('int64')

famafrench.head()

Figure out how many companies there are in the database, in different industries.

In [None]:
counts = pd.DataFrame(goodsData[['famafrench']].value_counts()).reset_index()
counts.columns = ['famafrench', 'indCounts']
counts = counts.merge(famafrench)
counts[0:10]

The absolute number of companies experiencing the extreme is important, because these are the number of observations we'll have for each. So let's do it by total companies first.

In [None]:
var = 'temp_annualquant_0.95'

extremes = goodsData[['famafrench',var,'temp_annualquant_1xYr',
                      'temp_annualquant_1x5Yrs',
                      'temp_annualquant_1x10Yrs']].\
    groupby('famafrench').sum().reset_index().merge(counts).reset_index(drop = True).merge(famafrench)

# extremes.sort_values(by = ['temp_annualquant_1x10Yrs'],ascending = False).head(),
extremes.sort_values(by = ['temp_annualquant_1x10Yrs'],ascending = False)[0:10]

In [None]:
var = 'precip_annualquant_0.95'

extremes = goodsData[['famafrench',var,'precip_annualquant_1xYr',
                      'precip_annualquant_1x5Yrs',
                      'precip_annualquant_1x10Yrs']].\
    groupby('famafrench').sum().reset_index().merge(counts).reset_index(drop = True).merge(famafrench)

# extremes.sort_values(by = ['temp_annualquant_1x10Yrs'],ascending = False).head(),
extremes.sort_values(by = ['precip_annualquant_0.95'],ascending = False)[0:10]

We can also imagine normalizing by number of companies, as below.

In [None]:
var = 'temp_annualquant_1x10Yrs'

extremes = goodsData[['famafrench',var]].\
    groupby('famafrench').sum().reset_index().merge(counts).reset_index(drop = True).merge(famafrench)

extremes['perCompany'] = extremes[var]/extremes.indCounts

extremes.sort_values(by = ['perCompany'],ascending = False)


# Extreme Extremes
By the time we get up to looking at 1/365 or 1/(365x) events, we're effectively filtering on the couple of states that actually see extremes this high. It's slightly less the case for precipitation than temperature.

In [None]:
var = 'precip_annualquant_0.95'# 'temp_annualquant_0.95'

extremesByState = goodsData[['state',var,'precip_annualquant_1xYr',
                              'precip_annualquant_1x5Yrs','precip_annualquant_1x10Yrs']].\
    groupby('state').sum().reset_index().reset_index(drop = True)

extremesByState.sort_values(by = ['precip_annualquant_1x10Yrs'],ascending = False)[0:10]

In [None]:
var = 'temp_annualquant_0.95'

extremesByState = goodsData[['state',var,'temp_annualquant_1xYr',
                              'temp_annualquant_1x5Yrs','temp_annualquant_1x10Yrs']].\
    groupby('state').sum().reset_index().reset_index(drop = True)

extremesByState.sort_values(by = ['temp_annualquant_1x10Yrs'],ascending = False)[0:10]

# Seasonality?
Guessing most of the hottest temperatures occur during the summer, maybe with some in the spring and whatnot.

In [None]:
var = 'temp_annualquant_0.95' # 'temp_annualquant_0.95'

extremesBySeason = goodsData[['qtr',var,'temp_annualquant_1xYr','temp_annualquant_1x5Yrs','temp_annualquant_1x10Yrs']].\
    groupby(['qtr']).sum().reset_index().reset_index(drop = True)

extremesBySeason.sort_values(by = [var],ascending = False)[0:20]

In [None]:
var = 'precip_annualquant_0.95'# 'temp_annualquant_0.95'

extremesBySeason = goodsData[['qtr',var,'precip_annualquant_1xYr',
                              'precip_annualquant_1x5Yrs','precip_annualquant_1x10Yrs']].\
    groupby(['qtr']).sum().reset_index().reset_index(drop = True)

extremesBySeason.sort_values(by = [var],ascending = False)[0:20]

Can also imagine that we might be interested in extremes by year as well, to see if we have trends in counts of extremes and whatnot.

# Correlations

In [None]:
famafrench.head()

In [None]:
goodsWithIndName = goodsData.merge(famafrench)
print(goodsData.shape,goodsWithIndName.shape)

In [None]:
corrByIndustry = goodsWithIndName.merge(famafrench).groupby('industryName')[['precip_annualquant_0.95','precip_annualquant_1xYr']].corr().iloc[0::2,-1].reset_index()

corrByIndustry.columns = ['famafrench','varName','corr']

corrByIndustry.sort_values(by = 'corr',ascending = False)[0:20]

In [None]:
corrByIndustry = goodsWithIndName.merge(famafrench).groupby('industryName')[['temp_annualquant_0.95','temp_annualquant_1xYr']].corr().iloc[0::2,-1].reset_index()

corrByIndustry.columns = ['famafrench','varName','corr']

corrByIndustry.sort_values(by = 'corr',ascending = False)[0:20]