In [74]:
import pickle as pkl
import pandas as pd
import matplotlib
import os
import re

import collections
import datetime
import time

import matplotlib.pyplot as plt
import seaborn as sns
    
import geopandas as gpd

import numpy as np

import rasterio

from difflib import get_close_matches

from fuzzywuzzy import process
from fuzzywuzzy import fuzz

import dask.dataframe as dd

import gc
import sys

from multiprocessing import Pool
import multiprocessing

from dask.diagnostics import ProgressBar


# Sample

In [2]:
def getData(weatherType,yearRange1,yearRange2):

    if weatherType == "Tmax":
        weatherVar = "temperature"
    else: 
        weatherVar = "precipitation"
    
    year = yearRange1
    filename = "../../../../../../../Volumes/backup2/dissData/prism/zipcode" + weatherType + str(year) + ".csv"
    data = dd.read_csv(filename, assume_missing=True)[['ZIP','date',weatherVar]]
    data['ZIP'] = data.ZIP.astype('int64')

    
   
    years = range(yearRange1 + 1,yearRange2 + 1)
    for year in years:
        filename = "../../../../../../../Volumes/backup2/dissData/prism/zipcode" + weatherType + str(year) + ".csv"
        tempData = dd.read_csv(filename, assume_missing=True)[['ZIP','date',weatherVar]]
        tempData['ZIP'] = tempData.ZIP.astype('int64')

        # tempData = tempData[tempData.ZIP.isin(relevantZips)]
        # tempData = tempData[~(tempData[weatherVar].isna().compute())]
        data = data.append(tempData)

    data = data[~(data[weatherVar].isna())] # .compute()
    
    return(data)


I think only a few of these quantiles will be relevant. Let's look at 0, 0.9, 0.95, 1.0

In [3]:
quantiles = [0.0, 0.05, 0.90, 0.95, 0.9978, 0.9995]

# need to account for fact that once a year-quarter is 90 days, not 365
quant_labels = ['quant_0.05','quant_tossThisOne','quant_0.95',
                'quant_1x5Qtrs','quant_1x5Yrs']

# Precipitation

In [4]:
weatherType = "Precip" # Tmax
precipData = getData(weatherType, 1981, 2008)

In [5]:
precipData['quarter']    = dd.to_datetime(precipData['date'], format='%Y%m%d').dt.quarter

In [6]:
precip = precipData.compute(workers = 100)
precip.shape

(333972912, 4)

In [7]:
precip.to_csv("../../data/companyData/allZipsPrecip.csv")

In [11]:
del precip
gc.collect()

470

## Temperature

In [8]:
weatherType = "Tmax"
tempData = getData(weatherType, 1981, 2008)

In [9]:
tempData['quarter']  = dd.to_datetime(tempData['date'], format='%Y%m%d').dt.quarter

In [12]:
temp = tempData.compute(workers = 100)
temp.shape

(333972912, 4)

In [13]:
temp.to_csv("../../data/companyData/allZipsTemp.csv")

In [14]:
del temp
gc.collect()

40

## Merge and save
Merge these two and save them as one sv that we can read in and reindex. I think the above approach reads in all files separately so we get a multi-index problem.

In [None]:
temp['precipitation'] = precip.precipitation
temp.head()

In [None]:
temp.to_csv("../../data/companyData/allZipsTemp.csv")

# Load all data
## Get stats

### Describe Climate

In [81]:
weatherData   = dd.read_csv("../../data/companyData/allZipsTemp.csv").drop(columns = {'Unnamed: 0'})
precipData = dd.read_csv("../../data/companyData/allZipsPrecip.csv").drop(columns = {'Unnamed: 0'})
with ProgressBar():
    precipData = precipData.repartition(npartitions=225)
    weatherData   = tempData.repartition(npartitions=225)
# weatherData = tempData.merge(precipData)

In [82]:
# tempData['precipitation'] = precipData.precipitation
weatherData = weatherData.assign(precipitation=precipData.precipitation)
weatherData.head()

Unnamed: 0,ZIP,date,temperature,quarter,precipitation
0,1001,19810101.0,-5.148,1,0.0
1,1002,19810101.0,-5.678,1,0.0
2,1003,19810101.0,-5.46,1,0.0
3,1005,19810101.0,-6.078,1,0.0
4,1007,19810101.0,-5.815,1,0.0


In [52]:
start = time.time()

# tempData['temp5Days']     = tempData.groupby(by='ZIP').apply(lambda df_g: df_g['temperature'].rolling(5).mean(), meta=('temperature', 'f8'))
# precipData['precip5Days']    = precipData.groupby(by='ZIP').apply(lambda df_g: df_g['precipitation'].rolling(5).mean(), meta=('precipitation', 'f8'))
# precipData['precipQtrMean']  = precipData.groupby(by='quarter').apply(lambda df_g: df_g['precipitation'].mean(), meta=('precipitation', 'f8'))

print(time.time() - start)

0.00014066696166992188


In [95]:
quarterlyAvg = weatherData.groupby('quarter').mean().compute(workers = 100).\
    reset_index()[['quarter','temperature','precipitation']].rename(columns = {'temperature': 'quarterly_avg_temp',
                                                                              'precipitation': 'quarterly_avg_precip'})


quarterlyVar = weatherData.groupby('quarter').var().compute(workers = 100).\
    reset_index()[['quarter','temperature','precipitation']].rename(columns = {'temperature': 'quarterly_var_temp',
                                                                              'precipitation': 'quarterly_var_precip'})
print(quarterlyVar,quarterlyAvg)


quarterlyStatsOverall = quarterlyVar.merge(quarterlyAvg)

   quarter  quarterly_var_temp  quarterly_var_precip
0        1           90.610895             55.668947
1        2           56.110724             55.242613
2        3           28.900500             56.412663
3        4           90.470189             57.189735    quarter  quarterly_avg_temp  quarterly_avg_precip
0        1            8.435430              2.614472
1        2           22.821419              2.687989
2        3           28.293629              2.790931
3        4           13.120395              2.766938


In [175]:
weatherData.groupby(['ZIP','quarter']).percentile([0.5])

AttributeError: 'Column not found: percentile'

In [96]:
quarterlyAvgByZip = weatherData.groupby(['ZIP','quarter']).mean().compute(workers = 100).\
    reset_index()[['ZIP','quarter','temperature','precipitation']].rename(columns = {'temperature': 'quarterly_avg_temp',
                                                                              'precipitation': 'quarterly_avg_precip'})


quarterlyVarByZip = weatherData.groupby(['ZIP','quarter']).var().compute(workers = 100).\
    reset_index()[['ZIP','quarter','temperature','precipitation']].rename(columns = {'temperature': 'quarterly_var_temp',
                                                                              'precipitation': 'quarterly_var_precip'})
print(quarterlyVarByZip,quarterlyAvgByZip)


quarterlyStatsByZip = quarterlyAvgByZip.merge(quarterlyVarByZip)

          ZIP  quarter  quarterly_var_temp  quarterly_var_precip
0        1001        1           42.033604             52.755125
1        1002        1           41.171977             56.631223
2        1003        1           41.356117             56.334398
3        1005        1           41.334653             52.233328
4        1007        1           41.264102             57.015382
...       ...      ...                 ...                   ...
130619  99363        4           60.124202             67.076533
130620  99371        4           63.550457             64.528976
130621  99401        4           52.100346             78.005397
130622  99402        4           49.997708             75.491973
130623  99403        4           52.217880             69.550950

[130624 rows x 4 columns]           ZIP  quarter  quarterly_avg_temp  quarterly_avg_precip
0        1001        1            4.253855              2.372867
1        1002        1            2.991732              2.41859

In [97]:
quarterlyStatsOverall.to_csv("../../data/companyData/quarterlyStatsOverall_allZips.csv")
quarterlyStatsByZip.to_csv("../../data/companyData/quarterlyStatsByZip_allZips.csv")

In [98]:
quarterlyStatsByZip.head()

Unnamed: 0,ZIP,quarter,quarterly_avg_temp,quarterly_avg_precip,quarterly_var_temp,quarterly_var_precip
0,1001,1,4.253855,2.372867,42.033604,52.755125
1,1002,1,2.991732,2.418594,41.171977,56.631223
2,1003,1,3.906219,2.519859,41.356117,56.334398
3,1005,1,2.10741,2.463722,41.334653,52.233328
4,1007,1,3.398788,2.501641,41.264102,57.015382


In [None]:
del quarterlyStatsByZip
gc.collect()

### Find Quartiles

In [112]:
def getPivotQuantsOverall(weatherType,weatherData):

    quants_overall  =  weatherData[weatherType].quantile(q = quantiles).reset_index().\
        rename(columns = {'index': 'quartile'}).compute(workers = 100)    
    
    quants_overall['quartile'] = 'q_' + quants_overall['quartile'].astype(str).str.slice(0,6)
    quants_overall = quants_overall.iloc[:,1:].values.flatten().tolist()

    return(quants_overall)

In [126]:
weatherData.head()

Unnamed: 0,ZIP,date,temperature,quarter,precipitation
0,1001,19810101.0,-5.148,1,0.0
1,1002,19810101.0,-5.678,1,0.0
2,1003,19810101.0,-5.46,1,0.0
3,1005,19810101.0,-6.078,1,0.0
4,1007,19810101.0,-5.815,1,0.0


In [170]:
 def qDask(s):
        return s.quantile(q = 0.5)
    
quants = dd.Aggregation('quants', agg = qDask)


TypeError: __init__() missing 1 required positional argument: 'chunk'

In [167]:


identifiers = ['ZIP']
weatherType = 'precipitation'

print(weatherData.groupby(identifiers)[weatherType].agg([qDask]).compute(workers = 100))

'''.quantile(q = quantiles).reset_index().\
    rename(columns=lambda x: re.sub('level_[0-9]$','quartile',x)).compute(workers = 100)
'''
'''[weatherType].quantile(q = quantiles).reset_index().\
        rename(columns=lambda x: re.sub('level_[0-9]$','quartile',x)).compute(workers = 100)'''

ValueError: unknown aggregate qDask

In [157]:
quantiles
quant_labels

['quant_0.05',
 'quant_tossThisOne',
 'quant_0.95',
 'quant_1x5Qtrs',
 'quant_1x5Yrs']

In [116]:
def getPivotQuants(weatherType, identifiers,weatherData):

    quants_overallByZip   = weatherData.groupby(identifiers)[weatherType].quantile(q = quantiles).reset_index().\
        rename(columns=lambda x: re.sub('level_[0-9]$','quartile',x)).compute(workers = 100)
    # rename(columns = {'level_1': 'quartile'})
    # print(quants_overallByZip.head())
    
    quants_overallByZip['quartile'] = 'q_' + quants_overallByZip['quartile'].astype(str).str.slice(0,6)

    ## 

    pivot_quants_overallByZip = quants_overallByZip.pivot(index=identifiers, 
            columns='quartile', values=str(weatherType)).reset_index().\
            rename_axis(None, axis=1)

    pivot_quants_overallByZip['quartileList']      = pivot_quants_overallByZip.iloc[:,len(identifiers):].values.tolist()
    pivot_quants_overallByZip['quartileLabelList'] = [list(pivot_quants_overallByZip.columns[2:-1])] * \
                                                                    pivot_quants_overallByZip.shape[0]

    pivot_quants_overallByZip = pivot_quants_overallByZip[pivot_quants_overallByZip.\
                                                            columns.drop(list(pivot_quants_overallByZip.filter(regex='q_')))]

    # pivot_quants_overallByZip.head()

    return(pivot_quants_overallByZip)

In [103]:
def getQuartCounts(df, weatherType):
    weatherCut = weatherType + "Cut"
    weatherCutLabels = weatherCut + 'labels'
    
    df['month'] = df.date.astype(str).str.slice(4,6)
    
    df['quarter'] = 'q1'

    df.loc[df['month'].isin(['04','05','06']), 'quarter'] = 'q2'
    df.loc[df['month'].isin(['07','08','09']), 'quarter'] = 'q3'
    df.loc[df['month'].isin(['10','11','12']), 'quarter'] = 'q4'

    df['yearQuarter'] = df.date.astype(str).str.slice(0,4) + df.quarter

    occurrences = weatherType + "Occurrences" 

    summaryDF = df.groupby(['ZIP','yearQuarter',weatherCutLabels]).size().reset_index()
    summaryDF.columns = ['zip','yearQuarter',weatherCutLabels,occurrences]
    
    return(summaryDF)

Now get all the quartile data:
    - Overall
    - By zip
    - Quarterly by zip

In [114]:
quant_labels

['quant_0.05',
 'quant_tossThisOne',
 'quant_0.95',
 'quant_1x5Qtrs',
 'quant_1x5Yrs']

In [115]:
###################
# OVERALL
tempQuants   = getPivotQuantsOverall('temperature',weatherData)
precipQuants = getPivotQuantsOverall('precipitation',weatherData)

'''temp5DaysQuants   = getPivotQuantsOverall('temp5Days',weatherData)
precip5DaysQuants = getPivotQuantsOverall('precip5Days',weatherData)'''


"temp5DaysQuants   = getPivotQuantsOverall('temp5Days',weatherData)\nprecip5DaysQuants = getPivotQuantsOverall('precip5Days',weatherData)"

AttributeError: 'SeriesGroupBy' object has no attribute 'quantile'

In [None]:
####################
# BY ZIP
pivot_temperatureQuants_zip = getPivotQuants('temperature',['ZIP'],weatherData).\
    drop(columns = {'quartileLabelList'})
pivot_temperatureQuants_zip.rename(columns = {'quartileList':     'quartileListTemperature_zip'},
                                      inplace = True)

pivot_temp5DaysQuants_zip = getPivotQuants('temp5Days',['ZIP'],weatherData).\
    drop(columns = {'quartileLabelList'})
pivot_temp5DaysQuants_zip.rename(columns = {'quartileList':     'quartileListTemp5Days_zip'},
                                      inplace = True)



pivot_precipitationQuants_zip = getPivotQuants('precipitation',['ZIP'],weatherData).\
    drop(columns = {'quartileLabelList'})
pivot_precipitationQuants_zip.rename(columns = {'quartileList':     'quartileListPrecipitation_zip'},
                                      inplace = True)

pivot_precip5DaysQuants_zip = getPivotQuants('precip5Days',['ZIP'],weatherData).\
    drop(columns = {'quartileLabelList'})
pivot_precip5DaysQuants_zip.rename(columns = {'quartileList':     'quartileListPrecip5Days_zip'},
                                      inplace = True)


quants_zip = pivot_temperatureQuants_zip.merge(pivot_precipitationQuants_zip).merge(pivot_temp5DaysQuants_zip).merge(pivot_precip5DaysQuants_zip)



##########################################
# BY ZIP-QUARTER
pivot_temperatureQuants_quarterlyByZip = getPivotQuants('temperature',['ZIP','quarter'],weatherData).\
    drop(columns = {'quartileLabelList'})
pivot_temperatureQuants_quarterlyByZip.rename(columns = {'quartileList':     'quartileListTemperature_quarterlyByZip'},
                                      inplace = True)
pivot_temp5DaysQuants_quarterlyByZip = getPivotQuants('temp5Days',['ZIP','quarter'],weatherData).\
    drop(columns = {'quartileLabelList'})
pivot_temp5DaysQuants_quarterlyByZip.rename(columns = {'quartileList':     'quartileListTemp5Days_quarterlyByZip'},
                                      inplace = True)



pivot_precipitationQuants_quarterlyByZip = getPivotQuants('precipitation',['ZIP','quarter'],weatherData).\
    drop(columns = {'quartileLabelList'})
pivot_precipitationQuants_quarterlyByZip.rename(columns = {'quartileList':     'quartileListPrecipitation_quarterlyByZip'},
                                      inplace = True)
pivot_precip5DaysQuants_quarterlyByZip = getPivotQuants('precip5Days',['ZIP','quarter'],weatherData).\
    drop(columns = {'quartileLabelList'})
pivot_precip5DaysQuants_quarterlyByZip.rename(columns = {'quartileList':     'quartileListPrecip5Days_quarterlyByZip'},
                                      inplace = True)


quants_quarterlyByZip = pivot_temperatureQuants_quarterlyByZip.merge(pivot_precipitationQuants_quarterlyByZip).merge(pivot_temp5DaysQuants_quarterlyByZip).merge(pivot_precip5DaysQuants_quarterlyByZip)



Construct a record of all the relevant quantiles by combining all of the above. Rough idea is:
    - Start with the zip-quarter data
    - Merge in the less-specific quarter information
    - For each row, put in the overall quartile information for each row

In [None]:
quantsAll                 = quants_quarterlyByZip.merge(quants_zip)



quantsAll['precipitationQuants'] = [precipQuants for i in quantsAll.index]
quantsAll['temperatureQuants']   = [tempQuants   for i in quantsAll.index]



quantsAll['precip5DaysQuants']   = [precip5DaysQuants for i in quantsAll.index]
quantsAll['temp5DaysQuants']     = [temp5DaysQuants   for i in quantsAll.index]



quantsAll.head()

In [None]:
# relevantZips = allCustomerData.zipcode.append(allSupplierData.zipcode).unique()
outfile =  '../../data/companyData/quantsAll_allZips.pkl'
with open(outfile, 'wb') as pickle_file:
    pkl.dump(quantsAll, pickle_file)

In [None]:
del weatherData
gc.collect()

### Convert 2009-2018 data into quartiles

In [None]:
recentDecadeTmax   = getData("Tmax",2009,2019)
recentDecadePrecip = getData("Precip",2009,2019)

In [None]:
recentDecadeWeather = recentDecadeTmax.merge(recentDecadePrecip)
recentDecadeWeather.head()

In [None]:
recentDecadeWeather['month'] = recentDecadeWeather.date.astype(str).str.slice(4,6)

recentDecadeWeather['quarter'] = 'q1'

recentDecadeWeather.loc[recentDecadeWeather['month'].isin(['04','05','06']), 'quarter'] = 'q2'
recentDecadeWeather.loc[recentDecadeWeather['month'].isin(['07','08','09']), 'quarter'] = 'q3'
recentDecadeWeather.loc[recentDecadeWeather['month'].isin(['10','11','12']), 'quarter'] = 'q4'

recentDecadeWeather['yearQuarter'] = recentDecadeWeather.date.astype(str).str.slice(0,4) + recentDecadeWeather.quarter
recentDecadeWeather.head()

In [None]:
recentDecadeWeather['temp5Days']   = recentDecadeWeather.groupby('ZIP').rolling(5)['temperature'].mean().reset_index(drop=True)
recentDecadeWeather['precip5Days'] = recentDecadeWeather.groupby('ZIP').rolling(5)['precipitation'].sum().reset_index(drop=True)

In [None]:
del recentDecadePrecip
del recentDecadeTmax
gc.collect()

recentDecadeWeather[recentDecadeWeather.yearQuarter == '2010q1']

Merge the intervals into the recent weather data.

In [None]:
quantsAll.columns

In [None]:
recentDecadeWeather.columns

In [None]:
recentDecadeWeather = recentDecadeWeather.merge(quantsAll)

recentDecadeWeather.head()

Get the annual quants.

In [None]:
recentDecadeWeather.temperatureQuants[0]

In [None]:
for i in range(0,len(recentDecadeWeather.temp_annualQuants[0])):
    recentDecadeWeather.temperatureQuants[0][i]   = recentDecadeWeather.temperatureQuants[0][i] + i/10000
    recentDecadeWeather.temp5DaysQuants[0][i]     = recentDecadeWeather.temp5DaysQuants[0][i] + i/10000
    
    recentDecadeWeather.precipitationQuants[0][i] = recentDecadeWeather.precipitationQuants[0][i] + i/10000
    recentDecadeWeather.precip5DaysQuants[0][i]   = recentDecadeWeather.precip5DaysQuants[0][i] + i/10000


In [None]:
recentDecadeWeather['temp_annualQuants'] = pd.cut(recentDecadeWeather.temperature, 
           bins = recentDecadeWeather.temperatureQuants[0],
           labels = quant_labels,include_lowest=True)

recentDecadeWeather['temp5Days_annualQuants'] = pd.cut(recentDecadeWeather.temp5Days, 
           bins = recentDecadeWeather.temp5DaysQuants[0],
           labels = quant_labels,include_lowest=True)

recentDecadeWeather['precip_annualQuants'] = pd.cut(recentDecadeWeather.precipitation, 
           bins = recentDecadeWeather.precipitationQuants[0],
           labels = quant_labels,include_lowest=True)

recentDecadeWeather['precip5Days_annualQuants'] = pd.cut(recentDecadeWeather.precip5Days, 
           bins = recentDecadeWeather.precip5DaysQuants[0],
           labels = quant_labels,include_lowest=True)

recentDecadeWeather.head()


Now do the same but for zips. For some reason the filtering operations are like 10x faster with making the ZIP and quarter out to be categories.

In [None]:
recentDecadeWeather = recentDecadeWeather.astype({'ZIP': 'category', 'quarter': 'category'})

In [None]:
def getZipQuarts(zipcode):
    
    tempData = recentDecadeWeather[recentDecadeWeather.ZIP == zipcode].reset_index()
    
    if tempData.shape[0] > 0:


        for i in range(0,len(tempData.quartileListPrecip5Days_zip[0])):
                tempData.quartileListPrecipitation_zip[0][i] = tempData.quartileListPrecipitation_zip[0][i] + i/10000
                tempData.quartileListPrecip5Days_zip[0][i]   = tempData.quartileListPrecip5Days_zip[0][i] + i/10000


        tempData['temp_zipQuants'] = pd.cut(tempData.temperature, 
               bins = tempData.quartileListTemperature_zip[0],
               labels = quant_labels,include_lowest=True)
        tempData['temp5Days_zipQuants'] = pd.cut(tempData.temp5Days, 
               bins = tempData.quartileListTemp5Days_zip[0],
               labels = quant_labels,include_lowest=True)


        tempData['precip_zipQuants'] = pd.cut(tempData.precipitation, 
               bins = tempData.quartileListPrecipitation_zip[0],
               labels = quant_labels,include_lowest=True)

        tempData['precip5Days_zipQuants'] = pd.cut(tempData.precip5Days, 
               bins = tempData.quartileListPrecip5Days_zip[0], 
               labels = quant_labels,include_lowest=True)
        
    
    
    return(tempData)

In [None]:
def getZipQuarterQuarts(zipcode):

    weatherByZipByQuarter = pd.DataFrame()

    quarters = recentDecadeWeather.quarter.unique()
    

    for quarter in quarters:
        tempData = recentDecadeWeather[(recentDecadeWeather.ZIP  == zipcode) & 
                        (recentDecadeWeather.quarter == quarter)].reset_index()
        if tempData.shape[0] > 0:

            for i in range(0,len(tempData.quartileListPrecip5Days_quarterlyByZip[0])):
                tempData.quartileListPrecipitation_quarterlyByZip[0][i] = tempData.quartileListPrecipitation_quarterlyByZip[0][i] + i/10000
                tempData.quartileListPrecip5Days_quarterlyByZip[0][i]   = tempData.quartileListPrecip5Days_quarterlyByZip[0][i] + i/10000
                tempData.quartileListTemperature_quarterlyByZip[0][i] = tempData.quartileListTemperature_quarterlyByZip[0][i] + i/10000
                tempData.quartileListTemp5Days_quarterlyByZip[0][i]   = tempData.quartileListTemp5Days_quarterlyByZip[0][i] + i/10000


            tempData['temp_zipQuarterQuants'] = pd.cut(tempData.temperature, 
                   bins = tempData.quartileListTemperature_quarterlyByZip[0],
                   labels = quant_labels,include_lowest=True)
            tempData['temp5Days_zipQuarterQuants'] = pd.cut(tempData.temp5Days, 
                   bins = tempData.quartileListTemp5Days_quarterlyByZip[0],
                   labels = quant_labels,include_lowest=True)


            tempData['precip_zipQuarterQuants'] = pd.cut(tempData.precipitation, 
                   bins = tempData.quartileListPrecipitation_quarterlyByZip[0],
                   labels = quant_labels,include_lowest=True)
            tempData['precip5Days_zipQuarterQuants'] = pd.cut(tempData.precip5Days, 
                   bins = tempData.quartileListPrecip5Days_quarterlyByZip[0], 
                   labels = quant_labels,include_lowest=True) # 


            weatherByZipByQuarter = weatherByZipByQuarter.append(tempData)

    return(weatherByZipByQuarter)

In [None]:
ZIPs = recentDecadeWeather.ZIP.unique()
len(ZIPs)

In [None]:
start = time.time()




with multiprocessing.Pool() as pool:
    zipQuarts = pool.map(getZipQuarts, ZIPs)

    
    
weatherByZip = pd.concat(zipQuarts)


print(time.time() - start)

In [None]:
start = time.time()



with multiprocessing.Pool() as pool:
    zipQuarterQuarts = pool.map(getZipQuarterQuarts, ZIPs)

    
    
weatherByZipQuarter = pd.concat(zipQuarterQuarts)


print(time.time() - start)

In [None]:
weatherByZipQuarter['yearQuarter'] = weatherByZipQuarter.date.astype('str').str.slice(0,4) + weatherByZipQuarter.quarter.astype('str')
weatherByZip['yearQuarter']        = weatherByZip.date.astype('str').str.slice(0,4) + weatherByZip.quarter.astype('str')

In [None]:
pt1 = weatherByZip[['ZIP','date','quarter','yearQuarter',
              'temp_annualQuants', 'temp5Days_annualQuants',
              'precip_annualQuants', 'precip5Days_annualQuants', 
              'temp_zipQuants', 'temp5Days_zipQuants',
              'precip_zipQuants', 'precip5Days_zipQuants']]

pt2 = weatherByZipQuarter[['ZIP','date','quarter','yearQuarter',
                           'temp_zipQuarterQuants','temp5Days_zipQuarterQuants', 
                     'precip_zipQuarterQuants','precip5Days_zipQuarterQuants']]


outfile =  '../../data/companyData/pt1_allZips.pkl'
with open(outfile, 'wb') as pickle_file:
    pkl.dump(pt1, pickle_file)
    
outfile =  '../../data/companyData/pt2_allZips.pkl'
with open(outfile, 'wb') as pickle_file:
    pkl.dump(pt2, pickle_file)




In [None]:
allQuarts = pt1.merge(pt2)

In [None]:
allQuarts.head()

------------------

### Now go through the data from 2010 - 2019 and find time spent in each bin

In [None]:
def getCounts(quartType,df):

    test = df.groupby(['ZIP','yearQuarter',quartType]).size().reset_index()
    test = test.drop_duplicates()
    test.columns = ['zip', 'yearQuarter', quartType, 'occurrences']

    # print(test.head())


    testPivot = test.pivot(index=['zip','yearQuarter'], 
            columns = quartType, values= 'occurrences').reset_index().\
            rename_axis(None, axis=1)

    testPivot.columns.values[2:] = quartType[:-6] + testPivot.columns.values[2:]

    # print(testPivot.head())
    return(testPivot)

In [None]:
allQuarts

In [None]:
countData = getCounts(allQuarts.columns[4],allQuarts)

for quart in allQuarts.columns[5:]:
    print(quart)
    countData = countData.merge(getCounts(quart,allQuarts))

In [None]:
print(countData.columns)

Reformat the column names so they're consistent.

In [None]:
countData['year'] = countData.yearQuarter.str.slice(0,4)
countData['qtr']  = countData.yearQuarter.str.slice(5,6).astype('float')
countData['zipcode']  = countData.zip.astype('int64')

In [None]:
countData.zipcode.min()

In [None]:
countData.head()

In [None]:
countData.to_csv("../../data/companyData/allWeatherBins_2009to2019_allZips.csv")

In [None]:
len(countData.zipcode.unique())

In [None]:
countData.columns

In [None]:
'''countData = pd.read_csv("../../data/companyData/allWeatherBins_2009to2019.csv").drop(columns = 'Unnamed: 0')

countData.head()'''

countDataRevised = countData[['zipcode','year','qtr']]


In [None]:

cdf = {}


cdf['0.95']     = ['1xQtr','1xYr','1x5Qtrs','1x10Qtrs','1x5Yrs','1x10Yrs','1.0']
cdf['1xQtr']    = ['1xYr','1x5Qtrs','1x10Qtrs','1x5Yrs','1x10Yrs','1.0']
cdf['1xYr']     = ['1x5Qtrs','1x10Qtrs','1x5Yrs','1x10Yrs','1.0']
cdf['1x5Qtrs']  = ['1x10Qtrs','1x5Yrs','1x10Yrs','1.0']
cdf['1x10Qtrs'] = ['1x5Yrs','1x10Yrs','1.0']
cdf['1x5Yrs']   = ['1x10Yrs','1.0']
cdf['1x10Yrs']  = ['1.0']


weatherVars = ['precip_','temp_','precip5Days_','temp5Days_']
statVars    = ['annualquant_','zipquant_','zipQuarterquant_']


for weatherVar in weatherVars:
    for statVar in statVars:
        print(weatherVar + statVar, "*************************")
        
        # 0.95
        for cutoff in list(cdf.keys()):
            
            varHere = weatherVar + statVar + cutoff
            countDataRevised[varHere] = countData[varHere]
            print(cutoff, "******")
            for greaterThanCutoffs in cdf[cutoff]:
                varCDF = weatherVar + statVar + greaterThanCutoffs 
                countDataRevised[varHere] = countDataRevised[varHere] + countData[varCDF] 
                print(weatherVar + statVar + greaterThanCutoffs)
 

In [None]:
countDataRevised['temp_annualquant_0.95']

In [None]:
countData['temp_annualquant_0.95']

In [None]:
countDataRevised.to_csv("../../data/companyData/revised_allWeatherBins_2009to2019_allZips.csv")

In [None]:
countDataRevised = pd.read_csv("../../data/companyData/revised_allWeatherBins_2009to2019.csv")

In [None]:
countDataRevised['temp5Days_zipquant_0.95'].describe()