In [74]:
import pickle as pkl
import pandas as pd
import matplotlib
import os
import re

import collections
import datetime
import time

import matplotlib.pyplot as plt
import seaborn as sns
    
import geopandas as gpd

import numpy as np

import rasterio

from difflib import get_close_matches

from fuzzywuzzy import process
from fuzzywuzzy import fuzz

import dask.dataframe as dd

import gc
import sys

from multiprocessing import Pool
import multiprocessing

from dask.diagnostics import ProgressBar


# Sample

In [2]:
def getData(weatherType,yearRange1,yearRange2):

    if weatherType == "Tmax":
        weatherVar = "temperature"
    else: 
        weatherVar = "precipitation"
    
    year = yearRange1
    filename = "../../../../../../../Volumes/backup2/dissData/prism/zipcode" + weatherType + str(year) + ".csv"
    data = dd.read_csv(filename, assume_missing=True)[['ZIP','date',weatherVar]]
    data['ZIP'] = data.ZIP.astype('int64')

    
   
    years = range(yearRange1 + 1,yearRange2 + 1)
    for year in years:
        filename = "../../../../../../../Volumes/backup2/dissData/prism/zipcode" + weatherType + str(year) + ".csv"
        tempData = dd.read_csv(filename, assume_missing=True)[['ZIP','date',weatherVar]]
        tempData['ZIP'] = tempData.ZIP.astype('int64')

        # tempData = tempData[tempData.ZIP.isin(relevantZips)]
        # tempData = tempData[~(tempData[weatherVar].isna().compute())]
        data = data.append(tempData)

    data = data[~(data[weatherVar].isna())] # .compute()
    
    return(data)


I think only a few of these quantiles will be relevant. Let's look at 0, 0.9, 0.95, 1.0

In [3]:
quantiles = [0.0, 0.05, 0.90, 0.95, 0.9978, 0.9995]

# need to account for fact that once a year-quarter is 90 days, not 365
quant_labels = ['quant_0.05','quant_tossThisOne','quant_0.95',
                'quant_1x5Qtrs','quant_1x5Yrs']

# Precipitation

In [4]:
weatherType = "Precip" # Tmax
precipData = getData(weatherType, 1981, 2008)

In [5]:
precipData['quarter']    = dd.to_datetime(precipData['date'], format='%Y%m%d').dt.quarter

In [6]:
precip = precipData.compute(workers = 100)
precip.shape

(333972912, 4)

In [7]:
precip.to_csv("../../data/companyData/allZipsPrecip.csv")

In [11]:
del precip
gc.collect()

470

## Temperature

In [8]:
weatherType = "Tmax"
tempData = getData(weatherType, 1981, 2008)

In [9]:
tempData['quarter']  = dd.to_datetime(tempData['date'], format='%Y%m%d').dt.quarter

In [12]:
temp = tempData.compute(workers = 100)
temp.shape

(333972912, 4)

In [13]:
temp.to_csv("../../data/companyData/allZipsTemp.csv")

In [14]:
del temp
gc.collect()

40

## Merge and save
Merge these two and save them as one sv that we can read in and reindex. I think the above approach reads in all files separately so we get a multi-index problem.

In [None]:
temp['precipitation'] = precip.precipitation
temp.head()

In [None]:
temp.to_csv("../../data/companyData/allZipsTemp.csv")

# Load all data
## Get stats

### Describe Climate

In [81]:
weatherData   = dd.read_csv("../../data/companyData/allZipsTemp.csv").drop(columns = {'Unnamed: 0'})
precipData = dd.read_csv("../../data/companyData/allZipsPrecip.csv").drop(columns = {'Unnamed: 0'})
with ProgressBar():
    precipData = precipData.repartition(npartitions=225)
    weatherData   = tempData.repartition(npartitions=225)
# weatherData = tempData.merge(precipData)

In [82]:
# tempData['precipitation'] = precipData.precipitation
weatherData = weatherData.assign(precipitation=precipData.precipitation)
weatherData.head()

Unnamed: 0,ZIP,date,temperature,quarter,precipitation
0,1001,19810101.0,-5.148,1,0.0
1,1002,19810101.0,-5.678,1,0.0
2,1003,19810101.0,-5.46,1,0.0
3,1005,19810101.0,-6.078,1,0.0
4,1007,19810101.0,-5.815,1,0.0


In [52]:
start = time.time()

# tempData['temp5Days']     = tempData.groupby(by='ZIP').apply(lambda df_g: df_g['temperature'].rolling(5).mean(), meta=('temperature', 'f8'))
# precipData['precip5Days']    = precipData.groupby(by='ZIP').apply(lambda df_g: df_g['precipitation'].rolling(5).mean(), meta=('precipitation', 'f8'))
# precipData['precipQtrMean']  = precipData.groupby(by='quarter').apply(lambda df_g: df_g['precipitation'].mean(), meta=('precipitation', 'f8'))

print(time.time() - start)

0.00014066696166992188


In [95]:
quarterlyAvg = weatherData.groupby('quarter').mean().compute(workers = 100).\
    reset_index()[['quarter','temperature','precipitation']].rename(columns = {'temperature': 'quarterly_avg_temp',
                                                                              'precipitation': 'quarterly_avg_precip'})


quarterlyVar = weatherData.groupby('quarter').var().compute(workers = 100).\
    reset_index()[['quarter','temperature','precipitation']].rename(columns = {'temperature': 'quarterly_var_temp',
                                                                              'precipitation': 'quarterly_var_precip'})
print(quarterlyVar,quarterlyAvg)


quarterlyStatsOverall = quarterlyVar.merge(quarterlyAvg)

   quarter  quarterly_var_temp  quarterly_var_precip
0        1           90.610895             55.668947
1        2           56.110724             55.242613
2        3           28.900500             56.412663
3        4           90.470189             57.189735    quarter  quarterly_avg_temp  quarterly_avg_precip
0        1            8.435430              2.614472
1        2           22.821419              2.687989
2        3           28.293629              2.790931
3        4           13.120395              2.766938


In [175]:
weatherData.groupby(['ZIP','quarter']).percentile([0.5])

AttributeError: 'Column not found: percentile'

In [96]:
quarterlyAvgByZip = weatherData.groupby(['ZIP','quarter']).mean().compute(workers = 100).\
    reset_index()[['ZIP','quarter','temperature','precipitation']].rename(columns = {'temperature': 'quarterly_avg_temp',
                                                                              'precipitation': 'quarterly_avg_precip'})


quarterlyVarByZip = weatherData.groupby(['ZIP','quarter']).var().compute(workers = 100).\
    reset_index()[['ZIP','quarter','temperature','precipitation']].rename(columns = {'temperature': 'quarterly_var_temp',
                                                                              'precipitation': 'quarterly_var_precip'})
print(quarterlyVarByZip,quarterlyAvgByZip)


quarterlyStatsByZip = quarterlyAvgByZip.merge(quarterlyVarByZip)

          ZIP  quarter  quarterly_var_temp  quarterly_var_precip
0        1001        1           42.033604             52.755125
1        1002        1           41.171977             56.631223
2        1003        1           41.356117             56.334398
3        1005        1           41.334653             52.233328
4        1007        1           41.264102             57.015382
...       ...      ...                 ...                   ...
130619  99363        4           60.124202             67.076533
130620  99371        4           63.550457             64.528976
130621  99401        4           52.100346             78.005397
130622  99402        4           49.997708             75.491973
130623  99403        4           52.217880             69.550950

[130624 rows x 4 columns]           ZIP  quarter  quarterly_avg_temp  quarterly_avg_precip
0        1001        1            4.253855              2.372867
1        1002        1            2.991732              2.41859

In [97]:
quarterlyStatsOverall.to_csv("../../data/companyData/quarterlyStatsOverall_allZips.csv")
quarterlyStatsByZip.to_csv("../../data/companyData/quarterlyStatsByZip_allZips.csv")

In [98]:
quarterlyStatsByZip.head()

Unnamed: 0,ZIP,quarter,quarterly_avg_temp,quarterly_avg_precip,quarterly_var_temp,quarterly_var_precip
0,1001,1,4.253855,2.372867,42.033604,52.755125
1,1002,1,2.991732,2.418594,41.171977,56.631223
2,1003,1,3.906219,2.519859,41.356117,56.334398
3,1005,1,2.10741,2.463722,41.334653,52.233328
4,1007,1,3.398788,2.501641,41.264102,57.015382


In [None]:
del quarterlyStatsByZip
gc.collect()

### Find Quartiles

In [112]:
def getPivotQuantsOverall(weatherType,weatherData):

    quants_overall  =  weatherData[weatherType].quantile(q = quantiles).reset_index().\
        rename(columns = {'index': 'quartile'}).compute(workers = 100)    
    
    quants_overall['quartile'] = 'q_' + quants_overall['quartile'].astype(str).str.slice(0,6)
    quants_overall = quants_overall.iloc[:,1:].values.flatten().tolist()

    return(quants_overall)

In [126]:
weatherData.head()

Unnamed: 0,ZIP,date,temperature,quarter,precipitation
0,1001,19810101.0,-5.148,1,0.0
1,1002,19810101.0,-5.678,1,0.0
2,1003,19810101.0,-5.46,1,0.0
3,1005,19810101.0,-6.078,1,0.0
4,1007,19810101.0,-5.815,1,0.0


In [157]:
quantiles
quant_labels

['quant_0.05',
 'quant_tossThisOne',
 'quant_0.95',
 'quant_1x5Qtrs',
 'quant_1x5Yrs']

In [197]:
weatherFull.temperature

0         -5.148
1         -5.678
2         -5.460
3         -6.078
4         -5.815
           ...  
1045933    6.954
1045934    3.349
1045935    1.645
1045936    2.105
1045937    3.796
Name: temperature, Length: 333972912, dtype: float64

In [196]:
weatherFull = weatherData.compute()
print(weatherFull.head())

    ZIP        date  temperature  quarter  precipitation
0  1001  19810101.0       -5.148        1            0.0
1  1002  19810101.0       -5.678        1            0.0
2  1003  19810101.0       -5.460        1            0.0
3  1005  19810101.0       -6.078        1            0.0
4  1007  19810101.0       -5.815        1            0.0


In [203]:
def getPivotQuants(weatherType, identifiers,weatherData):
    start2 = time.time()
    quants_overallByZip   = weatherData.groupby(identifiers)[weatherType].quantile(q = quantiles).reset_index().\
        rename(columns=lambda x: re.sub('level_[0-9]$','quartile',x))
    # rename(columns = {'level_1': 'quartile'})
    # print(quants_overallByZip.head())
    
    print("got grouped")
    print(time.time() - start2)
    quants_overallByZip['quartile'] = 'q_' + quants_overallByZip['quartile'].astype(str).str.slice(0,6)

    ## 

    pivot_quants_overallByZip = quants_overallByZip.pivot(index=identifiers, 
            columns='quartile', values=str(weatherType)).reset_index().\
            rename_axis(None, axis=1)

    pivot_quants_overallByZip['quartileList']      = pivot_quants_overallByZip.iloc[:,len(identifiers):].values.tolist()
    pivot_quants_overallByZip['quartileLabelList'] = [list(pivot_quants_overallByZip.columns[2:-1])] * \
                                                                    pivot_quants_overallByZip.shape[0]

    pivot_quants_overallByZip = pivot_quants_overallByZip[pivot_quants_overallByZip.\
                                                            columns.drop(list(pivot_quants_overallByZip.filter(regex='q_')))]
    print(time.time() - start2)
    # pivot_quants_overallByZip.head()

    return(pivot_quants_overallByZip)

Now get all the quartile data:
    - Overall
    - By zip
    - Quarterly by zip

In [176]:
###################
# OVERALL
tempQuants   = getPivotQuantsOverall('temperature',weatherData)
precipQuants = getPivotQuantsOverall('precipitation',weatherData)

'''temp5DaysQuants   = getPivotQuantsOverall('temp5Days',weatherData)
precip5DaysQuants = getPivotQuantsOverall('precip5Days',weatherData)'''


"temp5DaysQuants   = getPivotQuantsOverall('temp5Days',weatherData)\nprecip5DaysQuants = getPivotQuantsOverall('precip5Days',weatherData)"

In [204]:
####################
# BY ZIP
start = time.time()


pivot_temperatureQuants_zip = getPivotQuants('temperature',['ZIP'],weatherFull).\
    drop(columns = {'quartileLabelList'})
pivot_temperatureQuants_zip.rename(columns = {'quartileList':     'quartileListTemperature_zip'},
                                      inplace = True)

pivot_temperatureQuants_zip.to_csv('../../data/companyData/pivot_temperatureQuants_zip.csv')

print("done with temperature")
print(time.time() - start)



pivot_precipitationQuants_zip = getPivotQuants('precipitation',['ZIP'],weatherFull).\
    drop(columns = {'quartileLabelList'})
pivot_precipitationQuants_zip.rename(columns = {'quartileList':     'quartileListPrecipitation_zip'},
                                      inplace = True)

pivot_precipitationQuants_zip.to_csv('../../data/companyData/pivot_precipitationQuants_zip.csv')

print("done with precipitation")
print(time.time() - start)

quants_zip = pivot_temperatureQuants_zip.merge(pivot_precipitationQuants_zip)
quants_zip.to_csv('../../data/companyData/quants_zip.csv')


got grouped
631.7389612197876
632.0849342346191
done with temperature
632.2959599494934
got grouped
292.7123990058899
292.93355894088745
done with precipitation
925.3941698074341


In [205]:
##########################################
# BY ZIP-QUARTER
start = time.time()
pivot_temperatureQuants_quarterlyByZip = getPivotQuants('temperature',['ZIP','quarter'],weatherFull).\
    drop(columns = {'quartileLabelList'})
pivot_temperatureQuants_quarterlyByZip.rename(columns = {'quartileList':     'quartileListTemperature_quarterlyByZip'},
                                      inplace = True)
pivot_temperatureQuants_quarterlyByZip.to_csv('../../data/companyData/pivot_temperatureQuants_quarterlyByZip.csv')

print("done with temperature")
print(time.time() - start)

pivot_precipitationQuants_quarterlyByZip = getPivotQuants('precipitation',['ZIP','quarter'],weatherFull).\
    drop(columns = {'quartileLabelList'})
pivot_precipitationQuants_quarterlyByZip.rename(columns = {'quartileList':     'quartileListPrecipitation_quarterlyByZip'},
                                      inplace = True)
pivot_precipitationQuants_quarterlyByZip.to_csv('../../data/companyData/pivot_precipitationQuants_quarterlyByZip.csv')


print("done with precipitation")
print(time.time() - start)

quants_quarterlyByZip = pivot_temperatureQuants_quarterlyByZip.merge(pivot_precipitationQuants_quarterlyByZip)
quants_quarterlyByZip.to_csv('../../data/companyData/quants_quarterlyByZip.csv')



got grouped
648.972855091095
649.9847962856293
done with temperature
650.8118278980255
got grouped
312.0215871334076
312.9980471134186
done with precipitation
964.4727628231049


Construct a record of all the relevant quantiles by combining all of the above. Rough idea is:
    - Start with the zip-quarter data
    - Merge in the less-specific quarter information
    - For each row, put in the overall quartile information for each row

In [206]:
quantsAll                 = quants_quarterlyByZip.merge(quants_zip)



quantsAll['precipitationQuants'] = [precipQuants for i in quantsAll.index]
quantsAll['temperatureQuants']   = [tempQuants   for i in quantsAll.index]



quantsAll.head()

Unnamed: 0,ZIP,quarter,quartileListTemperature_quarterlyByZip,quartileListPrecipitation_quarterlyByZip,quartileListTemperature_zip,quartileListPrecipitation_zip,precipitationQuants,temperatureQuants
0,1001,1,"[-16.4400005340576, -6.171400165557857, 12.670...","[0.0, 0.0, 7.123299980163575, 13.6725496292113...","[-16.4400005340576, -1.892299997806547, 29.442...","[0.0, 0.0, 8.764499855041514, 15.7616001129149...","[0.0, 0.0, 12.9139995574951, 28.25805445671174...","[-38.8950004577637, 16.3610000610352, 35.25099..."
1,1001,2,"[-4.23000001907349, 8.991749811172483, 29.6447...","[0.0, 0.0, 8.873199844360355, 15.4460996627807...","[-16.4400005340576, -1.892299997806547, 29.442...","[0.0, 0.0, 8.764499855041514, 15.7616001129149...","[0.0, 0.0, 12.9139995574951, 28.25805445671174...","[-38.8950004577637, 16.3610000610352, 35.25099..."
2,1001,3,"[10.0860004425049, 18.485250473022475, 32.1564...","[0.0, 0.0, 9.89400005340576, 17.36649990081785...","[-16.4400005340576, -1.892299997806547, 29.442...","[0.0, 0.0, 8.764499855041514, 15.7616001129149...","[0.0, 0.0, 12.9139995574951, 28.25805445671174...","[-38.8950004577637, 16.3610000610352, 35.25099..."
3,1001,4,"[-13.2650003433228, -1.171749949455265, 20.800...","[0.0, 0.0, 8.884600067138667, 16.7939498901366...","[-16.4400005340576, -1.892299997806547, 29.442...","[0.0, 0.0, 8.764499855041514, 15.7616001129149...","[0.0, 0.0, 12.9139995574951, 28.25805445671174...","[-38.8950004577637, 16.3610000610352, 35.25099..."
4,1002,1,"[-18.0459995269775, -7.398400068283082, 11.230...","[0.0, 0.0, 6.826399946212764, 14.2662997722625...","[-18.0459995269775, -3.088499951362608, 28.561...","[0.0, 0.0, 8.711199951171883, 16.0927503585815...","[0.0, 0.0, 12.9139995574951, 28.25805445671174...","[-38.8950004577637, 16.3610000610352, 35.25099..."


In [207]:
# relevantZips = allCustomerData.zipcode.append(allSupplierData.zipcode).unique()
outfile =  '../../data/companyData/quantsAll_allZips.pkl'
with open(outfile, 'wb') as pickle_file:
    pkl.dump(quantsAll, pickle_file)

In [208]:
del weatherData
gc.collect()

60

### Convert 2009-2018 data into quartiles

In [235]:
recentDecadeTmax   = getData("Tmax",2009,2019)
recentDecadePrecip = getData("Precip",2009,2019)

In [236]:
recentDecadeWeather = recentDecadeTmax.merge(recentDecadePrecip)
recentDecadeWeather.head()

Unnamed: 0,ZIP,date,temperature,precipitation
0,4255,20151205.0,4.576,0.0
1,4258,20151205.0,5.761,0.0
2,4459,20151205.0,1.812,0.0
3,4611,20151205.0,5.226,0.0
4,5408,20151205.0,7.123,0.0


In [237]:
recentDecadeWeather['quarter']    = dd.to_datetime(recentDecadeWeather['date'], format='%Y%m%d').dt.quarter

In [238]:
del recentDecadePrecip
del recentDecadeTmax
gc.collect()

# recentDecadeWeather[recentDecadeWeather.yearQuarter == '2010q1']

67

Merge the intervals into the recent weather data.

In [239]:
quantsAll.columns

Index(['ZIP', 'quarter', 'quartileListTemperature_quarterlyByZip',
       'quartileListPrecipitation_quarterlyByZip',
       'quartileListTemperature_zip', 'quartileListPrecipitation_zip',
       'precipitationQuants', 'temperatureQuants'],
      dtype='object')

In [240]:
recentDecadeWeather.columns

Index(['ZIP', 'date', 'temperature', 'precipitation', 'quarter'], dtype='object')

In [241]:
quantsAll.head()

Unnamed: 0,ZIP,quarter,quartileListTemperature_quarterlyByZip,quartileListPrecipitation_quarterlyByZip,quartileListTemperature_zip,quartileListPrecipitation_zip,precipitationQuants,temperatureQuants
0,1001,1,"[-16.4400005340576, -6.171400165557857, 12.670...","[0.0, 0.0, 7.123299980163575, 13.6725496292113...","[-16.4400005340576, -1.892299997806547, 29.442...","[0.0, 0.0, 8.764499855041514, 15.7616001129149...","[0.0, 0.0, 12.9139995574951, 28.25805445671174...","[-38.8950004577637, 16.3610000610352, 35.25099..."
1,1001,2,"[-4.23000001907349, 8.991749811172483, 29.6447...","[0.0, 0.0, 8.873199844360355, 15.4460996627807...","[-16.4400005340576, -1.892299997806547, 29.442...","[0.0, 0.0, 8.764499855041514, 15.7616001129149...","[0.0, 0.0, 12.9139995574951, 28.25805445671174...","[-38.8950004577637, 16.3610000610352, 35.25099..."
2,1001,3,"[10.0860004425049, 18.485250473022475, 32.1564...","[0.0, 0.0, 9.89400005340576, 17.36649990081785...","[-16.4400005340576, -1.892299997806547, 29.442...","[0.0, 0.0, 8.764499855041514, 15.7616001129149...","[0.0, 0.0, 12.9139995574951, 28.25805445671174...","[-38.8950004577637, 16.3610000610352, 35.25099..."
3,1001,4,"[-13.2650003433228, -1.171749949455265, 20.800...","[0.0, 0.0, 8.884600067138667, 16.7939498901366...","[-16.4400005340576, -1.892299997806547, 29.442...","[0.0, 0.0, 8.764499855041514, 15.7616001129149...","[0.0, 0.0, 12.9139995574951, 28.25805445671174...","[-38.8950004577637, 16.3610000610352, 35.25099..."
4,1002,1,"[-18.0459995269775, -7.398400068283082, 11.230...","[0.0, 0.0, 6.826399946212764, 14.2662997722625...","[-18.0459995269775, -3.088499951362608, 28.561...","[0.0, 0.0, 8.711199951171883, 16.0927503585815...","[0.0, 0.0, 12.9139995574951, 28.25805445671174...","[-38.8950004577637, 16.3610000610352, 35.25099..."


In [243]:
quantsAll.precipitationQuants[0]

[0.0,
 0.0,
 12.9139995574951,
 28.25805445671174,
 89.66024061584802,
 151.74903869629478]

Get the annual quants.

In [245]:
for i in range(0,len(quant_labels)):
    quantsAll.temperatureQuants[0][i]  = quantsAll.temperatureQuants[0][i] + i/10000
    
    quantsAll.precipitationQuants[0][i]     = quantsAll.precipitationQuants[0][i] + i/10000


In [252]:
start = time.time()

recentDecadeWeather['temp_annualQuants'] = recentDecadeWeather.temperature.map_partitions(pd.cut, 
           bins = quantsAll.temperatureQuants[0],
           labels = quant_labels,include_lowest=True)

recentDecadeWeather['precip_annualQuants'] = recentDecadeWeather.precipitation.map_partitions(pd.cut, 
           bins = quantsAll.precipitationQuants[0],
           labels = quant_labels,include_lowest=True)

time.time() - start

0.020370960235595703

Now do the same but for zips. For some reason the filtering operations are like 10x faster with making the ZIP and quarter out to be categories.

In [None]:
recentDecadeWeather = recentDecadeWeather.astype({'ZIP': 'category', 'quarter': 'category'})

In [255]:
quantsAll[quantsAll.ZIP == 94025]

Unnamed: 0,ZIP,quarter,quartileListTemperature_quarterlyByZip,quartileListPrecipitation_quarterlyByZip,quartileListTemperature_zip,quartileListPrecipitation_zip,precipitationQuants,temperatureQuants
123284,94025,1,"[5.94500017166138, 11.04019975662229, 20.50420...","[0.0, 0.0, 8.076000213623043, 16.4664993286132...","[3.07500004768372, 12.3079996109009, 27.574799...","[0.0, 0.0, 8.734200191497802, 16.1083499908447...","[0.0, 0.0001, 12.914199557495099, 28.258354456...","[-38.8950004577637, 16.3611000610352, 35.25119..."
123285,94025,2,"[10.2259998321533, 15.585149717330966, 28.3531...","[0.0, 0.0, 7.96350002288818, 14.51049971580507...","[3.07500004768372, 12.3079996109009, 27.574799...","[0.0, 0.0, 8.734200191497802, 16.1083499908447...","[0.0, 0.0001, 12.914199557495099, 28.258354456...","[-38.8950004577637, 16.3611000610352, 35.25119..."
123286,94025,3,"[16.7360000610352, 20.64999914169315, 30.44299...","[0.0, 0.0, 10.263199996948257, 18.585449981689...","[3.07500004768372, 12.3079996109009, 27.574799...","[0.0, 0.0, 8.734200191497802, 16.1083499908447...","[0.0, 0.0001, 12.914199557495099, 28.258354456...","[-38.8950004577637, 16.3611000610352, 35.25119..."
123287,94025,4,"[3.07500004768372, 11.60374975204465, 24.80200...","[0.0, 0.0, 8.285199928283692, 15.3236502170562...","[3.07500004768372, 12.3079996109009, 27.574799...","[0.0, 0.0, 8.734200191497802, 16.1083499908447...","[0.0, 0.0001, 12.914199557495099, 28.258354456...","[-38.8950004577637, 16.3611000610352, 35.25119..."


In [None]:
quantBins

In [259]:
zipcode = 94025

tempData = recentDecadeWeather[recentDecadeWeather.ZIP == zipcode].reset_index()
quantBins = quantsAll[quantsAll.ZIP == zipcode].reset_index()

for i in range(0,len(quantBins.quartileListPrecipitation_zip[0])):
        quantBins.quartileListPrecipitation_zip[0][i] = quantBins.quartileListPrecipitation_zip[0][i] + i/10000

tempData['temp_zipQuants'] = tempData.temperature.map_partitions(pd.cut, 
       bins = quantBins.quartileListTemperature_zip[0],
       labels = quant_labels,include_lowest=True)

tempData['precip_zipQuants'] = tempData.precipitation.map_partitions(pd.cut, 
       bins = quantBins.quartileListPrecipitation_zip[0],
       labels = quant_labels,include_lowest=True)

In [284]:
def getZipQuarts(zipcode):
    # print(zipcode)
    
    tempData = recentDecadeWeather[recentDecadeWeather.ZIP == zipcode].reset_index()
    quantBins = quantsAll[quantsAll.ZIP == zipcode].reset_index()
    
    for i in range(0,len(quantBins.quartileListPrecipitation_zip[0])):
            quantBins.quartileListPrecipitation_zip[0][i] = quantBins.quartileListPrecipitation_zip[0][i] + i/10000

    tempData['temp_zipQuants'] = tempData.temperature.map_partitions(pd.cut, 
           bins = quantBins.quartileListTemperature_zip[0],
           labels = quant_labels,include_lowest=True)

    tempData['precip_zipQuants'] = tempData.precipitation.map_partitions(pd.cut, 
           bins = quantBins.quartileListPrecipitation_zip[0],
           labels = quant_labels,include_lowest=True)    
    
    return(tempData)

In [None]:
def getZipQuarterQuarts(zipcode):

    weatherByZipByQuarter = pd.DataFrame()

    quarters = recentDecadeWeather.quarter.unique()
    

    for quarter in quarters:
        tempData = recentDecadeWeather[(recentDecadeWeather.ZIP  == zipcode) & 
                        (recentDecadeWeather.quarter == quarter)].reset_index()
        if tempData.shape[0] > 0:

            for i in range(0,len(tempData.quartileListPrecipitation_quarterlyByZip[0])):
                tempData.quartileListPrecipitation_quarterlyByZip[0][i] = tempData.quartileListPrecipitation_quarterlyByZip[0][i] + i/10000
                tempData.quartileListTemperature_quarterlyByZip[0][i] = tempData.quartileListTemperature_quarterlyByZip[0][i] + i/10000


            tempData['temp_zipQuarterQuants'] = pd.cut(tempData.temperature, 
                   bins = tempData.quartileListTemperature_quarterlyByZip[0],
                   labels = quant_labels,include_lowest=True)


            tempData['precip_zipQuarterQuants'] = pd.cut(tempData.precipitation, 
                   bins = tempData.quartileListPrecipitation_quarterlyByZip[0],
                   labels = quant_labels,include_lowest=True)


            weatherByZipByQuarter = weatherByZipByQuarter.append(tempData)

    return(weatherByZipByQuarter)

In [262]:
ZIPs = quantsAll.ZIP.unique()
len(ZIPs)

32656

In [281]:
zipcode = 97014

tempData = recentDecadeWeather[recentDecadeWeather.ZIP == zipcode].reset_index()
quantBins = quantsAll[quantsAll.ZIP == zipcode].reset_index()
print(quantBins.quartileListPrecipitation_zip[0])


for i in range(0,len(quantBins.quartileListPrecipitation_zip[0])):
            quantBins.quartileListPrecipitation_zip[0][i] = quantBins.quartileListPrecipitation_zip[0][i] + i/10000

tempData['temp_zipQuants'] = tempData.temperature.map_partitions(pd.cut, 
       bins = quantBins.quartileListTemperature_zip[0],
       labels = quant_labels,include_lowest=True)

tempData['precip_zipQuants'] = tempData.precipitation.map_partitions(pd.cut, 
       bins = quantBins.quartileListPrecipitation_zip[0],
       labels = quant_labels,include_lowest=True)   


[0.0, 0.0, 9.668099689483652, 17.637650159454342, 55.247680297088536, 85.42371673202545]


In [301]:
start = time.time()




with multiprocessing.Pool() as pool:
    zipQuarts = pool.map(getZipQuarts, ZIPs[0:50])

print(time.time() - start)
    
# weatherByZip = pd.concat(zipQuarts)


print(time.time() - start)

1.2387702465057373
1.240393877029419


In [None]:
start = time.time()
allZipQuarts = dd.concat(zipQuarts)
zipQuartsCompd = allZipQuarts.compute(workers = 100)
print(time.time() - start)

In [300]:
zipQuartsCompd[zipQuartsCompd.ZIP == 1001].sort_values(by=['date'])

Unnamed: 0,index,ZIP,date,temperature,precipitation,quarter,temp_annualQuants,precip_annualQuants,temp_zipQuants,precip_zipQuants
0,3932,1001,20090101.0,-2.941,4.491000,1,quant_0.05,quant_tossThisOne,quant_0.05,quant_tossThisOne
0,4052,1001,20090102.0,-7.090,0.000000,1,quant_0.05,quant_0.05,quant_0.05,quant_0.05
0,4298,1001,20090103.0,-0.701,0.263000,1,quant_0.05,quant_tossThisOne,quant_tossThisOne,quant_tossThisOne
0,4470,1001,20090104.0,0.804,0.000000,1,quant_0.05,quant_0.05,quant_tossThisOne,quant_0.05
1,4751,1001,20090105.0,1.763,0.731000,1,quant_0.05,quant_tossThisOne,quant_tossThisOne,quant_tossThisOne
...,...,...,...,...,...,...,...,...,...,...
23,701266,1001,20191227.0,3.783,0.000000,4,quant_0.05,quant_0.05,quant_tossThisOne,quant_0.05
20,675989,1001,20191228.0,9.824,0.000000,4,quant_0.05,quant_0.05,quant_tossThisOne,quant_0.05
16,676937,1001,20191229.0,7.894,0.000000,4,quant_0.05,quant_0.05,quant_tossThisOne,quant_0.05
21,677513,1001,20191230.0,5.229,20.445000,4,quant_0.05,quant_0.95,quant_tossThisOne,quant_1x5Qtrs


In [None]:
start = time.time()



with multiprocessing.Pool() as pool:
    zipQuarterQuarts = pool.map(getZipQuarterQuarts, ZIPs)

    
    
weatherByZipQuarter = pd.concat(zipQuarterQuarts)


print(time.time() - start)

In [None]:
'''weatherByZipQuarter['yearQuarter'] = weatherByZipQuarter.date.astype('str').str.slice(0,4) + weatherByZipQuarter.quarter.astype('str')
weatherByZip['yearQuarter']        = weatherByZip.date.astype('str').str.slice(0,4) + weatherByZip.quarter.astype('str')'''

In [None]:
pt1 = weatherByZip[['ZIP','date','quarter','yearQuarter',
              'temp_annualQuants',
              'precip_annualQuants',
              'temp_zipQuants', 
              'precip_zipQuants']]

pt2 = weatherByZipQuarter[['ZIP','date','quarter','yearQuarter',
                           'temp_zipQuarterQuants', 
                           'precip_zipQuarterQuants']]


outfile =  '../../data/companyData/pt1_allZips.pkl'
with open(outfile, 'wb') as pickle_file:
    pkl.dump(pt1, pickle_file)
    
outfile =  '../../data/companyData/pt2_allZips.pkl'
with open(outfile, 'wb') as pickle_file:
    pkl.dump(pt2, pickle_file)




In [None]:
allQuarts = pt1.merge(pt2)

In [None]:
allQuarts.head()

------------------

### Now go through the data from 2010 - 2019 and find time spent in each bin

In [None]:
def getCounts(quartType,df):

    test = df.groupby(['ZIP','yearQuarter',quartType]).size().reset_index()
    test = test.drop_duplicates()
    test.columns = ['zip', 'yearQuarter', quartType, 'occurrences']

    # print(test.head())


    testPivot = test.pivot(index=['zip','yearQuarter'], 
            columns = quartType, values= 'occurrences').reset_index().\
            rename_axis(None, axis=1)

    testPivot.columns.values[2:] = quartType[:-6] + testPivot.columns.values[2:]

    # print(testPivot.head())
    return(testPivot)

In [None]:
allQuarts

In [None]:
countData = getCounts(allQuarts.columns[4],allQuarts)

for quart in allQuarts.columns[5:]:
    print(quart)
    countData = countData.merge(getCounts(quart,allQuarts))

In [None]:
print(countData.columns)

Reformat the column names so they're consistent.

In [None]:
countData['year'] = countData.yearQuarter.str.slice(0,4)
countData['qtr']  = countData.yearQuarter.str.slice(5,6).astype('float')
countData['zipcode']  = countData.zip.astype('int64')

In [None]:
countData.zipcode.min()

In [None]:
countData.head()

In [None]:
countData.to_csv("../../data/companyData/allWeatherBins_2009to2019_allZips.csv")

In [None]:
len(countData.zipcode.unique())

In [None]:
countData.columns

In [None]:
'''countData = pd.read_csv("../../data/companyData/allWeatherBins_2009to2019.csv").drop(columns = 'Unnamed: 0')

countData.head()'''

countDataRevised = countData[['zipcode','year','qtr']]


In [None]:

cdf = {}


cdf['0.95']     = ['1xQtr','1xYr','1x5Qtrs','1x10Qtrs','1x5Yrs','1x10Yrs','1.0']
cdf['1xQtr']    = ['1xYr','1x5Qtrs','1x10Qtrs','1x5Yrs','1x10Yrs','1.0']
cdf['1xYr']     = ['1x5Qtrs','1x10Qtrs','1x5Yrs','1x10Yrs','1.0']
cdf['1x5Qtrs']  = ['1x10Qtrs','1x5Yrs','1x10Yrs','1.0']
cdf['1x10Qtrs'] = ['1x5Yrs','1x10Yrs','1.0']
cdf['1x5Yrs']   = ['1x10Yrs','1.0']
cdf['1x10Yrs']  = ['1.0']


weatherVars = ['precip_','temp_','precip5Days_','temp5Days_']
statVars    = ['annualquant_','zipquant_','zipQuarterquant_']


for weatherVar in weatherVars:
    for statVar in statVars:
        print(weatherVar + statVar, "*************************")
        
        # 0.95
        for cutoff in list(cdf.keys()):
            
            varHere = weatherVar + statVar + cutoff
            countDataRevised[varHere] = countData[varHere]
            print(cutoff, "******")
            for greaterThanCutoffs in cdf[cutoff]:
                varCDF = weatherVar + statVar + greaterThanCutoffs 
                countDataRevised[varHere] = countDataRevised[varHere] + countData[varCDF] 
                print(weatherVar + statVar + greaterThanCutoffs)
 

In [None]:
countDataRevised['temp_annualquant_0.95']

In [None]:
countData['temp_annualquant_0.95']

In [None]:
countDataRevised.to_csv("../../data/companyData/revised_allWeatherBins_2009to2019_allZips.csv")

In [None]:
countDataRevised = pd.read_csv("../../data/companyData/revised_allWeatherBins_2009to2019.csv")

In [None]:
countDataRevised['temp5Days_zipquant_0.95'].describe()