In [2]:
import pickle as pkl
import pandas as pd
import matplotlib
import os
import re

import collections
import datetime
import time

import matplotlib.pyplot as plt
import seaborn as sns
    
import geopandas as gpd

import numpy as np

import rasterio

from difflib import get_close_matches

from fuzzywuzzy import process
from fuzzywuzzy import fuzz

import dask.dataframe as dd

import gc
import sys

from multiprocessing import Pool
import multiprocessing

from dask.diagnostics import ProgressBar


# Sample

In [2]:
def getData(weatherType,yearRange1,yearRange2):

    if weatherType == "Tmax":
        weatherVar = "temperature"
    else: 
        weatherVar = "precipitation"
    
    year = yearRange1
    filename = "../../../../../../../Volumes/backup2/dissData/prism/zipcode" + weatherType + str(year) + ".csv"
    data = dd.read_csv(filename, assume_missing=True)[['ZIP','date',weatherVar]]
    data['ZIP'] = data.ZIP.astype('int64')

    
   
    years = range(yearRange1 + 1,yearRange2 + 1)
    for year in years:
        filename = "../../../../../../../Volumes/backup2/dissData/prism/zipcode" + weatherType + str(year) + ".csv"
        tempData = dd.read_csv(filename, assume_missing=True)[['ZIP','date',weatherVar]]
        tempData['ZIP'] = tempData.ZIP.astype('int64')

        # tempData = tempData[tempData.ZIP.isin(relevantZips)]
        # tempData = tempData[~(tempData[weatherVar].isna().compute())]
        data = data.append(tempData)

    data = data[~(data[weatherVar].isna())] # .compute()
    
    return(data)


I think only a few of these quantiles will be relevant. Let's look at 0, 0.9, 0.95, 1.0

In [15]:
quantiles = [0.0, 0.50, 0.95, 0.99, 1.0]

quant_labels = ['1','2','3','4']

# Precipitation

In [None]:
weatherType = "Precip" # Tmax
precipData = getData(weatherType, 1981, 1999)

In [None]:
precipData['quarter']    = dd.to_datetime(precipData['date'], format='%Y%m%d').dt.quarter

In [None]:
precip = precipData.compute(workers = 100)
precip.shape

In [None]:
precip.to_csv("../../data/companyData/allZipsPrecip_8199.csv")

In [None]:
precip = pd.read_csv("../../data/companyData/allZipsPrecip_8199.csv")
precip.precipitation.isna().sum()

## Temperature

In [None]:
weatherType = "Tmax"
tempData = getData(weatherType, 1981, 1999)

In [None]:
tempData['quarter']  = dd.to_datetime(tempData['date'], format='%Y%m%d').dt.quarter

In [None]:
temp = tempData.compute(workers = 100)
temp.shape

In [None]:
temp.to_csv("../../data/companyData/allZipsTemp_8199.csv")

In [None]:
del temp
gc.collect()

In [None]:
del tempData
gc.collect()

# Load all data
## Get stats

### Describe Climate

In [148]:
weatherData   = dd.read_csv("../../data/companyData/allZipsTemp_8199.csv").drop(columns = {'Unnamed: 0'})
precipData = dd.read_csv("../../data/companyData/allZipsPrecip_8199.csv").drop(columns = {'Unnamed: 0'})

with ProgressBar():
    precipData    = precipData.repartition(npartitions=225)
    weatherData   = weatherData.repartition(npartitions=225)

weatherData = weatherData.merge(precipData).compute(workers = 100)

In [149]:
print(weatherData.dtypes,sys.getsizeof(weatherData)/1e9)

weatherData.ZIP           = weatherData.ZIP.astype('int32')
weatherData.date          = weatherData.date.astype('int32')
weatherData.temperature   = weatherData.temperature.astype('float32')
weatherData.precipitation = weatherData.precipitation.astype('float32')


print(weatherData.dtypes)

sys.getsizeof(weatherData)/1e9

ZIP                int64
date             float64
temperature      float64
quarter            int64
precipitation    float64
dtype: object 10.876799264
ZIP                int32
date               int32
temperature      float32
quarter            int64
precipitation    float32
dtype: object


7.25119952

In [150]:
sum(weatherData.precipitation.isna())

0

In [None]:
start = time.time()

# tempData['temp5Days']     = tempData.groupby(by='ZIP').apply(lambda df_g: df_g['temperature'].rolling(5).mean(), meta=('temperature', 'f8'))
# precipData['precip5Days']    = precipData.groupby(by='ZIP').apply(lambda df_g: df_g['precipitation'].rolling(5).mean(), meta=('precipitation', 'f8'))
# precipData['precipQtrMean']  = precipData.groupby(by='quarter').apply(lambda df_g: df_g['precipitation'].mean(), meta=('precipitation', 'f8'))

print(time.time() - start)

In [152]:
quarterlyAvg = weatherData.groupby('quarter').mean().\
    reset_index()[['quarter','temperature','precipitation']].rename(columns = {'temperature': 'quarterly_avg_temp',
                                                                              'precipitation': 'quarterly_avg_precip'})

quarterlyVar = weatherData.groupby('quarter').var().\
    reset_index()[['quarter','temperature','precipitation']].rename(columns = {'temperature': 'quarterly_var_temp',
                                                                              'precipitation': 'quarterly_var_precip'})
print(quarterlyVar,quarterlyAvg)

quarterlyStatsOverall = quarterlyVar.merge(quarterlyAvg)

   quarter  quarterly_var_temp  quarterly_var_precip
0        1           91.269073             48.202000
1        2           56.208214             54.826199
2        3           29.719254             56.482014
3        4           91.051224             55.745884    quarter  quarterly_avg_temp  quarterly_avg_precip
0        1            8.354369              2.489144
1        2           22.756006              2.941178
2        3           28.211340              2.745266
3        4           13.034486              2.557915


In [154]:
quarterlyAvgByZip = weatherData.groupby(['ZIP','quarter']).mean().\
    reset_index()[['ZIP','quarter','temperature','precipitation']].rename(columns = {'temperature': 'quarterly_avg_temp',
                                                                              'precipitation': 'quarterly_avg_precip'})


quarterlyVarByZip = weatherData.groupby(['ZIP','quarter']).var().\
    reset_index()[['ZIP','quarter','temperature','precipitation']].rename(columns = {'temperature': 'quarterly_var_temp',
                                                                              'precipitation': 'quarterly_var_precip'})
print(quarterlyVarByZip,quarterlyAvgByZip)


quarterlyStatsByZip = quarterlyAvgByZip.merge(quarterlyVarByZip)

          ZIP  quarter  quarterly_var_temp  quarterly_var_precip
0        1001        1           42.544899             48.074295
1        1001        2           47.229797             75.400520
2        1001        3           21.485365             89.049629
3        1001        4           55.028980             74.808990
4        1002        1           41.621262             45.362576
...       ...      ...                 ...                   ...
130619  99402        4           51.216682              5.795041
130620  99403        1           30.013823              4.163934
130621  99403        2           39.442696              6.055612
130622  99403        3           33.110489              3.752725
130623  99403        4           54.520615              5.092979

[130624 rows x 4 columns]           ZIP  quarter  quarterly_avg_temp  quarterly_avg_precip
0        1001        1            4.321893              2.869608
1        1001        2           20.888279              3.41000

In [155]:
quarterlyStatsOverall.to_csv("../../data/companyData/quarterlyStatsOverall_allZips_8199.csv")
quarterlyStatsByZip.to_csv("../../data/companyData/quarterlyStatsByZip_allZips_8199.csv")

In [156]:
quarterlyStatsByZip.head()

Unnamed: 0,ZIP,quarter,quarterly_avg_temp,quarterly_avg_precip,quarterly_var_temp,quarterly_var_precip
0,1001,1,4.321893,2.869608,42.544899,48.074295
1,1001,2,20.888279,3.410002,47.229797,75.40052
2,1001,3,26.69635,3.358637,21.485365,89.049629
3,1001,4,10.687009,3.41844,55.02898,74.80899
4,1002,1,2.982886,2.856695,41.621262,45.362576


In [157]:
del quarterlyStatsByZip
del quarterlyStatsOverall
gc.collect()

148

### Find Quartiles

In [165]:
def getPivotQuantsOverall(weatherType,weatherData):

    quants_overall  =  weatherData[weatherType].quantile(q = quantiles).reset_index().\
        rename(columns = {'index': 'quartile'})    
    
    quants_overall['quartile'] = 'q_' + quants_overall['quartile'].astype(str).str.slice(0,6)
    quants_overall = quants_overall.iloc[:,1:].values.flatten().tolist()

    return(quants_overall)

In [166]:
quantiles
quant_labels

['1', '2', '3', '4']

In [167]:
def getPivotQuants(weatherType, identifiers,weatherData):
    start2 = time.time()
    quants_overallByZip   = weatherData.groupby(identifiers)[weatherType].quantile(q = quantiles).reset_index().\
        rename(columns=lambda x: re.sub('level_[0-9]$','quartile',x))
    # rename(columns = {'level_1': 'quartile'})
    # print(quants_overallByZip.head())
    
    print("got grouped")
    print(time.time() - start2)
    quants_overallByZip['quartile'] = 'q_' + quants_overallByZip['quartile'].astype(str).str.slice(0,6)

    ## 

    pivot_quants_overallByZip = quants_overallByZip.pivot(index=identifiers, 
            columns='quartile', values=str(weatherType)).reset_index().\
            rename_axis(None, axis=1)

    pivot_quants_overallByZip['quartileList']      = pivot_quants_overallByZip.iloc[:,len(identifiers):].values.tolist()
    pivot_quants_overallByZip['quartileLabelList'] = [list(pivot_quants_overallByZip.columns[2:-1])] * \
                                                                    pivot_quants_overallByZip.shape[0]

    pivot_quants_overallByZip = pivot_quants_overallByZip[pivot_quants_overallByZip.\
                                                            columns.drop(list(pivot_quants_overallByZip.filter(regex='q_')))]
    print(time.time() - start2)
    # pivot_quants_overallByZip.head()

    return(pivot_quants_overallByZip)

Now get all the quartile data:
    - Overall
    - By zip
    - Quarterly by zip

In [168]:
###################
# OVERALL
tempQuants   = getPivotQuantsOverall('temperature',weatherData)
precipQuants = getPivotQuantsOverall('precipitation',weatherData)

'''temp5DaysQuants   = getPivotQuantsOverall('temp5Days',weatherData)
precip5DaysQuants = getPivotQuantsOverall('precip5Days',weatherData)'''


"temp5DaysQuants   = getPivotQuantsOverall('temp5Days',weatherData)\nprecip5DaysQuants = getPivotQuantsOverall('precip5Days',weatherData)"

In [169]:
####################
# BY ZIP
start = time.time()


pivot_temperatureQuants_zip = getPivotQuants('temperature',['ZIP'],weatherData).\
    drop(columns = {'quartileLabelList'})
pivot_temperatureQuants_zip.rename(columns = {'quartileList':     'quartileListTemperature_zip'},
                                      inplace = True)

pivot_temperatureQuants_zip.to_csv('../../data/companyData/pivot_temperatureQuants_zip_8199.csv')

print("done with temperature")
print(time.time() - start)



pivot_precipitationQuants_zip = getPivotQuants('precipitation',['ZIP'],weatherData).\
    drop(columns = {'quartileLabelList'})
pivot_precipitationQuants_zip.rename(columns = {'quartileList':     'quartileListPrecipitation_zip'},
                                      inplace = True)

pivot_precipitationQuants_zip.to_csv('../../data/companyData/pivot_precipitationQuants_zip_8199.csv')

print("done with precipitation")
print(time.time() - start)

quants_zip = pivot_temperatureQuants_zip.merge(pivot_precipitationQuants_zip)
quants_zip.to_csv('../../data/companyData/quants_zip_8199.csv')


got grouped
429.86352920532227
430.0418531894684
done with temperature
430.21997332572937
got grouped
269.9787509441376
270.1444528102875
done with precipitation
700.500654220581


In [170]:
##########################################
# BY ZIP-QUARTER
start = time.time()
pivot_temperatureQuants_quarterlyByZip = getPivotQuants('temperature',['ZIP','quarter'],weatherData).\
    drop(columns = {'quartileLabelList'})
pivot_temperatureQuants_quarterlyByZip.rename(columns = {'quartileList':     'quartileListTemperature_quarterlyByZip'},
                                      inplace = True)
pivot_temperatureQuants_quarterlyByZip.to_csv('../../data/companyData/pivot_temperatureQuants_quarterlyByZip_8199.csv')

print("done with temperature")
print(time.time() - start)

pivot_precipitationQuants_quarterlyByZip = getPivotQuants('precipitation',['ZIP','quarter'],weatherData).\
    drop(columns = {'quartileLabelList'})
pivot_precipitationQuants_quarterlyByZip.rename(columns = {'quartileList':     'quartileListPrecipitation_quarterlyByZip'},
                                      inplace = True)
pivot_precipitationQuants_quarterlyByZip.to_csv('../../data/companyData/pivot_precipitationQuants_quarterlyByZip_8199.csv')


print("done with precipitation")
print(time.time() - start)

quants_quarterlyByZip = pivot_temperatureQuants_quarterlyByZip.merge(pivot_precipitationQuants_quarterlyByZip)
quants_quarterlyByZip.to_csv('../../data/companyData/quants_quarterlyByZip_8199.csv')



got grouped
462.33906292915344
463.4253659248352
done with temperature
464.2069938182831
got grouped
295.2009069919586
295.8981897830963
done with precipitation
760.6708028316498


Construct a record of all the relevant quantiles by combining all of the above. Rough idea is:
    - Start with the zip-quarter data
    - Merge in the less-specific quarter information
    - For each row, put in the overall quartile information for each row

In [171]:
quantsAll                 = quants_quarterlyByZip.merge(quants_zip)



quantsAll['precipitationQuants'] = [precipQuants for i in quantsAll.index]
quantsAll['temperatureQuants']   = [tempQuants   for i in quantsAll.index]



quantsAll.head()

Unnamed: 0,ZIP,quarter,quartileListTemperature_quarterlyByZip,quartileListPrecipitation_quarterlyByZip,quartileListTemperature_zip,quartileListPrecipitation_zip,precipitationQuants,temperatureQuants
0,1001,1,"[-16.440000534057617, 4.105000019073486, 15.20...","[0.0, 0.0, 16.808500385284397, 35.242360839843...","[-16.440000534057617, 16.34600067138672, 31.28...","[0.0, 0.0, 19.81119995117186, 40.8881196594238...","[0.0, 0.0, 15.678999900817871, 35.342998504638...","[-38.89500045776367, 19.79400062561035, 33.562..."
1,1001,2,"[-4.230000019073486, 21.31399917602539, 31.188...","[0.0, 0.0, 18.654600143432596, 41.514800415039...","[-16.440000534057617, 16.34600067138672, 31.28...","[0.0, 0.0, 19.81119995117186, 40.8881196594238...","[0.0, 0.0, 15.678999900817871, 35.342998504638...","[-38.89500045776367, 19.79400062561035, 33.562..."
2,1001,3,"[12.298999786376953, 27.302000045776367, 33.56...","[0.0, 0.0, 20.011549854278552, 44.116429595947...","[-16.440000534057617, 16.34600067138672, 31.28...","[0.0, 0.0, 19.81119995117186, 40.8881196594238...","[0.0, 0.0, 15.678999900817871, 35.342998504638...","[-38.89500045776367, 19.79400062561035, 33.562..."
3,1001,4,"[-13.265000343322754, 10.868000030517578, 22.7...","[0.0, 0.0, 21.844349861145, 41.527550010681146...","[-16.440000534057617, 16.34600067138672, 31.28...","[0.0, 0.0, 19.81119995117186, 40.8881196594238...","[0.0, 0.0, 15.678999900817871, 35.342998504638...","[-38.89500045776367, 19.79400062561035, 33.562..."
4,1002,1,"[-18.04599952697754, 2.7734999656677246, 13.81...","[0.0, 0.0, 17.24770002365111, 33.0675295639038...","[-18.04599952697754, 15.095000267028809, 30.31...","[0.0, 0.0, 19.22520065307615, 39.9705391693114...","[0.0, 0.0, 15.678999900817871, 35.342998504638...","[-38.89500045776367, 19.79400062561035, 33.562..."


In [172]:
# relevantZips = allCustomerData.zipcode.append(allSupplierData.zipcode).unique()
outfile =  '../../data/companyData/quantsAll_allZips_8199.pkl'
with open(outfile, 'wb') as pickle_file:
    pkl.dump(quantsAll, pickle_file)

# Conversion
### Convert 2000-2018 data into quartiles



MAKE SURE THAT BY THE END, WE HAVE NO DATE, NO QUARTER, JUST yyyq#.

In [None]:
weatherType = "Tmax" # Tmax
tempData = getData(weatherType, 2000, 2019)

tempData['quarter']    = dd.to_datetime(tempData['date'], format='%Y%m%d').dt.quarter

temp = tempData.compute(workers = 100)
print(temp.shape)


temp.to_csv("../../data/companyData/allZipsTemp_0019.csv")

In [None]:
del temp
del tempData
gc.collect()

In [None]:
weatherType = "Precip" # Tmax
precipData = getData(weatherType, 2000, 2019)

precipData['quarter']    = dd.to_datetime(precipData['date'], format='%Y%m%d').dt.quarter

precip = precipData.compute(workers = 100)
print(precip.shape)


precip.to_csv("../../data/companyData/allZipsPrecip_0019.csv")

In [None]:
del precip
del precipData
gc.collect()

In [4]:
file = open('../../data/companyData/quantsAll_allZips_8199.pkl','rb')
quantsAll = pkl.load(file)

In [5]:
recentDecadeWeather   = dd.read_csv("../../data/companyData/allZipsTemp_0019.csv").drop(columns = {'Unnamed: 0'}).reset_index(drop = True)
precipData = dd.read_csv("../../data/companyData/allZipsPrecip_0019.csv").drop(columns = {'Unnamed: 0'}).reset_index(drop = True)
with ProgressBar():
    precipData    = precipData.repartition(npartitions=225)
    recentDecadeWeather   = recentDecadeWeather.repartition(npartitions=225)


# tempData['precipitation'] = precipData.precipitation
recentDecadeWeather = recentDecadeWeather.merge(precipData).compute(workers = 100) # assign(precipitation=precipData.precipitation)
# recentDecadeWeather.head()

print(recentDecadeWeather.dtypes,sys.getsizeof(recentDecadeWeather)/1e9)

recentDecadeWeather.ZIP           = recentDecadeWeather.ZIP.astype('int32')
recentDecadeWeather.date          = recentDecadeWeather.date.astype('int32')
recentDecadeWeather.temperature   = recentDecadeWeather.temperature.astype('float32')
recentDecadeWeather.quarter       = recentDecadeWeather.quarter.astype('float32')
recentDecadeWeather.precipitation = recentDecadeWeather.precipitation.astype('float32')


print(recentDecadeWeather.dtypes)

sys.getsizeof(recentDecadeWeather)/1e9

ZIP                int64
date             float64
temperature      float64
quarter            int64
precipitation    float64
dtype: object 11.450499872
ZIP                int32
date               int32
temperature      float32
quarter          float32
precipitation    float32
dtype: object


6.679458272

In [6]:
quantsAll.head()

Unnamed: 0,ZIP,quarter,quartileListTemperature_quarterlyByZip,quartileListPrecipitation_quarterlyByZip,quartileListTemperature_zip,quartileListPrecipitation_zip,precipitationQuants,temperatureQuants
0,1001,1,"[-16.440000534057617, 4.105000019073486, 15.20...","[0.0, 0.0, 16.808500385284397, 35.242360839843...","[-16.440000534057617, 16.34600067138672, 31.28...","[0.0, 0.0, 19.81119995117186, 40.8881196594238...","[0.0, 0.0, 15.678999900817871, 35.342998504638...","[-38.89500045776367, 19.79400062561035, 33.562..."
1,1001,2,"[-4.230000019073486, 21.31399917602539, 31.188...","[0.0, 0.0, 18.654600143432596, 41.514800415039...","[-16.440000534057617, 16.34600067138672, 31.28...","[0.0, 0.0, 19.81119995117186, 40.8881196594238...","[0.0, 0.0, 15.678999900817871, 35.342998504638...","[-38.89500045776367, 19.79400062561035, 33.562..."
2,1001,3,"[12.298999786376953, 27.302000045776367, 33.56...","[0.0, 0.0, 20.011549854278552, 44.116429595947...","[-16.440000534057617, 16.34600067138672, 31.28...","[0.0, 0.0, 19.81119995117186, 40.8881196594238...","[0.0, 0.0, 15.678999900817871, 35.342998504638...","[-38.89500045776367, 19.79400062561035, 33.562..."
3,1001,4,"[-13.265000343322754, 10.868000030517578, 22.7...","[0.0, 0.0, 21.844349861145, 41.527550010681146...","[-16.440000534057617, 16.34600067138672, 31.28...","[0.0, 0.0, 19.81119995117186, 40.8881196594238...","[0.0, 0.0, 15.678999900817871, 35.342998504638...","[-38.89500045776367, 19.79400062561035, 33.562..."
4,1002,1,"[-18.04599952697754, 2.7734999656677246, 13.81...","[0.0, 0.0, 17.24770002365111, 33.0675295639038...","[-18.04599952697754, 15.095000267028809, 30.31...","[0.0, 0.0, 19.22520065307615, 39.9705391693114...","[0.0, 0.0, 15.678999900817871, 35.342998504638...","[-38.89500045776367, 19.79400062561035, 33.562..."


In [7]:
quantsAll.precipitationQuants[0]

[0.0, 0.0, 15.678999900817871, 35.34299850463867, 459.0589904785156]

## Annual Quants
Get the annual quants.

In [8]:
quantsAll.temperatureQuants[0][0]   = -1000   
quantsAll.precipitationQuants[0][0] = -1000

quantsAll.temperatureQuants[0][len(quantsAll.temperatureQuants[0])     - 1] = 1000
quantsAll.precipitationQuants[0][len(quantsAll.precipitationQuants[0]) - 1] = 1000

for i in range(0,len(quant_labels)):
    quantsAll.temperatureQuants[0][i]  = quantsAll.temperatureQuants[0][i] + i/10000
    
    quantsAll.precipitationQuants[0][i]     = quantsAll.precipitationQuants[0][i] + i/10000
    print(quantsAll.precipitationQuants[0][i])
    


-1000.0
0.0001
15.67919990081787
35.343298504638675


In [9]:
recentDecadeWeather.precipitation

0           0.486000
1           0.015000
2           0.096000
3           0.434000
4           0.000000
             ...    
1059446     0.000000
1059447     0.000000
1059448     0.000000
1059449    19.341999
1059450     0.000000
Name: precipitation, Length: 238552080, dtype: float32

In [10]:
annualQuants = recentDecadeWeather[['ZIP','date']]

annualQuants['temp_annualQuants'] = pd.cut(recentDecadeWeather.temperature, 
           bins   = quantsAll.temperatureQuants[0],
           labels = quant_labels,include_lowest=True)

annualQuants['precip_annualQuants'] = pd.cut(recentDecadeWeather.precipitation, 
           bins   = quantsAll.precipitationQuants[0],
           labels = quant_labels,include_lowest=True)


annualQuants.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Unnamed: 0,ZIP,date,temp_annualQuants,precip_annualQuants
0,51530,20191112,1,2
1,53569,20191112,1,2
2,53901,20191112,1,2
3,54245,20191112,1,2
4,54476,20191112,1,1


In [11]:
annualQuants.temp_annualQuants

0          1
1          1
2          1
3          1
4          1
          ..
1059446    2
1059447    2
1059448    2
1059449    2
1059450    2
Name: temp_annualQuants, Length: 238552080, dtype: category
Categories (4, object): ['1' < '2' < '3' < '4']

In [12]:
np.sum(annualQuants.memory_usage(deep = True))/1e9

4.293938248

In [13]:
annualQuants.to_csv("../../data/companyData/annualQuants_allZips_0019.csv")

In [14]:
del annualQuants
gc.collect()

120

In [3]:
annualQuants = pd.read_csv("../../data/companyData/annualQuants_allZips_0019.csv").drop(columns = {'Unnamed: 0'})

In [4]:
start = time.time()


annualQuants['date'] = pd.to_datetime(annualQuants.date, format='%Y%m%d')


print(time.time() - start)


annualQuants['yearQuarter'] = annualQuants.date.dt.year.astype('str') + 'q' + annualQuants.date.dt.quarter.astype('str')
annualQuants.drop(columns = {'date'}, inplace = True)
annualQuants.insert(0, 'yearQuarter', annualQuants.pop('yearQuarter'))


print(time.time() - start)


annualQuants.to_csv("../../data/companyData/annualQuants_allZips_0019_noDate.csv")
print(time.time() - start)

5.601507902145386
396.81701588630676
743.8985450267792


## Zip Quants
Get the quants by zipcode.

Now do the same but for zips. For some reason the filtering operations are like 10x faster with making the ZIP and quarter out to be categories.

In [2]:
file = open('../../data/companyData/quantsAll_allZips_8199.pkl','rb')
quantsAll = pkl.load(file)
np.sum(quantsAll.memory_usage(deep = True))/1e9

0.090914304

In [3]:
print(quantsAll.columns)

quantsAll = quantsAll[quantsAll.quarter == 1]

quantsAll = quantsAll[['ZIP','quartileListTemperature_zip','quartileListPrecipitation_zip']]

np.sum(quantsAll.memory_usage(deep = True))/1e9

Index(['ZIP', 'quarter', 'quartileListTemperature_quarterlyByZip',
       'quartileListPrecipitation_quarterlyByZip',
       'quartileListTemperature_zip', 'quartileListPrecipitation_zip',
       'precipitationQuants', 'temperatureQuants'],
      dtype='object')


0.00783744

In [4]:
quantsAll.head()

Unnamed: 0,ZIP,quartileListTemperature_zip,quartileListPrecipitation_zip
0,1001,"[-16.440000534057617, 16.34600067138672, 31.28...","[0.0, 0.0, 19.81119995117186, 40.8881196594238..."
4,1002,"[-18.04599952697754, 15.095000267028809, 30.31...","[0.0, 0.0, 19.22520065307615, 39.9705391693114..."
8,1003,"[-17.95400047302246, 15.871999740600586, 30.98...","[0.0, 0.0, 19.300100326538075, 40.136859664916..."
12,1005,"[-18.68199920654297, 13.8100004196167, 28.6433...","[0.0, 0.0, 19.074000167846652, 38.257340774536..."
16,1007,"[-17.974000930786133, 15.494000434875488, 30.5...","[0.0, 0.0, 19.155999183654757, 40.110000534057..."


In [29]:
recentDecadeWeather   = dd.read_csv("../../data/companyData/allZipsTemp_0019.csv").drop(columns = {'Unnamed: 0'}).reset_index(drop = True)
precipData = dd.read_csv("../../data/companyData/allZipsPrecip_0019.csv").drop(columns = {'Unnamed: 0'}).reset_index(drop = True)
with ProgressBar():
    precipData    = precipData.repartition(npartitions=225)
    recentDecadeWeather   = recentDecadeWeather.repartition(npartitions=225)


# tempData['precipitation'] = precipData.precipitation
recentDecadeWeather = recentDecadeWeather.merge(precipData).compute(workers = 100) # assign(precipitation=precipData.precipitation)
# recentDecadeWeather = recentDecadeWeather.drop(columns = {'quarter'})

del precipData
gc.collect()


'''
print(recentDecadeWeather.dtypes,sys.getsizeof(recentDecadeWeather)/1e9)

recentDecadeWeather.ZIP           = recentDecadeWeather.ZIP.astype('int32')
recentDecadeWeather.date          = recentDecadeWeather.date.astype('int32')
recentDecadeWeather.temperature   = recentDecadeWeather.temperature.astype('float32')
recentDecadeWeather.precipitation = recentDecadeWeather.precipitation.astype('float32')


print(recentDecadeWeather.dtypes)

sys.getsizeof(recentDecadeWeather)/1e9
'''

"print(recentDecadeWeather.dtypes,sys.getsizeof(recentDecadeWeather)/1e9)\n\nrecentDecadeWeather.ZIP           = recentDecadeWeather.ZIP.astype('int32')\nrecentDecadeWeather.date          = recentDecadeWeather.date.astype('int32')\nrecentDecadeWeather.temperature   = recentDecadeWeather.temperature.astype('float32')\nrecentDecadeWeather.precipitation = recentDecadeWeather.precipitation.astype('float32')\n\n\nprint(recentDecadeWeather.dtypes)\n\nsys.getsizeof(recentDecadeWeather)/1e9"

In [30]:
def getZipQuarts(zipcode):
    
    tempData = recentDecadeQuarter[recentDecadeQuarter.ZIP == zipcode]# .compute(workers = 100) # reset_index()
    quantBins = quantsAll[quantsAll.ZIP == zipcode].reset_index()
    # print(tempData.head)
    # print(quantBins)
    if tempData.shape[0] > 0:

        quantBins.quartileListTemperature_zip[0][0]    = -50   
        quantBins.quartileListPrecipitation_zip[0][0]  = -50

        quantBins.quartileListTemperature_zip[0][-1]   = 1000
        quantBins.quartileListPrecipitation_zip[0][-1] = 1000

        for i in range(0,len(quantBins.quartileListPrecipitation_zip[0])):
                quantBins.quartileListPrecipitation_zip[0][i] = quantBins.quartileListPrecipitation_zip[0][i] + i/10000


        tempData['temp_zipQuants'] = pd.cut(tempData.temperature, 
               bins   = quantBins.quartileListTemperature_zip[0],
               labels = quant_labels,include_lowest=True)


        tempData['precip_zipQuants'] = pd.cut(tempData.precipitation, 
               bins   = quantBins.quartileListPrecipitation_zip[0],
               labels = quant_labels,include_lowest=True)        
        # print(tempData.head(), quantBins)
    
    return(tempData[['ZIP','date','temp_zipQuants','precip_zipQuants']])




Go through with this.

In [31]:
recentDecadeWeather = recentDecadeWeather.astype({'ZIP': 'category'})

quantsAll = quantsAll.astype({'ZIP': 'category'})

In [32]:
ZIPs = quantsAll.ZIP.unique()
len(ZIPs)

32656

In [33]:
start = time.time()
tempList = []


quarters = [1,2,3,4]
for quarter in quarters:
    print(quarter)
    recentDecadeQuarter = recentDecadeWeather[recentDecadeWeather.quarter == quarter]
    recentDecadeQuarter = recentDecadeQuarter.drop(columns = {'quarter'})
        
    with multiprocessing.Pool() as pool:
        zipQuarts = pool.map(getZipQuarts, ZIPs)
        pool.close()



    weatherByZip = pd.concat(zipQuarts)
    # weatherByZipQuarter['quarter'] = quarter
    tempList.append(weatherByZip)
    
    print(time.time() - start)

1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

139.9699878692627
2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

274.8180260658264
3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

410.3380010128021
4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

552.620197057724


In [None]:
# this crashed the comp everytime

'''start = time.time()

with multiprocessing.Pool() as pool:
    zipQuarts = pool.map(getZipQuarts, ZIPs)

print(time.time() - start)

quantsByZip = pd.concat(zipQuarts)

print(time.time() - start)

quantsByZip.head()'''

In [34]:
allWeatherByZip = pd.concat(tempList)

In [35]:
allWeatherByZip.to_csv("../../data/companyData/quantsByZip_allZips_0019.csv")

In [5]:
allWeatherByZip = pd.read_csv("../../data/companyData/quantsByZip_allZips_0019.csv").drop(columns = {'Unnamed: 0'})

In [6]:
allWeatherByZip.head()

Unnamed: 0,ZIP,date,temp_zipQuants,precip_zipQuants
0,1001,20000117.0,1,1
1,1001,20010222.0,1,1
2,1001,20030213.0,1,2
3,1001,20070328.0,2,1
4,1001,20110104.0,1,1


In [7]:
start = time.time()


allWeatherByZip['date'] = pd.to_datetime(allWeatherByZip.date, format='%Y%m%d')


print(time.time() - start)


allWeatherByZip['yearQuarter'] = allWeatherByZip.date.dt.year.astype('str') + 'q' + allWeatherByZip.date.dt.quarter.astype('str')
allWeatherByZip.drop(columns = {'date'}, inplace = True)
allWeatherByZip.insert(0, 'yearQuarter', allWeatherByZip.pop('yearQuarter'))


print(time.time() - start)


allWeatherByZip.to_csv("../../data/companyData/quantsByZip_allZips_0019_noDate.csv")
print(time.time() - start)

211.21769905090332
579.1642029285431
924.1530439853668


In [None]:
del quantsByZip
gc.collect()

## ZIP - Quarters
Now do this at a zip-quarter level.

In [16]:
recentDecadeWeather   = dd.read_csv("../../data/companyData/allZipsTemp_0019.csv").drop(columns = {'Unnamed: 0'}).reset_index(drop = True)
precipData = dd.read_csv("../../data/companyData/allZipsPrecip_0019.csv").drop(columns = {'Unnamed: 0'}).reset_index(drop = True)
with ProgressBar():
    precipData    = precipData.repartition(npartitions=225)
    recentDecadeWeather   = recentDecadeWeather.repartition(npartitions=225)


# tempData['precipitation'] = precipData.precipitation
recentDecadeWeather = recentDecadeWeather.merge(precipData).compute(workers = 100) # assign(precipitation=precipData.precipitation)


# recentDecadeWeather.head()

print(recentDecadeWeather.dtypes,sys.getsizeof(recentDecadeWeather)/1e9)

'''recentDecadeWeather.ZIP           = recentDecadeWeather.ZIP.astype('int32')
recentDecadeWeather.date          = recentDecadeWeather.date.astype('int32')
recentDecadeWeather.temperature   = recentDecadeWeather.temperature.astype('float32')
recentDecadeWeather.quarter       = recentDecadeWeather.quarter.astype('float32')
recentDecadeWeather.precipitation = recentDecadeWeather.precipitation.astype('float32')


print(recentDecadeWeather.dtypes)

sys.getsizeof(recentDecadeWeather)/1e9'''

ZIP                int64
date             float64
temperature      float64
quarter            int64
precipitation    float64
dtype: object 11.450499872


"recentDecadeWeather.ZIP           = recentDecadeWeather.ZIP.astype('int32')\nrecentDecadeWeather.date          = recentDecadeWeather.date.astype('int32')\nrecentDecadeWeather.temperature   = recentDecadeWeather.temperature.astype('float32')\nrecentDecadeWeather.quarter       = recentDecadeWeather.quarter.astype('float32')\nrecentDecadeWeather.precipitation = recentDecadeWeather.precipitation.astype('float32')\n\n\nprint(recentDecadeWeather.dtypes)\n\nsys.getsizeof(recentDecadeWeather)/1e9"

In [17]:
file = open('../../data/companyData/quantsAll_allZips_8199.pkl','rb')
quantsAll = pkl.load(file)
np.sum(quantsAll.memory_usage(deep = True))/1e9
print(quantsAll.columns)

Index(['ZIP', 'quarter', 'quartileListTemperature_quarterlyByZip',
       'quartileListPrecipitation_quarterlyByZip',
       'quartileListTemperature_zip', 'quartileListPrecipitation_zip',
       'precipitationQuants', 'temperatureQuants'],
      dtype='object')


In [18]:
recentDecadeWeather = recentDecadeWeather.astype({'ZIP': 'category',
                                                 'quarter': 'category'})
quantsAll = quantsAll.astype({'ZIP': 'category',
                                                 'quarter': 'category'})

In [19]:
ZIPs = quantsAll.ZIP.unique()
len(ZIPs)

32656

In [20]:
def getZipQuarterQuarts(zipcode):

    tempData = recentDecadeQuarter[(recentDecadeQuarter.ZIP  == zipcode)]

    quantBins = quantsQuarter[(quantsQuarter.ZIP == zipcode)].reset_index()


    if tempData.shape[0] > 0:
        
        quantBins.quartileListTemperature_quarterlyByZip[0][0]    = -50   
        quantBins.quartileListPrecipitation_quarterlyByZip[0][0]  = -50

        quantBins.quartileListTemperature_quarterlyByZip[0][-1]   = 1000
        quantBins.quartileListPrecipitation_quarterlyByZip[0][-1] = 1000

        for i in range(0,len(quantBins.quartileListPrecipitation_quarterlyByZip[0])):
            quantBins.quartileListPrecipitation_quarterlyByZip[0][i] = quantBins.quartileListPrecipitation_quarterlyByZip[0][i] + i/10000

        tempData['temp_zipQuarterQuants'] = pd.cut(tempData.temperature, 
               bins = quantBins.quartileListTemperature_quarterlyByZip[0],
               labels = quant_labels,include_lowest=True)


        tempData['precip_zipQuarterQuants'] = pd.cut(tempData.precipitation, 
               bins = quantBins.quartileListPrecipitation_quarterlyByZip[0],
               labels = quant_labels,include_lowest=True)

    return(tempData[['ZIP','date','temp_zipQuarterQuants','precip_zipQuarterQuants']])

In [21]:
start = time.time()
tempQuarterList = []


quarters = [1,2,3,4]
for quarter in quarters:
    print(quarter)
    recentDecadeQuarter = recentDecadeWeather[recentDecadeWeather.quarter == quarter]
    recentDecadeQuarter = recentDecadeQuarter.drop(columns = {'quarter'})
    
    
    
    quantsQuarter       = quantsAll[quantsAll.quarter == quarter][['ZIP','quartileListTemperature_quarterlyByZip','quartileListPrecipitation_quarterlyByZip']]
    
    
    
    with multiprocessing.Pool() as pool:
        zipQuarterQuarts = pool.map(getZipQuarterQuarts, ZIPs)
        pool.close()



    weatherByZipQuarter = pd.concat(zipQuarterQuarts)
    # weatherByZipQuarter['quarter'] = quarter
    tempQuarterList.append(weatherByZipQuarter)
    
    print(time.time() - start)

1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

141.10142970085144
2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

280.70473980903625
3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

424.99824690818787
4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

568.0929379463196


In [22]:
allWeatherByZipQuarter = pd.concat(tempQuarterList)

In [23]:
allWeatherByZipQuarter.head()

Unnamed: 0,ZIP,date,temp_zipQuarterQuants,precip_zipQuarterQuants
9455,1001,20000117.0,2,1
67891,1001,20010222.0,1,1
186256,1001,20030213.0,1,2
397578,1001,20070328.0,3,1
590931,1001,20110104.0,1,1


In [24]:
# allWeatherByZipQuarter.drop(columns = {'quarter'})

In [25]:
allWeatherByZipQuarter.head()

Unnamed: 0,ZIP,date,temp_zipQuarterQuants,precip_zipQuarterQuants
9455,1001,20000117.0,2,1
67891,1001,20010222.0,1,1
186256,1001,20030213.0,1,2
397578,1001,20070328.0,3,1
590931,1001,20110104.0,1,1


In [26]:
allWeatherByZipQuarter.to_csv("../../data/companyData/zipQuartsCompd_allZips_0019.csv")

In [27]:
start = time.time()

allWeatherByZipQuarter['date'] = pd.to_datetime(allWeatherByZipQuarter.date, format='%Y%m%d')

print(time.time() - start)

220.1553430557251


In [28]:
allWeatherByZipQuarter['date'] = pd.to_datetime(allWeatherByZipQuarter.date, format='%Y%m%d')

print(time.time() - start)

allWeatherByZipQuarter['yearQuarter'] = allWeatherByZipQuarter.date.dt.year.astype('str') + 'q' + allWeatherByZipQuarter.date.dt.quarter.astype('str')


allWeatherByZipQuarter.drop(columns = {'date'}, inplace = True)


allWeatherByZipQuarter.insert(0, 'yearQuarter', allWeatherByZipQuarter.pop('yearQuarter'))
print(time.time() - start)

allWeatherByZipQuarter.to_csv("../../data/companyData/zipQuartsCompd_allZips_0019_noDate.csv")
print(time.time() - start)

607.9552299976349
959.5053398609161


In [5]:
weather.head()

Unnamed: 0,yearQuarter,ZIP,temp_annualQuants,precip_annualQuants
0,2019q3,52626,2,1
1,2019q3,53048,1,1
2,2019q3,53105,1,1
3,2019q3,54443,1,1
4,2019q3,55014,2,1


In [15]:
del weather
gc.collect()

95

# See if we can put it all together
### Now go through the data from 2010 - 2019 and find time spent in each bin
We'll have to do this for separate quartiles because they altogether take too much memory. First, find the quartile that the weather for each day falls within, and save it.

In [8]:
def getCounts(quartType,df):

    test = df.groupby(['ZIP','yearQuarter',quartType]).size().reset_index()
    test = test.drop_duplicates()
    test.columns = ['zip', 'yearQuarter', quartType, 'occurrences']

    
    
    return(test)

In [9]:
start = time.time()

fileList = ["annualQuants_allZips",
            "zipQuartsCompd_allZips",
           "quantsByZip_allZips"]



countList = []



for file in fileList:
    print(file)
    
    filename = "../../data/companyData/" + file + "_0019_noDate.csv"
    allWeather = pd.read_csv(filename).drop(columns = {'Unnamed: 0'})
    print(allWeather.columns)
    print(allWeather.shape)
    print(time.time() - start)
    
    print("loaded")
    
    
    
    # print(allWeather.head())
    
    
    # countData = getCounts(allWeather.columns[2],allWeather)
    
    
    for quart in allWeather.columns[2:]:
        print(quart)
        
        countData = getCounts(quart,allWeather)
        
        outFileName = "../../data/companyData/" + file + quart + ".csv"
        countData.to_csv(outFileName)
        print("all done")
        
        del countData
        gc.collect()
        
    
    del allWeather
    gc.collect()
    print("all done 2")
    
    # countList.append(countData)
    
    
    
    

annualQuants_allZips
Index(['yearQuarter', 'ZIP', 'temp_annualQuants', 'precip_annualQuants'], dtype='object')
(238552080, 4)
72.05372405052185
loaded
temp_annualQuants
all done
precip_annualQuants
all done
all done 2
zipQuartsCompd_allZips
Index(['yearQuarter', 'ZIP', 'temp_zipQuarterQuants',
       'precip_zipQuarterQuants'],
      dtype='object')
(238552080, 4)
242.90518617630005
loaded
temp_zipQuarterQuants
all done
precip_zipQuarterQuants
all done
all done 2
quantsByZip_allZips
Index(['yearQuarter', 'ZIP', 'temp_zipQuants', 'precip_zipQuants'], dtype='object')
(238552080, 4)
404.1059031486511
loaded
temp_zipQuants
all done
precip_zipQuants
all done
all done 2


Next, find the number of days in each quartile, for each zip, for each quarter.

In [66]:
# allCounts = pd.read_csv("../../data/companyData/" + quartType + ".csv")

In [11]:
def getCounts2(quartType):
    test = pd.read_csv("../../data/companyData/" + quartType + ".csv").drop(columns = {'Unnamed: 0'})
    
    
    
    testPivot = test.pivot(index=['zip','yearQuarter'], 
            columns = quartType, values= 'occurrences').reset_index().\
            rename_axis(None, axis=1)
    
    testPivot = testPivot.fillna(0)
    testPivot.columns = testPivot.columns.astype(str)
    
    
    testPivot.columns.values[2:] = quartType[:-6] + testPivot.columns.values[2:]
    testPivot = testPivot.reset_index(drop=True)

    # print(testPivot.head())
    return(testPivot)

In [12]:
precip = getCounts2('precip_zipQuants')
temp   = getCounts2('temp_zipQuants')

print(precip.head(), temp.head())

    zip yearQuarter  precip_zip1  precip_zip2  precip_zip3  precip_zip4
0  1001      2000q1         64.0         21.0          5.0          1.0
1  1001      2000q2         48.0         35.0          5.0          3.0
2  1001      2000q3         53.0         34.0          2.0          3.0
3  1001      2000q4         68.0         19.0          4.0          1.0
4  1001      2001q1         55.0         30.0          3.0          2.0     zip yearQuarter  temp_zip1  temp_zip2  temp_zip3  temp_zip4
0  1001      2000q1       82.0        9.0        0.0        0.0
1  1001      2000q2       28.0       55.0        7.0        1.0
2  1001      2000q3        4.0       87.0        1.0        0.0
3  1001      2000q4       71.0       21.0        0.0        0.0
4  1001      2001q1       90.0        0.0        0.0        0.0


In [99]:
precip.merge(temp)

Unnamed: 0,zip,yearQuarter,precip_zip1,precip_zip2,precip_zip3,precip_zip4,temp_zip1,temp_zip2,temp_zip3,temp_zip4
0,1001,2000q1,64.0,21.0,5.0,1.0,82.0,9.0,0.0,0.0
1,1001,2000q2,48.0,35.0,5.0,3.0,28.0,55.0,7.0,1.0
2,1001,2000q3,53.0,34.0,2.0,3.0,4.0,87.0,1.0,0.0
3,1001,2000q4,68.0,19.0,4.0,1.0,71.0,21.0,0.0,0.0
4,1001,2001q1,55.0,30.0,3.0,2.0,90.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
2612475,99403,2018q4,45.0,47.0,0.0,0.0,68.0,24.0,0.0,0.0
2612476,99403,2019q1,41.0,49.0,0.0,0.0,88.0,2.0,0.0,0.0
2612477,99403,2019q2,53.0,38.0,0.0,0.0,21.0,70.0,0.0,0.0
2612478,99403,2019q3,67.0,25.0,0.0,0.0,2.0,78.0,10.0,2.0


In [13]:
quarts = ['precip_zipQuants',       'temp_zipQuants',
         'precip_zipQuarterQuants', 'temp_zipQuarterQuants',
         'precip_annualQuants',     'temp_annualQuants']

allCounts = pd.DataFrame()

for quart in quarts:
    print(quart)
    
    
    if quart == 'precip_zipQuants':
        allCounts = getCounts2(quart)
    
    else: allCounts = allCounts.merge(getCounts2(quart))

allCounts.head()


precip_zipQuants
temp_zipQuants
precip_zipQuarterQuants
temp_zipQuarterQuants
precip_annualQuants
temp_annualQuants


Unnamed: 0,zip,yearQuarter,precip_zip1,precip_zip2,precip_zip3,precip_zip4,temp_zip1,temp_zip2,temp_zip3,temp_zip4,...,temp_zipQuarter3,temp_zipQuarter4,precip_annual1,precip_annual2,precip_annual3,precip_annual4,temp_annual1,temp_annual2,temp_annual3,temp_annual4
0,1001,2000q1,64.0,21.0,5.0,1.0,82.0,9.0,0.0,0.0,...,12.0,0.0,76.0,12.0,3.0,0.0,91.0,0.0,0.0,0.0
1,1001,2000q2,48.0,35.0,5.0,3.0,28.0,55.0,7.0,1.0,...,7.0,1.0,68.0,17.0,5.0,1.0,70.0,21.0,0.0,0.0
2,1001,2000q3,53.0,34.0,2.0,3.0,4.0,87.0,1.0,0.0,...,0.0,0.0,78.0,10.0,2.0,2.0,40.0,52.0,0.0,0.0
3,1001,2000q4,68.0,19.0,4.0,1.0,71.0,21.0,0.0,0.0,...,4.0,2.0,79.0,12.0,1.0,0.0,90.0,2.0,0.0,0.0
4,1001,2001q1,55.0,30.0,3.0,2.0,90.0,0.0,0.0,0.0,...,0.0,0.0,78.0,9.0,3.0,0.0,90.0,0.0,0.0,0.0


In [14]:
sys.getsizeof(allCounts)/1e9

0.707982112

Reformat the column names so they're consistent.

In [15]:
allCounts['year']     = allCounts.yearQuarter.str.slice(0,4).astype('int32')
allCounts['qtr']      = allCounts.yearQuarter.str.slice(5,6).astype('int32')
allCounts['zipcode']  = allCounts.zip.astype('int32')

allCounts.drop(columns = {'zip'}, inplace = True)

In [16]:
allCounts.zipcode.min()

1001

In [17]:
allCounts.head()

Unnamed: 0,yearQuarter,precip_zip1,precip_zip2,precip_zip3,precip_zip4,temp_zip1,temp_zip2,temp_zip3,temp_zip4,precip_zipQuarter1,...,precip_annual2,precip_annual3,precip_annual4,temp_annual1,temp_annual2,temp_annual3,temp_annual4,year,qtr,zipcode
0,2000q1,64.0,21.0,5.0,1.0,82.0,9.0,0.0,0.0,64.0,...,12.0,3.0,0.0,91.0,0.0,0.0,0.0,2000,1,1001
1,2000q2,48.0,35.0,5.0,3.0,28.0,55.0,7.0,1.0,48.0,...,17.0,5.0,1.0,70.0,21.0,0.0,0.0,2000,2,1001
2,2000q3,53.0,34.0,2.0,3.0,4.0,87.0,1.0,0.0,53.0,...,10.0,2.0,2.0,40.0,52.0,0.0,0.0,2000,3,1001
3,2000q4,68.0,19.0,4.0,1.0,71.0,21.0,0.0,0.0,68.0,...,12.0,1.0,0.0,90.0,2.0,0.0,0.0,2000,4,1001
4,2001q1,55.0,30.0,3.0,2.0,90.0,0.0,0.0,0.0,55.0,...,9.0,3.0,0.0,90.0,0.0,0.0,0.0,2001,1,1001


In [18]:
allCounts.to_csv("../../data/companyData/allWeatherBins_2000to2019_allZips.csv")

In [19]:
del allCounts
gc.collect()

145

# Revise this
So we have all bins.

In [20]:
countData                = pd.read_csv("../../data/companyData/allWeatherBins_2000to2019_allZips.csv").\
    drop(columns = 'Unnamed: 0')
# countData.rename(columns = {'zip': 'zipcode'}, inplace = True)

countData['year']    = countData.yearQuarter.str.slice(0,4)
countData['quarter'] = countData.yearQuarter.str.slice(5,6)

countDataRevised = countData[['zipcode','year','quarter']]

countDataRevised.head() 


Unnamed: 0,zipcode,year,quarter
0,1001,2000,1
1,1001,2000,2
2,1001,2000,3
3,1001,2000,4
4,1001,2001,1


In [21]:
quantiles = [0.0, 0.50, 0.95, 0.99, 1.0]

quant_labels = ['1','2','3','4']

In [22]:
countData.head()

Unnamed: 0,yearQuarter,precip_zip1,precip_zip2,precip_zip3,precip_zip4,temp_zip1,temp_zip2,temp_zip3,temp_zip4,precip_zipQuarter1,...,precip_annual3,precip_annual4,temp_annual1,temp_annual2,temp_annual3,temp_annual4,year,qtr,zipcode,quarter
0,2000q1,64.0,21.0,5.0,1.0,82.0,9.0,0.0,0.0,64.0,...,3.0,0.0,91.0,0.0,0.0,0.0,2000,1,1001,1
1,2000q2,48.0,35.0,5.0,3.0,28.0,55.0,7.0,1.0,48.0,...,5.0,1.0,70.0,21.0,0.0,0.0,2000,2,1001,2
2,2000q3,53.0,34.0,2.0,3.0,4.0,87.0,1.0,0.0,53.0,...,2.0,2.0,40.0,52.0,0.0,0.0,2000,3,1001,3
3,2000q4,68.0,19.0,4.0,1.0,71.0,21.0,0.0,0.0,68.0,...,1.0,0.0,90.0,2.0,0.0,0.0,2000,4,1001,4
4,2001q1,55.0,30.0,3.0,2.0,90.0,0.0,0.0,0.0,55.0,...,3.0,0.0,90.0,0.0,0.0,0.0,2001,1,1001,1


In [23]:
cdf = {}


cdf['2']   = ['2','3','4']
cdf['3']   = ['3','4']
cdf['4']   = ['4']


weatherVars = ['precip_','temp_']
statVars    = ['annual','zip','zipQuarter']


for weatherVar in weatherVars:
    for statVar in statVars:
        print(weatherVar + statVar, "*************************")
        
        
        for cutoff in list(cdf.keys()):
            varHere = weatherVar + statVar + cutoff
            print(varHere)
            if cutoff == '2':
                varHereRev = weatherVar + statVar + '_50'
            if (cutoff == '3'):
                varHereRev = weatherVar + statVar + '_95'
            if cutoff == '4':
                varHereRev = weatherVar + statVar + '_99'
       
            
            countDataRevised[varHereRev] = 0 
            
            
            print(varHereRev, "******")
            for greaterThanCutoffs in cdf[cutoff]:
                varCDF = weatherVar + statVar + greaterThanCutoffs 
                countDataRevised[varHereRev] = countDataRevised[varHereRev] + countData[varCDF] 
                print(weatherVar + statVar + greaterThanCutoffs)
 

precip_annual *************************
precip_annual2
precip_annual_50 ******
precip_annual2
precip_annual3
precip_annual4
precip_annual3
precip_annual_95 ******
precip_annual3
precip_annual4
precip_annual4
precip_annual_99 ******
precip_annual4
precip_zip *************************
precip_zip2
precip_zip_50 ******
precip_zip2
precip_zip3
precip_zip4
precip_zip3
precip_zip_95 ******
precip_zip3
precip_zip4
precip_zip4
precip_zip_99 ******
precip_zip4
precip_zipQuarter *************************
precip_zipQuarter2
precip_zipQuarter_50 ******
precip_zipQuarter2
precip_zipQuarter3
precip_zipQuarter4
precip_zipQuarter3
precip_zipQuarter_95 ******
precip_zipQuarter3
precip_zipQuarter4
precip_zipQuarter4
precip_zipQuarter_99 ******
precip_zipQuarter4
temp_annual *************************
temp_annual2
temp_annual_50 ******
temp_annual2
temp_annual3
temp_annual4
temp_annual3
temp_annual_95 ******
temp_annual3
temp_annual4
temp_annual4
temp_annual_99 ******
temp_annual4
temp_zip ****************

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [24]:
countDataRevised.head()

Unnamed: 0,zipcode,year,quarter,precip_annual_50,precip_annual_95,precip_annual_99,precip_zip_50,precip_zip_95,precip_zip_99,precip_zipQuarter_50,...,precip_zipQuarter_99,temp_annual_50,temp_annual_95,temp_annual_99,temp_zip_50,temp_zip_95,temp_zip_99,temp_zipQuarter_50,temp_zipQuarter_95,temp_zipQuarter_99
0,1001,2000,1,15.0,3.0,0.0,27.0,6.0,1.0,27.0,...,1.0,0.0,0.0,0.0,9.0,0.0,0.0,51.0,12.0,0.0
1,1001,2000,2,23.0,6.0,1.0,43.0,8.0,3.0,43.0,...,3.0,21.0,0.0,0.0,63.0,8.0,1.0,39.0,8.0,1.0
2,1001,2000,3,14.0,4.0,2.0,39.0,5.0,3.0,39.0,...,3.0,52.0,0.0,0.0,88.0,1.0,0.0,30.0,0.0,0.0
3,1001,2000,4,13.0,1.0,0.0,24.0,5.0,1.0,24.0,...,0.0,2.0,0.0,0.0,21.0,0.0,0.0,38.0,6.0,2.0
4,1001,2001,1,12.0,3.0,0.0,35.0,5.0,2.0,35.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,33.0,0.0,0.0


In [25]:
len(countDataRevised.zipcode.unique())

32656

In [26]:
countDataRevised.to_csv("../../data/companyData/revised_allWeatherBins_2000to2019_allZips.csv")