In [None]:
import pickle as pkl
import pandas as pd
import matplotlib
import os
import re

import collections
import datetime
import time
 
import geopandas as gpd

import numpy as np

import rasterio

from difflib import get_close_matches

from fuzzywuzzy import process
from fuzzywuzzy import fuzz

import dask.dataframe as dd

import gc
import sys


# Sample

In [None]:
def getData(weatherType,yearRange1,yearRange2):

    if weatherType == "Tmax":
        weatherVar = "temperature"
    else: 
        weatherVar = "precipitation"
    
    year = yearRange1
    filename = "../../../../../../../Volumes/backup2/dissData/prism/zipcode" + weatherType + str(year) + ".csv"
    data = dd.read_csv(filename, assume_missing=True)[['ZIP','date',weatherVar]]
    
    
    callLocation = '../../data/companyData/relevantZips.pkl'

    file   = open(callLocation,"rb")

    relevantZips = list(pkl.load(file))
    len(relevantZips)
    
    data = data[data.ZIP.isin(relevantZips)]
    
    # data = data[~(data[weatherVar].isna().compute())]

    years = range(yearRange1 + 1,yearRange2 + 1)
    for year in years:
        filename = "../../../../../../../Volumes/backup2/dissData/prism/zipcode" + weatherType + str(year) + ".csv"
        tempData = dd.read_csv(filename, assume_missing=True)[['ZIP','date',weatherVar]]
        tempData = tempData[tempData.ZIP.isin(relevantZips)]
        # tempData = tempData[~(tempData[weatherVar].isna().compute())]
        data = data.append(tempData)

    data = data[~(data[weatherVar].isna())].compute()
    
    return(data)


In [None]:
'''numberQuants    = 20

quantiles       = np.arange(0.0, 1.05, 1/numberQuants)

quant_labels   = ['quant_' + str(n)[0:4] for n in quantiles]
'''
print(quant_labels)

In [None]:
len(quantiles)

I think only a few of these quantiles will be relevant. Let's look at 0, 0.9, 0.95, 1.0

In [None]:
quantiles = [0.0, 0.05, 0.90, 0.95, 1.0]

quant_labels = ['quant_0.05','quant_tossThisOne','quant_0.95','quant_1.0']

# Precipitation

In [None]:
weatherType = "Precip" # Tmax
precipData = getData(weatherType, 1981, 2009)

In [None]:
print(sys.getsizeof(precipData)/1e6)

## Temperature

In [None]:
weatherType = "Tmax"
tempData = getData(weatherType, 1981, 2009)

In [None]:
print(sys.getsizeof(tempData)/1e6)

## Get stats

### Describe Climate

In [None]:
weatherData = tempData.merge(precipData)
weatherData['month'] = weatherData.date.astype('str').str.slice(4,6)
weatherData['year'] = weatherData.date.astype('str').str.slice(0,4)

weatherData['quarter'] = 'q1'

weatherData.loc[weatherData['month'].isin(['04','05','06']), 'quarter'] = 'q2'
weatherData.loc[weatherData['month'].isin(['07','08','09']), 'quarter'] = 'q3'
weatherData.loc[weatherData['month'].isin(['10','11','12']), 'quarter'] = 'q4'
    
    
print(weatherData.shape,tempData.shape,precipData.shape)


weatherData.head()

Get the quarterly stats.

In [None]:
quarterlyStatsOverall = weatherData.drop(columns = {'date'}).\
    groupby(['quarter']).\
    agg(quarterly_avg_precip = ('precipitation',"mean"),
       quarterly_median_precip = ('precipitation',"median"),
       quarterly_variance_precip = ('precipitation',"var"),
       quarterly_avg_temp = ('temperature',"mean"),
       quarterly_median_temp = ('temperature',"median"),
       quarterly_variance_temp = ('temperature',"var")
       ).reset_index()

quarterlyStatsByZip = weatherData.drop(columns = {'date'}).\
    groupby(['ZIP','quarter']).\
    agg(quarterly_avg_precip = ('precipitation',"mean"),
       quarterly_median_precip = ('precipitation',"median"),
       quarterly_variance_precip = ('precipitation',"var"),
       quarterly_avg_temp = ('temperature',"mean"),
       quarterly_median_temp = ('temperature',"median"),
       quarterly_variance_temp = ('temperature',"var")
       ).reset_index()

'''precipQuantsOverall = precipData.groupby(['month']).precipitation.quantile(q = quantiles).\
    reset_index().rename(columns = {'level_1': 'quantile'})'''


In [None]:
quarterlyStatsByZip.to_csv("../../data/companyData/quarterlyStatsByZip.csv")

And the annual ones.

In [None]:
annualStatsOverall = weatherData.drop(columns = {'date'}).\
    groupby(['year']).\
    agg(annual_avg_precip = ('precipitation',"mean"),
       annual_median_precip = ('precipitation',"median"),
       annual_variance_precip = ('precipitation',"var"),
       annual_avg_temp = ('temperature',"mean"),
       annual_median_temp = ('temperature',"median"),
       annual_variance_temp = ('temperature',"var")
       ).reset_index()

annualStatsByZip = weatherData.drop(columns = {'date'}).\
    groupby(['ZIP','year']).\
    agg(annual_avg_precip = ('precipitation',"mean"),
       annual_median_precip = ('precipitation',"median"),
       annual_variance_precip = ('precipitation',"var"),
       annual_avg_temp = ('temperature',"mean"),
       annual_median_temp = ('temperature',"median"),
       annual_variance_temp = ('temperature',"var")
       ).reset_index()
annualStatsByZip.head()

In [None]:
quarterlyStatsByZip.head()

### Find Quartiles

In [None]:
quants_overall = getPivotQuantsOverall('temperature',weatherData)

quants_overall

In [None]:
def getPivotQuantsOverall(weatherType,weatherData):

    quants_overall  =  weatherData[weatherType].quantile(q = quantiles).reset_index().\
        rename(columns = {'index': 'quartile'})    
    
    quants_overall['quartile'] = 'q_' + quants_overall['quartile'].astype(str).str.slice(0,4)
    quants_overall = quants_overall.iloc[:,1:].values.flatten().tolist()

    return(quants_overall)

In [None]:
def getPivotQuants(weatherType, identifiers,weatherData):

    quants_overallByZip   = weatherData.groupby(identifiers)[weatherType].quantile(q = quantiles).reset_index().\
        rename(columns=lambda x: re.sub('level_[0-9]$','quartile',x))
    # rename(columns = {'level_1': 'quartile'})
    # print(quants_overallByZip.head())
    
    quants_overallByZip['quartile'] = 'q_' + quants_overallByZip['quartile'].astype(str).str.slice(0,4)
    quants_overallByZip.head()

    ## 

    pivot_quants_overallByZip = quants_overallByZip.pivot(index=identifiers, 
            columns='quartile', values=str(weatherType)).reset_index().\
            rename_axis(None, axis=1)

    pivot_quants_overallByZip['quartileList']      = pivot_quants_overallByZip.iloc[:,len(identifiers):].values.tolist()
    pivot_quants_overallByZip['quartileLabelList'] = [list(pivot_quants_overallByZip.columns[2:-1])] * \
                                                                    pivot_quants_overallByZip.shape[0]

    pivot_quants_overallByZip = pivot_quants_overallByZip[pivot_quants_overallByZip.\
                                                            columns.drop(list(pivot_quants_overallByZip.filter(regex='q_')))]

    # pivot_quants_overallByZip.head()

    return(pivot_quants_overallByZip)

In [None]:
def getQuartCounts(df, weatherType):
    weatherCut = weatherType + "Cut"
    weatherCutLabels = weatherCut + 'labels'
    
    df['month'] = df.date.astype(str).str.slice(4,6)
    
    df['quarter'] = 'q1'

    df.loc[df['month'].isin(['04','05','06']), 'quarter'] = 'q2'
    df.loc[df['month'].isin(['07','08','09']), 'quarter'] = 'q3'
    df.loc[df['month'].isin(['10','11','12']), 'quarter'] = 'q4'

    df['yearQuarter'] = df.date.astype(str).str.slice(0,4) + df.quarter

    occurrences = weatherType + "Occurrences" 

    summaryDF = df.groupby(['ZIP','yearQuarter',weatherCutLabels]).size().reset_index()
    summaryDF.columns = ['zip','yearQuarter',weatherCutLabels,occurrences]
    
    return(summaryDF)

### Convert 2009-2018 data into quartiles

In [None]:
'''recentDecadeTmax   = getData("Tmax",2009,2018)
recentDecadePrecip = getData("Precip",2009,2018)'''


In [None]:
recentDecadeWeather = recentDecadeTmax.merge(recentDecadePrecip)
recentDecadeWeather.head()

In [None]:
recentDecadeWeather['month'] = recentDecadeWeather.date.astype(str).str.slice(4,6)

recentDecadeWeather['quarter'] = 'q1'

recentDecadeWeather.loc[recentDecadeWeather['month'].isin(['04','05','06']), 'quarter'] = 'q2'
recentDecadeWeather.loc[recentDecadeWeather['month'].isin(['07','08','09']), 'quarter'] = 'q3'
recentDecadeWeather.loc[recentDecadeWeather['month'].isin(['10','11','12']), 'quarter'] = 'q4'

recentDecadeWeather['yearQuarter'] = recentDecadeWeather.date.astype(str).str.slice(0,4) + recentDecadeWeather.quarter
recentDecadeWeather.head()

In [None]:
recentDecadeWeather[recentDecadeWeather.yearQuarter == '2009q1']

Now get all the quartile data:
    - Overall
    - Monthly
    - By zip
    - Monthly by zip

In [None]:
tempQuants   = getPivotQuantsOverall('temperature',weatherData)
precipQuants = getPivotQuantsOverall('precipitation',weatherData)

###################

pivot_temperatureQuants_monthly = getPivotQuants('temperature',['month'],weatherData).\
    drop(columns = {'quartileLabelList'})
pivot_temperatureQuants_monthly.rename(columns = {'quartileList':     'quartileListTemperature_monthly'},
                                      inplace = True)

pivot_precipitationQuants_monthly = getPivotQuants('precipitation',['month'],weatherData)
pivot_precipitationQuants_monthly.rename(columns = {'quartileList':     'quartileListPrecipitation_monthly'},
                                      inplace = True)

quants_monthly = pivot_temperatureQuants_monthly.merge(pivot_precipitationQuants_monthly).\
    drop(columns = {'quartileLabelList'})


####################
pivot_temperatureQuants_zip = getPivotQuants('temperature',['ZIP'],weatherData).\
    drop(columns = {'quartileLabelList'})
pivot_temperatureQuants_zip.rename(columns = {'quartileList':     'quartileListTemperature_zip'},
                                      inplace = True)

pivot_precipitationQuants_zip = getPivotQuants('precipitation',['ZIP'],weatherData).\
    drop(columns = {'quartileLabelList'})
pivot_precipitationQuants_zip.rename(columns = {'quartileList':     'quartileListPrecipitation_zip'},
                                      inplace = True)

quants_zip = pivot_temperatureQuants_zip.merge(pivot_precipitationQuants_zip)


####################

pivot_temperatureQuants_monthlyByZip = getPivotQuants('temperature',['ZIP','month'],weatherData).\
    drop(columns = {'quartileLabelList'})
pivot_temperatureQuants_monthlyByZip.rename(columns = {'quartileList':     'quartileListTemperature_monthlyByZip'},
                                      inplace = True)

pivot_precipitationQuants_monthlyByZip = getPivotQuants('precipitation',['ZIP','month'],weatherData).\
    drop(columns = {'quartileLabelList'})
pivot_precipitationQuants_monthlyByZip.rename(columns = {'quartileList':     'quartileListPrecipitation_monthlyByZip'},
                                      inplace = True)

quants_monthlyByZip = pivot_temperatureQuants_monthlyByZip.merge(pivot_precipitationQuants_monthlyByZip)



In [None]:
quantsAll                 = quants_monthlyByZip.merge(quants_zip).merge(quants_monthly)

quantsAll['precipitationQuants'] = [precipQuants for i in quantsAll.index]
quantsAll['temperatureQuants']   = [tempQuants   for i in quantsAll.index]


quantsAll.head()

Merge the intervals into the recent weather data.

In [None]:
recentDecadeWeather = recentDecadeWeather.merge(quantsAll)

recentDecadeWeather.head()

In [None]:
recentDecadeWeather['temp_annualQuants'] = pd.cut(recentDecadeWeather.temperature, 
           bins = recentDecadeWeather.tempQuants[0],
           labels = quant_labels)

recentDecadeWeather['precip_annualQuants'] = pd.cut(recentDecadeWeather.precipitation, 
           bins = recentDecadeWeather.precipQuants[0][1:],
           labels = quant_labels[1:])

recentDecadeWeather.head()

In [None]:
months = recentDecadeWeather.month.unique()

weatherByMonth = pd.DataFrame()

for month in months: 
    print(month)
    tempData = recentDecadeWeather[recentDecadeWeather.month == month].reset_index()
    
    
    tempData['temp_monthlyQuants'] = pd.cut(tempData.temperature, 
           bins = tempData.quartileListTemp_monthly[0],
           labels = quant_labels)
    

    tempData['precip_monthlyQuants'] = pd.cut(tempData.precipitation, 
           bins = tempData.quartileListPrecip_monthly[0][1:],
           labels = quant_labels[1:])
    
    weatherByMonth = weatherByMonth.append(tempData)

In [None]:
def getQuarts(row):
    
    row[weatherCutLabels] = pd.cut([row[weatherType]], 
           bins = row.quartileList,
           labels = row.quartileLabelList)[0]

    '''row[weatherCut] = pd.cut([row[weatherType]], 
           bins = row.quartileList)[0]'''    
    return(row)


def getQuartType(data, weatherType):
    weatherCut = weatherType + "Cut"
    weatherCutLabels = weatherCut + 'labels'
    
    test1 = data.apply(getQuarts, axis=1)
    return(test1)

In [None]:
weatherByZip = pd.DataFrame()

ZIPs = weatherByMonth.ZIP.unique()


for zipcode in ZIPs: 
    print(zipcode)
    tempData = weatherByMonth[weatherByMonth.ZIP == zipcode].reset_index()
    
    
    tempData['temp_zipQuants'] = pd.cut(tempData.temperature, 
           bins = tempData.quartileListTemp_monthly[0],
           labels = quant_labels)
    

    tempData['precip_zipQuants'] = pd.cut(tempData.precipitation, 
           bins = tempData.quartileListPrecip_monthly[0][1:],
           labels = quant_labels[1:])
    
    weatherByZip = weatherByZip.append(tempData)

This one takes a while so didn't wait for it to finish.

In [None]:
weatherByZipByMonth = pd.DataFrame()


ZIPs   = weatherByZip.ZIP.unique()
months = weatherByZip.month.unique()


for zipcode in ZIPs:
    for month in months:


        tempData = weatherByZip[(weatherByZip.ZIP  == zipcode) & 
                                (weatherByZip.month == month)].reset_index()


        tempData['temp_zipMonthQuants'] = pd.cut(tempData.temperature, 
               bins = tempData.quartileListTemp_monthlyByZip[0],
               labels = quant_labels)


        tempData['precip_zipMonthQuants'] = pd.cut(tempData.precipitation, 
               bins = tempData.quartileListPrecip_monthlyByZip[0][1:],
               labels = quant_labels[1:])



        weatherByZipByMonth = weatherByZipByMonth.append(tempData)

In [None]:
weatherByZipByMonth[(weatherByZipByMonth.ZIP == 1238.0) & (weatherByZipByMonth.yearQuarter == '2009q2')]

In [None]:
weatherByZipByMonth.head()

In [None]:
allQuarts = weatherByZip[['ZIP', 'date', 'quarter', 'yearQuarter',
                          'temp_annualQuants',   'precip_annualQuants',
                          'temp_monthlyQuants',  'precip_monthlyQuants', 
                          'temp_zipQuants',      'precip_zipQuants']]

allQuarts.head()

In [None]:
allQuarts[allQuarts.yearQuarter == '2009q2']

------------------

## Now go through the data from 2010 - 2019 and find time spent in each bin

In [None]:
def getCounts(quartType,df):

    test = df.groupby(['ZIP','yearQuarter',quartType]).size().reset_index()
    test = test.drop_duplicates()
    test.columns = ['zip', 'yearQuarter', quartType, 'occurrences']

    # print(test.head())


    testPivot = test.pivot(index=['zip','yearQuarter'], 
            columns = quartType, values= 'occurrences').reset_index().\
            rename_axis(None, axis=1)

    testPivot.columns.values[2:] = quartType[:-6] + testPivot.columns.values[2:]

    # print(testPivot.head())
    return(testPivot)

In [None]:
countData = getCounts(allQuarts.columns[4],allQuarts)

for quart in allQuarts.columns[5:]:
    print(quart)
    countData = countData.merge(getCounts(quart,allQuarts))

In [None]:
print(countData.columns)



Reformat the column names so they're consistent.

In [None]:
countData['year'] = countData.yearQuarter.str.slice(0,4)
countData['qtr']  = countData.yearQuarter.str.slice(5,6).astype('float')
countData['zipcode']  = countData.zip.astype('int64')

In [None]:
countData.zipcode.min()

In [None]:
countData.to_csv("../../data/companyData/allWeatherBins_2009to2018.csv")