In [101]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from datetime import timedelta
import seaborn as sns

import statsmodels.api as sm
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose

%matplotlib inline

In [102]:
train = pd.read_csv('../data/train_clean.csv',  index_col='Date', parse_dates=['Date'])
spray = pd.read_csv('../data/updated_spray.csv', index_col='Date', parse_dates=['Date'])
weather = pd.read_csv('../data/clean_weather.csv', index_col='Date', parse_dates=['Date'])

In [103]:
weather.drop('Unnamed: 0', axis=1, inplace=True)

In [104]:
train.shape

(10506, 12)

In [105]:
weather.shape

(2944, 19)

In [106]:
weather_stn1 = weather[weather['Station']==1]
weather_stn2 = weather[weather['Station']==2]
weather_stn1 = weather_stn1.drop('Station', axis=1)
weather_stn2 = weather_stn2.drop('Station', axis=1)
weather = weather_stn1.merge(weather_stn2, on='Date')

In [107]:
weather.shape

(1472, 36)

In [108]:
weather.columns

Index(['Tmax_x', 'Tmin_x', 'Tavg_x', 'Depart_x', 'DewPoint_x', 'WetBulb_x',
       'Heat_x', 'Cool_x', 'Sunrise_x', 'Sunset_x', 'CodeSum_x', 'Depth_x',
       'PrecipTotal_x', 'StnPressure_x', 'SeaLevel_x', 'ResultSpeed_x',
       'ResultDir_x', 'AvgSpeed_x', 'Tmax_y', 'Tmin_y', 'Tavg_y', 'Depart_y',
       'DewPoint_y', 'WetBulb_y', 'Heat_y', 'Cool_y', 'Sunrise_y', 'Sunset_y',
       'CodeSum_y', 'Depth_y', 'PrecipTotal_y', 'StnPressure_y', 'SeaLevel_y',
       'ResultSpeed_y', 'ResultDir_y', 'AvgSpeed_y'],
      dtype='object')

In [109]:
weather['Day_length'] = (weather['Sunset_x'] - weather['Sunrise_x'])

In [110]:
weather.columns

Index(['Tmax_x', 'Tmin_x', 'Tavg_x', 'Depart_x', 'DewPoint_x', 'WetBulb_x',
       'Heat_x', 'Cool_x', 'Sunrise_x', 'Sunset_x', 'CodeSum_x', 'Depth_x',
       'PrecipTotal_x', 'StnPressure_x', 'SeaLevel_x', 'ResultSpeed_x',
       'ResultDir_x', 'AvgSpeed_x', 'Tmax_y', 'Tmin_y', 'Tavg_y', 'Depart_y',
       'DewPoint_y', 'WetBulb_y', 'Heat_y', 'Cool_y', 'Sunrise_y', 'Sunset_y',
       'CodeSum_y', 'Depth_y', 'PrecipTotal_y', 'StnPressure_y', 'SeaLevel_y',
       'ResultSpeed_y', 'ResultDir_y', 'AvgSpeed_y', 'Day_length'],
      dtype='object')

In [111]:
weather['Tmax'] = np.mean([weather['Tmax_x'],weather['Tmax_y']])

In [112]:
weather['Tmin'] = np.mean([weather['Tmin_x'],weather['Tmin_y']])

In [113]:
weather['Tavg'] = np.mean([weather['Tavg_x'],weather['Tavg_y']])

In [114]:
weather['Day_length_shift'] = weather['Day_length'].rolling(7).mean().shift(35)

In [115]:
weather['Tavg_shift'] = weather['Tavg'].rolling(3).mean().shift(14)

In [116]:
weather['ResultSpeed'] = np.mean([weather['ResultSpeed_x'],weather['ResultSpeed_y']])

In [117]:
weather['ResultSpeed_shift'] = weather['ResultSpeed'].rolling(3).mean().shift(21)

In [118]:
weather['ResultDir'] = np.mean([weather['ResultDir_x'],weather['ResultDir_y']])

In [119]:
weather['AvgSpeed'] = np.mean([weather['AvgSpeed_x'],weather['AvgSpeed_y']])

In [120]:
weather['ResultDir_shift'] = weather['ResultDir'].rolling(3).mean().shift(21)

In [121]:
weather['AvgSpeed_shift'] = weather['AvgSpeed'].rolling(3).mean().shift(21)

In [122]:
weather['Sunset'] = weather['Sunset_x']

In [123]:
weather['Sunrise'] = weather['Sunrise_x']

In [124]:
df = pd.merge(train, weather, on = 'Date')

In [125]:
df.shape

(10506, 62)

In [126]:
df['Heat'] = np.mean([weather['Heat_x'],weather['Heat_y']])

In [127]:
df.drop(['Unnamed: 0', 
         'Block', 
         'Latitude', 
         'Longitude', 
         'AddressAccuracy', 
         'SeaLevel_x', 
         'SeaLevel_y',
         'Tavg_x',
         'Tavg_y',
         'ResultSpeed_x',
         'ResultSpeed_y',
         'ResultDir_x',
         'ResultDir_y',
         'AvgSpeed_x',
         'AvgSpeed_y',
         'Heat_x',
         'Heat_y',
         'Tmax_x',
         'Tmax_y',
         'Tmin_x',
         'Tmin_y',
         'Sunset_y',
         'Sunrise_y'], 1, inplace=True)

In [128]:
df.columns

Index(['Address', 'Species', 'Street', 'Trap', 'AddressNumberAndStreet',
       'NumMosquitos', 'WnvPresent', 'Depart_x', 'DewPoint_x', 'WetBulb_x',
       'Cool_x', 'Sunrise_x', 'Sunset_x', 'CodeSum_x', 'Depth_x',
       'PrecipTotal_x', 'StnPressure_x', 'Depart_y', 'DewPoint_y', 'WetBulb_y',
       'Cool_y', 'CodeSum_y', 'Depth_y', 'PrecipTotal_y', 'StnPressure_y',
       'Day_length', 'Tmax', 'Tmin', 'Tavg', 'Day_length_shift', 'Tavg_shift',
       'ResultSpeed', 'ResultSpeed_shift', 'ResultDir', 'AvgSpeed',
       'ResultDir_shift', 'AvgSpeed_shift', 'Sunset', 'Sunrise', 'Heat'],
      dtype='object')

In [129]:
df['Depart'] = df['Depart_x']

In [130]:
df['DewPoint'] = np.mean([df['DewPoint_x'],df['DewPoint_y']])

In [131]:
df['WetBulb'] = np.mean([df['WetBulb_x'],df['WetBulb_y']])

In [132]:
df['Cool'] = np.mean([df['Cool_x'],df['Cool_y']])

In [133]:
df['CodeSum'] = df['CodeSum_x']

In [134]:
df['PrecipTotal'] = np.mean([df['PrecipTotal_x'],df['PrecipTotal_y']])

In [135]:
df['StnPressure'] = np.mean([df['StnPressure_x'],df['StnPressure_y']])

In [136]:
df['Depart'] = df['Depart_x']

In [137]:
df.drop(['Depart_y',
         'DewPoint_x',
         'DewPoint_y',
         'WetBulb_x',
         'WetBulb_y',
         'Cool_x',
         'Cool_y',
         'Sunrise_x',
         'Sunset_x'
         'CodeSum_x',
         'CodeSum_y',
         'Depth_x',
         'Depth_y',
         'PrecipTotal_x',
         'PrecipTotal_y',
         'StnPressure_x',
         'StnPressure_y',
         'Depart_x',
         'Depart_y',
         'DewPoint_x',
         'DewPoint_y'
         
         
         
         
], 1, inplace=True)

KeyError: "labels ['Sunset_xCodeSum_x'] not contained in axis"

In [None]:
df.to_csv('../data/dataframe.csv')