In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from datetime import timedelta
import seaborn as sns

import statsmodels.api as sm
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose

%matplotlib inline

In [2]:
train = pd.read_csv('../data/train_clean.csv',  index_col='Date', parse_dates=['Date'])
spray = pd.read_csv('../data/updated_spray.csv', index_col='Date', parse_dates=['Date'])
weather = pd.read_csv('../data/clean_weather.csv', index_col='Date', parse_dates=['Date'])

In [3]:
weather.drop('Unnamed: 0', axis=1, inplace=True)

In [4]:
train.shape

(10506, 12)

In [5]:
weather.shape

(2944, 19)

In [6]:
weather_stn1 = weather[weather['Station']==1]
weather_stn2 = weather[weather['Station']==2]
weather_stn1 = weather_stn1.drop('Station', axis=1)
weather_stn2 = weather_stn2.drop('Station', axis=1)
weather = weather_stn1.merge(weather_stn2, on='Date')

In [7]:
weather.shape

(1472, 36)

In [8]:
weather.columns

Index(['Tmax_x', 'Tmin_x', 'Tavg_x', 'Depart_x', 'DewPoint_x', 'WetBulb_x',
       'Heat_x', 'Cool_x', 'Sunrise_x', 'Sunset_x', 'CodeSum_x', 'Depth_x',
       'PrecipTotal_x', 'StnPressure_x', 'SeaLevel_x', 'ResultSpeed_x',
       'ResultDir_x', 'AvgSpeed_x', 'Tmax_y', 'Tmin_y', 'Tavg_y', 'Depart_y',
       'DewPoint_y', 'WetBulb_y', 'Heat_y', 'Cool_y', 'Sunrise_y', 'Sunset_y',
       'CodeSum_y', 'Depth_y', 'PrecipTotal_y', 'StnPressure_y', 'SeaLevel_y',
       'ResultSpeed_y', 'ResultDir_y', 'AvgSpeed_y'],
      dtype='object')

In [9]:
weather['Day_length'] = (weather['Sunset_x'] - weather['Sunrise_x'])

In [10]:
weather.columns

Index(['Tmax_x', 'Tmin_x', 'Tavg_x', 'Depart_x', 'DewPoint_x', 'WetBulb_x',
       'Heat_x', 'Cool_x', 'Sunrise_x', 'Sunset_x', 'CodeSum_x', 'Depth_x',
       'PrecipTotal_x', 'StnPressure_x', 'SeaLevel_x', 'ResultSpeed_x',
       'ResultDir_x', 'AvgSpeed_x', 'Tmax_y', 'Tmin_y', 'Tavg_y', 'Depart_y',
       'DewPoint_y', 'WetBulb_y', 'Heat_y', 'Cool_y', 'Sunrise_y', 'Sunset_y',
       'CodeSum_y', 'Depth_y', 'PrecipTotal_y', 'StnPressure_y', 'SeaLevel_y',
       'ResultSpeed_y', 'ResultDir_y', 'AvgSpeed_y', 'Day_length'],
      dtype='object')

In [11]:
weather['Tmax'] = weather.apply(lambda x: np.mean([x['Tmax_x'],x['Tmax_y']]), 1)

In [12]:
weather['Tmin'] = weather.apply(lambda x: np.mean([x['Tmin_x'],x['Tmin_y']]),1)

In [13]:
weather['Tavg'] = weather.apply(lambda x: np.mean([x['Tavg_x'],x['Tavg_y']]),1)

In [14]:
weather['Day_length_shift'] = weather['Day_length'].rolling(7).mean().shift(35)

In [15]:
weather['Tavg_shift'] = weather['Tavg'].rolling(3).mean().shift(14)

In [16]:
weather['ResultSpeed'] = weather.apply(lambda x: np.mean([x['ResultSpeed_x'],x['ResultSpeed_y']]),1)

In [17]:
weather['ResultSpeed_shift'] = weather['ResultSpeed'].rolling(3).mean().shift(21)

In [18]:
weather['ResultDir'] = weather.apply(lambda x: np.mean([x['ResultDir_x'],x['ResultDir_y']]),1)

In [19]:
weather['AvgSpeed'] = weather.apply(lambda x: np.mean([x['AvgSpeed_x'],x['AvgSpeed_y']]),1)

In [20]:
weather['ResultDir_shift'] = weather['ResultDir'].rolling(3).mean().shift(21)

In [21]:
weather['AvgSpeed_shift'] = weather['AvgSpeed'].rolling(3).mean().shift(21)

In [22]:
weather['Sunset'] = weather['Sunset_x']

In [23]:
weather['Sunrise'] = weather['Sunrise_x']

In [24]:
df = pd.merge(train, weather, on = 'Date')

In [25]:
df.shape

(10506, 62)

In [26]:
df['Heat'] = df.apply(lambda x: np.mean([x['Heat_x'],x['Heat_y']]),1)

In [27]:
df.drop(['Unnamed: 0', 
         'Block',  
         'AddressAccuracy', 
         'SeaLevel_x', 
         'SeaLevel_y',
         'Tavg_x',
         'Tavg_y',
         'ResultSpeed_x',
         'ResultSpeed_y',
         'ResultDir_x',
         'ResultDir_y',
         'AvgSpeed_x',
         'AvgSpeed_y',
         'Heat_x',
         'Heat_y',
         'Tmax_x',
         'Tmax_y',
         'Tmin_x',
         'Tmin_y',
         'Sunset_y',
         'Sunrise_y'], 1, inplace=True)

In [28]:
df.columns

Index(['Address', 'Species', 'Street', 'Trap', 'AddressNumberAndStreet',
       'Latitude', 'Longitude', 'NumMosquitos', 'WnvPresent', 'Depart_x',
       'DewPoint_x', 'WetBulb_x', 'Cool_x', 'Sunrise_x', 'Sunset_x',
       'CodeSum_x', 'Depth_x', 'PrecipTotal_x', 'StnPressure_x', 'Depart_y',
       'DewPoint_y', 'WetBulb_y', 'Cool_y', 'CodeSum_y', 'Depth_y',
       'PrecipTotal_y', 'StnPressure_y', 'Day_length', 'Tmax', 'Tmin', 'Tavg',
       'Day_length_shift', 'Tavg_shift', 'ResultSpeed', 'ResultSpeed_shift',
       'ResultDir', 'AvgSpeed', 'ResultDir_shift', 'AvgSpeed_shift', 'Sunset',
       'Sunrise', 'Heat'],
      dtype='object')

In [29]:
df['Depart'] = df['Depart_x']

In [30]:
df['DewPoint'] = df.apply(lambda x: np.mean([x['DewPoint_x'],x['DewPoint_y']]),1)

In [31]:
df['WetBulb'] = df.apply(lambda x: np.mean([x['WetBulb_x'],x['WetBulb_y']]),1)

In [32]:
df['Cool'] = df.apply(lambda x: np.mean([x['Cool_x'],x['Cool_y']]),1)

In [33]:
df['CodeSum'] = df['CodeSum_x']

In [34]:
df['PrecipTotal'] = df.apply(lambda x: np.mean([x['PrecipTotal_x'],x['PrecipTotal_y']]),1)

In [35]:
df['StnPressure'] = df.apply(lambda x: np.mean([x['StnPressure_x'],x['StnPressure_y']]),1)

In [36]:
df['Depart'] = df['Depart_x']

In [37]:
df.drop(['Depart_y',
         'DewPoint_x',
         'DewPoint_y',
         'WetBulb_x',
         'WetBulb_y',
         'Cool_x',
         'Cool_y',
         'Sunrise_x',
         'Sunset_x',
         'CodeSum_x',
         'CodeSum_y',
         'Depth_x',
         'Depth_y',
         'PrecipTotal_x',
         'PrecipTotal_y',
         'StnPressure_x',
         'StnPressure_y',
         'Depart_x',
         'Depart_y',
         'DewPoint_x',
         'DewPoint_y'
         
         
         
         
], 1, inplace=True)

In [38]:
df.to_csv('../data/dataframe.csv')

In [39]:
df.columns

Index(['Address', 'Species', 'Street', 'Trap', 'AddressNumberAndStreet',
       'Latitude', 'Longitude', 'NumMosquitos', 'WnvPresent', 'Day_length',
       'Tmax', 'Tmin', 'Tavg', 'Day_length_shift', 'Tavg_shift', 'ResultSpeed',
       'ResultSpeed_shift', 'ResultDir', 'AvgSpeed', 'ResultDir_shift',
       'AvgSpeed_shift', 'Sunset', 'Sunrise', 'Heat', 'Depart', 'DewPoint',
       'WetBulb', 'Cool', 'CodeSum', 'PrecipTotal', 'StnPressure'],
      dtype='object')