In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from datetime import timedelta
import seaborn as sns

import statsmodels.api as sm
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose

%matplotlib inline

In [2]:
train = pd.read_csv('../data/train_clean.csv',  index_col='Date', parse_dates=['Date'])
spray = pd.read_csv('../data/updated_spray.csv', index_col='Date', parse_dates=['Date'])
weather = pd.read_csv('../data/clean_weather.csv', index_col='Date', parse_dates=['Date'])

In [3]:
weather.drop('Unnamed: 0', axis=1, inplace=True)
train.drop('Unnamed: 0', axis=1, inplace=True)

In [4]:
train.shape

(10506, 11)

In [5]:
weather.shape

(2944, 19)

In [6]:
weather_stn1 = weather[weather['Station']==1]
weather_stn2 = weather[weather['Station']==2]
weather_stn1 = weather_stn1.drop('Station', axis=1)
weather_stn2 = weather_stn2.drop('Station', axis=1)
weather = weather_stn1.merge(weather_stn2, on='Date')

In [7]:
weather.shape

(1472, 36)

In [8]:
weather.columns

Index(['Tmax_x', 'Tmin_x', 'Tavg_x', 'Depart_x', 'DewPoint_x', 'WetBulb_x',
       'Heat_x', 'Cool_x', 'Sunrise_x', 'Sunset_x', 'CodeSum_x', 'Depth_x',
       'PrecipTotal_x', 'StnPressure_x', 'SeaLevel_x', 'ResultSpeed_x',
       'ResultDir_x', 'AvgSpeed_x', 'Tmax_y', 'Tmin_y', 'Tavg_y', 'Depart_y',
       'DewPoint_y', 'WetBulb_y', 'Heat_y', 'Cool_y', 'Sunrise_y', 'Sunset_y',
       'CodeSum_y', 'Depth_y', 'PrecipTotal_y', 'StnPressure_y', 'SeaLevel_y',
       'ResultSpeed_y', 'ResultDir_y', 'AvgSpeed_y'],
      dtype='object')

In [9]:
str(weather['Sunset_x'].iloc[0])

'1849'

In [10]:
from datetime import datetime

In [11]:
def day_length(row):
    sunset = row['Sunset_x']
    sunrise = row['Sunrise_x']
    if sunset % 100 == 60:
        sunset = sunset + 40
        sunset = str(sunset)
        sunrise = str(sunrise)
    else:
        sunset = str(sunset)
        sunrise = str(sunrise)
    
    x = datetime.strptime(sunset, '%H%M') - datetime.strptime(sunrise, '%H%M')
    return x.seconds
    # parse into datetime
    # find the difference
    # format  for output
    # return

In [12]:
weather['Day_length'] = weather.apply(day_length, axis=1)

In [13]:
weather['Tmax'] = weather.apply(lambda x: np.mean([x['Tmax_x'],x['Tmax_y']]), 1)

In [14]:
weather['Tmin'] = weather.apply(lambda x: np.mean([x['Tmin_x'],x['Tmin_y']]),1)

In [15]:
weather['Tavg'] = weather.apply(lambda x: np.mean([x['Tavg_x'],x['Tavg_y']]),1)

In [16]:
weather['ResultSpeed'] = weather.apply(lambda x: np.mean([x['ResultSpeed_x'],x['ResultSpeed_y']]),1)

In [17]:
weather['ResultDir'] = weather.apply(lambda x: np.mean([x['ResultDir_x'],x['ResultDir_y']]),1)

In [18]:
weather['AvgSpeed'] = weather.apply(lambda x: np.mean([x['AvgSpeed_x'],x['AvgSpeed_y']]),1)

In [19]:
weather['Sunset'] = weather['Sunset_x']

In [20]:
weather['Sunrise'] = weather['Sunrise_x']

In [21]:
weather['Heat'] = weather.apply(lambda x: np.mean([x['Heat_x'],x['Heat_y']]),1)

In [22]:
weather['Depart'] = weather['Depart_x']

In [23]:
weather['DewPoint'] = weather.apply(lambda x: np.mean([weather['DewPoint_x'],weather['DewPoint_y']]),1)

In [24]:
weather['WetBulb'] = weather.apply(lambda x: np.mean([x['WetBulb_x'],x['WetBulb_y']]),1)

In [25]:
weather['Cool'] = weather.apply(lambda x: np.mean([x['Cool_x'],x['Cool_y']]),1)

In [26]:
weather['CodeSum'] = weather['CodeSum_x']

In [27]:
weather['PrecipTotal'] = weather.apply(lambda x: np.mean([x['PrecipTotal_x'],x['PrecipTotal_y']]),1)

In [28]:
weather['StnPressure'] = weather.apply(lambda x: np.mean([x['StnPressure_x'],x['StnPressure_y']]),1)

In [29]:
weather.columns

Index(['Tmax_x', 'Tmin_x', 'Tavg_x', 'Depart_x', 'DewPoint_x', 'WetBulb_x',
       'Heat_x', 'Cool_x', 'Sunrise_x', 'Sunset_x', 'CodeSum_x', 'Depth_x',
       'PrecipTotal_x', 'StnPressure_x', 'SeaLevel_x', 'ResultSpeed_x',
       'ResultDir_x', 'AvgSpeed_x', 'Tmax_y', 'Tmin_y', 'Tavg_y', 'Depart_y',
       'DewPoint_y', 'WetBulb_y', 'Heat_y', 'Cool_y', 'Sunrise_y', 'Sunset_y',
       'CodeSum_y', 'Depth_y', 'PrecipTotal_y', 'StnPressure_y', 'SeaLevel_y',
       'ResultSpeed_y', 'ResultDir_y', 'AvgSpeed_y', 'Day_length', 'Tmax',
       'Tmin', 'Tavg', 'ResultSpeed', 'ResultDir', 'AvgSpeed', 'Sunset',
       'Sunrise', 'Heat', 'Depart', 'DewPoint', 'WetBulb', 'Cool', 'CodeSum',
       'PrecipTotal', 'StnPressure'],
      dtype='object')

In [30]:
weather.drop([
         'SeaLevel_x', 
         'SeaLevel_y',
         'Tavg_x',
         'Tavg_y',
         'ResultSpeed_x',
         'ResultSpeed_y',
         'ResultDir_x',
         'ResultDir_y',
         'AvgSpeed_x',
         'AvgSpeed_y',
         'Heat_x',
         'Heat_y',
         'Tmax_x',
         'Tmax_y',
         'Tmin_x',
         'Tmin_y',
         'Sunset_y',
         'Sunrise_y',
         'Depart_y',
         'DewPoint_x',
         'DewPoint_y',
         'WetBulb_x',
         'WetBulb_y',
         'Cool_x',
         'Cool_y',
         'Sunrise_x',
         'Sunset_x',
         'CodeSum_x',
         'CodeSum_y',
         'Depth_x',
         'Depth_y',
         'PrecipTotal_x',
         'PrecipTotal_y',
         'StnPressure_x',
         'StnPressure_y',
         'Depart_x',
         'Depart_y',
         'DewPoint_x',
         'DewPoint_y'], 1, inplace=True)

In [31]:
weather.columns

Index(['Day_length', 'Tmax', 'Tmin', 'Tavg', 'ResultSpeed', 'ResultDir',
       'AvgSpeed', 'Sunset', 'Sunrise', 'Heat', 'Depart', 'DewPoint',
       'WetBulb', 'Cool', 'CodeSum', 'PrecipTotal', 'StnPressure'],
      dtype='object')

In [32]:
train.columns

Index(['Address', 'Species', 'Block', 'Street', 'Trap',
       'AddressNumberAndStreet', 'Latitude', 'Longitude', 'AddressAccuracy',
       'NumMosquitos', 'WnvPresent'],
      dtype='object')

In [33]:
df = pd.merge(train, weather, on = 'Date')

In [34]:
df.shape

(10506, 28)

In [35]:
df.columns

Index(['Address', 'Species', 'Block', 'Street', 'Trap',
       'AddressNumberAndStreet', 'Latitude', 'Longitude', 'AddressAccuracy',
       'NumMosquitos', 'WnvPresent', 'Day_length', 'Tmax', 'Tmin', 'Tavg',
       'ResultSpeed', 'ResultDir', 'AvgSpeed', 'Sunset', 'Sunrise', 'Heat',
       'Depart', 'DewPoint', 'WetBulb', 'Cool', 'CodeSum', 'PrecipTotal',
       'StnPressure'],
      dtype='object')

In [36]:
df.to_csv('../data/dataframe.csv')