## Merging the test data with the clean weather data

In [691]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [692]:
test = pd.read_csv('../data/test_2.csv',index_col=0)
weather = pd.read_csv('../data/clean_weather.csv', index_col=0)

In [693]:
test.reset_index(inplace=True)

In [694]:
df = pd.merge(test, weather, on = 'Date')

In [695]:
df.set_index('Date', inplace=True)

In [696]:
df.index = pd.to_datetime(df.index)

Adding a column for month like in the train data

In [697]:
df['Month'] = df.index.map(lambda x: x.month)

In [698]:
df.columns

Index(['Id', 'Address', 'Species', 'Block', 'Street', 'Trap',
       'AddressNumberAndStreet', 'Latitude', 'Longitude', 'AddressAccuracy',
       'Day_length', 'Tmax', 'Tmin', 'Tavg', 'ResultSpeed', 'ResultDir',
       'AvgSpeed', 'Heat', 'DewPoint', 'WetBulb', 'Cool', 'PrecipTotal',
       'StnPressure', 'Sunset', 'Sunrise', 'Depart', 'CodeSum', 'Month'],
      dtype='object')

Saving my test data

In [699]:
df.to_csv('../data/test_final_dataframe.csv')

# Feature Engineering

Description: Engineering features to further refine our data, and ready it for Preprocessing.

In [700]:
import numpy as np
import pandas as pd

In [701]:
df = pd.read_csv('../data/test_final_dataframe.csv', index_col=0, parse_dates=True)

In [702]:
df.reset_index(inplace=True)

**Adding Rolling & Expanded Means for Features -w- Shifted Dates**

The optimal shifts for these features were found 

In [1]:
df['Day_length_shift'] = df['Day_length'].rolling(3).mean().shift(28)
df.loc[0:29, 'Day_length_shift'] = df.loc[30, 'Day_length_shift']

NameError: name 'df' is not defined

In [2]:
df['Tavg_shift'] = df['Tavg'].rolling(3).mean().shift(14)
df.loc[0:22, 'Tavg_shift'] = df.loc[23, 'Tavg_shift']

NameError: name 'df' is not defined

In [722]:
df['Heat_shift'] = df['Heat'].rolling(3).mean().shift(14)
df.loc[0:22, 'Heat_shift'] = df.loc[23, 'Heat_shift']

In [723]:
df['Cool_shift'] = df['Cool'].rolling(3).mean().shift(14)
df.loc[0:22, 'Cool_shift'] = df.loc[23, 'Cool_shift']

In [724]:
df['Tmax_shift'] = df['Tmax'].rolling(3).mean().shift(14)
df.loc[0:22, 'Tmax_shift'] = df.loc[23, 'Tmax_shift']

In [725]:
df['Tmin_shift'] = df['Tmin'].rolling(3).mean().shift(14)
df.loc[0:22, 'Tmin_shift'] = df.loc[23, 'Tmin_shift']

In [726]:
df['Depart_shift'] = df['Depart'].rolling(3).mean().shift(14)
df.loc[0:22, 'Depart_shift'] = df.loc[23, 'Depart_shift']

In [727]:
df['ResultSpeed_shift'] = df['ResultSpeed'].rolling(1).mean().shift(21)
df.loc[0:22, 'ResultSpeed_shift'] = df.loc[23, 'ResultSpeed_shift']

In [728]:
df['ResultDir_shift'] = df['ResultDir'].rolling(3).mean().shift(21)
df.loc[0:22, 'ResultDir_shift'] = df.loc[23, 'ResultDir_shift']

In [729]:
df['WetBulb_shift'] = df['WetBulb'].rolling(3).mean().shift(14)
df.loc[0:15, 'WetBulb_shift'] = df.loc[16, 'WetBulb_shift']

**Checking Resultant DataFrame**

In [730]:
df.head()

Unnamed: 0,Date,Id,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,...,Day_length_shift,Tavg_shift,Heat_shift,Cool_shift,Tmax_shift,Tmin_shift,Depart_shift,ResultSpeed_shift,ResultDir_shift,WetBulb_shift
0,2008-06-11,1,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,...,54600.0,75.0,0.0,10.0,86.0,63.5,7.0,9.15,18.0,64.0
1,2008-06-11,2,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,...,54600.0,75.0,0.0,10.0,86.0,63.5,7.0,9.15,18.0,64.0
2,2008-06-11,3,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,...,54600.0,75.0,0.0,10.0,86.0,63.5,7.0,9.15,18.0,64.0
3,2008-06-11,4,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX SALINARIUS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,...,54600.0,75.0,0.0,10.0,86.0,63.5,7.0,9.15,18.0,64.0
4,2008-06-11,5,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX TERRITANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,...,54600.0,75.0,0.0,10.0,86.0,63.5,7.0,9.15,18.0,64.0


In [731]:
df.shape

(116293, 39)

In [732]:
df.shape

(116293, 39)

In [733]:
df.columns

Index(['Date', 'Id', 'Address', 'Species', 'Block', 'Street', 'Trap',
       'AddressNumberAndStreet', 'Latitude', 'Longitude', 'AddressAccuracy',
       'Day_length', 'Tmax', 'Tmin', 'Tavg', 'ResultSpeed', 'ResultDir',
       'AvgSpeed', 'Heat', 'DewPoint', 'WetBulb', 'Cool', 'PrecipTotal',
       'StnPressure', 'Sunset', 'Sunrise', 'Depart', 'CodeSum', 'Month',
       'Day_length_shift', 'Tavg_shift', 'Heat_shift', 'Cool_shift',
       'Tmax_shift', 'Tmin_shift', 'Depart_shift', 'ResultSpeed_shift',
       'ResultDir_shift', 'WetBulb_shift'],
      dtype='object')

Saving our resulting dataframe

In [734]:
df.to_csv('../data/test_final_dataframe_features.csv')