## Merging the test data with the clean weather data

In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [53]:
test = pd.read_csv('../data/test_2.csv',index_col=0)
weather = pd.read_csv('../data/clean_weather.csv', index_col=0)

In [54]:
test.reset_index(inplace=True)

In [55]:
df = pd.merge(test, weather, on = 'Date')

In [56]:
df.set_index('Date', inplace=True)

In [57]:
df.index = pd.to_datetime(df.index)

Adding a column for month like in the train data

In [58]:
df['Month'] = df.index.map(lambda x: x.month)

In [59]:
df.columns

Index(['Id', 'Address', 'Species', 'Block', 'Street', 'Trap',
       'AddressNumberAndStreet', 'Latitude', 'Longitude', 'AddressAccuracy',
       'Day_length', 'Tmax', 'Tmin', 'Tavg', 'ResultSpeed', 'ResultDir',
       'AvgSpeed', 'Heat', 'DewPoint', 'WetBulb', 'Cool', 'PrecipTotal',
       'StnPressure', 'Sunset', 'Sunrise', 'Depart', 'CodeSum', 'Month'],
      dtype='object')

Saving my test data

In [61]:
df.to_csv('../data/test_final_dataframe.csv')

# Feature Engineering

Description: Engineering features to further refine our data, and ready it for Preprocessing.

In [26]:
import numpy as np
import pandas as pd

In [27]:
df = pd.read_csv('../data/test_final_dataframe.csv', index_col=0, parse_dates=True)

In [28]:
df.reset_index(inplace=True)

**Adding Rolling & Expanded Means for Features -w- Shifted Dates**

These features were optimized through the use of our function that tests a range of rolling and expanding means and picks the one's that have the highest correlation with number of mosquitoes

In [29]:
df['Day_length_exp'] = df['Day_length'].expanding().mean()

In [30]:
df['Tavg_shift'] = df['Tavg'].rolling(1).mean().shift(7)

In [31]:
df['Heat_exp'] = df['Heat'].expanding().mean()

In [32]:
df['Cool_shift'] = df['Cool'].rolling(1).mean().shift(7)

In [33]:
df['Tmax_shift'] = df['Tmax'].rolling(1).mean().shift(7)

In [34]:
df['Tmin_shift'] = df['Tmin'].rolling(1).mean().shift(10)

In [35]:
df['Depart_shift'] = df['Depart'].rolling(1).mean().shift(7)

In [36]:
df['ResultSpeed_shift'] = df['ResultSpeed'].rolling(1).mean().shift(28)

In [37]:
df['ResultDir_exp'] = df['ResultDir'].expanding().mean()

In [38]:
df['PrecipTotal_exp'] = df['PrecipTotal'].expanding().mean()

In [39]:
df['WetBulb_exp'] = df['WetBulb'].expanding().mean()

In [40]:
df.loc[0:6, 'Tavg_shift'] = df.loc[7, 'Tavg_shift']

In [41]:
df.loc[0:6, 'Cool_shift'] = df.loc[7, 'Cool_shift']

In [42]:
df.loc[0:6, 'Tmax_shift'] = df.loc[7, 'Tmax_shift']

In [43]:
df.loc[0:9, 'Tmin_shift'] = df.loc[10, 'Tmin_shift']

In [44]:
df.loc[0:6, 'Depart_shift'] = df.loc[7, 'Depart_shift']

In [45]:
df.loc[0:27, 'ResultSpeed_shift'] = df.loc[28, 'ResultSpeed_shift']

**Checking Resultant DataFrame**

In [46]:
df.head()

Unnamed: 0,Date,Id,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,...,Tavg_shift,Heat_exp,Cool_shift,Tmax_shift,Tmin_shift,Depart_shift,ResultSpeed_shift,ResultDir_exp,PrecipTotal_exp,WetBulb_exp
0,2008-06-11,1,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,...,75.0,0.0,10.0,86.0,63.5,7.0,9.15,18.0,0.0,64.0
1,2008-06-11,2,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,...,75.0,0.0,10.0,86.0,63.5,7.0,9.15,18.0,0.0,64.0
2,2008-06-11,3,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,...,75.0,0.0,10.0,86.0,63.5,7.0,9.15,18.0,0.0,64.0
3,2008-06-11,4,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX SALINARIUS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,...,75.0,0.0,10.0,86.0,63.5,7.0,9.15,18.0,0.0,64.0
4,2008-06-11,5,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX TERRITANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,...,75.0,0.0,10.0,86.0,63.5,7.0,9.15,18.0,0.0,64.0


In [47]:
df.shape

(116293, 39)

In [48]:
df.shape

(116293, 39)

In [49]:
df.columns

Index(['Date', 'Id', 'Address', 'Species', 'Block', 'Street', 'Trap',
       'AddressNumberAndStreet', 'Latitude', 'Longitude', 'AddressAccuracy',
       'Day_length', 'Tmax', 'Tmin', 'Tavg', 'ResultSpeed', 'ResultDir',
       'AvgSpeed', 'Heat', 'DewPoint', 'WetBulb', 'Cool', 'PrecipTotal',
       'StnPressure', 'Sunset', 'Depart', 'CodeSum', 'Month', 'Day_length_exp',
       'Tavg_shift', 'Heat_exp', 'Cool_shift', 'Tmax_shift', 'Tmin_shift',
       'Depart_shift', 'ResultSpeed_shift', 'ResultDir_exp', 'PrecipTotal_exp',
       'WetBulb_exp'],
      dtype='object')

Saving our resulting dataframe

In [50]:
df.to_csv('../data/test_final_dataframe_features.csv')