# Preprocessing
Description:
We used this notebook to process our carefully engineered data for use in the modeling process. 
A few considerations to note 

In [1]:
from sklearn.model_selection import train_test_split
import numpy as np 
import pandas as pd
import pickle
from sklearn.model_selection import TimeSeriesSplit

In [2]:
df = pd.read_csv('../data/final_dataframe_features.csv',index_col=0)


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8643 entries, 0 to 8642
Data columns (total 34 columns):
Trap                 8643 non-null object
Species              8643 non-null object
NumMosquitos         8643 non-null int64
WnvPresent           8643 non-null int64
Day_length           8643 non-null int64
Tmax                 8643 non-null float64
Tmin                 8643 non-null float64
Tavg                 8643 non-null float64
ResultSpeed          8643 non-null float64
ResultDir            8643 non-null float64
AvgSpeed             8643 non-null float64
Sunset               8643 non-null int64
Sunrise              8643 non-null int64
Heat                 8643 non-null float64
Depart               8643 non-null int64
DewPoint             8643 non-null float64
WetBulb              8643 non-null float64
Cool                 8643 non-null float64
CodeSum              8643 non-null object
PrecipTotal          8643 non-null float64
StnPressure          8643 non-null float64
Latit

# Dropping the CodeSum column
We noticed after we ran our first model that this column introduced a significant amount of noise.

In [4]:
df.drop(['CodeSum'],axis=1,inplace=True)

In [5]:
df.drop(columns=['Day_length', 'Tmax', 'Tmin', 'Tavg', 'ResultSpeed', 'ResultDir',
       'AvgSpeed', 'Sunset', 'Sunrise', 'Heat', 'Depart', 'DewPoint',
       'WetBulb', 'Cool', 'PrecipTotal', 'StnPressure',],inplace=True)

In [6]:
df.columns

Index(['Trap', 'Species', 'NumMosquitos', 'WnvPresent', 'Latitude',
       'Longitude', 'Month', 'Day_length_shift', 'Tavg_shift', 'Heat_shift',
       'Cool_shift', 'Tmax_shift', 'Tmin_shift', 'Depart_shift',
       'ResultSpeed_shift', 'ResultDir_shift', 'WetBulb_shift'],
      dtype='object')

# Making dummies of the species
This was the only column in the engineered data that needed to be dummied for use in the modeling process.

In [7]:
df_object = pd.get_dummies(df[['Species']],columns = df[['Species']].columns)

In [8]:
df_object.shape

(8643, 7)

In [9]:
df = pd.concat([df,df_object], axis=1, join = 'outer')

In [10]:
df.drop(['Trap', 'Species'], axis=1, inplace = True)

In [11]:
X = df.drop(labels=['WnvPresent', 'NumMosquitos'], axis=1)
y = df['WnvPresent']


# Time series split


In [12]:
tss = TimeSeriesSplit(n_splits = 3)

In [13]:
for train_index, test_index in tss.split(X):
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index,:]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

In [14]:
len(y_train)

6483

In [15]:
X_train

Unnamed: 0,Latitude,Longitude,Month,Day_length_shift,Tavg_shift,Heat_shift,Cool_shift,Tmax_shift,Tmin_shift,Depart_shift,ResultSpeed_shift,ResultDir_shift,WetBulb_shift,Species_CULEX ERRATICUS,Species_CULEX PIPIENS,Species_CULEX PIPIENS/RESTUANS,Species_CULEX RESTUANS,Species_CULEX SALINARIUS,Species_CULEX TARSALIS,Species_CULEX TERRITANS
0,41.954690,-87.800991,5,53760.0,75.5,0.0,10.5,88.0,62.5,10.0,5.8,17.0,65.5,0,0,1,0,0,0,0
1,41.954690,-87.800991,5,53760.0,75.5,0.0,10.5,88.0,62.5,10.0,5.8,17.0,65.5,0,0,0,1,0,0,0
2,41.994991,-87.769279,5,53760.0,75.5,0.0,10.5,88.0,62.5,10.0,5.8,17.0,65.5,0,0,0,1,0,0,0
3,41.974089,-87.824812,5,53760.0,75.5,0.0,10.5,88.0,62.5,10.0,5.8,17.0,65.5,0,0,1,0,0,0,0
4,41.974089,-87.824812,5,53760.0,75.5,0.0,10.5,88.0,62.5,10.0,5.8,17.0,65.5,0,0,0,1,0,0,0
5,41.921600,-87.666455,5,53760.0,75.5,0.0,10.5,88.0,62.5,10.0,5.8,17.0,65.5,0,0,0,1,0,0,0
6,41.891118,-87.654491,5,53760.0,75.5,0.0,10.5,88.0,62.5,10.0,5.8,17.0,65.5,0,0,0,1,0,0,0
7,41.867108,-87.654224,5,53760.0,75.5,0.0,10.5,88.0,62.5,10.0,5.8,17.0,65.5,0,0,1,0,0,0,0
8,41.867108,-87.654224,5,53760.0,75.5,0.0,10.5,88.0,62.5,10.0,5.8,17.0,65.5,0,0,0,1,0,0,0
9,41.896282,-87.655232,5,53760.0,75.5,0.0,10.5,88.0,62.5,10.0,5.8,17.0,65.5,0,0,0,1,0,0,0


# Pickling

In [16]:
pd.to_pickle(X_train,'../assets/X_train.pkl')

In [17]:
pd.to_pickle(X_test,'../assets/X_test.pkl')

In [18]:
pd.to_pickle(y_train,'../assets/y_train.pkl')

In [19]:
pd.to_pickle(y_test,'../assets/y_test.pkl')