# 03 Preprocessing

Description: Processing the engineered data in preperation for modeling.

In [1]:
from sklearn.model_selection import train_test_split
import numpy as np 
import pandas as pd
import pickle
from sklearn.model_selection import TimeSeriesSplit

In [2]:
df = pd.read_csv('../../data/final_dataframe_features.csv',index_col=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8643 entries, 0 to 8642
Data columns (total 34 columns):
Trap                 8643 non-null object
Species              8643 non-null object
NumMosquitos         8643 non-null int64
WnvPresent           8643 non-null int64
Day_length           8643 non-null int64
Tmax                 8643 non-null float64
Tmin                 8643 non-null float64
Tavg                 8643 non-null float64
ResultSpeed          8643 non-null float64
ResultDir            8643 non-null float64
AvgSpeed             8643 non-null float64
Sunset               8643 non-null int64
Sunrise              8643 non-null int64
Heat                 8643 non-null float64
Depart               8643 non-null int64
DewPoint             8643 non-null float64
WetBulb              8643 non-null float64
Cool                 8643 non-null float64
CodeSum              8643 non-null object
PrecipTotal          8643 non-null float64
StnPressure          8643 non-null float64
Latit

# Dropping the CodeSum column
We noticed after we ran our first model that this column introduced a significant amount of noise.

In [3]:
df.drop(['CodeSum'],axis=1,inplace=True)

In [4]:
df.columns

Index(['Trap', 'Species', 'NumMosquitos', 'WnvPresent', 'Day_length', 'Tmax',
       'Tmin', 'Tavg', 'ResultSpeed', 'ResultDir', 'AvgSpeed', 'Sunset',
       'Heat', 'Depart', 'DewPoint', 'Cool', 'PrecipTotal', 'StnPressure',
       'Latitude', 'Longitude', 'Month', 'Day_length_exp', 'Tavg_shift',
       'Heat_exp', 'Cool_shift', 'Tmax_shift', 'Tmin_shift', 'Depart_shift',
       'ResultSpeed_shift', 'ResultDir_exp', 'PrecipTotal_exp'],
      dtype='object')

# Making dummies of the species
This was the only column in the engineered data that needed to be dummied for use in the modeling process.

In [5]:
df_object = pd.get_dummies(df[['Species']],columns = df[['Species']].columns)

In [6]:
df_object.shape

(8643, 7)

In [7]:
df = pd.concat([df,df_object], axis=1, join = 'outer')

In [8]:
df.drop(['Trap', 'Species'], axis=1, inplace = True)

In [9]:
X = df.drop(labels=['WnvPresent', 'NumMosquitos'], axis=1)
y = df['WnvPresent']

# Time series split

In [10]:
tss = TimeSeriesSplit(n_splits = 3)

In [11]:
for train_index, test_index in tss.split(X):
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index,:]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

# Pickling

In [12]:
pd.to_pickle(X_train,'../assets/X_train.pkl')

In [13]:
pd.to_pickle(X_test,'../assets/X_test.pkl')

In [14]:
pd.to_pickle(y_train,'../assets/y_train.pkl')

In [15]:
pd.to_pickle(y_test,'../assets/y_test.pkl')