# Import Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
import datetime

# Loading Data

In [2]:
#load data from EDA 
df= pd.read_csv('project2_df.csv')

In [3]:
df.head(20)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,DewPoint,WetBulb,CodeSum,...,year_weather,week,Species_virus,7day_Tavg,10day_Tavg,7day_Precip,10day_Precip,tavg_celsius,wetbulb_celsius,relative_humidity
0,0,0,1,2007-05-29,88,60,74,58,65,BR HZ,...,2007,22,1,74.0,74.0,0.0,0.0,23.333333,18.333333,59.899166
1,1,1,1,2007-05-29,88,60,74,58,65,BR HZ,...,2007,22,1,74.0,74.0,0.0,0.0,23.333333,18.333333,59.899166
2,2,2,1,2007-05-29,88,60,74,58,65,BR HZ,...,2007,22,1,74.0,74.0,0.0,0.0,23.333333,18.333333,59.899166
3,3,3,1,2007-05-29,88,60,74,58,65,BR HZ,...,2007,22,1,74.0,74.0,0.0,0.0,23.333333,18.333333,59.899166
4,4,4,1,2007-05-29,88,60,74,58,65,BR HZ,...,2007,22,1,74.0,74.0,0.0,0.0,23.333333,18.333333,59.899166
5,5,5,1,2007-05-29,88,60,74,58,65,BR HZ,...,2007,22,1,74.0,74.0,0.0,0.0,23.333333,18.333333,59.899166
6,6,6,1,2007-05-29,88,60,74,58,65,BR HZ,...,2007,22,1,74.0,74.0,0.0,0.0,23.333333,18.333333,59.899166
7,7,7,2,2007-05-29,88,65,77,59,66,BR HZ,...,2007,22,1,74.0,74.0,0.0,0.0,25.0,18.888889,53.704207
8,8,8,2,2007-05-29,88,65,77,59,66,BR HZ,...,2007,22,1,74.0,74.0,0.0,0.0,25.0,18.888889,53.704207
9,9,9,2,2007-05-29,88,65,77,59,66,BR HZ,...,2007,22,1,74.0,74.0,0.0,0.0,25.0,18.888889,53.704207


# Prepare the data for train- test-split

In [4]:
df.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'Station', 'Date', 'Tmax', 'Tmin', 'Tavg',
       'DewPoint', 'WetBulb', 'CodeSum', 'PrecipTotal', 'StnPressure',
       'SeaLevel', 'ResultSpeed', 'ResultDir', 'AvgSpeed', 'year_train',
       'month_train', 'day_train', 'Address', 'Species', 'Trap', 'Latitude',
       'Longitude', 'NumMosquitos', 'WnvPresent', 'day_weather',
       'month_weather', 'year_weather', 'week', 'Species_virus', '7day_Tavg',
       '10day_Tavg', '7day_Precip', '10day_Precip', 'tavg_celsius',
       'wetbulb_celsius', 'relative_humidity'],
      dtype='object')

In [6]:
#dropping year, month, day from the train dataset
#dropping address, CodeSum, Trap
#drop date object column, Species with the different types as have 'Species_virus' which is either
#mosquitoes that carry the virus or not. 

df1= df.drop(['Unnamed: 0.1', 'Unnamed: 0', 'CodeSum', 'Address', 'Trap', 'year_train', 'month_train', 'day_train', 'Date', 'Species'], axis=1)

In [7]:
df1. shape

(10506, 28)

In [8]:
#create X, y to be used in the train-test-split arguments
X= df1.drop('WnvPresent', axis=1)
y= df1['WnvPresent']

In [9]:
#split the data to prepare it for modeling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=47)


In [10]:
#check to make sure X, y similar lengths
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(7354, 27) (3152, 27)
(7354,) (3152,)


In [11]:
X_train.dtypes

Station                int64
Tmax                   int64
Tmin                   int64
Tavg                   int64
DewPoint               int64
WetBulb                int64
PrecipTotal          float64
StnPressure          float64
SeaLevel             float64
ResultSpeed          float64
ResultDir              int64
AvgSpeed             float64
Latitude             float64
Longitude            float64
NumMosquitos           int64
day_weather            int64
month_weather          int64
year_weather           int64
week                   int64
Species_virus          int64
7day_Tavg            float64
10day_Tavg           float64
7day_Precip          float64
10day_Precip         float64
tavg_celsius         float64
wetbulb_celsius      float64
relative_humidity    float64
dtype: object

# Questions

In [None]:
1. Can I just use a pipeline in the modeling part of the project that includes the scaling of the data?

2. Should I have kept the actual 'Date' like 07-27-2009 column or is it okay to drop it since 
I have the day, week, month, year of the data?

