# Benchmark

## Import Library

In [1]:
# import libraries


import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression, ElasticNet, Lasso, Ridge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

sns.set_style('whitegrid')

import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)

## Import Data

In [30]:
train = pd.read_csv('./data/train_weather.csv')

In [31]:
train.dtypes

month             int64
Species          object
NumMosquitos      int64
Sunset            int64
Street           object
DaylightHrs       int64
Tmax            float64
Tmin            float64
Tavg            float64
Depart          float64
DewPoint        float64
WetBulb         float64
Cool            float64
PrecipTotal     float64
StnPressure     float64
SeaLevel        float64
ResultSpeed     float64
ResultDir       float64
AvgSpeed        float64
WnvPresent        int64
dtype: object

In [32]:
train_dummies = pd.get_dummies(train,drop_first=True,columns=['Species','Street'])

In [11]:
scaler = StandardScaler()
scaler.fit(train_dummies)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [15]:
train_dummies['Street_ W MONTANA ST'].value_counts()

0    10446
1       60
Name: Street_ W MONTANA ST, dtype: int64

## Train Test Split

In [26]:
y = train_dummies['WnvPresent']
X = train_dummies[[col for col in train_dummies.columns if col != 'WnvPresent']]


In [27]:
train_x, test_x, train_y, test_y = train_test_split(X,y,test_size = 0.3, random_state = 666)



In [28]:
baseline = Pipeline([
    ('SS', StandardScaler()),
    ('lr', LogisticRegression())
])

cross_val_score(baseline, train_x, train_y, cv=5) 



array([0.94361413, 0.94629504, 0.94425561, 0.9462585 , 0.94013605])

## Address class imbalance using SMOTE

In [68]:
sm = SMOTE(sampling_strategy=1,random_state=666)
sampledX,sampledy = sm.fit_sample(train_x,train_y)

In [70]:
baseline = Pipeline([
    ('SS', StandardScaler()),
    ('lr', LogisticRegression())
])

cross_val_score(baseline, strain_x, strain_y, cv=5) 



array([0.79805228, 0.79753972, 0.80666667, 0.80974359, 0.80410256])

In [65]:
def get_report(pipe,xtrain,xtest,ytrain,ytest):
    fit_pipe = pipe.fit(xtrain,ytrain)
    y_predict = fit_pipe.predict(xtest)
    print(classification_report(ytest, y_predict, target_names=['class 0','class 1']))

In [66]:
get_report(baseline,train_x, test_x, train_y, test_y)



              precision    recall  f1-score   support

     class 0       0.95      1.00      0.97      2989
     class 1       0.56      0.06      0.10       163

    accuracy                           0.95      3152
   macro avg       0.76      0.53      0.54      3152
weighted avg       0.93      0.95      0.93      3152



In [71]:
get_report(baseline,sampledX, stest_x, sampledy, stest_y)



              precision    recall  f1-score   support

     class 0       0.84      0.77      0.80      2104
     class 1       0.78      0.86      0.82      2076

    accuracy                           0.81      4180
   macro avg       0.81      0.81      0.81      4180
weighted avg       0.81      0.81      0.81      4180

