In [17]:
import pandas as pd
import numpy as np
from sklearn import cross_validation
from sklearn.cross_validation import KFold
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.grid_search import GridSearchCV
from sklearn.feature_selection import SelectFromModel
import sklearn.metrics
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.pipeline import FeatureUnion
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import PCA

In [18]:
df=pd.read_csv('df.csv')

In [19]:
_df=pd.read_csv('test.csv')

In [20]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,species,latitude,longitude,nummosquitos,month,day,Tmax,Tmin,Tavg,...,WetBulb,Cool,Sunrise,Sunset,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed,wnvpresent
0,0,2,41.95469,-87.800991,1,5,29,88,60,74,...,65.0,9,421,1917,29.39,30.11,5.8,18,6.5,0
1,1,3,41.95469,-87.800991,1,5,29,88,60,74,...,65.0,9,421,1917,29.39,30.11,5.8,18,6.5,0


In [21]:
df=df.drop('Unnamed: 0', axis=1)
df.head(2)

Unnamed: 0,species,latitude,longitude,nummosquitos,month,day,Tmax,Tmin,Tavg,Depart,...,WetBulb,Cool,Sunrise,Sunset,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed,wnvpresent
0,2,41.95469,-87.800991,1,5,29,88,60,74,10,...,65.0,9,421,1917,29.39,30.11,5.8,18,6.5,0
1,3,41.95469,-87.800991,1,5,29,88,60,74,10,...,65.0,9,421,1917,29.39,30.11,5.8,18,6.5,0


In [22]:
df.columns

Index([u'species', u'latitude', u'longitude', u'nummosquitos', u'month',
       u'day', u'Tmax', u'Tmin', u'Tavg', u'Depart', u'DewPoint', u'WetBulb',
       u'Cool', u'Sunrise', u'Sunset', u'StnPressure', u'SeaLevel',
       u'ResultSpeed', u'ResultDir', u'AvgSpeed', u'wnvpresent'],
      dtype='object')

In [23]:
#set up X and Y values & split validation set
validation_size= 0.20
data=df.values
X=data[:,0:20]
Y=data[:,20]
print X.shape, len(X)
Y=df.wnvpresent.values
print Y.shape, len(Y)
X_train, X_validation, Y_train,Y_validation=cross_validation.train_test_split(X,Y,test_size=validation_size, random_state=10)
print X_train.shape
print Y_train.shape

(10506, 20) 10506
(10506,) 10506
(8404, 20)
(8404,)


In [24]:
models=[]
models.append(('LOG', LogisticRegression()))
models.append(('LDA',LinearDiscriminantAnalysis()))
models.append(('CART',DecisionTreeClassifier()))
models.append(('KNN',KNeighborsClassifier()))
models.append(('SVM',SVC()))
models.append(('NB',GaussianNB()))

In [26]:
results=[]
names=[]

for name, model in models:
    kfold=KFold(n=8404,n_folds=10, random_state=10)
    _results=cross_validation.cross_val_score(model,X_train,Y_train,cv=kfold,scoring='accuracy')
    results.append(_results)
    names.append(name)
    scores="%s: %f (%f)" % (name, _results.mean(), _results.std())
    print scores

LOG: 0.946693 (0.005334)
LDA: 0.946693 (0.005842)
CART: 0.914328 (0.010493)
KNN: 0.939672 (0.006624)
SVM: 0.944313 (0.005518)
NB: 0.814613 (0.013983)


The linear models performed best.

In [27]:
#create a pipeline 
pipelines=[]
pipelines.append(('ScaledLOG', Pipeline([('Scaler', StandardScaler()),('LOG', LogisticRegression())])))
pipelines.append(('ScaledLDA', Pipeline([('Scaler', StandardScaler()),('LDA', LinearDiscriminantAnalysis())])))
pipelines.append(('ScaledCART', Pipeline([('Scaler', StandardScaler()),('CART', DecisionTreeClassifier())])))
pipelines.append(('ScaledKNN', Pipeline([('Scaler', StandardScaler()),('KNN', KNeighborsClassifier())])))
pipelines.append(('ScaledSVM', Pipeline([('Scaler', StandardScaler()),('SVM', SVC())])))
pipelines.append(('ScaledGNB', Pipeline([('Scaler', StandardScaler()),('GNB', GaussianNB())])))

In [31]:
results=[]
names=[]

for name, model in pipelines:
    kfold=KFold(n=8404,n_folds=10, random_state=10)
    _results=cross_validation.cross_val_score(model,X_train,Y_train,cv=kfold,scoring='accuracy')
    results.append(_results)
    names.append(name)
    scores="%s: %f (%f)" % (name, _results.mean(), _results.std())
    print scores

ScaledLOG: 0.946574 (0.005879)
ScaledLDA: 0.946693 (0.005842)
ScaledCART: 0.915637 (0.009697)
ScaledKNN: 0.941934 (0.006999)
ScaledSVM: 0.946693 (0.005334)
ScaledGNB: 0.814732 (0.014066)


After scaling our dataset most of the models performed better. We used pipeline so that each model is perfomed on unseen data.

In [34]:
#improve our knn model

scaler=StandardScaler().fit(X_train)
_X=scaler.transform(X_train)
k_values= np.array(range(1,21))
param=dict(n_neighbors=k_values)
model=KNeighborsClassifier()
kfold=cross_validation.KFold(n=8404,n_folds=10, random_state=10)
grid= GridSearchCV(estimator=model, param_grid=param, scoring='accuracy', cv=kfold)
grid_result=grid.fit(_X, Y_train)


In [35]:
print ("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
for params, mean_score, scores in grid_result.grid_scores_:
    print ("%f (%f) with: %r" % (scores.mean(), scores.std(), params))

Best: 0.947525 using {'n_neighbors': 20}
0.922657 (0.008668) with: {'n_neighbors': 1}
0.944075 (0.007516) with: {'n_neighbors': 2}
0.938245 (0.008510) with: {'n_neighbors': 3}
0.944671 (0.005747) with: {'n_neighbors': 4}
0.941577 (0.006786) with: {'n_neighbors': 5}
0.947050 (0.004787) with: {'n_neighbors': 6}
0.944433 (0.004713) with: {'n_neighbors': 7}
0.945860 (0.004729) with: {'n_neighbors': 8}
0.944908 (0.004603) with: {'n_neighbors': 9}
0.946693 (0.005143) with: {'n_neighbors': 10}
0.945742 (0.006721) with: {'n_neighbors': 11}
0.946693 (0.005307) with: {'n_neighbors': 12}
0.946336 (0.005324) with: {'n_neighbors': 13}
0.947288 (0.005648) with: {'n_neighbors': 14}
0.946812 (0.005522) with: {'n_neighbors': 15}
0.946931 (0.005156) with: {'n_neighbors': 16}
0.947169 (0.004903) with: {'n_neighbors': 17}
0.946812 (0.005123) with: {'n_neighbors': 18}
0.947169 (0.005419) with: {'n_neighbors': 19}
0.947526 (0.005131) with: {'n_neighbors': 20}


In [38]:
# Tune SVM
scaler=StandardScaler().fit(X_train)
_X=scaler.transform(X_train)
c_values= [0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 1.3, 1.5, 1.7, 2.0]
kernel_values=['linear','poly']
param=dict(C=c_values, kernel=kernel_values)
model=SVC()
kfold=cross_validation.KFold(n=8404,n_folds=10, random_state=10)
grid= GridSearchCV(estimator=model, param_grid=param, scoring='accuracy', cv=kfold)
grid_result=grid.fit(_X, Y_train)


In [39]:
print ("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
for params, mean_score, scores in grid_result.grid_scores_:
    print ("%f (%f) with: %r" % (scores.mean(), scores.std(), params))

Best: 0.946692 using {'kernel': 'linear', 'C': 0.1}
0.946693 (0.005334) with: {'kernel': 'linear', 'C': 0.1}
0.946693 (0.005334) with: {'kernel': 'poly', 'C': 0.1}
0.946693 (0.005334) with: {'kernel': 'linear', 'C': 0.2}
0.946693 (0.005334) with: {'kernel': 'poly', 'C': 0.2}
0.946693 (0.005334) with: {'kernel': 'linear', 'C': 0.4}
0.946693 (0.005334) with: {'kernel': 'poly', 'C': 0.4}
0.946693 (0.005334) with: {'kernel': 'linear', 'C': 0.6}
0.946693 (0.005334) with: {'kernel': 'poly', 'C': 0.6}
0.946693 (0.005334) with: {'kernel': 'linear', 'C': 0.8}
0.946693 (0.005334) with: {'kernel': 'poly', 'C': 0.8}
0.946693 (0.005334) with: {'kernel': 'linear', 'C': 1.0}
0.946693 (0.005334) with: {'kernel': 'poly', 'C': 1.0}
0.946693 (0.005334) with: {'kernel': 'linear', 'C': 1.3}
0.946693 (0.005334) with: {'kernel': 'poly', 'C': 1.3}
0.946693 (0.005334) with: {'kernel': 'linear', 'C': 1.5}
0.946693 (0.005334) with: {'kernel': 'poly', 'C': 1.5}
0.946693 (0.005334) with: {'kernel': 'linear', 'C': 

KNN model improved. SVM models did not improve significantly

In [40]:
# use ensemble methods to increase performance
methods=[]
methods.append(('AB', AdaBoostClassifier()))
methods.append(('GBM',LinearDiscriminantAnalysis()))
methods.append(('RF',RandomForestClassifier()))
methods.append(('ET',ExtraTreesClassifier()))


In [42]:
results=[]
names=[]

for name, model in methods:
    kfold=KFold(n=8404,n_folds=10, random_state=10)
    _results=cross_validation.cross_val_score(model,X_train,Y_train,cv=kfold,scoring='accuracy')
    results.append(_results)
    names.append(name)
    scores="%s: %f (%f)" % (name, _results.mean(), _results.std())
    print scores

AB: 0.945503 (0.005113)
GBM: 0.946693 (0.005842)
RF: 0.941576 (0.007353)
ET: 0.942528 (0.005846)


The ensemble methods did not perform better than our linear & non linear models. 

In [62]:
# study the logistic regression (best model) further:
features=[]
features.append(('pca', PCA (n_components=3)))
features.append(('select_best', SelectKBest(k=6)))
feature_union= FeatureUnion(features)

In [65]:
estimators=[]
#knn=KNeighborsClassifier(n_neighbors=20)
estimators.append(('feature_union', feature_union))
estimators.append(('Knn', KNeighborsClassifier()))
model= Pipeline(estimators)

In [59]:
kfold=cross_validation.KFold(n=8404,n_folds=10, random_state=200)
scores=cross_validation.cross_val_score(model, X, Y, cv=kfold)
print scores.mean()

0.951711256441


In [66]:
#make prediction on validation dataset
#cart=DecisionTreeClassifier()
#knn= KNeighborsClassifier(n_neighbors=20)
model= Pipeline(estimators)
#knn.fit(X_train, Y_train)
model.fit(X_train, Y_train)
Y_pred=model.predict(X_validation)
print accuracy_score(Y_validation,Y_pred )
print confusion_matrix(Y_validation,Y_pred )
print classification_report(Y_validation, Y_pred)
print np.unique(Y_pred)

0.946241674596
[[1981   18]
 [  95    8]]
             precision    recall  f1-score   support

          0       0.95      0.99      0.97      1999
          1       0.31      0.08      0.12       103

avg / total       0.92      0.95      0.93      2102

[0 1]


In [67]:
data={'Y_val': Y_validation, 'Y_pred': Y_pred}
ydf=pd.DataFrame(data)
ydf.head(2)

Unnamed: 0,Y_pred,Y_val
0,0,1
1,0,0
