# West Nile Analysis

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.cross_validation import train_test_split, KFold, cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier



In [3]:
train = pd.read_csv('../data/trainw.csv')
#test  = pd.read_csv('../data/testcw.csv')
train.columns = [col.lower().replace(' ', '_') for col in train.columns]
#test.columns = [col.lower().replace(' ', '_') for col in test.columns]

                    

In [4]:
train.columns

Index(['unnamed:_0', 'date', 'address', 'block', 'street', 'trap',
       'addressnumberandstreet', 'latitude', 'longitude', 'addressaccuracy',
       'nummosquitos', 'wnvpresent', 'year', 'month', 'day',
       'species_culex_pipiens', 'species_culex_pipiens/restuans',
       'species_culex_restuans', 'species_culex_salinarius',
       'species_culex_tarsalis', 'species_culex_territans'],
      dtype='object')

In [3]:
#included features
features = [ 'trap', 'latitude', 'longitude', 'block',
            'species_culex_pipiens', 'species_culex_pipiens/restuans',
            'species_culex_restuans', 'species_culex_salinarius',
            'species_culex_tarsalis', 'species_culex_territans', 'distance', 'wetbulb', 
            'precipitation', 'max_temp']

X = train[features]
y = train['wnvpresent']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state= 6, stratify = y)

In [5]:
datasets = (X_train, y_train)
traincombined = pd.concat(datasets, axis = 1)
traincombined.head()

Unnamed: 0,trap,latitude,longitude,block,species_culex_pipiens,species_culex_pipiens/restuans,species_culex_restuans,species_culex_salinarius,species_culex_tarsalis,species_culex_territans,distance,wnvpresent
10477,102,41.750498,-87.605294,10,0,0,1,0,0,0,0.488413,0.0
1929,13,41.923738,-87.785288,61,1,0,0,0,0,0,0.539636,0.0
6,46,41.891118,-87.654491,25,0,0,1,0,0,0,0.358443,0.0
3078,11,41.944869,-87.832763,36,0,1,0,0,0,0,1.771727,1.0
3279,60,41.823065,-87.678378,22,0,1,0,0,0,0,0.622362,0.0


In [6]:
west_nile = traincombined[traincombined.wnvpresent == 1]

In [7]:
extra_west_nile = west_nile.sample(n = 5000, replace = True, random_state = 63, axis = 0)

In [8]:
merge = [traincombined, extra_west_nile]
traincombined = pd.concat(merge, axis = 0)

In [9]:
traincombined.shape

(12879, 12)

In [10]:
traincombined.corr()

Unnamed: 0,trap,latitude,longitude,block,species_culex_pipiens,species_culex_pipiens/restuans,species_culex_restuans,species_culex_salinarius,species_culex_tarsalis,species_culex_territans,distance,wnvpresent
trap,1.0,0.286085,-0.559703,-0.366928,-0.092077,0.066231,0.032452,0.001477,-0.00449,-0.026831,0.239564,0.097109
latitude,0.286085,1.0,-0.730875,0.108827,-0.130715,0.085107,0.062081,-0.022407,-0.010488,-0.037882,0.017634,0.077678
longitude,-0.559703,-0.730875,1.0,-0.070963,0.129929,-0.104066,-0.040424,0.01958,0.010407,0.051567,-0.091407,-0.148194
block,-0.366928,0.108827,-0.070963,1.0,-0.041489,0.038784,-0.001014,-0.006264,-0.001506,0.008825,-0.090224,0.014945
species_culex_pipiens,-0.092077,-0.130715,0.129929,-0.041489,1.0,-0.646863,-0.338064,-0.045736,-0.010564,-0.078113,-0.167452,0.196968
species_culex_pipiens/restuans,0.066231,0.085107,-0.104066,0.038784,-0.646863,1.0,-0.456552,-0.061766,-0.014267,-0.105491,0.068515,0.026525
species_culex_restuans,0.032452,0.062081,-0.040424,-0.001014,-0.338064,-0.456552,1.0,-0.03228,-0.007456,-0.055132,0.109178,-0.23005
species_culex_salinarius,0.001477,-0.022407,0.01958,-0.006264,-0.045736,-0.061766,-0.03228,1.0,-0.001009,-0.007459,0.001262,-0.05627
species_culex_tarsalis,-0.00449,-0.010488,0.010407,-0.001506,-0.010564,-0.014267,-0.007456,-0.001009,1.0,-0.001723,0.018884,-0.012997
species_culex_territans,-0.026831,-0.037882,0.051567,0.008825,-0.078113,-0.105491,-0.055132,-0.007459,-0.001723,1.0,0.006461,-0.096104


In [11]:
X_train = traincombined[features]
y_train = traincombined['wnvpresent']

In [21]:
rf = RandomForestClassifier()
rf_params = {
    'n_estimators'       : [ 30, 40, 60],
    'max_depth'          : [  100, 130],
    'min_samples_leaf'   : [1, 2, 3]
}



In [22]:
gs = GridSearchCV(rf, param_grid=rf_params)
gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)
gs.score(X_test, y_test)


0.7634133084866838
{'max_depth': 100, 'min_samples_leaf': 1, 'n_estimators': 30}


0.6867148838979825

In [14]:
y_hat = gs.predict(X_test)
y_hat_train = gs.predict(X_train)

In [15]:
print(classification_report( y_train, y_hat_train))

             precision    recall  f1-score   support

        0.0       0.87      0.72      0.79      7466
        1.0       0.69      0.85      0.76      5413

avg / total       0.79      0.78      0.78     12879



In [16]:
print(classification_report(y_test, y_hat))

             precision    recall  f1-score   support

        0.0       0.96      0.71      0.82      2489
        1.0       0.09      0.50      0.15       138

avg / total       0.92      0.70      0.78      2627



In [17]:
confusion_matrix(y_test, y_hat)

array([[1773,  716],
       [  69,   69]])

In [18]:
X = test[features]
predict = (gs.predict(X))

KeyError: "['distance'] not in index"

In [36]:
knn = KNeighborsClassifier()
knn_params = {
    'n_neighbors' : [7, 9, 11],
    'weights'      : ['uniform','distance']
}

gs = GridSearchCV(knn, param_grid=knn_params)
gs.fit(X_train, y_train)

print(gs.best_params_)
print(gs.best_score_)
print(gs.score(X_test,y_test))
y_hat = gs.predict(X_test)
y_hat_train = gs.predict(X_train)
print(classification_report(y_test, y_hat))

{'n_neighbors': 11, 'weights': 'distance'}
0.7464865284571783
0.7445755614769699
             precision    recall  f1-score   support

        0.0       0.96      0.76      0.85      2489
        1.0       0.09      0.40      0.14       138

avg / total       0.91      0.74      0.81      2627



In [None]:
submissions = pd.DataFrame(predict, columns = ['wnvpresent'])

In [None]:
submissions['id'] = [id +1 for id in range(len(submissions))]

In [None]:
submissions = submissions[['id','wnvpresent' ]]


In [None]:
submissions.to_csv('../data/submission.csv', index = False)