# West Nile Analysis

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.cross_validation import train_test_split, KFold, cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier



In [2]:
train = pd.read_csv('../data/trainw.csv')
test  = pd.read_csv('../data/testw.csv')
train.columns = [col.lower().replace(' ', '_') for col in train.columns]
test.columns = [col.lower().replace(' ', '_') for col in test.columns]

                    

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
test.columns

Index(['unnamed:_0', 'index', 'id', 'date', 'address', 'block', 'street',
       'trap', 'addressnumberandstreet', 'latitude', 'longitude',
       'addressaccuracy', 'year', 'month', 'day', 'tmax_x', 'tmin_x', 'tavg_x',
       'depart_x', 'dewpoint_x', 'wetbulb_x', 'heat_x', 'cool_x', 'sunrise_x',
       'sunset_x', 'codesum_x', 'depth_x', 'water1_x', 'snowfall_x',
       'preciptotal_x', 'stnpressure_x', 'sealevel_x', 'resultspeed_x',
       'resultdir_x', 'avgspeed_x', 'tmax_y', 'tmin_y', 'tavg_y', 'depart_y',
       'dewpoint_y', 'wetbulb_y', 'heat_y', 'cool_y', 'sunrise_y', 'sunset_y',
       'codesum_y', 'depth_y', 'water1_y', 'snowfall_y', 'preciptotal_y',
       'stnpressure_y', 'sealevel_y', 'resultspeed_y', 'resultdir_y',
       'avgspeed_y', 'species_culex_pipiens', 'species_culex_pipiens/restuans',
       'species_culex_restuans', 'species_culex_salinarius',
       'species_culex_tarsalis', 'species_culex_territans',
       'species_unspecified_culex', 'distance', 'dist_spec

In [4]:
train.columns

Index(['unnamed:_0', 'address', 'addressaccuracy', 'addressnumberandstreet',
       'avgspeed_x', 'avgspeed_y', 'block', 'codesum_x', 'codesum_y', 'cool_x',
       'cool_y', 'date', 'day', 'depart_x', 'depart_y', 'depth_x', 'depth_y',
       'dewpoint_x', 'dewpoint_y', 'distance', 'heat_x', 'heat_y', 'index',
       'latitude', 'longitude', 'month', 'nummosquitos', 'preciptotal_x',
       'preciptotal_y', 'resultdir_x', 'resultdir_y', 'resultspeed_x',
       'resultspeed_y', 'sealevel_x', 'sealevel_y', 'snowfall_x', 'snowfall_y',
       'species_culex_pipiens', 'species_culex_pipiens/restuans',
       'species_culex_restuans', 'species_culex_salinarius',
       'species_culex_tarsalis', 'species_culex_territans', 'stnpressure_x',
       'stnpressure_y', 'street', 'sunrise_x', 'sunrise_y', 'sunset_x',
       'sunset_y', 'tavg_x', 'tavg_y', 'tmax_x', 'tmax_y', 'tmin_x', 'tmin_y',
       'trap', 'water1_x', 'water1_y', 'wetbulb_x', 'wetbulb_y', 'wnvpresent',
       'year', 'dist_species_c

In [5]:
#included features
features = [ 'trap', 'block',
            'species_culex_pipiens', 'species_culex_pipiens/restuans',
            'species_culex_restuans', 'species_culex_salinarius',
            'species_culex_tarsalis', 'species_culex_territans', 'wetbulb_y', 
            'tmax_y', 'distance', 'dist_species_culex_pipiens',
            'dist_species_culex_pipiens/restuans', 'dist_species_culex_restuans',
            'dist_species_culex_salinarius', 'dist_species_culex_tarsalis',
            'dist_species_culex_territans']

X = train[features]
y = train['wnvpresent']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state= 6, stratify = y)

In [7]:
X_train.head().T

Unnamed: 0,10477,1929,6,3078,3279
trap,102.0,13.0,46.0,11.0,60.0
block,10.0,61.0,25.0,36.0,22.0
species_culex_pipiens,0.0,1.0,0.0,0.0,0.0
species_culex_pipiens/restuans,0.0,0.0,0.0,1.0,1.0
species_culex_restuans,1.0,0.0,1.0,0.0,0.0
species_culex_salinarius,0.0,0.0,0.0,0.0,0.0
species_culex_tarsalis,0.0,0.0,0.0,0.0,0.0
species_culex_territans,0.0,0.0,0.0,0.0,0.0
wetbulb_y,58.0,71.0,66.0,50.0,65.0
tmax_y,75.0,81.0,88.0,66.0,89.0


In [8]:
datasets = (X_train, y_train)
traincombined = pd.concat(datasets, axis = 1)
traincombined.head()

Unnamed: 0,trap,block,species_culex_pipiens,species_culex_pipiens/restuans,species_culex_restuans,species_culex_salinarius,species_culex_tarsalis,species_culex_territans,wetbulb_y,tmax_y,distance,dist_species_culex_pipiens,dist_species_culex_pipiens/restuans,dist_species_culex_restuans,dist_species_culex_salinarius,dist_species_culex_tarsalis,dist_species_culex_territans,wnvpresent
10477,102,10,0,0,1,0,0,0,58,75,0.488413,0.0,0.0,0.488413,0.0,0.0,0.0,0.0
1929,13,61,1,0,0,0,0,0,71,81,0.539636,0.539636,0.0,0.0,0.0,0.0,0.0,0.0
6,46,25,0,0,1,0,0,0,66,88,0.358443,0.0,0.0,0.358443,0.0,0.0,0.0,0.0
3078,11,36,0,1,0,0,0,0,50,66,1.771727,0.0,1.771727,0.0,0.0,0.0,0.0,1.0
3279,60,22,0,1,0,0,0,0,65,89,0.622362,0.0,0.622362,0.0,0.0,0.0,0.0,0.0


In [9]:
west_nile = traincombined[traincombined.wnvpresent == 1]

In [10]:
extra_west_nile = west_nile.sample(n = 5000, replace = True, random_state = 63, axis = 0)

In [11]:
merge = [traincombined, extra_west_nile]
traincombined = pd.concat(merge, axis = 0)

In [12]:
traincombined.shape

(12879, 18)

In [13]:
traincombined.corr()

Unnamed: 0,trap,block,species_culex_pipiens,species_culex_pipiens/restuans,species_culex_restuans,species_culex_salinarius,species_culex_tarsalis,species_culex_territans,wetbulb_y,tmax_y,distance,dist_species_culex_pipiens,dist_species_culex_pipiens/restuans,dist_species_culex_restuans,dist_species_culex_salinarius,dist_species_culex_tarsalis,dist_species_culex_territans,wnvpresent
trap,1.0,-0.366928,-0.092077,0.066231,0.032452,0.001477,-0.00449,-0.026831,-0.054215,-0.001669,0.239564,-0.009229,0.156444,0.072445,0.007136,-0.004767,-0.022404,0.097109
block,-0.366928,1.0,-0.041489,0.038784,-0.001014,-0.006264,-0.001506,0.008825,-0.016363,-0.027312,-0.090224,-0.036543,-0.037503,-0.015248,-0.008027,0.001645,0.004396,0.014945
species_culex_pipiens,-0.092077,-0.041489,1.0,-0.646863,-0.338064,-0.045736,-0.010564,-0.078113,0.140113,0.066304,-0.167452,0.842751,-0.51481,-0.290456,-0.039382,-0.009734,-0.070025,0.196968
species_culex_pipiens/restuans,0.066231,0.038784,-0.646863,1.0,-0.456552,-0.061766,-0.014267,-0.105491,-0.05712,-0.022955,0.068515,-0.545145,0.795856,-0.392257,-0.053185,-0.013146,-0.094568,0.026525
species_culex_restuans,0.032452,-0.001014,-0.338064,-0.456552,1.0,-0.03228,-0.007456,-0.055132,-0.096811,-0.056254,0.109178,-0.284904,-0.36335,0.859174,-0.027795,-0.00687,-0.049423,-0.23005
species_culex_salinarius,0.001477,-0.006264,-0.045736,-0.061766,-0.03228,1.0,-0.001009,-0.007459,-0.007041,-0.005511,0.001262,-0.038544,-0.049157,-0.027734,0.861064,-0.000929,-0.006686,-0.05627
species_culex_tarsalis,-0.00449,-0.001506,-0.010564,-0.014267,-0.007456,-0.001009,1.0,-0.001723,-0.020642,-0.024813,0.018884,-0.008903,-0.011354,-0.006406,-0.000869,0.921428,-0.001544,-0.012997
species_culex_territans,-0.026831,0.008825,-0.078113,-0.105491,-0.055132,-0.007459,-0.001723,1.0,0.017245,0.029853,0.006461,-0.06583,-0.083956,-0.047368,-0.006422,-0.001587,0.896452,-0.096104
wetbulb_y,-0.054215,-0.016363,0.140113,-0.05712,-0.096811,-0.007041,-0.020642,0.017245,1.0,0.843967,-0.182143,0.065477,-0.114214,-0.11199,-0.010456,-0.017799,0.009221,0.184128
tmax_y,-0.001669,-0.027312,0.066304,-0.022955,-0.056254,-0.005511,-0.024813,0.029853,0.843967,1.0,-0.155992,0.010885,-0.083068,-0.075042,-0.007891,-0.023631,0.019034,0.117919


In [14]:
X_train = traincombined[features]
y_train = traincombined['wnvpresent']

In [15]:

X_train.head().T

Unnamed: 0,10477,1929,6,3078,3279
trap,102.0,13.0,46.0,11.0,60.0
block,10.0,61.0,25.0,36.0,22.0
species_culex_pipiens,0.0,1.0,0.0,0.0,0.0
species_culex_pipiens/restuans,0.0,0.0,0.0,1.0,1.0
species_culex_restuans,1.0,0.0,1.0,0.0,0.0
species_culex_salinarius,0.0,0.0,0.0,0.0,0.0
species_culex_tarsalis,0.0,0.0,0.0,0.0,0.0
species_culex_territans,0.0,0.0,0.0,0.0,0.0
wetbulb_y,58.0,71.0,66.0,50.0,65.0
tmax_y,75.0,81.0,88.0,66.0,89.0


In [16]:
print(X_train.columns)
print(X_test.columns)

Index(['trap', 'block', 'species_culex_pipiens',
       'species_culex_pipiens/restuans', 'species_culex_restuans',
       'species_culex_salinarius', 'species_culex_tarsalis',
       'species_culex_territans', 'wetbulb_y', 'tmax_y', 'distance',
       'dist_species_culex_pipiens', 'dist_species_culex_pipiens/restuans',
       'dist_species_culex_restuans', 'dist_species_culex_salinarius',
       'dist_species_culex_tarsalis', 'dist_species_culex_territans'],
      dtype='object')
Index(['trap', 'block', 'species_culex_pipiens',
       'species_culex_pipiens/restuans', 'species_culex_restuans',
       'species_culex_salinarius', 'species_culex_tarsalis',
       'species_culex_territans', 'wetbulb_y', 'tmax_y', 'distance',
       'dist_species_culex_pipiens', 'dist_species_culex_pipiens/restuans',
       'dist_species_culex_restuans', 'dist_species_culex_salinarius',
       'dist_species_culex_tarsalis', 'dist_species_culex_territans'],
      dtype='object')


In [32]:
rf = RandomForestClassifier()
rf_params = {
    'n_estimators'       : [ 30, 40, 60],
    'max_depth'          : [80,100, 130],
    'min_samples_leaf'   : [2, 3]
}



In [33]:
gs = GridSearchCV(rf, param_grid=rf_params)
gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)
gs.score(X_test, y_test)


0.9416880192561534
{'max_depth': 100, 'min_samples_leaf': 2, 'n_estimators': 60}


0.894175866006852

In [34]:
y_hat = gs.predict(X_test)
y_hat_train = gs.predict(X_train)

In [35]:
print(classification_report( y_train, y_hat_train))

             precision    recall  f1-score   support

        0.0       0.99      0.95      0.97      7466
        1.0       0.93      0.98      0.96      5413

avg / total       0.96      0.96      0.96     12879



In [36]:
print(classification_report(y_test, y_hat))

             precision    recall  f1-score   support

        0.0       0.96      0.93      0.94      2489
        1.0       0.16      0.23      0.19       138

avg / total       0.91      0.89      0.90      2627



In [37]:
confusion_matrix(y_train, y_hat_train)

array([[7084,  382],
       [ 106, 5307]])

In [38]:
confusion_matrix(y_test, y_hat)

array([[2317,  172],
       [ 106,   32]])

In [24]:
X = test[features]
predict = (gs.predict(X))

In [25]:
knn = KNeighborsClassifier()
knn_params = {
    'n_neighbors' : [7, 9, 11],
    'weights'      : ['uniform','distance']
}

gs = GridSearchCV(knn, param_grid=knn_params)
gs.fit(X_train, y_train)

print(gs.best_params_)
print(gs.best_score_)
print(gs.score(X_test,y_test))
y_hat = gs.predict(X_test)
y_hat_train = gs.predict(X_train)
print(classification_report(y_test, y_hat))

{'n_neighbors': 7, 'weights': 'distance'}
0.878329062815436
0.829082603730491
             precision    recall  f1-score   support

        0.0       0.96      0.86      0.90      2489
        1.0       0.12      0.34      0.17       138

avg / total       0.91      0.83      0.87      2627



In [26]:
predict_knn = gs.predict(X)

In [39]:
submissions = pd.DataFrame(predict_knn, columns = ['wnvpresent'])

In [40]:
submissions.wnvpresent.value_counts()

0.0    99012
1.0    17281
Name: wnvpresent, dtype: int64

In [41]:
submissions['id'] = [id +1 for id in range(len(submissions))]

In [42]:
submissions = submissions[['id','wnvpresent' ]]


In [43]:
submissions.to_csv('../data/submission.csv', index = False)