# West Nile Analysis

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.cross_validation import train_test_split, KFold, cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns



In [2]:
train = pd.read_csv('../data/trainclean.csv')
test  = pd.read_csv('../data/testclean.csv')
train.columns = [col.lower().replace(' ', '_') for col in train.columns]
test.columns = [col.lower().replace(' ', '_') for col in test.columns]

                    

In [3]:
#included features
features = [ 'trap', 'latitude', 'longitude', 'block',
            'species_culex_pipiens', 'species_culex_pipiens/restuans',
            'species_culex_restuans', 'species_culex_salinarius',
            'species_culex_tarsalis', 'species_culex_territans']

X = train[features]
y = train['wnvpresent']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state= 6, stratify = y)

In [5]:
datasets = (X_train, y_train)
traincombined = pd.concat(datasets, axis = 1)
traincombined.head()

Unnamed: 0,trap,latitude,longitude,block,species_culex_pipiens,species_culex_pipiens/restuans,species_culex_restuans,species_culex_salinarius,species_culex_tarsalis,species_culex_territans,wnvpresent
10477,102,41.750498,-87.605294,10,0,0,1,0,0,0,0
1929,13,41.923738,-87.785288,61,1,0,0,0,0,0,0
6,46,41.891118,-87.654491,25,0,0,1,0,0,0,0
3078,11,41.944869,-87.832763,36,0,1,0,0,0,0,1
3279,60,41.823065,-87.678378,22,0,1,0,0,0,0,0


In [6]:
west_nile = traincombined[traincombined.wnvpresent == 1]

In [7]:
extra_west_nile = west_nile.sample(n = 5000, replace = True, random_state = 63, axis = 0)

In [8]:
merge = [traincombined, extra_west_nile]
traincombined = pd.concat(merge, axis = 0)

In [9]:
traincombined.shape

(12879, 11)

In [10]:
X_train = traincombined[features]
y_train = traincombined['wnvpresent']

In [11]:
rf = RandomForestClassifier()
rf_params = {
    'n_estimators'       : [10, 20, 30],
    'max_depth'          : [ 100, 110, 150],
    'min_samples_leaf'   : [7, 10, 20]
}



In [12]:
gs = GridSearchCV(rf, param_grid=rf_params)
gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)
gs.score(X_test, y_test)


0.7153505706964827
{'max_depth': 110, 'min_samples_leaf': 7, 'n_estimators': 20}


0.7297297297297297

In [13]:
y_hat = gs.predict(X_test)
y_hat_train = gs.predict(X_train)

In [14]:
print(classification_report( y_train, y_hat_train))

             precision    recall  f1-score   support

          0       0.78      0.75      0.76      7466
          1       0.67      0.70      0.68      5413

avg / total       0.73      0.73      0.73     12879



In [15]:
print(classification_report(y_test, y_hat))

             precision    recall  f1-score   support

          0       0.96      0.75      0.84      2489
          1       0.09      0.43      0.14       138

avg / total       0.91      0.73      0.80      2627



In [16]:
confusion_matrix(y_test, y_hat)

array([[1858,  631],
       [  79,   59]])

In [30]:
X = test[features]
predict = (gs.predict(X))

In [32]:
submissions = pd.DataFrame(predict, columns = ['wnvpresent'])

In [33]:
submissions['id'] = [id +1 for id in range(len(submissions))]

In [34]:
submissions = submissions[['id','wnvpresent' ]]


In [35]:
submissions

Unnamed: 0,id,wnvpresent
0,1,1
1,2,0
2,3,1
3,4,0
4,5,0
5,6,0
6,7,0
7,8,0
8,9,0
9,10,0
