In [25]:
import numpy as np
import pandas as pd
import re
from sklearn import svm
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix as cm
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

In [26]:
train_data   = pd.read_csv('./assets/train.csv')
test_data    = pd.read_csv('./assets/test.csv')
weather_data = pd.read_csv('./weather_station_avg.csv')

In [27]:
train_data.shape, test_data.shape

((10506, 12), (116293, 11))

In [28]:
# train_data = pd.merge(train_data, weather_data, how='inner', on='Date', sort=False, validate='m:1')
train_data = train_data.merge(weather_data, on='Date')
test_data  = test_data.merge(weather_data, on="Date")

train_data.shape, test_data.shape

((10506, 39), (116293, 38))

In [29]:
test_data.head()

Unnamed: 0,Id,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,...,CodeSum_DZ,CodeSum_DZ BR,CodeSum_RA,CodeSum_RA BR,CodeSum_RA DZ,CodeSum_RA DZ BR,CodeSum_TS,CodeSum_TS BR,CodeSum_TS RA,CodeSum_TS RA BR
0,1,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,...,0,0,0,0,0,0,0,0,0,0
1,2,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,...,0,0,0,0,0,0,0,0,0,0
2,3,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,...,0,0,0,0,0,0,0,0,0,0
3,4,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX SALINARIUS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,...,0,0,0,0,0,0,0,0,0,0
4,5,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX TERRITANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,...,0,0,0,0,0,0,0,0,0,0


In [30]:
# to sort the values of the bugs that can have west nile from the ones that don't.
def processing_species(df):
    for bug in df:
        if bug == 'CULEX PIPIENS/RESTUANS':
            return 'wnv_bug'
        elif bug == 'CULEX PIPIENS':
            return 'wnv_bug'
        elif bug == 'CULEX RESTUANS':
            return 'wnv_bug'
        else:
            return 'no_wnv_bug'

train_data['Species'] = train_data[['Species']].apply(processing_species, axis=1)
test_data['Species']  = test_data[['Species']].apply(processing_species, axis=1)

In [31]:
def processing(df):
    df.drop(['Date', 'Address', 'Street', 'AddressNumberAndStreet', 'Unnamed: 0'], axis=1, inplace=True)
    df['Trap'] = [x.strip('TABCabc') for x in df['Trap']]
    df['Trap'].astype(int)
    df = pd.get_dummies(df, columns=['Species'])
    return df

train_data = processing(train_data)
test_data  = processing(test_data)

In [32]:
train_data.head()

Unnamed: 0,Block,Trap,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent,Tmax,Tmin,Tavg,...,CodeSum_RA,CodeSum_RA BR,CodeSum_RA DZ,CodeSum_RA DZ BR,CodeSum_TS,CodeSum_TS BR,CodeSum_TS RA,CodeSum_TS RA BR,Species_no_wnv_bug,Species_wnv_bug
0,41,2,41.95469,-87.800991,9,1,0,88,60,74,...,0,0,0,0,0,0,0,0,0,1
1,41,2,41.95469,-87.800991,9,1,0,88,60,74,...,0,0,0,0,0,0,0,0,0,1
2,62,7,41.994991,-87.769279,9,1,0,88,60,74,...,0,0,0,0,0,0,0,0,0,1
3,79,15,41.974089,-87.824812,8,1,0,88,60,74,...,0,0,0,0,0,0,0,0,0,1
4,79,15,41.974089,-87.824812,8,4,0,88,60,74,...,0,0,0,0,0,0,0,0,0,1


In [33]:
# the difference in shapes is fine because we have the Target still in the Training Data.
train_data.shape, test_data.shape

((10506, 35), (116293, 34))

In [34]:
target   = train_data.WnvPresent
features = train_data.drop('WnvPresent', axis=1) 

In [35]:
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=42, stratify=target)

In [36]:
sm = SMOTE(random_state=42, ratio=0.5)

X_train_res, y_train_res = sm.fit_sample(X_train, y_train)



In [37]:
ss  = StandardScaler()

log = LogisticRegression()
rfc = RandomForestClassifier(n_estimators=14, random_state=42)
abc = AdaBoostClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
# svc = svm.SVC()

In [38]:
pipe_log = Pipeline([
    ('ss', ss),
    ('log', log)
])
pipe_rfc = Pipeline([
    ('ss', ss),
    ('rfc', rfc),
])
pipe_abc = Pipeline([
    ('ss', ss),
    ('abc', abc)
])
# pipe_svc = Pipeline([
#     ('ss', ss),
#     ('svc', svc)
# ])

In [39]:
# grid_log = GridSearchCV(estimator=pipe_log,
#                         param_grid=params_log,
#                         scoring='roc_auc',
#                         cv=5)
# grid_rfc = GridSearchCV(estimator=pipe_rfc,
#                         param_grid=params_rfc,
#                         scoring='roc_auc',
#                         cv=5)
# grid_abc = GridSearchCV(estimator=pipe_abc,
#                         param_grid=params_abc,
#                         scoring='roc_auc',
#                         cv=5)
# # grid_svc = GridSearchCV(estimator=pipe_svc,
# #                         param_grid=params_svc,
# #                         scoring='roc_auc',
# #                         cv=5)

In [40]:
%%time
pipes    = [ 
    pipe_log, 
    pipe_rfc, 
    pipe_abc,
#     grid_svc  # after testing this it was not worth it and took way to long.
]
pipe_idx = {0: 'Logistic Regression', 
            1: 'Random Forest', 
            2: 'Adaboost',
#             3: 'Support Vector Machine'
           }

for idx, pipe in enumerate(pipes):
    pipe.fit(X_train_res, y_train_res)
    print('\nScore Train/Test: %s' % pipe_idx[idx])
    print(pipe.score(X_train, y_train))
    print(pipe.score(X_test, y_test))
#     print('Best params: %s' % pipe.best_params_)


Score Train/Test: Logistic Regression
0.8525193552481279
0.851541682527598

Score Train/Test: Random Forest
0.9812158903414139
0.9288161400837457

Score Train/Test: Adaboost
0.8733341794643991
0.871716787209745
CPU times: user 3.09 s, sys: 83.3 ms, total: 3.17 s
Wall time: 2.54 s


In [41]:
predictions = pipe_rfc.predict(test_data)

In [42]:
len(predictions)

116293

In [43]:
sample_submission = pd.read_csv('./assets/sampleSubmission.csv')
sample_submission['WnvPresent'] = predictions
sample_submission.to_csv('submission_randomforest.csv', index=False)

In [44]:
from collections import Counter

print(Counter(predictions).values())
print(Counter(predictions).keys())

dict_values([109660, 6633])
dict_keys([0, 1])
