In [292]:
import numpy as np
import pandas as pd
import itertools as it
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution1D, MaxPooling1D
from keras import regularizers
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import *
from keras.optimizers import Adam
from keras.initializers import *

%matplotlib inline

In [71]:
train = pd.read_csv('../WnV2.csv')
test = pd.read_csv('../assets/test.csv')
sample = pd.read_csv('../assets/sampleSubmission.csv')
weather = pd.read_csv('../assets/weather.csv')

In [82]:
train['hot_temp']=(train['Tmax']>40).astype(int)
train['low_temp']=(train['Tmax']<32).astype(int)
train['year_spray']=(train['year_spry']==train['year_wthr']).astype(int)
train['month_spray']=(train['month_spry']==train['month_wthr']).astype(int)
train['day_spray']=((14>train['day_spry']-train['day_wthr']) & (train['day_spry']-train['day_wthr']>0)).astype(int)
train['spray_effect']=train['year_spry'] & train['month_spray'] & train['day_spray']
train['wind_speed']=(train['AvgSpeed']>10).astype(int)
train['low_wsp']=(train['ResultSpeed']>11).astype(int) 
train = train[[column for column in train.columns if train[column].dtype != 'object' and 'index' not in column]]

In [73]:
train.head()

Unnamed: 0,level_0,index,Date_spry,Time_spry,Latitude_spry,Longitude_spry,day_spry,month_spry,year_spry,index.1,...,month_cat,DewPoint:Sunrise,hot_temp,low_temp,year_spray,month_spray,day_spray,spray_effect,wind_speed,low_wsp
0,0,0,2011-09-07,210358.0,41.990164,-87.805663,7,9,2011,0,...,1,22848.0,1,0,0,0,1,0,0,0
1,1,1,2011-09-07,212510.0,41.987535,-87.821307,7,9,2011,1,...,1,22848.0,1,0,0,0,1,0,0,0
2,2,2,2011-09-07,204500.0,41.976982,-87.819747,7,9,2011,2,...,1,18774.0,1,0,0,0,1,0,1,1
3,3,3,2011-09-07,194432.0,41.98646,-87.794225,7,9,2011,3,...,1,18774.0,1,0,0,0,1,0,1,1
4,4,4,2011-09-07,213420.0,41.9877,-87.821792,7,9,2011,4,...,1,17840.0,1,0,0,0,1,0,1,1


In [362]:
total_feat=['Latitude_spry', 'Longitude_spry', 'day_spry',
       'month_spry', 'year_spry',
        'Latitude_trn',
       'Longitude_trn', 'AddressAccuracy', 'NumMosquitos',
       'day_trn', 'month_trn', 'Street_effect', 'Species_lb',
       'Station', 'Heat', 'Cool', 
       'Depth', 'SnowFall', 'PrecipTotal', 'StnPressure',
       'ResultSpeed', 'ResultDir', 'AvgSpeed', 'day_wthr', 'month_wthr',
       'year_wthr', 'CodeSum_lb', 'Tmax:Tmin:Sunst',
       'P:sea:WB', 'T_diff', 'Dry', 'Dry_Wet_Diff', 'month_cat',
       'DewPoint:Sunrise', 'hot_temp', 'low_temp', 'year_spray', 'month_spray',
       'day_spray', 'spray_effect', 'wind_speed', 'low_wsp']
feat=[
        'Latitude_spry',
       'Longitude_spry', 'day_spry', 'month_spry',
        'Longitude_trn',
       'AddressAccuracy', 'day_trn', 'month_trn',
       'Street_effect', 'Species_lb', 'Address_lb',
       'Station','NumMosquitos','spray_effect','wind_speed',
        'Sunrise', 'Sunset','low_wsp',
       'PrecipTotal',
       'AvgSpeed', 'day_wthr', 'month_wthr', 'year_wthr', 'CodeSum_lb',
       'Tmax:Tmin:Sunst', 'Tavg:WB', 'DP:Tmin',
        'DewPoint:Sunrise']

In [363]:
# X = train.drop(columns=['WnvPresent'])
X = train[feat]
X = train[total_feat]
y = train.WnvPresent.values

In [223]:
def split_balance(X, y, balance=True, scale=True):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
    
    if balance:
        ind = [i for i, x in enumerate(y_train) if x == 1]
        X_train = list(X_train.values)
        pos_vals = [X_train[i] for i in ind]
        y_train = list(y_train)
        for _ in range(11):
            X_train += pos_vals
            y_train += (len(pos_vals) * [1])
        X_train = pd.DataFrame(np.array(X_train), columns=X_test.columns)
        y_train = np.array(y_train)
    
    if scale:
        ss = StandardScaler()
        X_train = ss.fit_transform(X_train)
        X_test = ss.transform(X_test)
    return X_train, X_test, y_train, y_test

In [375]:
X_train, X_test, y_train, y_test = split_balance(X, y)

In [380]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.8488964346349746

In [381]:
roc_auc_score(y_test, lr.predict_proba(X_test)[:,1])

0.7839954163483576

In [387]:
confusion_matrix(y_test, lr.predict(X_test))

array([[472,  89],
       [  9,  19]])

In [383]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
print(knn.score(X_test, y_test))
roc = roc_auc_score(y_test, knn.predict_proba(X_test)[:,1])
print(max(roc, 1 - roc))

0.8285229202037352
0.56951871657754


In [49]:
[column for column in train.columns if train[column].dtype != 'object' and 'index' not in column]

['level_0',
 'Time_spry',
 'Latitude_spry',
 'Longitude_spry',
 'day_spry',
 'month_spry',
 'year_spry',
 'Block',
 'Latitude_trn',
 'Longitude_trn',
 'AddressAccuracy',
 'NumMosquitos',
 'WnvPresent',
 'day_trn',
 'month_trn',
 'year_trn',
 'Street_effect',
 'Species_lb',
 'Address_lb',
 'Station',
 'Tmax',
 'Tmin',
 'Tavg',
 'DewPoint',
 'WetBulb',
 'Heat',
 'Cool',
 'Sunrise',
 'Sunset',
 'Depth',
 'SnowFall',
 'PrecipTotal',
 'StnPressure',
 'SeaLevel',
 'ResultSpeed',
 'ResultDir',
 'AvgSpeed',
 'day_wthr',
 'month_wthr',
 'year_wthr',
 'CodeSum_lb',
 'Tmax:Tmin:Sunst',
 'Tavg:WB',
 'Hot_wet',
 'DP:Tmin',
 'P:sea:WB',
 'T_diff',
 'Dry',
 'Dry_Wet_Diff',
 'month_cat',
 'DewPoint:Sunrise',
 'hot_temp',
 'low_temp',
 'year_spray',
 'month_spray',
 'day_spray',
 'spray_effect',
 'wind_speed',
 'low_wsp']

In [152]:
1 - y_test.mean()

0.9524617996604414

In [108]:
ind = [i for i, x in enumerate(y_train) if x == 1]
# np.array([X_train[i] for i in ind])

In [121]:
len(X_train)

3465

In [122]:
len(y_train)

3465

In [172]:
knn

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': 1,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [367]:
outs = []
for _ in range(50):
    X_train, X_test, y_train, y_test = split_balance(X, y)
    extc = ExtraTreesClassifier(n_estimators=150,
                                min_impurity_decrease =0.08,
                                n_jobs=-1,
                                criterion='gini',
                                class_weight ='balanced') 
    extc.fit(X_train, y_train)
#     print(extc.score(X_test, y_test))
    roc = roc_auc_score(y_test, extc.predict_proba(X_test)[:,1])
#     print(max(roc, 1 - roc))
    outs += [[extc.score(X_test, y_test), max(roc, 1 - roc)]]
['{} +/- {}'.format(x.mean(), x.std()) for x in np.array(outs).transpose()]

['0.7208828522920204 +/- 0.067780008108757',
 '0.802399414311179 +/- 0.039833983175969034']

In [368]:
outs = []
for _ in range(50):
    X_train, X_test, y_train, y_test = split_balance(X, y)
    forest1=RandomForestClassifier(criterion='entropy',
                                   max_depth=100,
                                   min_impurity_decrease= 0.001,
                                   min_weight_fraction_leaf=0.1,
#                                    min_impurity_split=0.0001,
                                   min_samples_leaf= 1,
                                   min_samples_split= 2,
                                   n_estimators=60,
                                   n_jobs=-1,
                                   class_weight='balanced')

    forest1.fit(X_train, y_train)
#     print(forest1.score(X_test, y_test))
    roc = roc_auc_score(y_test, forest1.predict_proba(X_test)[:,1])
#     print(max(roc, 1 - roc))
    outs += [[forest1.score(X_test, y_test), max(roc, 1 - roc)]]

In [369]:
['{} +/- {}'.format(x.mean(), x.std()) for x in np.array(outs).transpose()]

['0.7764006791171478 +/- 0.019869980142730486',
 '0.8321950598421188 +/- 0.0336396128400698']

In [384]:
forest1=RandomForestClassifier(criterion='entropy',
                               max_depth=100,
                               min_impurity_decrease= 0.001,
                               min_weight_fraction_leaf=0.1,
#                                    min_impurity_split=0.0001,
                               min_samples_leaf= 1,
                               min_samples_split= 2,
                               n_estimators=60,
                               n_jobs=-1,
                               class_weight='balanced')
extc = ExtraTreesClassifier(n_estimators=150,
                            min_impurity_decrease =0.08,
                            n_jobs=-1,
                            criterion='gini',
                            class_weight ='balanced') 
knn = LogisticRegression()

In [385]:
outs = []
for _ in range(50):
    vcl = VotingClassifier([('f1', forest1), ('extra', extc), ('nearest', knn)], voting='soft')
    X_train, X_test, y_train, y_test = split_balance(X, y)
    vcl.fit(X_train, y_train)
#     print(vcl.score(X_test, y_test))
    roc = roc_auc_score(y_test, vcl.predict_proba(X_test)[:,1])
#     print(max(roc, 1 - roc))
#     roc = roc_auc_score(y_test, forest1.predict_proba(X_test)[:,1])
    outs += [[vcl.score(X_test, y_test), max(roc, 1 - roc)]]
['{} +/- {}'.format(x.mean(), x.std()) for x in np.array(outs).transpose()]

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


['0.8243803056027162 +/- 0.015956218329406763',
 '0.8164323911382735 +/- 0.033890487167125456']

In [282]:
outs

[[0.8726655348047538, 0.7683982683982684],
 [0.8760611205432938, 0.8060860707919532],
 [0.8930390492359932, 0.7426152279093456],
 [0.8590831918505942, 0.7799210593328241],
 [0.8794567062818336, 0.8160809778456837],
 [0.8607809847198642, 0.7482174688057041],
 [0.8692699490662139, 0.7511459129106188],
 [0.865874363327674, 0.7798573975044564],
 [0.8590831918505942, 0.7374586198115609],
 [0.8353140916808149, 0.7125031830914185],
 [0.8522920203735145, 0.7449707155589509],
 [0.8811544991511036, 0.8038579067990832],
 [0.8624787775891342, 0.7696078431372549],
 [0.8455008488964346, 0.7619684237331297],
 [0.8726655348047538, 0.7887063916475681],
 [0.8505942275042445, 0.768270944741533],
 [0.8641765704584041, 0.7461166284695697],
 [0.8505942275042445, 0.7263177998472117],
 [0.8505942275042445, 0.7626687038451744],
 [0.8641765704584041, 0.766934046345811],
 [0.8624787775891342, 0.7941176470588236],
 [0.8421052631578947, 0.7361853832442068],
 [0.8692699490662139, 0.8370257193786605],
 [0.8573853989

In [359]:
corrs = []
for x, y in it.combinations(list(X.columns),2):
    temp = np.corrcoef(X[x].values, X[y].values)[0][1]
    if abs(temp)>.95:
        print(x,y, temp)
        corrs += [(x, y)]

DewPoint DP:Tmin 0.9714932969901715
DewPoint P:sea:WB 0.972028621855331
DP:Tmin P:sea:WB 0.9786346478985921


  c /= stddev[:, None]
  c /= stddev[None, :]


In [341]:
corrs

[('Tmax', 'Tavg'),
 ('Tmin', 'Tavg'),
 ('Tmin', 'Tavg:WB'),
 ('Tmin', 'DP:Tmin'),
 ('Tmin', 'P:sea:WB'),
 ('Tavg', 'Tavg:WB'),
 ('Tavg', 'P:sea:WB'),
 ('DewPoint', 'DP:Tmin'),
 ('DewPoint', 'P:sea:WB'),
 ('Sunrise', 'Sunset'),
 ('Tavg:WB', 'DP:Tmin'),
 ('Tavg:WB', 'P:sea:WB'),
 ('DP:Tmin', 'P:sea:WB')]

In [342]:
def transitive_closure(a):
    closure = set(a)
    while True:
        new_relations = set((x,w) for x,y in closure for q,w in closure if q == y)

        closure_until_now = closure | new_relations

        if closure_until_now == closure:
            break

        closure = closure_until_now

    return closure

In [360]:
corrs = transitive_closure(corrs)

In [361]:
all_labels = set()
for corr in corrs:
    for label in corr:
        all_labels.add(label)

eclasses = []
done = set()
for label in all_labels:
    if label not in done:
        temp = set()
        for corr in corrs:
            if label in corr:
                temp.update(set(corr))
        print(temp)
        eclasses += [temp]
        done.update(temp)
# eclasses

{'DP:Tmin', 'DewPoint', 'P:sea:WB'}


In [326]:
s = set()
s.update()

TypeError: add() takes exactly one argument (2 given)

In [356]:
X[['Tavg:WB', 'DewPoint', 'P:sea:WB', 'DP:Tmin']]

Unnamed: 0,Tavg:WB,DewPoint,P:sea:WB,DP:Tmin
0,3752.0,51,54.647887,2550.0
1,3876.0,51,55.776660,2652.0
2,2397.0,42,45.890994,1764.0
3,2444.0,42,46.000000,1806.0
4,2688.0,40,46.836653,1840.0
5,2900.0,40,48.904382,1920.0
6,2900.0,41,48.768719,2009.0
7,3450.0,42,48.868176,2142.0
8,2940.0,38,47.860465,2014.0
9,3000.0,39,48.953141,2106.0


In [392]:
ind = [i for i, x in enumerate(y) if x == 1]
Xs = list(X.values)
pos_vals = [Xs[i] for i in ind]
ys = list(y)
for _ in range(11):
    Xs += pos_vals
    ys += (len(pos_vals) * [1])
Xs = pd.DataFrame(np.array(Xs), columns=X.columns)
ys = np.array(ys)

ss = StandardScaler()
Xs = ss.fit_transform(Xs)
# X2 = ss.transform(X2)

In [399]:
vcl = VotingClassifier([('f1', forest1), ('extra', extc), ('nearest', knn)], voting='soft')
vcl.fit(Xs, ys)

VotingClassifier(estimators=[('f1', RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='entropy', max_depth=100, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.001,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)

In [400]:
vcl.score(Xs,ys)

  if diff:


0.7914151576123407