In [58]:
%matplotlib inline

import numpy as np
from sklearn.datasets import load_digits, load_breast_cancer
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB, ComplementNB
from sklearn.preprocessing import StandardScaler

import gc

import pandas as pd

## Import Data

In [38]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [39]:
y = train_df['target']
X = train_df.drop(['ID_code', 'target'], axis=1)

X_test = test_df.drop(['ID_code'], axis=1)
X_test.fillna(999, inplace = True)

In [40]:
bnb = BernoulliNB()
gnb = GaussianNB()

In [41]:
#Cross-validation of classifiers
score_bnb = cross_val_score(bnb, X, y, cv =5, scoring='roc_auc',)

score_gnb= cross_val_score(gnb, X, y, cv = 5, scoring='roc_auc',)

In [42]:
print('Standarter dataset')
print('BernoulliNB mean cross_val_score: ' + str(score_bnb.mean()))
print('GaussianNB mean cross_val_score: ' + str(score_gnb.mean()))

Standarter dataset
BernoulliNB mean cross_val_score: 0.7113721430459774
GaussianNB mean cross_val_score: 0.8883825634475349
GaussianNB mean cross_val_score: 0.8883825634475349


In [43]:
scaler = StandardScaler().fit(X)

In [44]:
X_scaled = scaler.transform(X)
X_test_scaled = scaler.transform(X_test)

In [45]:
#Cross-validation of classifiers
score_bnb_scaled = cross_val_score(bnb, X_scaled, y, cv =5, scoring='roc_auc',)

score_gnb_scaled = cross_val_score(gnb, X_scaled, y, cv = 5, scoring='roc_auc',)

print('Standarter dataset scaled')
print('BernoulliNB mean cross_val_score: ' + str(score_bnb_scaled.mean()))
print('GaussianNB mean cross_val_score: ' + str(score_gnb_scaled.mean()))

Standarter dataset scaled
BernoulliNB mean cross_val_score: 0.7849518970228012
GaussianNB mean cross_val_score: 0.8883828469044662


## Data Augumentation

In [48]:
santander_data = pd.read_csv('train.csv')
santander_data_test = pd.read_csv('test.csv')

In [49]:
# Taking the labels (price)
label_df = santander_data['target']

In [50]:
santander_data.drop(['ID_code','target'], axis=1, inplace=True)

santander_data_test.drop('ID_code', axis=1, inplace=True)
santander_data.head(10)

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,-4.92,5.747,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,3.1468,8.0851,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,-4.9193,5.9525,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,-5.8609,8.245,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,6.2654,7.6784,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104
5,11.4763,-2.3182,12.608,8.6264,10.9621,3.5609,4.5322,15.2255,3.5855,5.979,...,-6.3068,6.6025,5.2912,0.4403,14.9452,1.0314,-3.6241,9.767,12.5809,-4.7602
6,11.8091,-0.0832,9.3494,4.2916,11.1355,-8.0198,6.1961,12.0771,-4.3781,7.9232,...,8.783,6.4521,3.5325,0.1777,18.3314,0.5845,9.1104,9.1143,10.8869,-3.2097
7,13.558,-7.9881,13.8776,7.5985,8.6543,0.831,5.689,22.3262,5.0647,7.1971,...,13.17,6.5491,3.9906,5.8061,23.1407,-0.3776,4.2178,9.4237,8.6624,3.4806
8,16.1071,2.4426,13.9307,5.6327,8.8014,6.163,4.4514,10.1854,-3.1882,9.0827,...,1.4298,14.751,1.6395,1.4181,14.837,-1.994,-1.0733,8.1975,19.5114,4.8453
9,12.5088,1.9743,8.896,5.4508,13.6043,-16.2859,6.0637,16.841,0.1287,7.9682,...,0.5543,6.316,1.0371,3.6885,14.8344,0.4467,14.1287,7.9133,16.2375,14.2514


In [51]:
len_train = len(santander_data)
len_train

200000

In [52]:
#Merge test and train
merged = pd.concat([santander_data, santander_data_test])
#Saving the list of original features in a new list `original_features`.
original_features = merged.columns
merged.shape

(400000, 200)

In [53]:
idx = features = merged.columns.values[0:200]
for df in [merged]:
    df['sum'] = df[idx].sum(axis=1)  
    df['min'] = df[idx].min(axis=1)
    df['max'] = df[idx].max(axis=1)
    df['mean'] = df[idx].mean(axis=1)
    df['std'] = df[idx].std(axis=1)
    df['skew'] = df[idx].skew(axis=1)
    df['kurt'] = df[idx].kurtosis(axis=1)
    df['med'] = df[idx].median(axis=1)

In [None]:
print("Total number of features: ",merged.shape[1])

In [54]:
train_df = merged.iloc[:len_train]
train_df.head()

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_198,var_199,sum,min,max,mean,std,skew,kurt,med
0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,-4.92,5.747,...,12.7803,-1.0914,1456.3182,-21.4494,43.1127,7.281591,9.33154,0.10158,1.331023,6.7704
1,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,3.1468,8.0851,...,18.356,1.9518,1415.3636,-47.3797,40.5632,7.076818,10.33613,-0.351734,4.110215,7.22315
2,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,-4.9193,5.9525,...,14.7222,0.3965,1240.8966,-22.4038,33.882,6.204483,8.753387,-0.056957,0.546438,5.8994
3,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,-5.8609,8.245,...,17.9697,-8.9996,1288.2319,-35.1659,38.1015,6.441159,9.594064,-0.480116,2.630499,6.7026
4,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,6.2654,7.6784,...,17.9974,-8.8104,1354.231,-65.4863,41.1037,6.771155,11.287122,-1.463426,9.787399,6.94735


In [55]:
X_test = merged.iloc[len_train:]
X_test.head()

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_198,var_199,sum,min,max,mean,std,skew,kurt,med
0,11.0656,7.7798,12.9536,9.4292,11.4327,-2.3805,5.8493,18.2675,2.1337,8.81,...,15.4722,-8.7197,1416.6404,-31.9891,42.0248,7.083202,9.910632,-0.088518,1.871262,7.3144
1,8.5304,1.2543,11.3047,5.1858,9.1974,-4.0117,6.0196,18.6316,-4.4131,5.9739,...,19.1293,-20.976,1249.686,-41.1924,35.602,6.24843,9.541267,-0.559785,3.391068,6.4396
2,5.4827,-10.3581,10.1407,7.0479,10.2628,9.8052,4.895,20.2537,1.5233,8.3442,...,19.8956,-23.1794,1430.2599,-34.3488,39.3654,7.151299,9.967466,-0.135084,2.326901,7.26355
3,8.5374,-1.3222,12.022,6.5749,8.8458,3.1744,4.9397,20.566,3.3755,7.4578,...,13.0168,-4.2108,1411.4447,-21.4797,40.3383,7.057223,8.257204,-0.167741,2.253054,6.89675
4,11.7058,-0.1327,14.1295,7.7506,9.1035,-8.5848,6.8595,10.6048,2.989,7.1437,...,13.926,-9.1846,1423.7364,-24.8254,45.551,7.118682,10.043542,0.293484,2.044943,6.83375


In [56]:
def augment(x,y,t=2):
    xs,xn = [],[]
    for i in range(t):
        mask = y>0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xs.append(x1)

    for i in range(t//2):
        mask = y==0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xn.append(x1)

    xs = np.vstack(xs)
    xn = np.vstack(xn)
    ys = np.ones(xs.shape[0])
    yn = np.zeros(xn.shape[0])
    x = np.vstack([x,xs,xn])
    y = np.concatenate([y,ys,yn])
    return x,y

In [59]:
"""train_df = santander_data
X_test = santander_data_test"""
del santander_data
del santander_data_test
gc.collect()

NameError: name 'santander_data' is not defined

In [61]:
train_df.head()

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_198,var_199,sum,min,max,mean,std,skew,kurt,med
0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,-4.92,5.747,...,12.7803,-1.0914,1456.3182,-21.4494,43.1127,7.281591,9.33154,0.10158,1.331023,6.7704
1,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,3.1468,8.0851,...,18.356,1.9518,1415.3636,-47.3797,40.5632,7.076818,10.33613,-0.351734,4.110215,7.22315
2,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,-4.9193,5.9525,...,14.7222,0.3965,1240.8966,-22.4038,33.882,6.204483,8.753387,-0.056957,0.546438,5.8994
3,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,-5.8609,8.245,...,17.9697,-8.9996,1288.2319,-35.1659,38.1015,6.441159,9.594064,-0.480116,2.630499,6.7026
4,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,6.2654,7.6784,...,17.9974,-8.8104,1354.231,-65.4863,41.1037,6.771155,11.287122,-1.463426,9.787399,6.94735


In [62]:
label_df.head()

0    0
1    0
2    0
3    0
4    0
Name: target, dtype: int64

In [63]:
#Cross-validation of classifiers
score_bnb_scaled = cross_val_score(bnb, train_df, label_df, cv =5, scoring='roc_auc',)

score_gnb_scaled = cross_val_score(gnb, train_df, label_df, cv = 5, scoring='roc_auc',)

print('Standarter dataset scaled')
print('BernoulliNB mean cross_val_score: ' + str(score_bnb_scaled.mean()))
print('GaussianNB mean cross_val_score: ' + str(score_gnb_scaled.mean()))

Standarter dataset scaled
BernoulliNB mean cross_val_score: 0.7117847418098446
GaussianNB mean cross_val_score: 0.8828356856149263


In [64]:
X_tr, y_tr = augment(train_df.values, label_df.values)

In [65]:
#Cross-validation of classifiers
score_bnb_scaled = cross_val_score(bnb, X_tr, y_tr, cv =5, scoring='roc_auc',)

score_gnb_scaled = cross_val_score(gnb, X_tr, y_tr, cv = 5, scoring='roc_auc',)

print('Standarter dataset scaled')
print('BernoulliNB mean cross_val_score: ' + str(score_bnb_scaled.mean()))
print('GaussianNB mean cross_val_score: ' + str(score_gnb_scaled.mean()))

Standarter dataset scaled
BernoulliNB mean cross_val_score: 0.7140307516800914
GaussianNB mean cross_val_score: 0.887824949077924


In [66]:
scaler = preprocessing.StandardScaler().fit(X_tr)
X_augmented_scaled = scaler.transform(X_tr)

In [72]:
#Cross-validation of classifiers
score_bnb_scaled = cross_val_score(bnb, X_augmented_scaled, y_tr, cv =5, scoring='roc_auc',)

score_gnb_scaled = cross_val_score(gnb, X_augmented_scaled, y_tr, cv = 5, scoring='roc_auc',)

print('Standarter dataset scaled')
print('BernoulliNB mean cross_val_score: ' + str(score_bnb_scaled.mean()))
print('GaussianNB mean cross_val_score: ' + str(score_gnb_scaled.mean()))

Standarter dataset scaled
BernoulliNB mean cross_val_score: 0.7886837719846864
GaussianNB mean cross_val_score: 0.8878297855032461


In [69]:
from sklearn.linear_model import Ridge, LogisticRegression

In [70]:
rdg = Ridge(alpha=0.1, random_state = 1)
lgr = LogisticRegression(random_state= 1)