In [3]:
from glob import glob
import re 
import numpy as np
np.random.seed(0) # ensure reproducibility
np.set_printoptions(suppress = True)
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.metrics import log_loss
# Models
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# NN
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
# Stacking
from vecstack import stacking
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np

Using TensorFlow backend.


In [4]:
train = pd.read_csv('train.csv')

In [5]:
y = train['target'] 
X = train.drop(["ID_code", "target"], axis=1)


In [6]:
test = pd.read_csv('test.csv')
test1 = test.drop(["ID_code"], axis=1)

In [7]:
n_classes = 3

# Create data: 500 example, 5 feature, 3 classes
X, y = make_classification(n_samples=200000, n_features=200, 
                           n_informative=3, n_redundant=1, 
                           n_classes=n_classes, flip_y=0, 
                           random_state=0)

# Make train/test split
# As usual in machine learning task we have X_train, y_train, and X_test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print('Train shape:', X_train.shape)
print('Test shape: ', X_test.shape)

Train shape: (160000, 200)
Test shape:  (40000, 200)


In [8]:
def build_keras_model_1():
    model = Sequential()
    model.add(Dense(64, 
                    input_dim=X_train.shape[1], 
                    kernel_initializer='normal', 
                    activation='relu'))
    model.add(Dense(n_classes, 
                    kernel_initializer='normal', 
                    activation='softmax'))
    model.compile(optimizer='rmsprop', 
                  loss='categorical_crossentropy', 
                  metrics=['categorical_accuracy'])
    return model

# Caution! All models and parameter values are just 
# demonstrational and shouldn't be considered as recommended.
models_1 = [ 
    GaussianNB(),
    
    LogisticRegression(random_state=0),
    
    ExtraTreesClassifier(random_state=0, n_jobs=-1, 
                         n_estimators=100, max_depth=3),
                         
    RandomForestClassifier(random_state=0, n_jobs=-1, 
                           n_estimators=100, max_depth=3),
        
    XGBClassifier(random_state=0, n_jobs=-1, learning_rate=0.1, 
                  n_estimators=100, max_depth=3),
                  
    LGBMClassifier(random_state=0, n_jobs=-1, learning_rate=0.1, 
                   n_estimators=100, max_depth=3),
                  
    KerasClassifier(build_fn=build_keras_model_1, epochs=2, 
                    batch_size=32, verbose=0)
]

In [9]:
S_train_1, S_test_1 = stacking(models_1,                   # list of models
                               X_train, y_train, X_test,   # data
                               regression=False,           # classification task (if you need 
                                                           #     regression - set to True)
                               mode='oof_pred',            # mode: oof for train set, fit on full 
                                                           #     train and predict test set once
                               needs_proba=True,           # predict probabilities (if you need 
                                                           #     class labels - set to False) 
                               save_dir='.',               # save result and log in current dir 
                                                           #     (to disable saving - set to None)
                               metric=log_loss,            # metric: callable
                               n_folds=5,                  # number of folds
                               stratified=True,            # stratified split for folds
                               shuffle=True,               # shuffle the data
                               random_state=0,             # ensure reproducibility
                               verbose=2)                  # print all info

task:         [classification]
n_classes:    [3]
metric:       [log_loss]
mode:         [oof_pred]
n_models:     [7]

model  0:     [GaussianNB]
    fold  0:  [0.60107452]
    fold  1:  [0.62014706]
    fold  2:  [0.61672544]
    fold  3:  [0.61218361]
    fold  4:  [0.61952161]
    ----
    MEAN:     [0.61393045] + [0.00701492]
    FULL:     [0.61393026]

    Fitting on full train set...

model  1:     [LogisticRegression]




    fold  0:  [0.56465399]




    fold  1:  [0.58025074]




    fold  2:  [0.57469431]




    fold  3:  [0.57474876]




    fold  4:  [0.58078008]
    ----
    MEAN:     [0.57502557] + [0.00579953]
    FULL:     [0.57502541]

    Fitting on full train set...





model  2:     [ExtraTreesClassifier]
    fold  0:  [1.06074423]
    fold  1:  [1.06235617]
    fold  2:  [1.04947944]
    fold  3:  [1.06130249]
    fold  4:  [1.06374782]
    ----
    MEAN:     [1.05952603] + [0.00512654]
    FULL:     [1.05952601]

    Fitting on full train set...

model  3:     [RandomForestClassifier]
    fold  0:  [0.92345645]
    fold  1:  [0.90625637]
    fold  2:  [0.92339716]
    fold  3:  [0.91650931]
    fold  4:  [0.91757033]
    ----
    MEAN:     [0.91743792] + [0.00628716]
    FULL:     [0.91743800]

    Fitting on full train set...

model  4:     [XGBClassifier]
    fold  0:  [0.43016298]
    fold  1:  [0.43888331]
    fold  2:  [0.43915033]
    fold  3:  [0.43740928]
    fold  4:  [0.44352498]
    ----
    MEAN:     [0.43782618] + [0.00434156]
    FULL:     [0.43782605]

    Fitting on full train set...

model  5:     [LGBMClassifier]
    fold  0:  [0.43093772]
    fold  1:  [0.43751131]
    fold  2:  [0.43960360]
    fold  3:  [0.43950096]
    fold  4

In [10]:
def build_keras_model_2():
    model = Sequential()
    model.add(Dense(256, 
                    input_dim=X_train.shape[1], 
                    kernel_initializer='normal', 
                    activation='relu'))
    model.add(Dense(64, 
                    kernel_initializer='normal', 
                    activation='relu'))
    model.add(Dense(n_classes, 
                    kernel_initializer='normal', 
                    activation='softmax'))
    model.compile(optimizer='rmsprop', 
                  loss='categorical_crossentropy', 
                  metrics=['categorical_accuracy'])
    return model

# Caution! All models and parameter values are just 
# demonstrational and shouldn't be considered as recommended.
models_2 = [        
    KerasClassifier(build_fn=build_keras_model_2, epochs=5, 
                    batch_size=32, verbose=0)
]

In [12]:
S_train_1_1, S_test_1_1 = stacking(models_1,                   # list of models
                               X_train, y_train, test1,   # data
                               regression=False,           # classification task (if you need 
                                                           #     regression - set to True)
                               mode='oof_pred',            # mode: oof for train set, fit on full 
                                                           #     train and predict test set once
                               needs_proba=True,           # predict probabilities (if you need 
                                                           #     class labels - set to False) 
                               save_dir='.',               # save result and log in current dir 
                                                           #     (to disable saving - set to None)
                               metric=log_loss,            # metric: callable
                               n_folds=5,                  # number of folds
                               stratified=True,            # stratified split for folds
                               shuffle=True,               # shuffle the data
                               random_state=0,             # ensure reproducibility
                               verbose=2)                  # print all info

task:         [classification]
n_classes:    [3]
metric:       [log_loss]
mode:         [oof_pred]
n_models:     [7]

model  0:     [GaussianNB]
    fold  0:  [0.60107452]
    fold  1:  [0.62014706]
    fold  2:  [0.61672544]
    fold  3:  [0.61218361]
    fold  4:  [0.61952161]
    ----
    MEAN:     [0.61393045] + [0.00701492]
    FULL:     [0.61393026]

    Fitting on full train set...

model  1:     [LogisticRegression]




    fold  0:  [0.56465399]




    fold  1:  [0.58025074]




    fold  2:  [0.57469431]




    fold  3:  [0.57474876]




    fold  4:  [0.58078008]
    ----
    MEAN:     [0.57502557] + [0.00579953]
    FULL:     [0.57502541]

    Fitting on full train set...





model  2:     [ExtraTreesClassifier]
    fold  0:  [1.06074423]
    fold  1:  [1.06235617]
    fold  2:  [1.04947944]
    fold  3:  [1.06130249]
    fold  4:  [1.06374782]
    ----
    MEAN:     [1.05952603] + [0.00512654]
    FULL:     [1.05952601]

    Fitting on full train set...

model  3:     [RandomForestClassifier]
    fold  0:  [0.92345645]
    fold  1:  [0.90625637]
    fold  2:  [0.92339716]
    fold  3:  [0.91650931]
    fold  4:  [0.91757033]
    ----
    MEAN:     [0.91743792] + [0.00628716]
    FULL:     [0.91743800]

    Fitting on full train set...

model  4:     [XGBClassifier]
    fold  0:  [0.43016298]
    fold  1:  [0.43888331]
    fold  2:  [0.43915033]
    fold  3:  [0.43740928]
    fold  4:  [0.44352498]
    ----
    MEAN:     [0.43782618] + [0.00434156]
    FULL:     [0.43782605]

    Fitting on full train set...

model  5:     [LGBMClassifier]
    fold  0:  [0.43093772]
    fold  1:  [0.43751131]
    fold  2:  [0.43960360]
    fold  3:  [0.43950096]
    fold  4

In [34]:
S_train_2, S_test_2 = stacking(models_2,                   # list of models
                               X_train, y_train, X_test,   # data
                               regression=False,           # classification task (if you need 
                                                           #     regression - set to True)
                               mode='oof_pred',            # mode: oof for train set, fit on full 
                                                           #     train and predict test set once
                               needs_proba=True,           # predict probabilities (if you need 
                                                           #     class labels - set to False) 
                               save_dir='.',               # save result and log in current dir 
                                                           #     (to disable saving - set to None)
                               metric=log_loss,            # metric: callable
                               n_folds=5,                  # number of folds
                               stratified=True,            # stratified split for folds
                               shuffle=True,               # shuffle the data
                               random_state=0,             # ensure reproducibility
                               verbose=2)                  # print all info

task:         [classification]
n_classes:    [3]
metric:       [log_loss]
mode:         [oof_pred]
n_models:     [1]

model  0:     [KerasClassifier]
    fold  0:  [0.46798640]
    fold  1:  [0.47907916]
    fold  2:  [0.46979569]
    fold  3:  [0.48248938]
    fold  4:  [0.48195481]
    ----
    MEAN:     [0.47626109] + [0.00615505]
    FULL:     [0.47626091]

    Fitting on full train set...

Result was saved to [.\[2019.04.11].[00.34.34].161993.7493e8.npy]


In [36]:
# Create empty arrays
S_train_all = np.zeros((X_train.shape[0], 0))
S_test_all = np.zeros((X_test.shape[0], 0))

# Load results
for name in sorted(glob('*.npy')):
    print('Loading: %s' % name)
    S = np.load(name)
    S_train_all = np.c_[S_train_all, S[0]]
    S_test_all = np.c_[S_test_all, S[1]]
    
print('\nS_train_all shape:', S_train_all.shape)
print('S_test_all shape: ', S_test_all.shape)

Loading: [2019.04.10].[17.49.09].227856.a81bd6.npy
Loading: [2019.04.10].[17.53.42].608953.073cde.npy

S_train_all shape: (160000, 24)
S_test_all shape:  (40000, 24)


In [36]:
# Initialize 2nd level model
model = XGBClassifier(random_state=0, n_jobs=-1, learning_rate=0.1, 
                      n_estimators=100, max_depth=3)
    
# Fit 2nd level model
model = model.fit(S_train_1, y_train)

# Predict
y_pred = model.predict_proba(S_test_1)



In [37]:
# Final prediction score
print('Final prediction score: %.8f' % log_loss(y_test, y_pred))

Final prediction score: 0.40105395


In [38]:
final_predictions = model.predict_proba(S_test_1_1)

In [39]:
final_predictions

array([[0.5550094 , 0.40934965, 0.0356409 ],
       [0.6725949 , 0.32483917, 0.00256592],
       [0.00668239, 0.9925132 , 0.00080449],
       ...,
       [0.07744649, 0.9045954 , 0.01795811],
       [0.99821055, 0.00164519, 0.00014433],
       [0.71130955, 0.2876673 , 0.00102313]], dtype=float32)

In [29]:
sample = pd.read_csv('sample_submission.csv')

In [30]:
sample.head()

Unnamed: 0,ID_code,target
0,test_0,0
1,test_1,0
2,test_2,0
3,test_3,0
4,test_4,0


In [40]:
sample["target"] = final_predictions

In [41]:
sample.head()

Unnamed: 0,ID_code,target
0,test_0,0.555009
1,test_1,0.672595
2,test_2,0.006682
3,test_3,0.0087
4,test_4,0.00121


In [42]:
sample.to_csv('submission_10th_April.csv', index=False)