In [9]:
from glob import glob
import re 
import numpy as np
np.random.seed(0) # ensure reproducibility
np.set_printoptions(suppress = True)
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.metrics import log_loss
# Models
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# NN
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
# Stacking
from vecstack import stacking
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np

In [10]:
train = pd.read_csv('train.csv')

In [11]:
y = train['target'] 
X = train.drop(["ID_code", "target"], axis=1)

In [14]:
n_classes = 3

# Create data: 500 example, 5 feature, 3 classes
X, y = make_classification(n_samples=200000, n_features=200, 
                           n_informative=3, n_redundant=1, 
                           n_classes=n_classes, flip_y=0, 
                           random_state=0)

# Make train/test split
# As usual in machine learning task we have X_train, y_train, and X_test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print('Train shape:', X_train.shape)
print('Test shape: ', X_test.shape)

Train shape: (160000, 200)
Test shape:  (40000, 200)


In [18]:
def build_keras_model_1():
    model = Sequential()
    model.add(Dense(64, 
                    input_dim=X_train.shape[1], 
                    kernel_initializer='normal', 
                    activation='relu'))
    model.add(Dense(n_classes, 
                    kernel_initializer='normal', 
                    activation='softmax'))
    model.compile(optimizer='rmsprop', 
                  loss='categorical_crossentropy', 
                  metrics=['categorical_accuracy'])
    return model

# Caution! All models and parameter values are just 
# demonstrational and shouldn't be considered as recommended.
models_1 = [ 
    GaussianNB(),
    
    LogisticRegression(random_state=0),
    
    ExtraTreesClassifier(random_state=0, n_jobs=-1, 
                         n_estimators=100, max_depth=3),
                         
    RandomForestClassifier(random_state=0, n_jobs=-1, 
                           n_estimators=100, max_depth=3),
        
    XGBClassifier(random_state=0, n_jobs=-1, learning_rate=0.1, 
                  n_estimators=100, max_depth=3),
                  
    LGBMClassifier(random_state=0, n_jobs=-1, learning_rate=0.1, 
                   n_estimators=100, max_depth=3),
                  
    KerasClassifier(build_fn=build_keras_model_1, epochs=2, 
                    batch_size=32, verbose=0)
]

In [20]:
S_train_1, S_test_1 = stacking(models_1,                   # list of models
                               X_train, y_train, X_test,   # data
                               regression=False,           # classification task (if you need 
                                                           #     regression - set to True)
                               mode='oof_pred',            # mode: oof for train set, fit on full 
                                                           #     train and predict test set once
                               needs_proba=True,           # predict probabilities (if you need 
                                                           #     class labels - set to False) 
                               save_dir='.',               # save result and log in current dir 
                                                           #     (to disable saving - set to None)
                               metric=log_loss,            # metric: callable
                               n_folds=5,                  # number of folds
                               stratified=True,            # stratified split for folds
                               shuffle=True,               # shuffle the data
                               random_state=0,             # ensure reproducibility
                               verbose=2)                  # print all info

task:         [classification]
n_classes:    [3]
metric:       [log_loss]
mode:         [oof_pred]
n_models:     [7]

model  0:     [GaussianNB]
    fold  0:  [0.60107452]
    fold  1:  [0.62014706]
    fold  2:  [0.61672544]
    fold  3:  [0.61218361]
    fold  4:  [0.61952161]
    ----
    MEAN:     [0.61393045] + [0.00701492]
    FULL:     [0.61393026]

    Fitting on full train set...

model  1:     [LogisticRegression]




    fold  0:  [0.56465399]




    fold  1:  [0.58025074]




    fold  2:  [0.57469431]




    fold  3:  [0.57474876]




    fold  4:  [0.58078008]
    ----
    MEAN:     [0.57502557] + [0.00579953]
    FULL:     [0.57502541]

    Fitting on full train set...





model  2:     [ExtraTreesClassifier]
    fold  0:  [1.06074423]
    fold  1:  [1.06235617]
    fold  2:  [1.04947944]
    fold  3:  [1.06130249]
    fold  4:  [1.06374782]
    ----
    MEAN:     [1.05952603] + [0.00512654]
    FULL:     [1.05952601]

    Fitting on full train set...

model  3:     [RandomForestClassifier]
    fold  0:  [0.92345645]
    fold  1:  [0.90625637]
    fold  2:  [0.92339716]
    fold  3:  [0.91650931]
    fold  4:  [0.91757033]
    ----
    MEAN:     [0.91743792] + [0.00628716]
    FULL:     [0.91743800]

    Fitting on full train set...

model  4:     [XGBClassifier]
    fold  0:  [0.43016298]
    fold  1:  [0.43888331]
    fold  2:  [0.43915033]
    fold  3:  [0.43740928]
    fold  4:  [0.44352498]
    ----
    MEAN:     [0.43782618] + [0.00434156]
    FULL:     [0.43782605]

    Fitting on full train set...

model  5:     [LGBMClassifier]
    fold  0:  [0.43093772]
    fold  1:  [0.43751131]
    fold  2:  [0.43960360]
    fold  3:  [0.43950096]
    fold  4

In [21]:
print('We have %d classes and %d models so in resulting arrays \
we expect to see %d columns.' % (n_classes, len(models_1), n_classes * len(models_1)))
print('S_train_1 shape:', S_train_1.shape)
print('S_test_1 shape: ', S_test_1.shape)

We have 3 classes and 7 models so in resulting arrays we expect to see 21 columns.
S_train_1 shape: (160000, 21)
S_test_1 shape:  (40000, 21)


In [22]:
S_train_1[:2]

array([[0.98996697, 0.0014692 , 0.00856383, 0.90755247, 0.06482565,
        0.02762189, 0.3509121 , 0.32470061, 0.32438728, 0.47477919,
        0.26467815, 0.26054266, 0.98931456, 0.00764197, 0.00304349,
        0.98757663, 0.00978849, 0.00263487, 0.99689096, 0.0031091 ,
        0.00000001],
       [0.11227576, 0.65222713, 0.23549711, 0.17150716, 0.50036272,
        0.32813011, 0.33411975, 0.33864272, 0.32723754, 0.28745166,
        0.41626246, 0.29628587, 0.15967681, 0.57350618, 0.266817  ,
        0.15763446, 0.59017378, 0.25219176, 0.32695445, 0.43366429,
        0.23938128]])

In [23]:
S_test_1[:2]

array([[0.19756544, 0.69596677, 0.1064678 , 0.2823499 , 0.57946909,
        0.138181  , 0.33480539, 0.34222773, 0.32296688, 0.29637844,
        0.42388314, 0.27973842, 0.22797398, 0.64218599, 0.12984006,
        0.24673295, 0.62236016, 0.13090689, 0.13497773, 0.68432266,
        0.18069959],
       [0.10157364, 0.61288917, 0.28553719, 0.14327163, 0.49265634,
        0.36407202, 0.32829869, 0.34242709, 0.32927422, 0.29610252,
        0.41331745, 0.29058003, 0.14889233, 0.58053714, 0.27057058,
        0.14877604, 0.58005014, 0.27117382, 0.15368925, 0.41763222,
        0.42867848]])

In [29]:
names = sorted(glob('*.npy'))
npy_1_name = names[0] # for later use

print('Arrays:')
for name in names:
    print(name)

names = sorted(glob('*.log.txt'))
log_1_name = names[0] # for later use

print('\nLogs:')
for name in names:
    print(name)

Arrays:
[2019.04.10].[17.49.09].227856.a81bd6.npy
[2019.04.10].[17.53.42].608953.073cde.npy

Logs:
[2019.04.10].[17.49.09].227856.a81bd6.log.txt
[2019.04.10].[17.53.42].608953.073cde.log.txt


In [24]:
def build_keras_model_2():
    model = Sequential()
    model.add(Dense(256, 
                    input_dim=X_train.shape[1], 
                    kernel_initializer='normal', 
                    activation='relu'))
    model.add(Dense(64, 
                    kernel_initializer='normal', 
                    activation='relu'))
    model.add(Dense(n_classes, 
                    kernel_initializer='normal', 
                    activation='softmax'))
    model.compile(optimizer='rmsprop', 
                  loss='categorical_crossentropy', 
                  metrics=['categorical_accuracy'])
    return model

# Caution! All models and parameter values are just 
# demonstrational and shouldn't be considered as recommended.
models_2 = [        
    KerasClassifier(build_fn=build_keras_model_2, epochs=5, 
                    batch_size=32, verbose=0)
]

In [25]:
S_train_2, S_test_2 = stacking(models_2,                   # list of models
                               X_train, y_train, X_test,   # data
                               regression=False,           # classification task (if you need 
                                                           #     regression - set to True)
                               mode='oof_pred',            # mode: oof for train set, fit on full 
                                                           #     train and predict test set once
                               needs_proba=True,           # predict probabilities (if you need 
                                                           #     class labels - set to False) 
                               save_dir='.',               # save result and log in current dir 
                                                           #     (to disable saving - set to None)
                               metric=log_loss,            # metric: callable
                               n_folds=5,                  # number of folds
                               stratified=True,            # stratified split for folds
                               shuffle=True,               # shuffle the data
                               random_state=0,             # ensure reproducibility
                               verbose=2)                  # print all info

task:         [classification]
n_classes:    [3]
metric:       [log_loss]
mode:         [oof_pred]
n_models:     [1]

model  0:     [KerasClassifier]
    fold  0:  [0.45979184]
    fold  1:  [0.49483895]
    fold  2:  [0.46820496]
    fold  3:  [0.48360031]
    fold  4:  [0.49763215]
    ----
    MEAN:     [0.48081364] + [0.01475284]
    FULL:     [0.48081326]

    Fitting on full train set...

Result was saved to [.\[2019.04.10].[17.53.42].608953.073cde.npy]


In [30]:
print("opening this log: %s" % log_1_name)
with open(log_1_name) as f:
    lines = f.readlines()

print("models build in those session.\n")
for line in lines:
    if re.search(r'^model [0-9]+', line):
        print(line)

opening this log: [2019.04.10].[17.49.09].227856.a81bd6.log.txt
models build in those session.



In [31]:
print(" .npy file: %s" % npy_1_name)
S = np.load(npy_1_name)
S_train_lgbm = S[0][:, 15:18]
S_test_lgbm = S[1][:, 15:18]

 .npy file: [2019.04.10].[17.49.09].227856.a81bd6.npy


In [32]:
S_train_lgbm[:5]

array([[0.98757663, 0.00978849, 0.00263487],
       [0.15763446, 0.59017378, 0.25219176],
       [0.28626021, 0.27927242, 0.43446736],
       [0.63731884, 0.30184423, 0.06083693],
       [0.86489797, 0.03197194, 0.10313009]])

In [33]:
S_test_lgbm[:5]

array([[0.24673295, 0.62236016, 0.13090689],
       [0.14877604, 0.58005014, 0.27117382],
       [0.0518705 , 0.01172231, 0.93640719],
       [0.00227678, 0.02780158, 0.96992164],
       [0.04190374, 0.49007427, 0.46802199]])

In [34]:
print('LGBMCLassifier log loss: %.8f' % log_loss(y_train, S_train_lgbm))

LGBMCLassifier log loss: 0.43810202


In [35]:
print('We have %d classes and %d models TOTAL so in resulting arrays \
we expect to see %d columns.' % (n_classes, len(models_1) + len(models_2), 
                                 n_classes * (len(models_1) + len(models_2))))

We have 3 classes and 8 models TOTAL so in resulting arrays we expect to see 24 columns.


In [36]:
# Create empty arrays
S_train_all = np.zeros((X_train.shape[0], 0))
S_test_all = np.zeros((X_test.shape[0], 0))

# Load results
for name in sorted(glob('*.npy')):
    print('Loading: %s' % name)
    S = np.load(name)
    S_train_all = np.c_[S_train_all, S[0]]
    S_test_all = np.c_[S_test_all, S[1]]
    
print('\nS_train_all shape:', S_train_all.shape)
print('S_test_all shape: ', S_test_all.shape)

Loading: [2019.04.10].[17.49.09].227856.a81bd6.npy
Loading: [2019.04.10].[17.53.42].608953.073cde.npy

S_train_all shape: (160000, 24)
S_test_all shape:  (40000, 24)


In [42]:
# Initialize 2nd level model
model = XGBClassifier(random_state=0, n_jobs=-1, learning_rate=0.1, 
                      n_estimators=100, max_depth=3)
    
# Fit 2nd level model
model = model.fit(S_train_all, y_train)

# Predict
y_pred = model.predict_proba(S_test_all)

# Final prediction score
print('Final prediction score: %.8f' % log_loss(y_test, y_pred))

Final prediction score: 0.39562066


In [39]:
test = pd.read_csv('test.csv')

In [44]:
test = test.drop(["ID_code"], axis=1)
final = model.predict_proba(test)

ValueError: feature_names mismatch: ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23'] ['var_0', 'var_1', 'var_2', 'var_3', 'var_4', 'var_5', 'var_6', 'var_7', 'var_8', 'var_9', 'var_10', 'var_11', 'var_12', 'var_13', 'var_14', 'var_15', 'var_16', 'var_17', 'var_18', 'var_19', 'var_20', 'var_21', 'var_22', 'var_23', 'var_24', 'var_25', 'var_26', 'var_27', 'var_28', 'var_29', 'var_30', 'var_31', 'var_32', 'var_33', 'var_34', 'var_35', 'var_36', 'var_37', 'var_38', 'var_39', 'var_40', 'var_41', 'var_42', 'var_43', 'var_44', 'var_45', 'var_46', 'var_47', 'var_48', 'var_49', 'var_50', 'var_51', 'var_52', 'var_53', 'var_54', 'var_55', 'var_56', 'var_57', 'var_58', 'var_59', 'var_60', 'var_61', 'var_62', 'var_63', 'var_64', 'var_65', 'var_66', 'var_67', 'var_68', 'var_69', 'var_70', 'var_71', 'var_72', 'var_73', 'var_74', 'var_75', 'var_76', 'var_77', 'var_78', 'var_79', 'var_80', 'var_81', 'var_82', 'var_83', 'var_84', 'var_85', 'var_86', 'var_87', 'var_88', 'var_89', 'var_90', 'var_91', 'var_92', 'var_93', 'var_94', 'var_95', 'var_96', 'var_97', 'var_98', 'var_99', 'var_100', 'var_101', 'var_102', 'var_103', 'var_104', 'var_105', 'var_106', 'var_107', 'var_108', 'var_109', 'var_110', 'var_111', 'var_112', 'var_113', 'var_114', 'var_115', 'var_116', 'var_117', 'var_118', 'var_119', 'var_120', 'var_121', 'var_122', 'var_123', 'var_124', 'var_125', 'var_126', 'var_127', 'var_128', 'var_129', 'var_130', 'var_131', 'var_132', 'var_133', 'var_134', 'var_135', 'var_136', 'var_137', 'var_138', 'var_139', 'var_140', 'var_141', 'var_142', 'var_143', 'var_144', 'var_145', 'var_146', 'var_147', 'var_148', 'var_149', 'var_150', 'var_151', 'var_152', 'var_153', 'var_154', 'var_155', 'var_156', 'var_157', 'var_158', 'var_159', 'var_160', 'var_161', 'var_162', 'var_163', 'var_164', 'var_165', 'var_166', 'var_167', 'var_168', 'var_169', 'var_170', 'var_171', 'var_172', 'var_173', 'var_174', 'var_175', 'var_176', 'var_177', 'var_178', 'var_179', 'var_180', 'var_181', 'var_182', 'var_183', 'var_184', 'var_185', 'var_186', 'var_187', 'var_188', 'var_189', 'var_190', 'var_191', 'var_192', 'var_193', 'var_194', 'var_195', 'var_196', 'var_197', 'var_198', 'var_199']
expected f16, f11, f5, f10, f15, f14, f23, f18, f2, f1, f4, f12, f21, f22, f20, f9, f8, f0, f3, f19, f13, f17, f7, f6 in input data
training data did not have the following fields: var_78, var_114, var_9, var_19, var_147, var_177, var_134, var_90, var_21, var_30, var_141, var_67, var_102, var_151, var_133, var_136, var_188, var_195, var_113, var_149, var_43, var_2, var_126, var_112, var_10, var_27, var_138, var_44, var_168, var_132, var_7, var_158, var_165, var_123, var_127, var_1, var_8, var_29, var_156, var_111, var_140, var_37, var_14, var_80, var_56, var_92, var_55, var_118, var_22, var_142, var_157, var_198, var_95, var_79, var_16, var_26, var_49, var_72, var_96, var_175, var_13, var_189, var_69, var_159, var_73, var_125, var_41, var_85, var_121, var_107, var_148, var_18, var_74, var_116, var_128, var_150, var_11, var_119, var_129, var_53, var_64, var_131, var_99, var_110, var_144, var_117, var_5, var_172, var_180, var_82, var_184, var_185, var_36, var_98, var_169, var_143, var_162, var_84, var_145, var_51, var_6, var_166, var_45, var_87, var_164, var_137, var_197, var_171, var_193, var_199, var_66, var_58, var_122, var_178, var_70, var_167, var_182, var_47, var_187, var_35, var_108, var_91, var_20, var_86, var_34, var_139, var_160, var_183, var_23, var_154, var_194, var_0, var_48, var_60, var_77, var_83, var_103, var_33, var_100, var_93, var_50, var_68, var_75, var_54, var_101, var_24, var_176, var_155, var_130, var_88, var_25, var_192, var_39, var_97, var_163, var_62, var_17, var_186, var_59, var_76, var_191, var_106, var_57, var_170, var_42, var_152, var_46, var_61, var_81, var_89, var_105, var_135, var_28, var_52, var_63, var_174, var_4, var_104, var_15, var_146, var_173, var_115, var_65, var_40, var_161, var_179, var_190, var_32, var_124, var_38, var_3, var_12, var_153, var_109, var_71, var_120, var_94, var_31, var_181, var_196