In [1]:
import glob

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import ExtraTreesClassifier
from tqdm.notebook import tqdm

In [2]:
train = pd.read_csv('../data/input/train_fold_v000.csv')
test = pd.read_csv('../data/input/sample_submission.csv')
train.head(3)

Unnamed: 0,id,target,Fold
0,00000e74ad,1,2
1,00001f4945,0,1
2,0000661522,0,3


In [3]:
y_train = train['target']
n_splits = train['Fold'].nunique()

In [4]:
experiments = [e.split('/')[-1] for e in sorted(glob.glob('../data/output/predictions/test/*'))]

In [5]:
X_train = {}
for exp in experiments:
    paths = sorted(glob.glob(f'../data/output/predictions/oof/{exp}/*.npy'))
    if len(paths) != 5:
        continue

    oof = np.zeros((len(train),))
    for i, path in zip(range(n_splits), paths):
        valid_idx = train.Fold == i
        oof[valid_idx] = np.load(path).flatten()
        
    X_train[exp] = oof
X_train = pd.DataFrame(X_train)
X_train.head(3)

Unnamed: 0,default,trainer=exp001,trainer=exp002
0,0.545408,0.442724,0.547303
1,0.160439,0.169753,0.157171
2,0.381329,0.260785,0.252459


In [6]:
X_test = {}
for exp in experiments:
    paths = sorted(glob.glob(f'../data/output/predictions/test/{exp}/*.npy'))
    if len(paths) != 5:
        continue
    X_test[exp] = np.mean([np.load(path) for path in paths], axis=0).flatten()
X_test = pd.DataFrame(X_test)
X_test.head(3)

Unnamed: 0,default,trainer=exp001,trainer=exp002
0,1.0,1.0,0.999995
1,0.949274,0.9293,0.941273
2,0.299642,0.317033,0.323706


In [7]:
print(X_train.shape, y_train.shape)
print(X_test.shape)

(560000, 3) (560000,)
(226000, 3)


In [8]:
# simple averaging
for c in X_train.columns:
        print(f"{c}: ", roc_auc_score(y_train, X_train[c]))

preds = (0.3 * X_train['default'] + 0.3 * X_train['trainer=exp001'] + 0.4 * X_train['trainer=exp002'])
print("averaging: ", roc_auc_score(y_train, preds))

default:  0.8717325161414454
trainer=exp001:  0.8736307917702919
trainer=exp002:  0.8745796744380153
averaging:  0.8757118014794365


In [9]:
# meta-features
X_train_features = pd.read_pickle('../data/output/features/create_feets_train.pkl')
X_test_features = pd.read_pickle('../data/output/features/create_feets_test.pkl')

X_train = pd.concat([X_train, X_train_features], axis=1)
X_test = pd.concat([X_test, X_test_features], axis=1)

In [11]:
!pip install pycaret
from pycaret.classification import *

Collecting pycaret
  Downloading pycaret-2.3.3-py3-none-any.whl (264 kB)
[K     |████████████████████████████████| 264 kB 6.7 MB/s eta 0:00:01
Collecting scipy<=1.5.4
  Downloading scipy-1.5.4-cp37-cp37m-manylinux1_x86_64.whl (25.9 MB)
[K     |████████████████████████████████| 25.9 MB 743 kB/s eta 0:00:01
Collecting mlflow
  Downloading mlflow-1.19.0-py3-none-any.whl (14.4 MB)
[K     |████████████████████████████████| 14.4 MB 8.7 MB/s eta 0:00:01
Collecting numpy==1.19.5
  Downloading numpy-1.19.5-cp37-cp37m-manylinux2010_x86_64.whl (14.8 MB)
[K     |████████████████████████████████| 14.8 MB 8.9 MB/s eta 0:00:01
[?25hCollecting pandas-profiling>=2.8.0
  Downloading pandas_profiling-3.0.0-py2.py3-none-any.whl (248 kB)
[K     |████████████████████████████████| 248 kB 7.8 MB/s eta 0:00:01
Collecting pyod
  Downloading pyod-0.9.1.tar.gz (105 kB)
[K     |████████████████████████████████| 105 kB 9.3 MB/s eta 0:00:01
Collecting alembic<=1.4.1
  Downloading alembic-1.4.1.tar.gz (1.1 MB)

Collecting typing-extensions>=3.7.4.3
  Downloading typing_extensions-3.10.0.0-py3-none-any.whl (26 kB)
Collecting multimethod==1.4
  Downloading multimethod-1.4-py2.py3-none-any.whl (7.3 kB)


Building wheels for collected packages: pyod, alembic, prometheus-flask-exporter, databricks-cli
  Building wheel for pyod (setup.py) ... [?25ldone
[?25h  Created wheel for pyod: filename=pyod-0.9.1-py3-none-any.whl size=123038 sha256=7f40cb8d838d653601bf5095e31e8b4543a1364eab8e75e18a3903ebbaf77580
  Stored in directory: /root/.cache/pip/wheels/4f/18/13/eb2a2aafe68004098ade2032e72aad192643f6440462c4de19
  Building wheel for alembic (setup.py) ... [?25ldone
[?25h  Created wheel for alembic: filename=alembic-1.4.1-py2.py3-none-any.whl size=158154 sha256=9cb27d7e2b36e98dfcb6c8dbff986fa6c1617d89d90fc9353060c100dd3c917c
  Stored in directory: /root/.cache/pip/wheels/be/5d/0a/9e13f53f4f5dfb67cd8d245bb7cdffe12f135846f491a283e3
  Building wheel for prometheus-flask-exporter (setup.py) ... [?25ldone
[?25h  Created wheel for prometheus-flask-exporter: filename=prometheus_flask_exporter-0.18.2-py3-none-any.whl size=17398 sha256=d073fffe6c8c6c23ef0e6ad65e2d5a61ab65f681101faa36219699ce7b0d0cc

In [12]:
X_train['target'] = y_train

In [13]:
exp = setup(data=X_train, target='target', session_id=123, use_gpu=True)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,target
2,Target Type,Binary
3,Label Encoded,"0: 0, 1: 1"
4,Original Data,"(560000, 70)"
5,Missing Values,False
6,Numeric Features,69
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [14]:
best_top5_model = compare_models(sort='AUC', n_select=5)
best_model = best_top5_model[0]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.8098,0.8756,0.6995,0.8972,0.7861,0.6196,0.6352,488.992
lda,Linear Discriminant Analysis,0.8099,0.8756,0.7037,0.893,0.7871,0.6197,0.634,2.033
catboost,CatBoost Classifier,0.8097,0.8754,0.6994,0.897,0.786,0.6193,0.6349,10.71
lightgbm,Light Gradient Boosting Machine,0.8098,0.8752,0.6974,0.8993,0.7856,0.6196,0.6358,10.976
ada,Ada Boost Classifier,0.8093,0.8749,0.6961,0.8995,0.7848,0.6186,0.635,111.927
xgboost,Extreme Gradient Boosting,0.8068,0.8716,0.7,0.8899,0.7836,0.6136,0.628,10.739
et,Extra Trees Classifier,0.8079,0.871,0.6966,0.8958,0.7837,0.6158,0.6316,10.265
rf,Random Forest Classifier,0.808,0.8697,0.6975,0.895,0.784,0.6159,0.6315,14.669
nb,Naive Bayes,0.4999,0.7955,0.8952,0.4992,0.5999,0.0005,0.0032,0.287
qda,Quadratic Discriminant Analysis,0.5261,0.7608,0.9681,0.526,0.674,0.0529,0.057,0.642


In [15]:
print(best_top5_model)
predict_model(best_model);

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=123, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Gradient Boosting Classifier,0.8098,0.8758,0.7014,0.8961,0.7869,0.6197,0.6349


In [None]:
# stack models dynamically
stacker = stack_models(estimator_list = top5, meta_model = top5[0])
predict_model(stacker);

In [16]:
# # model = create_model('lr')
# tuned_model = tune_model(best_model)
# predict_model(tuned_model);

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8117,0.8778,0.6882,0.9134,0.785,0.6232,0.6431
1,0.8074,0.876,0.6827,0.9093,0.7799,0.6148,0.6348
2,0.8082,0.8739,0.6858,0.9076,0.7813,0.6162,0.6355
3,0.8091,0.8725,0.6877,0.9079,0.7826,0.6182,0.6372
4,0.8125,0.878,0.69,0.9135,0.7862,0.6249,0.6444
5,0.8041,0.8726,0.6778,0.9065,0.7756,0.6081,0.6284
6,0.8094,0.8735,0.6887,0.9074,0.7831,0.6186,0.6374
7,0.8107,0.8774,0.6875,0.9119,0.784,0.6213,0.641
8,0.8097,0.8734,0.6863,0.9108,0.7828,0.6193,0.639
9,0.8108,0.8766,0.6915,0.9078,0.785,0.6215,0.6399


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Gradient Boosting Classifier,0.8086,0.8753,0.6864,0.9089,0.7821,0.6173,0.6366


In [17]:
final_model = finalize_model(stacker)
print(final_model)
predict_model(final_model);

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.01, loss='deviance', max_depth=7,
                           max_features='sqrt', max_leaf_nodes=None,
                           min_impurity_decrease=0.05, min_impurity_split=None,
                           min_samples_leaf=2, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=140,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=123, subsample=0.35, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Gradient Boosting Classifier,0.8097,0.8772,0.6861,0.9119,0.783,0.6195,0.6394


In [18]:
test_predictions = predict_model(final_model, data=X_test)
test_predictions.head(10)

Unnamed: 0,default,trainer=exp001,trainer=exp002,Amplitude_channel_0,AndersonDarling_channel_0,Autocor_length_channel_0,Con_channel_0,FluxPercentileRatioMid20_channel_0,FluxPercentileRatioMid35_channel_0,FluxPercentileRatioMid50_channel_0,...,PairSlopeTrend_channel_2,PercentAmplitude_channel_2,PercentDifferenceFluxPercentile_channel_2,Q31_channel_2,Rcs_channel_2,Skew_channel_2,SmallKurtosis_channel_2,Std_channel_2,Label,Score
0,1.0,1.0,0.999995,1.245491e-20,1.0,37.0,0.004397,0.23314,0.410092,0.571434,...,-0.3,5083.149047,5870.19634,1.837661e-21,0.040239,-0.136037,0.805226,1.640129e-21,1,0.8567
1,0.949274,0.9293,0.941273,1.507626e-20,1.0,37.0,0.006351,0.198532,0.359877,0.524786,...,-0.1,47.964207,65.453515,3.0128709999999998e-21,0.03298,-0.017745,-0.769401,2.011631e-21,1,0.8014
2,0.299642,0.317033,0.323706,1.423221e-20,0.999999,36.0,0.044944,0.147675,0.268725,0.388563,...,-0.166667,-187.658314,-261.172741,2.683478e-21,0.034005,-0.004698,-0.494688,1.966682e-21,0,0.616
3,0.940216,0.97153,0.857765,1.114562e-20,1.0,35.0,0.019541,0.180043,0.315167,0.456299,...,-0.033333,112.974427,119.070032,1.909733e-21,0.037433,-0.054739,-0.001572,1.3981359999999999e-21,1,0.7944
4,0.096719,0.086012,0.088665,1.826976e-20,1.0,36.0,0.053493,0.122437,0.20807,0.390958,...,-0.1,82.176701,102.214637,3.970641e-21,0.036467,-0.02731,-0.383499,2.751361e-21,0,0.7419
5,0.083963,0.069213,0.094772,1.134843e-20,1.0,35.0,0.048119,0.13649,0.234846,0.370131,...,0.033333,-409.892261,-490.45066,3.034565e-21,0.03824,0.000303,-0.539766,2.083575e-21,0,0.7415
6,0.265098,0.221418,0.22204,1.1579099999999999e-20,1.0,35.0,0.027357,0.190709,0.322695,0.465799,...,0.033333,65.657631,79.625906,1.82594e-21,0.032359,-0.047572,-0.576621,1.239589e-21,0,0.6741
7,0.070537,0.050988,0.07692,1.395155e-20,1.0,35.0,0.043723,0.153672,0.276826,0.411214,...,0.1,297.580455,358.149333,2.224414e-21,0.036621,0.061415,-0.351115,1.567246e-21,0,0.7427
8,1.0,1.0,1.0,1.920512e-20,1.0,36.0,0.058622,0.152663,0.267893,0.392197,...,-0.166667,61.350979,62.158987,2.27775e-21,0.040153,-0.004881,-0.037544,1.639358e-21,1,0.857
9,0.097741,0.089601,0.115504,1.1503129999999999e-20,1.0,35.0,0.065706,0.130493,0.22156,0.321801,...,-0.233333,-148.701851,-186.550254,2.865774e-21,0.032666,0.056124,-0.607491,1.896375e-21,0,0.7392


In [19]:
sub = pd.read_csv('../data/input/sample_submission.csv')
sub['target'] = test_predictions['Score']
sub.to_csv('../data/output/submission/ensebmle.csv', index=None)
sub.head()

Unnamed: 0,id,target
0,00005bced6,0.8567
1,0000806717,0.8014
2,0000ef4fe1,0.616
3,00020de251,0.7944
4,00024887b5,0.7419


In [20]:
# !kaggle competitions submit g2net-gravitational-wave-detection -f ../data/output/submission/ensebmle.csv -m ""

In [21]:
# help(setup)