In [None]:
!pip install catboost
!pip install optuna

In [None]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import VotingRegressor

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import roc_auc_score
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

from sklearn import linear_model
from sklearn import ensemble
from sklearn import svm
from catboost import CatBoostClassifier, Pool
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import optuna

import warnings
warnings.filterwarnings("ignore")

In [None]:
train = pd.read_csv('https://raw.githubusercontent.com/AkemiRiemann/Machine/main/train.csv')
test = pd.read_csv('https://raw.githubusercontent.com/AkemiRiemann/Machine/main/test.csv')
sub = pd.read_csv('https://raw.githubusercontent.com/AkemiRiemann/Machine/main/sample_submission.csv')

target = train['Machine failure']
train.drop(['id', 'Machine failure'], inplace=True, axis=1)
test.drop(['id'], inplace=True, axis=1)

In [None]:
enc = OrdinalEncoder()
train['Type'] = enc.fit_transform(train[['Type']])
test['Type'] = enc.transform(test[['Type']])

In [None]:
cols = {'Air temperature [K]' : 'Air_Temperature', 'Process temperature [K]' : 'Process_Temperature',
       'Rotational speed [rpm]' : 'Rotational_Speed', 'Torque [Nm]' : 'Torque', 'Tool wear [min]' : 'Tool_Wear'}
train = train.rename(columns = cols)
test = test.rename(columns = cols)

In [None]:
models = {
    #'logistic_regression': linear_model.LogisticRegression(),
    #'lasso': linear_model.Lasso(),
    #'ridge': linear_model.RidgeClassifier(),
    #'svc': svm.SVC(probability=True),
    #'randomforest': ensemble.RandomForestClassifier(),
    #'gradientboosting': ensemble.GradientBoostingClassifier(),
    'xgboost': xgb.XGBClassifier(),
    'lightgbm': lgbm.LGBMClassifier(),
    'catboost' : cb.CatBoostClassifier(iterations=300, depth=6, silent=True)
}

In [None]:
for name, model in models.items():
    print(name, ':', cross_val_score(model, train, target, cv=5, scoring="roc_auc"))

xgboost : [0.96517917 0.94718261 0.96481926 0.96224498 0.96222138]
lightgbm : [0.96533306 0.95740686 0.96367054 0.96389846 0.96565686]
catboost : [0.96581758 0.94584831 0.95783394 0.96192086 0.9631287 ]


In [None]:
scaler_column = ["Air_Temperature", "Process_Temperature", "Rotational_Speed", "Torque", "Tool_Wear"]
scaler = RobustScaler()

train[scaler_column] = scaler.fit_transform(train[scaler_column])
test[scaler_column] = scaler.transform(test[scaler_column])

In [None]:
lgbm_params = {
    'n_estimators': 1100,
    'num_leaves': 12,
    'max_depth': 8,
    'min_child_samples': 12,
    'learning_rate': 0.044,
    'colsample_bytree': 0.8,#0.87
    'reg_alpha': 0.001,
    'reg_lambda': 0.002,
    'subsample': 0.7,
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'max_bin': 1000,
    'n_jobs': -1,
    'bagging_freq': 1,
    'random_state': 42
}

cb_params = {
    'n_estimators': 1200,
    'depth': 13,
    'learning_rate': 0.0633180843164835,
    'random_strength': 0.22,
    'grow_policy': 'Lossguide',
    'bootstrap_type': 'Bayesian',
    'objective':'Logloss',
    "loss_function": "AUC",
    'eval_metric': "AUC",
    'l2_leaf_reg': 3.0,
    'min_child_samples': 3,
    'random_state': 42,
    'silent': True
}

xgb_params = {
     'reg_alpha': 0.18727857702097278,
     'reg_lambda': 0.77217672456579,
     'learning_rate': 0.043011675696849064,
     'max_depth': 15,
     'subsample': 0.8370545840097189,
     'n_estimators' : 1500,
     'gamma':0,
     'tree_method':'gpu_hist',
     'predictor':'gpu_predictor',
}

models = [
    ('xgb', xgb.XGBClassifier(**xgb_params)),
    ('lgb', lgbm.LGBMClassifier(**lgbm_params)),
    ('cb', cb.CatBoostClassifier(**cb_params))
]

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state = 13)
train_roc, valid_roc = [], []

for i, (train_index, valid_index) in enumerate(kf.split(train, target)):
    print(f"Fold {i}:")
    X_train = train.loc[train_index].to_numpy()
    y_train = target.loc[train_index].to_numpy()

    X_valid = train.loc[valid_index].to_numpy()
    y_valid = target.loc[valid_index].to_numpy()

    pool_train = Pool(X_train, y_train, feature_names = train.columns.tolist(), cat_features = ['Product ID', 'Type'])
    pool_valid = Pool(X_valid, y_valid, feature_names = train.columns.tolist(), cat_features = ['Product ID', 'Type'])

    cb = CatBoostClassifier(**cb_params)
    cb.fit(pool_train, eval_set=pool_valid)

    # I gave up using VotingClassifier because it did not perform as expected.
    #     voter = VotingClassifier(models)
    #     voter.fit(X_train, y_train)

    train_pred = cb.predict_proba(pool_train)[:,1]
    valid_pred = cb.predict_proba(pool_valid)[:,1]

    train_roc_score = roc_auc_score(y_train, train_pred)
    valid_roc_score = roc_auc_score(y_valid, valid_pred)

    train_roc.append(train_roc_score)
    valid_roc.append(valid_roc_score)

    print(f'Valid MAE Score: {np.mean(train_roc):.5f} ± {np.std(train_roc):.5f} | Train MAE Score: {np.mean(valid_roc):.5f} ± {np.std(valid_roc):.5f}')
    print('')
print(f"Average MAE Score: {(np.mean(valid_roc + np.std(valid_roc))):.5f}")

Fold 0:
Valid MAE Score: 0.99099 ± 0.00000 | Train MAE Score: 0.98112 ± 0.00000

Fold 1:
Valid MAE Score: 0.99216 ± 0.00117 | Train MAE Score: 0.97785 ± 0.00327

Fold 2:
Valid MAE Score: 0.99242 ± 0.00103 | Train MAE Score: 0.97800 ± 0.00268

Fold 3:
Valid MAE Score: 0.99215 ± 0.00101 | Train MAE Score: 0.97626 ± 0.00380

Fold 4:
Valid MAE Score: 0.99332 ± 0.00250 | Train MAE Score: 0.97612 ± 0.00341

Average MAE Score: 0.97953


In [None]:
pool_test = Pool(test, feature_names=train.columns.tolist(), cat_features = ['Product ID', 'Type'])

In [None]:
y_test_pred = cb.predict_proba(pool_test)[:,1]

In [None]:
sub['Machine failure'] = y_test_pred
sub.to_csv('cb_pool.csv', index = False)