# Import Libraries

In [73]:
import numpy as np
import pandas as pd

import optuna
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Data Loading

In [55]:
# train: training dataset
# test: testing dataset
# original: original dataset
train = pd.read_csv('Dataset/SteelPlateDefectPrediction/train.csv', encoding='utf-8')
test = pd.read_csv('Dataset/SteelPlateDefectPrediction/test.csv', encoding='utf-8')
submission = pd.read_csv('Dataset/SteelPlateDefectPrediction/sample_submission.csv', encoding='utf-8')
original = pd.read_csv('Dataset/SteelPlateDefectPrediction/faults.csv', encoding='utf-8')
TARGET_FEATURES = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains','Dirtiness', 'Bumps', 'Other_Faults']
train.shape, test.shape, original.shape

((19219, 35), (12814, 28), (1941, 34))

# Data Checking
* Check features
* Check null values
* Check data

In [41]:
train.columns

Index(['id', 'X_Minimum', 'X_Maximum', 'Y_Minimum', 'Y_Maximum',
       'Pixels_Areas', 'X_Perimeter', 'Y_Perimeter', 'Sum_of_Luminosity',
       'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer',
       'TypeOfSteel_A300', 'TypeOfSteel_A400', 'Steel_Plate_Thickness',
       'Edges_Index', 'Empty_Index', 'Square_Index', 'Outside_X_Index',
       'Edges_X_Index', 'Edges_Y_Index', 'Outside_Global_Index', 'LogOfAreas',
       'Log_X_Index', 'Log_Y_Index', 'Orientation_Index', 'Luminosity_Index',
       'SigmoidOfAreas', 'Pastry', 'Z_Scratch', 'K_Scatch', 'Stains',
       'Dirtiness', 'Bumps', 'Other_Faults'],
      dtype='object')

In [42]:
# Check null values (training dataset)
train.isnull().sum()

id                       0
X_Minimum                0
X_Maximum                0
Y_Minimum                0
Y_Maximum                0
Pixels_Areas             0
X_Perimeter              0
Y_Perimeter              0
Sum_of_Luminosity        0
Minimum_of_Luminosity    0
Maximum_of_Luminosity    0
Length_of_Conveyer       0
TypeOfSteel_A300         0
TypeOfSteel_A400         0
Steel_Plate_Thickness    0
Edges_Index              0
Empty_Index              0
Square_Index             0
Outside_X_Index          0
Edges_X_Index            0
Edges_Y_Index            0
Outside_Global_Index     0
LogOfAreas               0
Log_X_Index              0
Log_Y_Index              0
Orientation_Index        0
Luminosity_Index         0
SigmoidOfAreas           0
Pastry                   0
Z_Scratch                0
K_Scatch                 0
Stains                   0
Dirtiness                0
Bumps                    0
Other_Faults             0
dtype: int64

In [43]:
# Check data
train.head(5).T

Unnamed: 0,0,1,2,3,4
id,0.0,1.0,2.0,3.0,4.0
X_Minimum,584.0,808.0,39.0,781.0,1540.0
X_Maximum,590.0,816.0,192.0,789.0,1560.0
Y_Minimum,909972.0,728350.0,2212076.0,3353146.0,618457.0
Y_Maximum,909977.0,728372.0,2212144.0,3353173.0,618502.0
Pixels_Areas,16.0,433.0,11388.0,210.0,521.0
X_Perimeter,8.0,20.0,705.0,16.0,72.0
Y_Perimeter,5.0,54.0,420.0,29.0,67.0
Sum_of_Luminosity,2274.0,44478.0,1311391.0,3202.0,48231.0
Minimum_of_Luminosity,113.0,70.0,29.0,114.0,82.0


# Data Preprocessing
* Combine training dataset and original dataset
* Drop duplicates

In [56]:
train.drop(columns=['id'], axis=1, inplace=True)
train = pd.concat([train, original], axis=0)
train = train.drop_duplicates()
train.reset_index(drop=True, inplace=True)

In [57]:
targets_bin = train[TARGET_FEATURES]
targets_bin[targets_bin.sum(axis=1)==2]

Unnamed: 0,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
634,0,0,1,0,0,0,1
752,0,0,1,0,0,0,1
3572,0,0,1,0,0,1,0
4416,0,0,1,0,0,0,1
4530,0,0,1,0,0,0,1
4664,0,0,1,0,0,0,1
6448,0,0,1,0,0,0,1
7589,0,0,1,0,0,0,1
8075,0,0,1,0,0,0,1
8538,0,0,1,0,0,0,1


In [58]:
train['Target'] = np.argmax(train[TARGET_FEATURES].values, axis=1) + 1
train.loc[train[TARGET_FEATURES].sum(axis=1) == 0, 'Target'] = 0
train.drop(TARGET_FEATURES, inplace=True,axis =1)

# XGBoost Baseline

In [59]:
X = train.drop(['Target'], axis=1)
y = train['Target']

In [60]:
X.shape

(21160, 27)

In [61]:
y.shape

(21160,)

In [63]:
test.drop(['id'], inplace=True, axis=1)

In [64]:
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
auc_scores = []
y_prob_test = []

for train_idx, valid_idx in cv.split(X, y):

    X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
    y_train_fold, y_valid_fold = y.iloc[train_idx], y.iloc[valid_idx]

    model = XGBClassifier()
    model.fit(X_train_fold, y_train_fold)

    y_prob = model.predict_proba(X_valid_fold)
    
    y_prob_test.append(model.predict_proba(test))

    average_auc = roc_auc_score(targets_bin.iloc[valid_idx], y_prob[:, 1:], multi_class="ovr", average="macro")
    auc_scores.append(average_auc)

print(np.mean(auc_scores))
y_prob_test_array = np.array(y_prob_test)
pred = np.mean(y_prob_test_array, axis=0)

0.8907445225714771


In [68]:
submission.iloc[:, 1:] = pred[:,1:]
submission.to_csv('Dataset/SteelPlateDefectPrediction/Prediction/xgb_submission_Baseline.csv', index=False)

# LightGBM Baseline

In [70]:
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
auc_scores = []
y_prob_test = []

for train_idx, valid_idx in cv.split(X, y):

    X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
    y_train_fold, y_valid_fold = y.iloc[train_idx], y.iloc[valid_idx]

    model = LGBMClassifier()
    model.fit(X_train_fold, y_train_fold)

    y_prob = model.predict_proba(X_valid_fold)

    y_prob_test.append(model.predict_proba(test))

    average_auc = roc_auc_score(targets_bin.iloc[valid_idx], y_prob[:, 1:], multi_class="ovr", average="macro")
    auc_scores.append(average_auc)

print(np.mean(auc_scores))
y_prob_test_array = np.array(y_prob_test)
pred = np.mean(y_prob_test_array, axis=0)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001303 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5258
[LightGBM] [Info] Number of data points in the train set: 19044, number of used features: 27
[LightGBM] [Info] Start training from score -3.251919
[LightGBM] [Info] Start training from score -2.567631
[LightGBM] [Info] Start training from score -2.759443
[LightGBM] [Info] Start training from score -1.711281
[LightGBM] [Info] Start training from score -3.498400
[LightGBM] [Info] Start training from score -3.668299
[LightGBM] [Info] Start training from score -1.410530
[LightGBM] [Info] Start training from score -1.076335
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001860 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5262
[LightGBM] [Info] Number of data points in the train set: 19044, number of used fea

In [71]:
submission.iloc[:, 1:] = pred[:,1:]
submission.to_csv('Dataset/SteelPlateDefectPrediction/Prediction/lgbm_submission_Baseline.csv', index=False)

# CatBoost Baseline

In [74]:
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
auc_scores = []
y_prob_test = []

for train_idx, valid_idx in cv.split(X, y):

    X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
    y_train_fold, y_valid_fold = y.iloc[train_idx], y.iloc[valid_idx]

    model = CatBoostClassifier()
    model.fit(X_train_fold, y_train_fold)

    y_prob = model.predict_proba(X_valid_fold)

    y_prob_test.append(model.predict_proba(test))

    average_auc = roc_auc_score(targets_bin.iloc[valid_idx], y_prob[:, 1:], multi_class="ovr", average="macro")
    auc_scores.append(average_auc)

print(np.mean(auc_scores))
y_prob_test_array = np.array(y_prob_test)
pred = np.mean(y_prob_test_array, axis=0)

Learning rate set to 0.091958
0:	learn: 1.8938123	total: 78.3ms	remaining: 1m 18s
1:	learn: 1.7650493	total: 91.1ms	remaining: 45.5s
2:	learn: 1.6668859	total: 102ms	remaining: 34s
3:	learn: 1.5910022	total: 112ms	remaining: 27.9s
4:	learn: 1.5251807	total: 122ms	remaining: 24.3s
5:	learn: 1.4773204	total: 132ms	remaining: 21.9s
6:	learn: 1.4321445	total: 141ms	remaining: 20s
7:	learn: 1.3897762	total: 151ms	remaining: 18.8s
8:	learn: 1.3581875	total: 161ms	remaining: 17.7s
9:	learn: 1.3292293	total: 171ms	remaining: 17s
10:	learn: 1.3052890	total: 182ms	remaining: 16.3s
11:	learn: 1.2841066	total: 191ms	remaining: 15.8s
12:	learn: 1.2631310	total: 202ms	remaining: 15.3s
13:	learn: 1.2434835	total: 213ms	remaining: 15s
14:	learn: 1.2214273	total: 223ms	remaining: 14.6s
15:	learn: 1.2083184	total: 235ms	remaining: 14.4s
16:	learn: 1.1922937	total: 250ms	remaining: 14.5s
17:	learn: 1.1766067	total: 265ms	remaining: 14.5s
18:	learn: 1.1626525	total: 277ms	remaining: 14.3s
19:	learn: 1.151

In [75]:
submission.iloc[:, 1:] = pred[:,1:]
submission.to_csv('Dataset/SteelPlateDefectPrediction/Prediction/cat_submission_Baseline.csv', index=False)