<a href="https://colab.research.google.com/github/antbartash/product_failure/blob/main/lgbm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Loading libraries and data <br>
Decision trees are fine with not-scaled features. Since LightGBM supports missing values, we don't need to fill them ourselves and KNN won't be applied. Therefore, models will be built on not-scaled variables

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_auc_score
from scipy.stats import uniform, randint

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
X_train = pd.read_csv('drive/MyDrive/product_failure/data/X_train.csv', index_col=0)
X_test = pd.read_csv('drive/MyDrive/product_failure/data/X_test.csv', index_col=0)
y_train = pd.read_csv('drive/MyDrive/product_failure/data/y_train.csv', index_col=0)
y_test = pd.read_csv('drive/MyDrive/product_failure/data/y_test.csv', index_col=0)

Check first 5 observations and data shapes to make sure that the data was read correctly

In [None]:
X_train.head()

Unnamed: 0,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,...,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,measurement_0_grouped,measurement_1_grouped,measurement_2_grouped
1630,A,107.53,material_7,material_8,9,5,16,2,4,,...,18.014,14.962,16.743,14.487,14.351,15.737,705.885,"(13.0, 29.0]","(-0.001, 4.0]","(3.0, 4.0]"
18030,D,128.99,material_7,material_5,6,6,3,9,5,17.242,...,18.873,11.384,15.002,,19.237,,841.809,"(-0.001, 3.0]","(7.0, 9.0]","(4.0, 5.0]"
26078,E,128.33,material_7,material_6,6,9,4,16,5,16.094,...,19.696,10.48,15.294,16.715,15.292,15.861,632.835,"(3.0, 4.0]","(13.0, 29.0]","(4.0, 5.0]"
19823,D,125.21,material_7,material_5,6,6,11,9,6,16.677,...,21.13,13.625,15.095,15.394,12.846,19.156,643.322,"(9.0, 11.0]","(7.0, 9.0]","(5.0, 6.0]"
15788,C,106.12,material_7,material_8,5,8,5,6,6,16.963,...,17.362,10.523,15.199,17.476,,15.472,545.206,"(4.0, 5.0]","(4.0, 6.0]","(5.0, 6.0]"


In [None]:
print("X_train.shape: ", X_train.shape)
print("y_train.shape: ", y_train.shape)
print("X_test.shape: ", X_test.shape)
print("y_test.shape:", y_test.shape)

X_train.shape:  (19927, 27)
y_train.shape:  (19927, 1)
X_test.shape:  (6643, 27)
y_test.shape: (6643, 1)


In [None]:
y_train = np.ravel(y_train)
y_test = np.ravel(y_test)

print("y_train.shape: ", y_train.shape)
print("y_test.shape:", y_test.shape)

y_train.shape:  (19927,)
y_test.shape: (6643,)


# Encoding categorical features <br>
LightGBM supports categorical features, but they need to be encoded as integers. To achieve that, OrdinalEncoder will be used

In [None]:
# List of categorical features
X_train.columns[[0, 2, 3, 4, 5, 24, 25, 26]]

Index(['product_code', 'attribute_0', 'attribute_1', 'attribute_2',
       'attribute_3', 'measurement_0_grouped', 'measurement_1_grouped',
       'measurement_2_grouped'],
      dtype='object')

In [None]:
# encode categorical features
transformer = ColumnTransformer([
    ('encoder', OrdinalEncoder(), [0, 2, 3, 4, 5, 24, 25, 26])],
    remainder='passthrough'
)
X_train_enc = transformer.fit_transform(X_train)
X_test_enc = transformer.transform(X_test)


# restore column names
X_train_enc = pd.DataFrame(
    X_train_enc,
    columns=[*X_train.columns[[0, 2, 3, 4, 5, 24, 25, 26]], X_train.columns[1],
             *X_train.columns[range(6, 24)]]
)
X_test_enc = pd.DataFrame(
    X_test_enc,
    columns=[*X_test.columns[[0, 2, 3, 4, 5, 24, 25, 26]], X_test.columns[1],
             *X_test.columns[range(6, 24)]] 
)


# change dtype of categorical features from float to category
for col in range(0, 8):
    X_train_enc.iloc[:, col] = X_train_enc.iloc[:, col].astype('category')
    X_test_enc.iloc[:, col] = X_test_enc.iloc[:, col].astype('category')


# check if categorical features were encoded as integers
X_train_enc.head()

Unnamed: 0,product_code,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0_grouped,measurement_1_grouped,measurement_2_grouped,loading,measurement_0,...,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17
0,0.0,1.0,2.0,3.0,0.0,2.0,0.0,3.0,107.53,16.0,...,19.361,11.316,16.426,18.014,14.962,16.743,14.487,14.351,15.737,705.885
1,3.0,1.0,0.0,1.0,1.0,0.0,5.0,4.0,128.99,3.0,...,18.963,10.382,15.601,18.873,11.384,15.002,,19.237,,841.809
2,4.0,1.0,1.0,1.0,3.0,3.0,2.0,4.0,128.33,4.0,...,19.338,11.776,18.614,19.696,10.48,15.294,16.715,15.292,15.861,632.835
3,3.0,1.0,0.0,1.0,1.0,9.0,5.0,5.0,125.21,11.0,...,19.947,10.667,15.419,21.13,13.625,15.095,15.394,12.846,19.156,643.322
4,2.0,1.0,2.0,0.0,2.0,4.0,3.0,5.0,106.12,5.0,...,18.009,11.512,,17.362,10.523,15.199,17.476,,15.472,545.206


In [None]:
# check features dtypes
X_train_enc.dtypes

product_code             category
attribute_0              category
attribute_1              category
attribute_2              category
attribute_3              category
measurement_0_grouped    category
measurement_1_grouped    category
measurement_2_grouped    category
loading                   float64
measurement_0             float64
measurement_1             float64
measurement_2             float64
measurement_3             float64
measurement_4             float64
measurement_5             float64
measurement_6             float64
measurement_7             float64
measurement_8             float64
measurement_9             float64
measurement_10            float64
measurement_11            float64
measurement_12            float64
measurement_13            float64
measurement_14            float64
measurement_15            float64
measurement_16            float64
measurement_17            float64
dtype: object

# Baseline model <br>
Build a baseline LightGBM model and evaluate its performance on train and test sets. For performance evaluation AUC will be used

In [None]:
model_baseline = LGBMClassifier(random_state=42)
model_baseline.fit(X_train_enc, y_train)

print("Train set AUC: {}".format(roc_auc_score(y_train, model_baseline.predict(X_train_enc))))
print("Test set AUC: {}".format(roc_auc_score(y_test, model_baseline.predict(X_test_enc))))

Train set AUC: 0.5396319886765747
Test set AUC: 0.502658591887544


AUC values indicate, that the model may underfit the dataset. We can try to tune its parameters to improve quality of the model

# Tuning parameters values <br>
LGBM models have many parameters and some of them may not have a significant influence on the quality of the models. Because of that, RandomizedSearchCV will be used at the start of parameters tuning process

Round 1

In [None]:
lgbm = LGBMClassifier(random_state=42)

distr = {
    'learning_rate': uniform(0.001, 1),
    'n_estimators': randint(50, 1000),
    'max_depth': [-1, 10, 6, 3, 1],
    'reg_alpha': [0, 0.5, 1],
    'reg_lambda': [0, 0.5, 1],
    'colsample_bytree': [0.8, 0.9, 1]
}

clf = RandomizedSearchCV(lgbm, distr, n_iter=500, cv=5, scoring='roc_auc',
                         random_state=42, verbose=1)
clf.fit(X_train_enc, y_train)

print('Best score: ', clf.best_score_)
print('Best params: ', clf.best_params_)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits
Best score:  0.584960624457363
Best params:  {'colsample_bytree': 1, 'learning_rate': 0.010485433628498075, 'max_depth': 1, 'n_estimators': 526, 'reg_alpha': 0.5, 'reg_lambda': 0}


In result of the randomized search, the lowest value of reg_lambda and highest possible value of col_sample_bytree were found to be the optimal ones. The optimal learning_rate is about 0.01 with n_estimators=526, max_depth=1, reg_alpha=0.5. <br>
In the second round we'll test values of parameters that are close to the values, that were found to be optimal in the first round.

In [None]:
grid = {
    'learning_rate': [0.005, 0.01, 0.015, 0.02],
    'n_estimators': [600, 500, 400, 300],
    'max_depth': [3, 2, 1],
    'reg_alpha': [0.3, 0.4, 0.5, 0.6],
    'reg_lambda': [0, 0.1, 0.2],
    'colsample_bytree': [1]
}

clf = GridSearchCV(lgbm, grid, cv=5, scoring='roc_auc',
                   verbose=1)
clf.fit(X_train_enc, y_train)

print('Best score: ', clf.best_score_)
print('Best params: ', clf.best_params_)

Fitting 5 folds for each of 576 candidates, totalling 2880 fits
Best score:  0.585539577801774
Best params:  {'colsample_bytree': 1, 'learning_rate': 0.01, 'max_depth': 2, 'n_estimators': 400, 'reg_alpha': 0.4, 'reg_lambda': 0.1}


The optimal values of LightGBM model are the following:
* n_estimators: 400
* learning_rate: 0.01
* max_depth: 2
* reg_alpha: 0.4
* reg_lambda: 0.1
* colsample_bytree: 1 <br>
With these values of LightGBM parameters AUC=0.5855396 was achieved.