In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import early_stopping,log_evaluation, Dataset
import lightgbm as lgb
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn import set_config
import warnings
import optuna
from sklearn.preprocessing import PolynomialFeatures,OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from scipy.stats import mode
from sklearn.preprocessing import LabelEncoder
from pprint import pprint

warnings.filterwarnings('ignore')

sns.set_theme(style = 'white', palette = 'viridis')
pal = sns.color_palette('viridis')

pd.set_option('display.max_rows', 100)
#set_config(transform_output = 'pandas')
pd.options.mode.chained_assignment = None

In [2]:
data = pd.read_csv('train.csv',low_memory=False)
data.head()

Unnamed: 0,id,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,...,Orientation_Index,Luminosity_Index,SigmoidOfAreas,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,0,584,590,909972,909977,16,8,5,2274,113,...,-0.5,-0.0104,0.1417,0,0,0,1,0,0,0
1,1,808,816,728350,728372,433,20,54,44478,70,...,0.7419,-0.2997,0.9491,0,0,0,0,0,0,1
2,2,39,192,2212076,2212144,11388,705,420,1311391,29,...,-0.0105,-0.0944,1.0,0,0,1,0,0,0,0
3,3,781,789,3353146,3353173,210,16,29,3202,114,...,0.6667,-0.0402,0.4025,0,0,1,0,0,0,0
4,4,1540,1560,618457,618502,521,72,67,48231,82,...,0.9158,-0.2455,0.9998,0,0,0,0,0,0,1


In [3]:
label_cols = ['Pastry','Z_Scratch','K_Scatch','Stains','Dirtiness','Bumps','Other_Faults']
data = data[data[label_cols].sum(axis=1) <= 1]

In [4]:
test = pd.read_csv('test.csv',low_memory=False)
test.head()

Unnamed: 0,id,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,...,Outside_X_Index,Edges_X_Index,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas
0,19219,1015,1033,3826564,3826588,659,23,46,62357,67,...,0.0095,0.5652,1.0,1.0,2.841,1.1139,1.6628,0.6727,-0.2261,0.9172
1,19220,1257,1271,419960,419973,370,26,28,39293,92,...,0.0047,0.2414,1.0,1.0,2.5682,0.9031,1.4472,0.9063,-0.1453,0.9104
2,19221,1358,1372,117715,117724,289,36,32,29386,101,...,0.0155,0.6,0.75,0.0,2.4609,1.3222,1.3222,-0.5238,-0.0435,0.6514
3,19222,158,168,232415,232440,80,10,11,8586,107,...,0.0037,0.8,1.0,1.0,1.9031,0.699,1.0414,0.1818,-0.0738,0.2051
4,19223,559,592,544375,544389,140,19,15,15524,103,...,0.0158,0.8421,0.5333,0.0,2.1461,1.3222,1.1461,-0.5714,-0.0894,0.417


In [5]:
data["XRange"] = data["X_Maximum"] - data["X_Minimum"]
data["YRange"] = data["Y_Maximum"] - data["Y_Minimum"]
data['Area_Perimeter_Ratio'] = data['Pixels_Areas'] / (data['X_Perimeter'] + data['Y_Perimeter'])
data['Luminosity_Range'] = data['Maximum_of_Luminosity'] - data['Minimum_of_Luminosity']
data["Average_Luminosity"]=(data['Maximum_of_Luminosity'] + data['Minimum_of_Luminosity']) / 2

test["XRange"] = test["X_Maximum"] - test["X_Minimum"]
test["YRange"] = test["Y_Maximum"] - test["Y_Minimum"]
test['Area_Perimeter_Ratio'] = test['Pixels_Areas'] / (test['X_Perimeter'] + test['Y_Perimeter'])
test['Luminosity_Range'] = test['Maximum_of_Luminosity'] - test['Minimum_of_Luminosity']
test["Average_Luminosity"]=(test['Maximum_of_Luminosity'] + test['Minimum_of_Luminosity']) / 2

In [6]:
data.head()
test.head()

Unnamed: 0,id,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,...,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas,XRange,YRange,Area_Perimeter_Ratio,Luminosity_Range,Average_Luminosity
0,19219,1015,1033,3826564,3826588,659,23,46,62357,67,...,1.1139,1.6628,0.6727,-0.2261,0.9172,18,24,9.550725,60,97.0
1,19220,1257,1271,419960,419973,370,26,28,39293,92,...,0.9031,1.4472,0.9063,-0.1453,0.9104,14,13,6.851852,40,112.0
2,19221,1358,1372,117715,117724,289,36,32,29386,101,...,1.3222,1.3222,-0.5238,-0.0435,0.6514,14,9,4.25,33,117.5
3,19222,158,168,232415,232440,80,10,11,8586,107,...,0.699,1.0414,0.1818,-0.0738,0.2051,10,25,3.809524,33,123.5
4,19223,559,592,544375,544389,140,19,15,15524,103,...,1.3222,1.1461,-0.5714,-0.0894,0.417,33,14,4.117647,31,118.5


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19198 entries, 0 to 19218
Data columns (total 40 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     19198 non-null  int64  
 1   X_Minimum              19198 non-null  int64  
 2   X_Maximum              19198 non-null  int64  
 3   Y_Minimum              19198 non-null  int64  
 4   Y_Maximum              19198 non-null  int64  
 5   Pixels_Areas           19198 non-null  int64  
 6   X_Perimeter            19198 non-null  int64  
 7   Y_Perimeter            19198 non-null  int64  
 8   Sum_of_Luminosity      19198 non-null  int64  
 9   Minimum_of_Luminosity  19198 non-null  int64  
 10  Maximum_of_Luminosity  19198 non-null  int64  
 11  Length_of_Conveyer     19198 non-null  int64  
 12  TypeOfSteel_A300       19198 non-null  int64  
 13  TypeOfSteel_A400       19198 non-null  int64  
 14  Steel_Plate_Thickness  19198 non-null  int64  
 15  Ed

In [8]:
class Model:
    def __init__(self, train, test):
        self.train = train
        self.test = test
        self.model_dict = dict()
        self.test_predict_list = list()
        
    def fit(self,params):
        target_col = ['Pastry','Z_Scratch','K_Scatch','Stains','Dirtiness','Bumps','Other_Faults']
        drop_col = ['id']
        
        train_cols = [col for col in self.train.columns.to_list() if col not in target_col + drop_col]
        scores = list()
        
        
        for i in range(4):
            mskf = MultilabelStratifiedKFold(n_splits=25, shuffle=True)
            oof_valid_preds = np.zeros((self.train[train_cols].shape[0], len(target_col)))
                
            for fold, (train_idx, valid_idx) in enumerate(mskf.split(self.train[train_cols], self.train[target_col])):
                X_train, y_train = self.train[train_cols].iloc[train_idx], self.train[target_col].iloc[train_idx]
                X_valid, y_valid = self.train[train_cols].iloc[valid_idx], self.train[target_col].iloc[valid_idx]
                
                model = XGBClassifier(random_state=i+fold,**params)
                    
                model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], 
                          early_stopping_rounds=250,verbose=False)
                    
                valid_preds = model.predict_proba(X_valid)
                oof_valid_preds[valid_idx] = valid_preds
                test_predict = model.predict_proba(self.test[train_cols])
                self.test_predict_list.append(test_predict)
                score = roc_auc_score(y_valid, valid_preds, multi_class="ova")
                self.model_dict[f'fold_{fold}'] = model
                    
            oof_score = roc_auc_score(self.train[target_col], oof_valid_preds, multi_class="ovr")
            print(f"The OOF auc score for iteration {i+1} is {oof_score}")
            scores.append(oof_score)
        return scores,self.test_predict_list
    
    def objective(self,trial):
        target_col = ['Pastry','Z_Scratch','K_Scatch','Stains','Dirtiness','Bumps','Other_Faults']
        drop_col = ['id']
        test_predict_list = list()
        model_dict = dict()
        
        train_cols = [col for col in self.train.columns.to_list() if col not in target_col + drop_col]
        scores = list()
        params = {'grow_policy': 'depthwise',
                  #'num_class':7,
                      'n_estimators': trial.suggest_int('n_estimators', 500, 2000), 
                       'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1), 
                       'gamma': trial.suggest_uniform('gamma', 0.1, 1), 
                       'subsample': trial.suggest_uniform('subsample', 0.5, 1),
                       'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.3, 1), 
                       'max_depth': trial.suggest_int('max_depth', 5, 30), 
                       'min_child_weight': trial.suggest_int('min_child_weight', 1, 25), 
                       'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 10),
                       'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 10),
                       'booster':'gbtree',
                       'verbosity':0,
                       'device_type': 'cuda','tree_method': 'hist'
                        }
        
        for i in range(1):
            mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True)
            oof_valid_preds = np.zeros((self.train[train_cols].shape[0], len(target_col)))
                
            for fold, (train_idx, valid_idx) in enumerate(mskf.split(self.train[train_cols], self.train[target_col])):
                X_train, y_train = self.train[train_cols].iloc[train_idx], self.train[target_col].iloc[train_idx]
                X_valid, y_valid = self.train[train_cols].iloc[valid_idx], self.train[target_col].iloc[valid_idx]
                
                model = XGBClassifier(random_state=i+fold,**params)
                    
                model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=100,verbose=False)
                    
                valid_preds = model.predict_proba(X_valid)
                oof_valid_preds[valid_idx] = valid_preds
                test_predict = model.predict_proba(self.test[train_cols])
                test_predict_list.append(test_predict)
                score = roc_auc_score(y_valid, valid_preds, multi_class="ovr")
                model_dict[f'fold_{fold}'] = model
                    
            oof_score = roc_auc_score(self.train[target_col], oof_valid_preds, multi_class="ovr")
            #print(f"The OOF auc score for iteration {i+1} is {oof_score}")
            scores.append(oof_score)
        return np.mean(np.array(scores))
    
    def find_params(self):
        study = optuna.create_study(direction='maximize')
        study.optimize(self.objective,250)

        best_params = study.best_params
        return best_params

In [25]:
model = Model(data,test)
model.find_params()

[I 2024-03-13 09:52:51,840] A new study created in memory with name: no-name-bf6df6aa-f896-4a98-9162-bbc504aab681
[I 2024-03-13 09:53:01,634] Trial 0 finished with value: 0.8603110919972591 and parameters: {'n_estimators': 574, 'learning_rate': 0.8292481265017502, 'gamma': 0.19311772961450085, 'subsample': 0.8613314660590904, 'colsample_bytree': 0.8236034627043969, 'max_depth': 26, 'min_child_weight': 15, 'reg_lambda': 0.0015947802854123603, 'reg_alpha': 3.7962776205603535e-06}. Best is trial 0 with value: 0.8603110919972591.
[I 2024-03-13 09:53:27,000] Trial 1 finished with value: 0.8823628246495895 and parameters: {'n_estimators': 1425, 'learning_rate': 0.08761640777194388, 'gamma': 0.8586303758602387, 'subsample': 0.9200702052439076, 'colsample_bytree': 0.44974301072199363, 'max_depth': 21, 'min_child_weight': 10, 'reg_lambda': 1.320124643866866e-06, 'reg_alpha': 0.002784359624916031}. Best is trial 1 with value: 0.8823628246495895.
[I 2024-03-13 09:54:19,945] Trial 2 finished with 

[I 2024-03-13 10:39:16,175] Trial 20 finished with value: 0.8798092090020335 and parameters: {'n_estimators': 503, 'learning_rate': 0.2029555314180289, 'gamma': 0.3286670188505891, 'subsample': 0.5325848649872309, 'colsample_bytree': 0.7045058619702104, 'max_depth': 8, 'min_child_weight': 17, 'reg_lambda': 0.011620108752380065, 'reg_alpha': 2.9947855773636326e-07}. Best is trial 16 with value: 0.8871243541369676.
[I 2024-03-13 10:41:49,063] Trial 21 finished with value: 0.8860772731112997 and parameters: {'n_estimators': 1141, 'learning_rate': 0.006591374666670015, 'gamma': 0.5511821464052484, 'subsample': 0.6356949890986726, 'colsample_bytree': 0.6384518243045629, 'max_depth': 11, 'min_child_weight': 7, 'reg_lambda': 0.03556071234771811, 'reg_alpha': 2.057901090150283e-08}. Best is trial 16 with value: 0.8871243541369676.
[I 2024-03-13 10:42:46,883] Trial 22 finished with value: 0.8867104186624529 and parameters: {'n_estimators': 1049, 'learning_rate': 0.020151588385550888, 'gamma': 0

[I 2024-03-13 11:19:42,231] Trial 40 finished with value: 0.8625081311205387 and parameters: {'n_estimators': 784, 'learning_rate': 0.001449720750591069, 'gamma': 0.20954967083892836, 'subsample': 0.5342544124539431, 'colsample_bytree': 0.6882652214652172, 'max_depth': 6, 'min_child_weight': 3, 'reg_lambda': 0.023124674955874724, 'reg_alpha': 9.497134208701699}. Best is trial 16 with value: 0.8871243541369676.
[I 2024-03-13 11:20:38,352] Trial 41 finished with value: 0.8859934440327665 and parameters: {'n_estimators': 1126, 'learning_rate': 0.016920193002379326, 'gamma': 0.25942053840674417, 'subsample': 0.5711918004610368, 'colsample_bytree': 0.5400119591789041, 'max_depth': 10, 'min_child_weight': 10, 'reg_lambda': 0.023135568756935957, 'reg_alpha': 5.8651702446688296e-08}. Best is trial 16 with value: 0.8871243541369676.
[I 2024-03-13 11:21:55,933] Trial 42 finished with value: 0.886991882797311 and parameters: {'n_estimators': 1244, 'learning_rate': 0.012426229826726531, 'gamma': 0

[I 2024-03-13 11:59:40,326] Trial 60 finished with value: 0.8804221966311067 and parameters: {'n_estimators': 1485, 'learning_rate': 0.15331151264193194, 'gamma': 0.8463505978236521, 'subsample': 0.6123189398011528, 'colsample_bytree': 0.36442479196184446, 'max_depth': 12, 'min_child_weight': 15, 'reg_lambda': 0.0001245242178438948, 'reg_alpha': 5.579983752654627e-07}. Best is trial 43 with value: 0.8873365282344767.
[I 2024-03-13 12:02:46,289] Trial 61 finished with value: 0.8869257006814646 and parameters: {'n_estimators': 1206, 'learning_rate': 0.004113927920703816, 'gamma': 0.2197874588922074, 'subsample': 0.6810608507366293, 'colsample_bytree': 0.48243333443010034, 'max_depth': 12, 'min_child_weight': 14, 'reg_lambda': 7.747794271875371e-06, 'reg_alpha': 1.845883848032154e-07}. Best is trial 43 with value: 0.8873365282344767.
[I 2024-03-13 12:04:15,910] Trial 62 finished with value: 0.8860427747486781 and parameters: {'n_estimators': 1182, 'learning_rate': 0.011266678629435086, 'g

KeyboardInterrupt: 

In [9]:
params1 = {'grow_policy': 'depthwise',
                  'n_estimators': 1800, 
                  'learning_rate': 0.00676793896727872, 
                  'gamma': 0.4425433619561816, 
                  'subsample': 0.6782713902375049, 
                  'colsample_bytree': 0.38371870139739117, 
                  'max_depth': 5, 
                  'min_child_weight': 4, 
                  'reg_lambda': 1.7864788262454325e-06, 
                  'reg_alpha': 0.5400111178318557,
                  'booster':'gbtree',
                  #'objective':'multi:softmax',
                  'verbosity': 0 ,#'device_type': 'cuda','tree_method': 'gpu_hist'
            }
params2 = { 'n_estimators':1800,
            'learning_rate': 0.006,
            'gamma': 0.44,
            'subsample': 0.7,
            'colsample_bytree': 0.38,
            'max_depth': 5,
            'min_child_weight': 4,
            'reg_lambda': 1.8e-06,
            'reg_alpha': 0.54,
            'booster':'gbtree',
           'grow_policy': 'depthwise',
            'verbosity': 0 ,#'device_type': 'cuda','tree_method': 'gpu_hist',}
          }

params3 = {'n_estimators': 1235,
 'learning_rate': 0.008352405007099802,
 'gamma': 0.6499918347241912,
 'subsample': 0.9116532305497375,
 'colsample_bytree': 0.49334879814671045,
 'max_depth': 7,
 'min_child_weight': 1,
 'reg_lambda': 1.7005084366184795,
 'reg_alpha': 0.0059679946773570774,#'device_type': 'cuda','tree_method': 'gpu_hist'
           }

params4 = {'learning_rate': 0.011889807711203291,
           'reg_lambda': 0.0104654434883515,
           'reg_alpha': 7.240532858657039e-06,
           'subsample': 0.9423567155919341,
           'colsample_bytree': 0.25200372739674615,
           'max_depth': 4,
           #'early_stopping_rounds': 439,
           'n_estimators': 15000,
           'tree_method': 'hist',
           'booster': 'gbtree',
           'gamma': 0.026568352248073718,
           'grow_policy': 'depthwise'}


In [10]:
model = Model(data,test)
scores1,preds1 = model.fit(params1)
print(f'The average roc-auc score is {np.mean(scores1)}')

model = Model(data,test)
scores2,preds2 = model.fit(params2)
print(f'The average roc-auc score is {np.mean(scores2)}')

model = Model(data,test)
scores3,preds3 = model.fit(params3)
print(f'The average roc-auc score is {np.mean(scores3)}')

model = Model(data,test)
scores4,preds4 = model.fit(params4)
print(f'The average roc-auc score is {np.mean(scores4)}')


The OOF auc score for iteration 1 is 0.88967585149105
The OOF auc score for iteration 2 is 0.8894738677958499
The OOF auc score for iteration 3 is 0.8895977504670556
The OOF auc score for iteration 4 is 0.8899075556942824
The average roc-auc score is 0.8896637563620595
The OOF auc score for iteration 1 is 0.8894864401876744
The OOF auc score for iteration 2 is 0.8897944132671504
The OOF auc score for iteration 3 is 0.8895637125307294
The OOF auc score for iteration 4 is 0.8896599965301409
The average roc-auc score is 0.8896261406289239
The OOF auc score for iteration 1 is 0.8892255637753733
The OOF auc score for iteration 2 is 0.8888296166853726
The OOF auc score for iteration 3 is 0.8885796506992409
The OOF auc score for iteration 4 is 0.8890157467047557
The average roc-auc score is 0.8889126444661857
The OOF auc score for iteration 1 is 0.8893747725057555


KeyboardInterrupt: 

In [11]:
predictions1 = np.mean(preds1,axis=0)
predictions2 = np.mean(preds2,axis=0)
#predictions3 = np.mean(preds3,axis=0)
#predictions4 = np.mean(preds4,axis=0)

#predictions = (predictions1+predictions2+predictions3+predictions4)/4
predictions = (predictions1+predictions2)/2
#predictions = predictions1
submit = pd.DataFrame(predictions, columns=['Pastry','Z_Scratch','K_Scatch','Stains','Dirtiness','Bumps','Other_Faults'])
submit['id'] = test['id']
submit.to_csv('submission.csv',index=False)
submit

Unnamed: 0,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults,id
0,0.524939,0.001387,0.003156,0.000155,0.018526,0.178946,0.353649,19219
1,0.263710,0.019338,0.005684,0.000278,0.190273,0.191540,0.319063,19220
2,0.002285,0.030872,0.037802,0.000478,0.006693,0.317903,0.474316,19221
3,0.154613,0.002109,0.000675,0.001306,0.010979,0.388347,0.401031,19222
4,0.002510,0.002373,0.000847,0.003352,0.005766,0.605596,0.380000,19223
...,...,...,...,...,...,...,...,...
12809,0.065394,0.087221,0.003145,0.000176,0.019179,0.195208,0.374086,32028
12810,0.171199,0.004164,0.021707,0.007809,0.095875,0.212166,0.412548,32029
12811,0.000628,0.000670,0.929590,0.000226,0.000597,0.001184,0.055205,32030
12812,0.386315,0.009535,0.016834,0.000184,0.051069,0.171652,0.329820,32031
