In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s4e3/sample_submission.csv
/kaggle/input/playground-series-s4e3/train.csv
/kaggle/input/playground-series-s4e3/test.csv
/kaggle/input/faulty-steel-plates/faults.csv


In [2]:
train_df = pd.read_csv('/kaggle/input/playground-series-s4e3/train.csv', encoding='utf-8')
test_df = pd.read_csv('/kaggle/input/playground-series-s4e3/test.csv', encoding='utf-8')
sub_df = pd.read_csv('/kaggle/input/playground-series-s4e3/sample_submission.csv', encoding='utf-8')
original_df = pd.read_csv('/kaggle/input/faulty-steel-plates/faults.csv', encoding='utf-8')

In [3]:
train_df = train_df.drop(['id'],axis=1)
test_df_id = test_df['id']
test_df = test_df.drop(['id'],axis=1)                       
combined_df = pd.concat([train_df, original_df], ignore_index=True)

In [4]:
combined_df.shape

(21160, 34)

In [5]:
combined_df = combined_df.drop_duplicates()
combined_df.shape

(21160, 34)

In [6]:
TARGET_COLUMNS = ['Pastry','Z_Scratch', 'K_Scatch',   'Stains',   'Dirtiness','Bumps','Other_Faults']

In [7]:
y_train = train_df[TARGET_COLUMNS]
X_train = train_df.drop(columns=TARGET_COLUMNS, axis=1)
X_test= test_df

In [8]:
X_train.shape, X_test.shape,y_train.shape

((19219, 27), (12814, 27), (19219, 7))

In [9]:
features = list(X_train)
features

['X_Minimum',
 'X_Maximum',
 'Y_Minimum',
 'Y_Maximum',
 'Pixels_Areas',
 'X_Perimeter',
 'Y_Perimeter',
 'Sum_of_Luminosity',
 'Minimum_of_Luminosity',
 'Maximum_of_Luminosity',
 'Length_of_Conveyer',
 'TypeOfSteel_A300',
 'TypeOfSteel_A400',
 'Steel_Plate_Thickness',
 'Edges_Index',
 'Empty_Index',
 'Square_Index',
 'Outside_X_Index',
 'Edges_X_Index',
 'Edges_Y_Index',
 'Outside_Global_Index',
 'LogOfAreas',
 'Log_X_Index',
 'Log_Y_Index',
 'Orientation_Index',
 'Luminosity_Index',
 'SigmoidOfAreas']

In [10]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19219 entries, 0 to 19218
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   X_Minimum              19219 non-null  int64  
 1   X_Maximum              19219 non-null  int64  
 2   Y_Minimum              19219 non-null  int64  
 3   Y_Maximum              19219 non-null  int64  
 4   Pixels_Areas           19219 non-null  int64  
 5   X_Perimeter            19219 non-null  int64  
 6   Y_Perimeter            19219 non-null  int64  
 7   Sum_of_Luminosity      19219 non-null  int64  
 8   Minimum_of_Luminosity  19219 non-null  int64  
 9   Maximum_of_Luminosity  19219 non-null  int64  
 10  Length_of_Conveyer     19219 non-null  int64  
 11  TypeOfSteel_A300       19219 non-null  int64  
 12  TypeOfSteel_A400       19219 non-null  int64  
 13  Steel_Plate_Thickness  19219 non-null  int64  
 14  Edges_Index            19219 non-null  float64
 15  Em

In [12]:
cat_features=['TypeOfSteel_A300','TypeOfSteel_A400']

In [13]:
for f in cat_features:
    print(X_test[f].value_counts())

TypeOfSteel_A300
0    7622
1    5192
Name: count, dtype: int64
TypeOfSteel_A400
1    7610
0    5204
Name: count, dtype: int64


In [14]:
scale_cols = features.copy()
for f in cat_features:
    scale_cols.remove(f)

In [15]:
for c in scale_cols:
    min_value = X_train[c].min()
    max_value = X_train[c].max()
    X_train[c] = (X_train[c] - min_value) / (max_value - min_value)
    X_test[c] = (X_test[c] - min_value) / (max_value - min_value)

In [16]:
from xgboost import XGBClassifier

In [17]:
class Model:
    def __init__(self, train, test):
        self.train = train
        self.test = test
        self.model_dict = dict()
        self.test_predict_list = list()
        
    def fit(self,params):
        label_columns = ['Pastry','Z_Scratch','K_Scatch','Stains','Dirtiness','Bumps','Other_Faults']
        
        train_cols = [col for col in self.train.columns.to_list() if col not in label_columns]
        scores = list()
        
        for i in range(5):
            mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True)
            oof_valid_preds = np.zeros((self.train[train_cols].shape[0], len(label_columns)))
                
            for fold, (train_idx, valid_idx) in enumerate(mskf.split(self.train[train_cols], self.train[label_columns])):
                X_train, y_train = self.train[train_cols].iloc[train_idx], self.train[label_columns].iloc[train_idx]
                X_valid, y_valid = self.train[train_cols].iloc[valid_idx], self.train[label_columns].iloc[valid_idx]
            
                model = XGBClassifier(random_state=5*i+13*fold,**params) 
                model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=250,verbose=False)               
                valid_preds = model.predict_proba(X_valid)
                oof_valid_preds[valid_idx] = valid_preds
                test_predict = model.predict_proba(self.test[train_cols])
                self.test_predict_list.append(test_predict)
                score = roc_auc_score(y_valid, valid_preds, multi_class="ovr")
                self.model_dict[f'fold_{fold}'] = model                    
            oof_score = roc_auc_score(self.train[label_columns], oof_valid_preds, multi_class="ovr")
            print(f"The OOF auc score for iteration {i+1} is {oof_score}")
            scores.append(oof_score)
        return scores,self.test_predict_list

In [18]:
X_train_ = pd.concat([X_train, y_train], axis=1,ignore_index=True)
X_train_.columns=features+TARGET_COLUMNS

In [19]:
%%capture
pip install iterative-stratification

In [20]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split


In [None]:
'''
# 0.8977401379416714
params1 = {'learning_rate': 0.005219114924743408, 
           'n_estimators': 845, 
           'reg_alpha': 4.1208708173103436e-07, 
           'reg_lambda': 7.383739881532837e-07, 
           'max_depth': 8, 
           'colsample_bytree': 0.7721306125437644, 
           'subsample': 0.8034210605432881, 
           'min_child_weight': 3
}

# 0.8977815340512961
params2 = {
    'learning_rate': 0.021238372596665726, 
    'n_estimators': 819, 
    'reg_alpha': 1.3552789792917862e-05, 
    'reg_lambda': 1.1928764306963244e-08, 
    'max_depth': 3, 
    'colsample_bytree': 0.9981971488939648, 
    'subsample': 0.8064267876137938, 
    'min_child_weight': 2
}

# 0.8978573594246791
params3 = {
    'learning_rate': 0.012777805624303109, 
    'n_estimators': 726, 
    'reg_alpha': 0.01050679869014073, 
    'reg_lambda': 2.424514529138804e-07, 
    'max_depth': 8, 
    'colsample_bytree': 0.7579961991392175, 
    'subsample': 0.7554753336650764, 
    'min_child_weight': 4
}

# 0.898744485456195
params4 = {
    'learning_rate': 0.010205134363734072,
    'n_estimators': 634, 
    'reg_alpha': 0.003964012396877028, 
    'reg_lambda': 1.2413530223279242e-07, 
    'max_depth': 9, 
    'colsample_bytree': 0.4836297966993244, 
    'subsample': 0.7291200081945567, 
    'min_child_weight': 5
}

# 0.898805334945633
params5 = {
    'learning_rate': 0.011751566020194271, 
    'n_estimators': 536, 
    'reg_alpha': 7.741987948617281e-05, 
    'reg_lambda': 2.4007011141974187e-07, 
    'max_depth': 8, 
    'colsample_bytree': 0.3703232993289221, 
    'subsample': 0.9132925438658243, 
    'min_child_weight': 5
}

# 0.898825313695891
params6 = {
    'learning_rate': 0.011851728060067561, 
    'n_estimators': 521, 
    'reg_alpha': 0.00010228678348252551,
    'reg_lambda': 1.6881416217009295e-07, 
    'max_depth': 6, 
    'colsample_bytree': 0.37782090107730304, 
    'subsample': 0.9025003211807061, 
    'min_child_weight': 5
}

# 0.8990034152716699
params7 = {
    'learning_rate': 0.011770354467970834, 
    'n_estimators': 505, 
    'reg_alpha': 0.002875633409957358, 
    'reg_lambda': 2.767980359261517e-07, 
    'max_depth': 8, 
    'colsample_bytree': 0.39240027531269284, 
    'subsample': 0.9383088435896358, 
    'min_child_weight': 5
}

# 0.8990614383722505
params8 = {
    'learning_rate': 0.013668191000455281, 
    'n_estimators': 614, 
    'reg_alpha': 4.540850025656145e-06, 
    'reg_lambda': 0.0028959885973129425,
    'max_depth': 7, 
    'colsample_bytree': 0.35906808942691126, 
    'subsample': 0.9467015479498415, 
    'min_child_weight': 5
}

# 0.8992612492589878
params9 = {
    'learning_rate': 0.013783280433915454, 
    'n_estimators': 615, 
    'reg_alpha': 6.627990245442222e-06,
    'reg_lambda': 0.003777510971731547, 
    'max_depth': 7, 
    'colsample_bytree': 0.34139037786138204, 
    'subsample': 0.9151254322057188, 
    'min_child_weight': 5
}

# 0.8994207353666404
params10 = {
    'learning_rate': 0.01653460618062825, 
    'n_estimators': 701, 
    'reg_alpha': 7.273708238403666e-08,
    'reg_lambda': 0.005460603687341128, 
    'max_depth': 5,
    'colsample_bytree': 0.35776753567288, 
    'subsample': 0.9532191723985007, 
    'min_child_weight': 5
}
'''

In [21]:
# Parameters from XGBoost
params_1 = {
    'learning_rate': 0.011770354467970834, 
    'n_estimators': 505, 
    'reg_alpha': 0.002875633409957358, 
    'reg_lambda': 2.767980359261517e-07, 
    'max_depth': 8, 
    'colsample_bytree': 0.39240027531269284, 
    'subsample': 0.9383088435896358, 
    'min_child_weight': 5,
    'device' : "cuda",
    'tree_method':"hist"
}

params_2 = {
    'learning_rate': 0.013668191000455281, 
    'n_estimators': 614, 
    'reg_alpha': 4.540850025656145e-06, 
    'reg_lambda': 0.0028959885973129425,
    'max_depth': 7, 
    'colsample_bytree': 0.35906808942691126, 
    'subsample': 0.9467015479498415, 
    'min_child_weight': 5,
    'device' : "cuda",
    'tree_method':"hist"
}

params_3 = {
    'learning_rate': 0.013783280433915454, 
    'n_estimators': 615, 
    'reg_alpha': 6.627990245442222e-06,
    'reg_lambda': 0.003777510971731547, 
    'max_depth': 7, 
    'colsample_bytree': 0.34139037786138204, 
    'subsample': 0.9151254322057188, 
    'min_child_weight': 5,
    'device' : "cuda",
    'tree_method':"hist"
}

params_4 = {
    'learning_rate': 0.01653460618062825, 
    'n_estimators': 701, 
    'reg_alpha': 7.273708238403666e-08,
    'reg_lambda': 0.005460603687341128, 
    'max_depth': 5,
    'colsample_bytree': 0.35776753567288, 
    'subsample': 0.9532191723985007, 
    'min_child_weight': 5,
    'device' : "cuda",
    'tree_method':"hist"
}

In [22]:
model = Model(X_train_,X_test)
scores,preds_1 = model.fit(params_1)
print(f'The average roc-auc score is {np.mean(scores)}')
score1=np.mean(scores)

model = Model(X_train_,X_test)
scores,preds_2 = model.fit(params_2)
print(f'The average roc-auc score is {np.mean(scores)}')
score2=np.mean(scores)

model = Model(X_train_,X_test)
scores,preds_3 = model.fit(params_3)
print(f'The average roc-auc score is {np.mean(scores)}')
score3=np.mean(scores)

model = Model(X_train_,X_test)
scores,preds_4 = model.fit(params_4)
print(f'The average roc-auc score is {np.mean(scores)}')
score4=np.mean(scores)

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




The OOF auc score for iteration 1 is 0.8874624542010338




The OOF auc score for iteration 2 is 0.8871333249395751




The OOF auc score for iteration 3 is 0.8877217106055787




The OOF auc score for iteration 4 is 0.8871870362762859




The OOF auc score for iteration 5 is 0.8876638449676238
The average roc-auc score is 0.8874336741980194




The OOF auc score for iteration 1 is 0.8875769296679269




The OOF auc score for iteration 2 is 0.8870109045856986




The OOF auc score for iteration 3 is 0.8874172902821208




The OOF auc score for iteration 4 is 0.8875066395217793




The OOF auc score for iteration 5 is 0.8873787138447339
The average roc-auc score is 0.8873780955804518




The OOF auc score for iteration 1 is 0.8868382958087261




The OOF auc score for iteration 2 is 0.8876676555310163




The OOF auc score for iteration 3 is 0.8876725257641764




The OOF auc score for iteration 4 is 0.8871948917890883




The OOF auc score for iteration 5 is 0.8883359895422339
The average roc-auc score is 0.8875418716870482




The OOF auc score for iteration 1 is 0.8884994187097898




The OOF auc score for iteration 2 is 0.8878209513356159




The OOF auc score for iteration 3 is 0.8883110053277969




The OOF auc score for iteration 4 is 0.8887897899212966




The OOF auc score for iteration 5 is 0.8885332209007265
The average roc-auc score is 0.8883908772390452


In [23]:
print(score1)
print(score2)
print(score3)
print(score4)

0.8874336741980194
0.8873780955804518
0.8875418716870482
0.8883908772390452


In [24]:
df_test = pd.read_csv('/kaggle/input/playground-series-s4e3/test.csv')

In [25]:
predictions_1 = np.mean(preds_1,axis=0)
predictions_2 = np.mean(preds_2,axis=0)
predictions_3 = np.mean(preds_3,axis=0)
predictions_4 = np.mean(preds_4,axis=0)

#w=[0.4,0.4,0.05,0.15]
w=[0.23,0.23,0.23,0.31]
predictions = w[0] * predictions_1 + w[1] * predictions_2  + w[2] * predictions_3 + w[3] * predictions_4

#w=[0.20,0.0,0.0,0.0,0.80]
#predictions = w[0] * predictions_1 + w[1] * predictions_2  + w[2] * predictions_3 + w[3] * predictions_4 + w[4] * df_sub_result 

sub = pd.DataFrame(predictions, columns=TARGET_COLUMNS)
sub['id'] = df_test['id']
sub.to_csv('submission.csv',index=False)
sub


Unnamed: 0,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults,id
0,0.483978,0.002372,0.004987,0.000341,0.018948,0.180337,0.375603,19219
1,0.276798,0.021054,0.006618,0.000430,0.159691,0.182243,0.320358,19220
2,0.002060,0.035959,0.044174,0.000741,0.005124,0.303522,0.463414,19221
3,0.140093,0.004293,0.000998,0.001491,0.011252,0.395883,0.429165,19222
4,0.003175,0.005030,0.001082,0.003597,0.007563,0.603717,0.353919,19223
...,...,...,...,...,...,...,...,...
12809,0.080314,0.085836,0.003871,0.000354,0.017826,0.200598,0.358597,32028
12810,0.157425,0.003838,0.024778,0.008816,0.090540,0.198065,0.413408,32029
12811,0.000794,0.000829,0.916961,0.000393,0.000647,0.001111,0.073937,32030
12812,0.336935,0.009050,0.020231,0.000358,0.057441,0.164736,0.312102,32031
