In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, HistGradientBoostingClassifier
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split, StratifiedKFold
import lightgbm as lgb
from lightgbm import LGBMClassifier, plot_importance
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
import optuna
from optuna.pruners import HyperbandPruner
import random as python_random
import gc
from tensorflow.keras import backend as K

tf.random.set_seed(42)
np.random.seed(42)
python_random.seed(42)


2024-04-01 00:40:38.888030: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-01 00:40:38.888131: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-01 00:40:39.025382: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
train_data = pd.read_csv('/kaggle/input/playground-series-s4e3/train.csv')
original_data = pd.read_csv('/kaggle/input/faulty-steel-plates/faults.csv')
test_data = pd.read_csv('/kaggle/input/playground-series-s4e3/test.csv')
TARGET_FEATURES = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains','Dirtiness', 'Bumps', 'Other_Faults']

In [3]:
train_data = pd.concat([train_data,original_data],axis = 0).drop_duplicates()
train_data.reset_index(drop=True, inplace=True)

# Feature Engineering

### 1. Data Cleaning
- remove multi-label rows as they occur rarely and including them doesnt affect final score
- Outside global index contains 3 unique value in original dataset and 4 unique value in synthetic dataset. The extra 4th unique value occur only once in the entire dataset. So i mapped it to closed Outside_Global_Index value

In [4]:
train_data = train_data[train_data[TARGET_FEATURES].sum(axis=1) <= 1]
train_data['Outside_Global_Index'] = np.where(train_data['Outside_Global_Index']==0.7, 0.5, train_data['Outside_Global_Index'])

### 2. Add more expressive features

In [5]:
def log_transformation(data, columns):
    for column in columns:
        positive_values = data[column] - data[column].min() + 1
        data[f'{column}_log'] = np.log(positive_values)
    return data

# train_data = square_transformation(train_data, )
train_data = log_transformation(train_data, ['Pixels_Areas'])
test_data = log_transformation(test_data, ['Pixels_Areas'])

### 3. Add label encoded target column 

- It will have one extra class to account for no defect labels

In [6]:
targets_bin = train_data[TARGET_FEATURES]

In [7]:
train_data['Target'] = np.argmax(train_data[TARGET_FEATURES].values, axis=1)+1
train_data.loc[train_data[TARGET_FEATURES].sum(axis=1) == 0, 'Target'] = 0

### 4. Drop unimportant columns

In [8]:
col_to_drop = ['id','Square_Index','Sum_of_Luminosity','X_Minimum', 'X_Perimeter', 'SigmoidOfAreas', 'Edges_X_Index', 'Y_Minimum', 'Y_Maximum','TypeOfSteel_A400']
train_data = train_data.drop(col_to_drop + TARGET_FEATURES,axis = 1)
test_data = test_data.drop(col_to_drop,axis = 1)

### 5. Scaling features

In [9]:
col_to_robust_scale = ['Steel_Plate_Thickness', 'Empty_Index']
col_to_min_max_scale = ['Pixels_Areas','Outside_X_Index']

robustscaler = RobustScaler()
minmaxscaler = MinMaxScaler()


train_data[col_to_robust_scale] = robustscaler.fit_transform(train_data[col_to_robust_scale])
test_data[col_to_robust_scale] = robustscaler.transform(test_data[col_to_robust_scale])

train_data[col_to_min_max_scale] = minmaxscaler.fit_transform(train_data[col_to_min_max_scale])
test_data[col_to_min_max_scale] = minmaxscaler.transform(test_data[col_to_min_max_scale])

# Model Training

In [10]:
X = train_data.drop(['Target'], axis=1)  
y = train_data['Target']  

## Random Forest Model

In [11]:
RETRAIN_RF_MODEL = False 

def objective(trial):
    
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 1100, 5500),
        'max_depth': trial.suggest_int('max_depth', 12, 40),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 40),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 40),
        'max_samples': trial.suggest_float('max_samples', 0.1, 1.0),
        'random_state': 42,
        'n_jobs':-1
      }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    auc_scores = []

    for train_idx, valid_idx in cv.split(X, y):
        X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
        y_train_fold, y_valid_fold = y.iloc[train_idx], y.iloc[valid_idx]
        model = RandomForestClassifier(**param)
        model.fit(X_train_fold, y_train_fold)
        y_prob = model.predict_proba(X_valid_fold)
        average_auc = roc_auc_score(targets_bin.iloc[valid_idx], y_prob[:, 1:], multi_class="ovr", average="macro")
        auc_scores.append(average_auc)

    return np.mean(auc_scores)


study = optuna.create_study(direction='maximize', study_name="rf_model_training")
if RETRAIN_RF_MODEL:
    study.optimize(objective, n_trials=200) 
    print(f"Best trial average AUC: {study.best_value:.4f}")
    for key, value in study.best_params.items():
        print(f"{key}: {value}")


[I 2024-04-01 00:40:49,211] A new study created in memory with name: rf_model_training


## HGBC Model

In [12]:
RETRAIN_HGBC_MODEL = False

def objective(trial):
    param = {
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1),
        'max_iter': trial.suggest_int('max_iter', 100, 2500), 
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'l2_regularization': trial.suggest_float('l2_regularization', 1e-8, 10.0, log=True),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 20, 300),
        'max_bins': trial.suggest_int('max_bins', 25, 255),
        'random_state' : 42
    }
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    auc_scores = []

    for train_idx, valid_idx in cv.split(X, y):
        X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
        y_train_fold, y_valid_fold = y.iloc[train_idx], y.iloc[valid_idx]
        model = HistGradientBoostingClassifier(**param)
        model.fit(X_train_fold, y_train_fold)
        y_prob = model.predict_proba(X_valid_fold)
        average_auc = roc_auc_score(targets_bin.iloc[valid_idx], y_prob[:, 1:], multi_class="ovr", average="macro")
        auc_scores.append(average_auc)

    return np.mean(auc_scores)

study = optuna.create_study(direction='maximize', study_name="HistGradientBoostingClassifier_model_training")
if RETRAIN_HGBC_MODEL:
    study.optimize(objective, n_trials=200)
    print(f"Best trial average AUC: {study.best_value:.4f}")
    for key, value in study.best_params.items():
        print(f"{key}: {value}")


[I 2024-04-01 00:40:49,270] A new study created in memory with name: HistGradientBoostingClassifier_model_training


## CAT Boost Model

In [13]:
RETRAIN_CATBOOST_MODEL = False

def objective(trial):
    
    param = {
        "loss_function": "MultiClass",
        "eval_metric": "MultiClass",
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
        "n_estimators": trial.suggest_int("n_estimators", 300, 1000),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0.0, 10.0),
        "depth": trial.suggest_int("depth", 3, 10),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.1, 1.0),
        "bootstrap_type": "Bernoulli",
        "subsample": trial.suggest_float("subsample", 0.1, 1.0),
        "random_state": 42,
        "verbose": False
    }

    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    auc_scores = []

    for train_idx, valid_idx in cv.split(X, y):
        X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
        y_train_fold, y_valid_fold = y.iloc[train_idx], y.iloc[valid_idx]
        model = CatBoostClassifier(**param)
        model.fit(X_train_fold, y_train_fold, eval_set=(X_valid_fold, y_valid_fold), verbose=False, early_stopping_rounds=100)
        y_prob = model.predict_proba(X_valid_fold)
        average_auc = roc_auc_score(y_valid_fold, y_prob, multi_class="ovr", average="macro")
        auc_scores.append(average_auc)

    return np.mean(auc_scores)


study = optuna.create_study(direction='maximize', study_name="catboost_model_training")

if RETRAIN_CATBOOST_MODEL:
    study.optimize(objective, n_trials=100)
    print(f"Best trial average AUC: {study.best_value:.4f}")
    for key, value in study.best_params.items():
        print(f"{key}: {value}")


[I 2024-04-01 00:40:49,330] A new study created in memory with name: catboost_model_training


## LGBM Model

In [14]:
RETRAIN_LGBM_MODEL = False

def objective(trial):
    param = {
    'objective': 'multiclass', 
    'num_class': 8,
    'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1),
    'n_estimators': trial.suggest_int('n_estimators', 300, 1200),
    'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
    'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
    'max_depth': trial.suggest_int('max_depth', 3, 10),
    'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0),
    'subsample': trial.suggest_float('subsample', 0.5, 1.0),
    'min_child_weight': trial.suggest_int('min_child_weight', 1, 8),
    'device_type': 'gpu',
    'num_leaves': trial.suggest_int('num_leaves', 4, 2048),
    "verbosity": -1
    }

    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    auc_scores = []

    for train_idx, valid_idx in cv.split(X, y):

        X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
        y_train_fold, y_valid_fold = y.iloc[train_idx], y.iloc[valid_idx]
        model = LGBMClassifier(**param)
        model.fit(X_train_fold, y_train_fold)
        y_prob = model.predict_proba(X_valid_fold)
        average_auc = roc_auc_score(targets_bin.iloc[valid_idx], y_prob[:, 1:], multi_class="ovr", average="macro")
        auc_scores.append(average_auc)

    return np.mean(auc_scores)

study = optuna.create_study(direction='maximize',study_name = "lgbm_model_training")
if RETRAIN_LGBM_MODEL:
    study.optimize(objective, n_trials=180) 
    print(f"Best trial average AUC: {study.best_value:.4f}")
    for key, value in study.best_params.items():
        print(f"{key}: {value}")


[I 2024-04-01 00:40:49,394] A new study created in memory with name: lgbm_model_training


## XGB MODEL

In [15]:
RETRAIN_XGB_MODEL = False
def objective(trial):

    param = {
        'objective':'multi:softmax',
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1),
        'n_estimators': trial.suggest_int('n_estimators',250,1000),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0,log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0,log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 8),
        'device' : "cuda",
        'tree_method':"hist"
    }
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    auc_scores = []

    for train_idx, valid_idx in cv.split(X, y):

        X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
        y_train_fold, y_valid_fold = y.iloc[train_idx], y.iloc[valid_idx]
        model = XGBClassifier(**param)
        model.fit(X_train_fold, y_train_fold)
        y_prob = model.predict_proba(X_valid_fold)
        average_auc = roc_auc_score(targets_bin.iloc[valid_idx], y_prob[:, 1:], multi_class="ovr", average="macro")
        auc_scores.append(average_auc)

    return np.mean(auc_scores)


study = optuna.create_study(direction='maximize',study_name = "xgb_model_training")
if RETRAIN_XGB_MODEL:
    study.optimize(objective, n_trials=200)
    print(f"Best trial average AUC: {study.best_value:.4f}")
    for key, value in study.best_params.items():
        print(f"{key}: {value}")


[I 2024-04-01 00:40:49,456] A new study created in memory with name: xgb_model_training


## Best Model Params:

In [16]:
lgbm_params = {
    "objective": "multiclass",
    "verbosity": -1,
    "random_state": 42,
    "num_class": 8,
    'learning_rate': 0.014752843955666523,
    'n_estimators': 618,
    'lambda_l1': 1.4050330996128146,
    'lambda_l2': 8.85983526709923e-08,
    'max_depth': 4,
    'colsample_bytree': 0.3787120691002838,
    'subsample': 0.5858336045495702,
    'min_child_weight': 2,
    'num_leaves': 1840,
    'device' : 'gpu',
}


cat_params = {
    'loss_function': 'MultiClass',
    'learning_rate': 0.06497097587405798,
    'iterations': 1392,
    'l2_leaf_reg': 6.242622838509466,
    'depth': 4,
    'task_type':'GPU',
    'devices':'0',
    'verbose': False,
    "random_state": 42

}

xgb_params = {
        'objective':'multi:softmax',
        'learning_rate': 0.014525601125449565,
        'n_estimators': 981,
        'reg_alpha': 1.4566882667398338,
        'reg_lambda': 5.86255409028864e-08,
        'max_depth': 5,
        'colsample_bytree': 0.3032477181611612,
        'subsample': 0.8720613168628836,
        'min_child_weight': 4,
        'device' : "cuda",
        'tree_method':"hist",
    }



histGradient_params = {
    'random_state' : 42,
    'learning_rate': 0.019944410547905925,
    'max_iter': 2309,
    'max_depth': 6,
    'l2_regularization': 5.611274355555922e-08,
    'min_samples_leaf': 299,
    'max_bins': 101,
}




rf_params = {
    'random_state': 42,
    'n_estimators': 3996,
    'max_depth': 18,
    'min_samples_split': 13,
    'min_samples_leaf': 3,
    'max_samples': 0.8220916127612791,
    'n_jobs':-1
}

In [17]:
cv_estimators = [
    ('XGB', XGBClassifier(**xgb_params)),
    ('LGBM', LGBMClassifier(**lgbm_params)),
    ('CAT', CatBoostClassifier(**cat_params)),
#     ('RF', RandomForestClassifier(**rf_params)),
    ('HGBC', HistGradientBoostingClassifier(**histGradient_params))
]

## Finding best weights for voting classifier

In [18]:
WEIGHT_TUNE = False

def objective(trial):
    
    params = {
        'XGB_Weight': trial.suggest_float('XGB_Weight', 0.0, 5.0),
        'LGBM_Weight': trial.suggest_float('LGBM_Weight', 0.0, 5.0),
        'CAT_Weight': trial.suggest_float('CAT_Weight', 0.0, 5.0),
        'RF_Weight': trial.suggest_float('RF_Weight', 0.0, 5.0),
        'HGBC_Weight': trial.suggest_float('HGBC_Weight', 0.0, 5.0),    
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    auc_scores = []

    for train_idx, valid_idx in cv.split(X, y):
        X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
        y_train_fold, y_valid_fold = y.iloc[train_idx], y.iloc[valid_idx]
        voting_classifier = VotingClassifier(
            estimators=cv_estimators,
            voting='soft',
            weights=[params['XGB_Weight'], params['LGBM_Weight'], params['CAT_Weight'],params['RF_Weight'],params['HGBC_Weight']]
        )
        voting_classifier.fit(X_train_fold, y_train_fold)
        y_prob = voting_classifier.predict_proba(X_valid_fold)
        average_auc = roc_auc_score(targets_bin.iloc[valid_idx], y_prob[:, 1:], multi_class="ovr", average="macro")
        auc_scores.append(average_auc)

    return np.mean(auc_scores)


weight_study = optuna.create_study(direction='maximize', study_name="voting_classifier_training")
if WEIGHT_TUNE:
    weight_study.optimize(objective, n_trials=200)
    print(f"Best trial average AUC: {weight_study.best_value:.4f}")
    for key, value in weight_study.best_params.items():
        print(f"{key}: {value}")


[I 2024-04-01 00:40:49,639] A new study created in memory with name: voting_classifier_training


## Final Voting classifier training based on best params

In [19]:
# weight_best_params
weight_best_params = {'XGB_Weight': 4.588446199271165, 'LGBM_Weight': 1.0464853862618422, 'CAT_Weight': 2.305591559147083, "RF_Weight": 1.575194973453438,'HGBC_Weight': 1.096656838999795}

In [20]:
voting_classifier = VotingClassifier(
estimators=cv_estimators,
voting='soft',
weights=[weight_best_params['XGB_Weight'], weight_best_params['LGBM_Weight'], weight_best_params['CAT_Weight'],weight_best_params['HGBC_Weight']],
)
# weight_best_params['RF_Weight']

In [21]:
# if final_training false, i am using my presaved pred file
DO_FINAL_TRAINING = True

if DO_FINAL_TRAINING:
    USE_REPEATED_FOLDS = False  # Change this to False if you want to use StratifiedKFold
    if USE_REPEATED_FOLDS:
        cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=5, random_state=42)
    else:
        cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

    auc_scores = []
    y_prob_test = []

    for i, (train_idx, valid_idx) in enumerate(cv.split(X, y)):
        print('-' * 25)
        print(f'FOLD {i + 1} STARTED')
        X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
        y_train_fold, y_valid_fold = y.iloc[train_idx], y.iloc[valid_idx]
        voting_classifier.fit(X_train_fold, y_train_fold)
        y_prob = voting_classifier.predict_proba(X_valid_fold)
        y_prob_test.append(voting_classifier.predict_proba(test_data))
        average_auc = roc_auc_score(targets_bin.iloc[valid_idx], y_prob[:, 1:], multi_class="ovr", average="macro")
        auc_scores.append(average_auc)
        print(f'FOLD {i + 1} - AUC computed: {average_auc:.4f}')

    average_auc_score = np.mean(auc_scores)
    print(f'Average AUC score across all folds: {average_auc_score:.4f}')
    y_prob_test_array = np.mean(y_prob_test, axis=0)
    pred = y_prob_test_array

-------------------------
FOLD 1 STARTED


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




FOLD 1 - AUC computed: 0.9015
-------------------------
FOLD 2 STARTED
FOLD 2 - AUC computed: 0.8989
-------------------------
FOLD 3 STARTED
FOLD 3 - AUC computed: 0.8985
-------------------------
FOLD 4 STARTED
FOLD 4 - AUC computed: 0.9064
-------------------------
FOLD 5 STARTED
FOLD 5 - AUC computed: 0.8975
-------------------------
FOLD 6 STARTED
FOLD 6 - AUC computed: 0.9054
-------------------------
FOLD 7 STARTED
FOLD 7 - AUC computed: 0.8930
-------------------------
FOLD 8 STARTED
FOLD 8 - AUC computed: 0.8999
-------------------------
FOLD 9 STARTED
FOLD 9 - AUC computed: 0.9002
-------------------------
FOLD 10 STARTED
FOLD 10 - AUC computed: 0.9039
Average AUC score across all folds: 0.9005


In [22]:
# pred_without_trees_final = np.loadtxt('/kaggle/input/pred-new-feature/pred_new_feature.txt')
# pred_with_trees_final = np.loadtxt('/kaggle/input/pred-with-trees/pred_with_trees_final.txt')


voting_probs2 = pd.read_csv('/kaggle/input/publicly-available-submissions/add_sub.csv',index_col='id')
voting_probs3 = pd.read_csv('/kaggle/input/publicly-available-submissions/sub1.csv',index_col='id')
voting_probs4 = pd.read_csv('/kaggle/input/publicly-available-submissions/sub2.csv',index_col='id')
voting_probs5 = pd.read_csv('/kaggle/input/publicly-available-submissions/submission_pure.csv',index_col='id')

In [23]:
voting_probs2.drop(['Unnamed: 0'],axis = 1,inplace = True)
voting_probs3.drop(['Unnamed: 0'],axis = 1,inplace = True)
voting_probs4.drop(['Unnamed: 0'],axis = 1,inplace = True)

voting_probs1 = pred[:, 1:]
voting_probs2 = voting_probs2.to_numpy()
voting_probs3 = voting_probs3.to_numpy()
voting_probs4 = voting_probs4.to_numpy()
voting_probs5 = voting_probs5.to_numpy()

In [24]:
prob_weighted_mean =0.25*voting_probs1 + 0.12*voting_probs2 + 0.17*voting_probs3 + 0.26*voting_probs4 + 0.2*voting_probs5

In [25]:
sample_data = pd.read_csv('/kaggle/input/playground-series-s4e3/sample_submission.csv')

In [26]:
predictions = pd.DataFrame(prob_weighted_mean, columns=[TARGET_FEATURES])
predictions['id'] = sample_data['id']
predictions.to_csv('my_submission.csv', index=False)

# EXTRAS

## Models that didnt perform well/didnt work

### 1. Neural Networks

In [27]:
# num_classes = 7
# y_categorical = to_categorical(y, num_classes=num_classes)

# def make_dataset(X, y, batch_size=512, mode='train'):
#     dataset = tf.data.Dataset.from_tensor_slices((X, y))
#     if mode == 'train':
#         dataset = dataset.shuffle(buffer_size=len(X)).cache()
#     dataset = dataset.batch(batch_size)
#     dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
#     return dataset



# def create_model(trial, input_shape, num_classes):
#     model = Sequential()
#     model.add(Input(shape=input_shape))
    
#     n_layers = trial.suggest_int('n_layers', 2, 4)
#     for i in range(n_layers):
#         num_units = trial.suggest_int(f'n_units_l{i}', 16, 128)
#         activation = trial.suggest_categorical(f'activation_l{i}', ['relu', 'elu', 'selu'])
#         dropout_rate = trial.suggest_float(f'dropout_l{i}', 0.2, 0.5)
#         model.add(Dense(num_units, activation=activation))
#         model.add(BatchNormalization())
#         model.add(Dropout(dropout_rate))
    
#     model.add(Dense(num_classes, activation='softmax'))
    
#     lr = trial.suggest_float("lr", 1e-5, 1e-2, log=True)
#     model.compile(optimizer=Adam(learning_rate=lr),
#                   loss='categorical_crossentropy',
#                   metrics=[tf.keras.metrics.AUC(name='auc', multi_label=True)])
#     return model

# def objective(trial):
#     auc_scores = []
#     skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
#     batch_size = trial.suggest_categorical('batch_size', [64, 128, 256, 512])
    
#     for train_idx, valid_idx in skf.split(X, y):
#         X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
#         y_train, y_valid = to_categorical(y.iloc[train_idx], num_classes=num_classes), to_categorical(y.iloc[valid_idx], num_classes=num_classes)
        
#         train_dataset = make_dataset(X_train, y_train, batch_size=batch_size, mode='train')
#         valid_dataset = make_dataset(X_valid, y_valid, batch_size=batch_size, mode='eval')
        
#         model = create_model(trial, input_shape=(X_train.shape[1],), num_classes=num_classes)
        
#         early_stopping = EarlyStopping(monitor='val_auc', patience=25, mode='max', restore_best_weights=True)
        
#         steps_per_epoch = len(X_train) // batch_size
        
#         model.fit(train_dataset.repeat(), epochs=2500, steps_per_epoch=steps_per_epoch, validation_data=valid_dataset, callbacks=[early_stopping], verbose=0)

#         results = model.evaluate(valid_dataset, verbose=0, return_dict=True)
#         auc_score = results['auc']
#         auc_scores.append(auc_score)
#         # Clear the Keras session after each trial
#         K.clear_session()
#         gc.collect()
#     return np.mean(auc_scores)

# study = optuna.create_study(direction='maximize',pruner = HyperbandPruner())
# study.optimize(objective, n_trials=250) 

# print('Number of finished trials:', len(study.trials))
# print('Best trial parameters:', study.best_trial.params)

In [28]:
# import numpy as np
# import tensorflow as tf
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, BatchNormalization, Dropout, Input
# from tensorflow.keras.optimizers import Adam
# from tensorflow.keras.callbacks import EarlyStopping
# from sklearn.model_selection import StratifiedKFold
# from tensorflow.keras.utils import to_categorical
# import random as python_random

# # Set seeds for reproducibility
# tf.random.set_seed(42)
# np.random.seed(42)
# python_random.seed(42)

# # Assuming X and y are your features and labels respectively
# num_classes = 7
# y_categorical = to_categorical(y, num_classes=num_classes)

# # Define the model creation function with the best hyperparameters
# def create_model(input_shape, num_classes, n_units, activation, dropout_rate, lr):
#     model = Sequential()
#     model.add(Input(shape=input_shape))
    
#     for units, act, drop in zip(n_units, activation, dropout_rate):
#         model.add(Dense(units, activation=act))
#         model.add(BatchNormalization())
#         model.add(Dropout(drop))
    
#     model.add(Dense(num_classes, activation='softmax'))
    
#     optimizer = Adam(learning_rate=lr)
#     model.compile(optimizer=optimizer,
#                   loss='categorical_crossentropy',
#                   metrics=[tf.keras.metrics.AUC(name='auc', multi_label=True)])
#     return model

# # Best hyperparameters
# best_params = {
#     'batch_size': 128,
#     'n_units': [107, 115, 94],
#     'activation': ['selu', 'relu', 'selu'],
#     'dropout_rate': [0.23171122368292765, 0.21511239568756282, 0.41684411583006736],
#     'lr': 0.0002473770028925611
# }

# # Prepare for Stratified Cross-Validation
# skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
# auc_scores = []
# y_prob_test = []
# for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
#     print(f"Training on fold {fold+1}/10...")
#     X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
#     y_train, y_valid = y_categorical[train_idx], y_categorical[valid_idx]

#     model = create_model(input_shape=(X.shape[1],), num_classes=num_classes, 
#                          n_units=best_params['n_units'],
#                          activation=best_params['activation'], 
#                          dropout_rate=best_params['dropout_rate'], lr=best_params['lr'])
    
#     early_stopping = EarlyStopping(monitor='val_auc', patience=40, mode='max', restore_best_weights=True)

#     model.fit(X_train, y_train, batch_size=best_params['batch_size'], epochs=200, 
#               validation_data=(X_valid, y_valid), callbacks=[early_stopping], verbose=0)
#     y_prob_test.append(model.predict(test_data))

#     results = model.evaluate(X_valid, y_valid, verbose=0, return_dict=True)
#     auc_score = results['auc']
#     auc_scores.append(auc_score)
#     print(f"Fold {fold+1} AUC: {auc_score:.4f}\n")

# # Display average AUC across all folds
# print(f"Average AUC across all folds: {np.mean(auc_scores):.4f}")

# # Average the predictions on the test set across all folds
# y_prob_test_array = np.mean(y_prob_test, axis=0)

# # Output the final prediction
# pred = y_prob_test_array


### 2. SVM
- Kaggle interface was getting stuck maybe due to high computation requirement of a kernel based SVM

# Credits:

### Discussions:
- Discussion here was gold : https://www.kaggle.com/competitions/playground-series-s4e3/discussion/482401
- https://www.kaggle.com/competitions/playground-series-s4e3/discussion/483320

### Notebooks:
- https://www.kaggle.com/code/arunklenin/ps4e3-steel-plate-fault-prediction-multilabel
- https://www.kaggle.com/code/ravi20076/playgrounds4e03-eda-binaryclassifier
- https://www.kaggle.com/code/noepinefrin/0-89484-weighted-ensemble-xgb-lgbm-cat
- https://www.kaggle.com/code/lucamassaron/steel-plate-eda-xgboost-is-all-you-need