In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data import 

In [None]:
train_data = pd.read_csv('/kaggle/input/playground-series-s4e3/train.csv')
original_data = pd.read_csv('/kaggle/input/faulty-steel-plates/faults.csv')
TARGET_FEATURES = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains','Dirtiness', 'Bumps', 'Other_Faults']
test_data = pd.read_csv('/kaggle/input/playground-series-s4e3/test.csv')

In [None]:
train_data.drop(['id'],axis = 1,inplace = True)
train_data = pd.concat([train_data,original_data],axis = 0)
train_data = train_data.drop_duplicates()
train_data.reset_index(drop=True, inplace=True)

In [None]:
np.unique(train_data.iloc[:,-7:],axis=0,return_counts=True)

In [None]:
targets_bin = train_data[TARGET_FEATURES]
targets_bin[targets_bin.sum(axis=1)==2]

In [None]:
train_data['Target'] = np.argmax(train_data[TARGET_FEATURES].values, axis=1) + 1
train_data.loc[train_data[TARGET_FEATURES].sum(axis=1) == 0, 'Target'] = 0
train_data.drop(TARGET_FEATURES, inplace=True,axis =1)

In [None]:
import numpy as np
import optuna
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
X = train_data.drop(['Target'], axis=1)  
y = train_data['Target']  

# XGB with Optuna model creating and evaluating

In [None]:
RETRAIN_MODEL = False
def objective(trial):
    # Define hyperparameters to tune
    param = {
        'objective':'multi:softmax',
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.05),
        'n_estimators': trial.suggest_int('n_estimators',250,1000),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0,log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0,log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 5),
        'device' : "cuda",
        'tree_method':"hist"
    }

    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    auc_scores = []

    for train_idx, valid_idx in cv.split(X, y):

        X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
        y_train_fold, y_valid_fold = y.iloc[train_idx], y.iloc[valid_idx]

        model = XGBClassifier(**param)
        model.fit(X_train_fold, y_train_fold)

        y_prob = model.predict_proba(X_valid_fold)

        average_auc = roc_auc_score(targets_bin.iloc[valid_idx], y_prob[:, 1:], multi_class="ovr", average="macro")
        auc_scores.append(average_auc)

    return np.mean(auc_scores)

study = optuna.create_study(direction='maximize',study_name = "xgb_model_training")

if RETRAIN_MODEL:
    study.optimize(objective, n_trials=100)  # Adjust the number of trials as necessary

    print(f"Best trial average AUC: {study.best_value:.4f}")
    for key, value in study.best_params.items():
        print(f"{key}: {value}")

In [None]:
#study.optimize(objective, n_trials=150)  # Adjust the number of trials as necessary

#print(f"Best trial average AUC: {study.best_value:.4f}")
#for key, value in study.best_params.items():
#    print(f"{key}: {value}")

In [None]:
'''best_params = {
        'objective':'multi:softmax',
        'learning_rate': 0.02767540293640535,
        'n_estimators': 494,
        'reg_alpha': 1.5855453969671037e-06,
        'reg_lambda': 1.4155529076600075,
        'max_depth': 5,
        'colsample_bytree': 0.46589178614541227,
        'subsample': 0.8504122771965839,
        'min_child_weight': 3,
        'device' : "cuda",
        'tree_method':"hist",

}
'''
#0.8994042250342729 and parameters: 
best_params = {'objective':'multi:softmax',
               'learning_rate': 0.014298793081072316,
'n_estimators': 913,
    'reg_alpha': 0.07843106592857683,
        'reg_lambda': 4.313270299391389,
            'max_depth': 6,
                'colsample_bytree': 0.32355661455856666,
                    'subsample': 0.6604581212188047,
                        'min_child_weight': 1,'tree_method':"hist"}

In [None]:
test_data.drop(['id'],inplace = True,axis = 1)

In [None]:
cv = StratifiedKFold(n_splits=20, shuffle=True, random_state=42)
auc_scores = []
y_prob_test = []
for train_idx, valid_idx in cv.split(X, y):
    X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
    y_train_fold, y_valid_fold = y.iloc[train_idx], y.iloc[valid_idx]

    model = XGBClassifier(**best_params)
    model.fit(X_train_fold, y_train_fold)

    y_prob = model.predict_proba(X_valid_fold)
    
    y_prob_test.append(model.predict_proba(test_data))

    average_auc = roc_auc_score(targets_bin.iloc[valid_idx], y_prob[:, 1:], multi_class="ovr", average="macro")
    auc_scores.append(average_auc)

print(np.mean(auc_scores))
y_prob_test_array = np.array(y_prob_test)
pred = np.mean(y_prob_test_array, axis=0)

In [None]:
sample_prediction = pd.read_csv('/kaggle/input/playground-series-s4e3/sample_submission.csv')

In [None]:
predictions = pd.DataFrame(pred[:, 1:], columns=[TARGET_FEATURES])
predictions['id'] = sample_prediction['id']
predictions.to_csv('submission.csv', index=False)