In [None]:
import numpy as np
import pandas as pd 
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix, f1_score,roc_curve, precision_score, recall_score,roc_auc_score
from sklearn import linear_model, tree, ensemble
from sklearn.model_selection import train_test_split
import warnings
warnings.simplefilter(action="ignore")

In [None]:
train = pd.read_csv('/kaggle/input/playground-series-s3e12/train.csv', index_col=[0])
df_test = pd.read_csv('/kaggle/input/playground-series-s3e12/test.csv', index_col=[0])
original = pd.read_csv('/kaggle/input/kidney-stone-prediction-based-on-urine-analysis/kindey stone urine analysis.csv')
train['is_generated'] = 1
df_test['is_generated'] = 1
original['is_generated'] = 0
original = original.reset_index()
original['id'] = original['index'] + df_test.index[-1] + 1
original = original.drop(columns = ['index']).set_index('id')
df_train = pd.concat([train, original])

In [None]:
class Preprocessing:
    def __init__(self, df, is_test=False):
        self.df = df
        self.is_test = is_test
        
    def shape(self):
        print(f'shape: {self.df.shape}')
    
    def dtypes(self, pr=False):
        print("Types")
        if pr:
            print(self.df.dtypes)
                
    def isNaN(self, pr=False):
        if pr:
            print("Contain NaN")
            print(self.df.isnull().sum())
        else:
            return self.df.columns[self.df.isna().any()].tolist()
    
    def isObject(self):
        return [column for column in self.df.columns if self.df[column].dtype == 'object']
        
    def check_dataframe(self):
        self.shape()
        self.dtypes(True)
        self.isNaN(True)
        
    # Thanks to https://www.kaggle.com/code/lusfernandotorres/s03e12-stacking-tuned-models
    # Also: https://www.kaggle.com/code/tetsutani/ps3e12-eda-ensemble-baseline#Pre-Processing
    def feature_engineering(self):
        print("Feature Engineering")
        self.df['pH_cat'] = pd.cut(self.df['ph'], bins=[0, 6, 8, 14], labels=['Acid' , 'Normal', 'Base'])
#         self.df['osmo-to-urea-ratio'] = self.df['osmo']/self.df['urea']
#         self.df['osmo-to-cond-diff'] = self.df['osmo']-self.df['cond']
#         self.df['calc-to-ph-ratio'] = self.df['calc']/self.df['osmo']
#         self.df['osmo-to-urea-diff'] = self.df['osmo']-self.df['urea']
        self.df["ion_product"] = self.df["calc"] * self.df["urea"]
        self.df["calcium_to_urea_ratio"] = self.df["calc"] / self.df["urea"]
        self.df["electrolyte_balance"] = self.df["cond"] / (10 ** (-self.df["ph"]))
        self.df["osmolality_to_sg_ratio"] = self.df["osmo"] / self.df["gravity"]
        self.df['osmo_density'] = self.df['osmo'] * self.df['gravity']
        
    def get_df(self):
        self.feature_engineering()
        return self.df
    
    def split_target(self):
        print("Split Target")
        if not self.is_test:
            self.feature_engineering()
            self.X = self.df.drop('target', axis=1)
            self.y = self.df['target'].astype(int).to_numpy()
        else:
            self.feature_engineering()
            self.X = self.df
#     def get_X(self):
#         self.split_target()
#         return self.X

    def find_enc_method(self):
        print("Find Encoding Method")
        self.split_target()
        one_hot_cols = [column for column in self.X.columns if self.X[column].dtype == 'category' or self.X[column].dtype == 'object']
        return one_hot_cols
    
    def encoding(self):
        print("Encoding")
        one_hot_cols = self.find_enc_method()
        num_cols = [col for col in self.X.columns if col not in one_hot_cols]
        print(one_hot_cols, num_cols)
        X_OHE, X_NUM = self.X[one_hot_cols].copy(), self.X[num_cols].copy()
        self.OHE = OneHotEncoder(drop='first', handle_unknown='error')
        X_OHE = self.OHE.fit_transform(X_OHE).toarray()
        return X_OHE, X_NUM.to_numpy()

    def scaling(self):
        print("Scaling")
        X_OHE, X_num = self.encoding()
        self.SS = StandardScaler()
        X_num = self.SS.fit_transform(X_num)
        self.X_total = np.concatenate((X_OHE, X_num), axis=1)
        
    def get_encoders(self):
        return self.OHE
        
    def get_scaler(self):
        return self.SS
      
    def get_Xy(self):
        if not self.is_test:
            self.scaling()
            return self.X_total, self.y
        else:
            self.split_target()
            return self.X
        

pre_test = Preprocessing(df_test, True)
X_test = pre_test.get_Xy()
print('-'*50)
pre_train = Preprocessing(df_train)
X, y = pre_train.get_Xy()
SS = pre_train.get_scaler()
OHE = pre_train.get_encoders()

In [None]:
one_hot_cols = [column for column in X_test.columns if X_test[column].dtype == 'category' or X_test[column].dtype == 'object']
num_cols = [col for col in X_test.columns if col not in one_hot_cols]
X_OHE, X_NUM = X_test[one_hot_cols].copy(), X_test[num_cols].copy()
X_OHE = OHE.transform(X_OHE).toarray()
X_NUM = SS.transform(X_NUM)
X_test = np.concatenate((X_OHE, X_NUM), axis=1)

In [None]:
unique, counts = np.unique(y, return_counts=True)
unique, counts

In [None]:
from imblearn.over_sampling import SMOTE

def apply_smote(X, y):
    """
    Applies SMOTE to the input features (X) and target variable (y) to balance the dataset.
    
    Parameters:
    X: numpy array or pandas DataFrame with the input features
    y: numpy array or pandas Series with the target variable
    random_state: int, default=None, controls the randomness of the SMOTE algorithm
    
    Returns:
    X_resampled: numpy array with the resampled input features
    y_resampled: numpy array with the resampled target variable
    """
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    return X_resampled, y_resampled

X_resampled, y_resampled = apply_smote(X, y)
print(X_resampled.shape)
print(y_resampled.shape)
unique, counts = np.unique(y_resampled, return_counts=True)
unique, counts

In [None]:
import optuna
from lightgbm import LGBMClassifier
def objective_lgbm(trial):
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
    param = {
        'random_state': 42,
        'n_estimators': trial.suggest_categorical("n_estimators", [150, 200, 300, 3000]),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.001, 0.006,0.008,0.01,0.014,0.017,0.02]),
        'max_depth': trial.suggest_categorical('max_depth', [10,20,100]),
        'num_leaves' : trial.suggest_int('num_leaves', 1, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        'cat_smooth' : trial.suggest_int('min_data_per_groups', 1, 100)
    }
    model = LGBMClassifier(**param)
    model.fit(X_train, y_train, early_stopping_rounds=100, eval_set=[(X_test, y_test)], verbose=False)    
    preds = model.predict_proba(X_test)[:, 1]
    auc_score = roc_auc_score(y_test, preds) 

    return auc_score

# study = optuna.create_study(direction='maximize')
# study.optimize(objective_lgbm, n_trials=50)
# params_lgbm = study.best_trial.params
# print('Number of finished trials:', len(study.trials))
# print('Best trial:', params_lgbm)

In [None]:
# With higher number of features, performance of LGBM Decreases
params = {'n_estimators': 150, 'reg_alpha': 0.0020019026674570717, 'reg_lambda': 0.018189202721399202, 
          'colsample_bytree': 0.4, 'subsample': 0.6, 'learning_rate': 0.008, 
          'max_depth': 10, 'num_leaves': 776, 'min_child_samples': 1, 'min_data_per_groups': 86}
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
lgbm_model = LGBMClassifier(**params)
lgbm_model.fit(X_train, y_train, early_stopping_rounds=50, eval_set=[(X_valid, y_valid)], verbose=False)   
y_pred = lgbm_model.predict_proba(X_valid)[:, 1]
y_test = lgbm_model.predict_proba(X_test)[:, 1]
print(f"Auc: {round(roc_auc_score(y_valid,y_pred), 2)}")

In [None]:
import time
def cross_validate_model(model, X_test, n_splits=10,):
    KF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    aucs = []
    y_test = []
    oof_preds = []
    test_preds = []
    for index, (train_index, val_index) in enumerate(KF.split(X_resampled, y_resampled)):
        print(f"Fold {index+1} out of {n_splits}")
        start = time.time()
        X_train, X_val = X_resampled[train_index], X_resampled[val_index]
        y_train, y_val = y_resampled[train_index], y_resampled[val_index]
        model.fit(X_train, y_train, early_stopping_rounds=50, eval_set=[(X_val, y_val)], verbose=False)  
        y_pred = model.predict_proba(X_val)[:, 1]
        y_test = model.predict_proba(X_test)[:, 1]
        oof_preds.append(y_pred)
        test_preds.append(y_test)
        auc = roc_auc_score(y_val, y_pred)
        print(f"Auc: {round(roc_auc_score(y_val, y_pred), 2)}")
        
        aucs.append(auc)
        end = time.time()
        print(f'This Fold {index+1}, took {end - start} seconds.')
        
    return aucs, y_test/n_splits
# aucs, y_test = cross_validate_model(LGBMClassifier(**params), X_test=X_test)

In [None]:
# submission = pd.read_csv('/kaggle/input/playground-series-s3e12/sample_submission.csv')
# submission['target'] = y_test
# submission.to_csv('submission.csv', index = False)

In [None]:
import optuna
from xgboost import XGBClassifier
def objective_xgb(trial):
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
    param = {
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.5,0.6,0.7,0.8,0.9,1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.009,0.01,0.012,0.014,0.016,0.018, 0.02]),
        'n_estimators': trial.suggest_categorical("n_estimators", [150, 200, 300, 3000]),
        'max_depth': trial.suggest_categorical('max_depth', [4,5,7,9,11,13,15,17]),
        'random_state': 42,
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
    }
    model = XGBClassifier(**param)
    model.fit(X_train, y_train, early_stopping_rounds=100, eval_set=[(X_test, y_test)], verbose=False)    
    preds = model.predict_proba(X_test)[:, 1]
    auc_score = roc_auc_score(y_test, preds) 

    return auc_score  

# study = optuna.create_study(direction='maximize')
# study.optimize(objective_xgb, n_trials=50)
# params_xgb = study.best_trial.params
# print('Number of finished trials:', len(study.trials))
# print('Best trial:', params_xgb)

In [None]:
#XGB accuracy increases with more features
params_xgb = {'lambda': 0.2147524663546028, 'alpha': 0.5750492421465946, 
              'colsample_bytree': 0.7, 'subsample': 0.6, 
              'learning_rate': 0.008, 'n_estimators': 300, 
              'max_depth': 15, 'min_child_weight': 1}
X_train, X_valid, y_train, y_valid = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
model = XGBClassifier(**params_xgb)
model.fit(X_train, y_train, early_stopping_rounds=50, eval_set=[(X_valid, y_valid)], verbose=False)   
y_pred = model.predict_proba(X_valid)[:, 1]
y_test2 = model.predict_proba(X_test)[:,1]
print(f"Auc: {round(roc_auc_score(y_valid,y_pred), 2)}")

In [None]:

submission = pd.read_csv('/kaggle/input/playground-series-s3e12/sample_submission.csv')
submission['target'] = np.mean( np.array([[ y_test, y_test2]]), axis=0 )[0]
submission.to_csv('submission.csv', index = False)