# Kütüphanelerin İçeri Aktarılması

In [2]:
import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score

from sklearn.ensemble import StackingClassifier, RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier


import warnings
warnings.filterwarnings('ignore')

# Verilerin Yüklenmesi

In [3]:
# Veri yükleme
data = pd.read_csv("/kaggle/input/playground-series-s4e2/train.csv", index_col="id")
test_data = pd.read_csv("/kaggle/input/playground-series-s4e2/test.csv", index_col="id")
submission = pd.read_csv("/kaggle/input/playground-series-s4e2/sample_submission.csv")

# Özellik Mühendisliği
- Elimizdeki özellikler ile yeni özellikler oluşturacağız.
- Test veri setinde "CALC" stünunun içersinde eğitim veri setinin içersinde olmayan "Always" kategorisi bulunuyor. Sorunun önüne geçebilmek adına "Always" kategorisini "Frequently" kategorisine dahil edeceğim.

In [4]:
def new_features(df):
    
    """ BMI (Body Mass Index) Hesaplama ve Kategorilendirme """
    df['BMI'] = df['Weight'] / (df['Height'] ** 2)
    df['BMI_Category'] = pd.cut(df['BMI'], bins=[0, 18.5, 25, 30, np.inf], labels=["Underweight", "Normal", "Overweight", "Obese"])

    """ Yaş Kategorileri """
    df['Age_Category'] = pd.cut(df['Age'], bins=[0, 18, 30, 50, np.inf], labels=['Teen', 'Young_Adult', 'Adult', 'Senior'])

    """ Sağlıklı Yaşam Skoru """
    df['Healthy_Lifestyle_Score'] = (df['FCVC'] + df['NCP'] + df['CH2O'] + df['FAF']) / 4

    """ Aktif ve Pasif Ulaşım Modları """
    df['Active_Transport'] = df['MTRANS'].apply(lambda x: 1 if x in ['Walking', 'Bike'] else 0)
    df['Passive_Transport'] = df['MTRANS'].apply(lambda x: 1 if x in ['Automobile', 'Motorbike'] else 0)

    """ Toplam Su Tüketimi """
    df['Total_Water_Intake'] = df['CH2O'] * df['FAF']
    
    return df

In [5]:
data = new_features(data)
test_data = new_features(test_data)

In [6]:
test_data["CALC"].replace({"Always": "Frequently"}, inplace=True)

# Label Encoding ve Get Dummies
- Hedef değişkenimizi numerik hale çevireceğiz
- İçersinde 2'den az veri bulunan kategorik stünları label encoder ile 2 den fazla veri bulunan stünları da get dummies ile dönüştüştüreceğiz.

In [7]:
# Hedef Değişkenin Dönüştürülmesi
data["NObeyesdad"].replace({
    "Insufficient_Weight": 0,
    "Normal_Weight": 1,
    "Overweight_Level_I": 2,
    "Overweight_Level_II": 3,
    "Obesity_Type_I": 4,
    "Obesity_Type_II": 5,
    "Obesity_Type_III": 6,
}, inplace=True)

In [8]:
# Kategorik Değişkenlerin Dönüştürülmesi
def transform_categorical_values(df):
    le = LabelEncoder()
    categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
    
    # Label ve One-Hot Encoding
    label_columns = df[categorical_columns].nunique()[df[categorical_columns].nunique() == 2].index.tolist()
    df[label_columns] = df[label_columns].apply(lambda col: le.fit_transform(col))
    
    one_hot_columns = df[categorical_columns].nunique()[df[categorical_columns].nunique() > 2].index.tolist()
    df = pd.get_dummies(df, columns=one_hot_columns, drop_first=True)
    
    return df

In [9]:
data = transform_categorical_values(data)
test_data = transform_categorical_values(test_data)

# Eğitim ve Test Seti Oluşturma
- Hedef değişkeni data veri seti dışında tutup eğitim verilerini x'e atayacağım ardından sadece bağımlı değişkeni sadece y'ye atayacağım.
- Train Test Split ile test_size ve random_state'i belirleyip eğitim ve test setlerini ayıracağım.

In [10]:
# Veri setini hazırlama
x = data.drop("NObeyesdad", axis=1)
y = data["NObeyesdad"]

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)

# Verilerin boyutlarını kontrol etme
print(f"Eğitim seti boyutu: {x_train.shape}")
print(f"Test seti boyutu: {x_test.shape}")

Eğitim seti boyutu: (16606, 33)
Test seti boyutu: (4152, 33)


# OPTUNA
- Optuna ile CATBOOST, XGBOOST, LIGHTGBM, GRADIENTBOOSTING, RANDOMFOREST modelleri için en uygun parametreleri bulmaya çalışacağım

## 1. CatBoostClassifier

In [12]:
#def objective(trial):
#    # Hiperparametre aralıklarını tanımlama
#    param = {
#        'iterations': trial.suggest_int('iterations', 100, 2000),
#        'depth': trial.suggest_int('depth', 2, 10),
#        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
#        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.01, 10),
#        'border_count': trial.suggest_int('border_count', 32, 255),
#        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
#        'random_strength': trial.suggest_float('random_strength', 0.001, 1.0),
#        'grow_policy': trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),
#        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 20)
#    }
#
#    # 'max_leaves' sadece 'Lossguide' grow_policy için geçerli
#    if param['grow_policy'] == 'Lossguide':
#        param['max_leaves'] = trial.suggest_int('max_leaves', 31, 64)
#
#    # CatBoostClassifier modelini oluşturma ve eğitme
#    model = CatBoostClassifier(**param, verbose=0, early_stopping_rounds=100)
#    model.fit(x_train, y_train, eval_set=[(x_test, y_test)], verbose=0)
#
#    # Modelin doğruluğunu hesaplama
#    preds = model.predict(x_test)
#    accuracy = accuracy_score(y_test, preds)
#    return accuracy
#
## Optuna study nesnesi oluşturma ve optimizasyon başlatma
#study = optuna.create_study(direction='maximize')
#study.optimize(objective, n_trials=2005)

#### Optuna ile CatBoost parametre aramasının sonucunda şunlara eriştim
- [I 2024-02-22 05:48:58,400] *Trial 871 finished with value:* **0.9135356454720617** *and parameters:* {'iterations': 1355, 'depth': 5, 'learning_rate': 0.1096384174794031, 'l2_leaf_reg': 5.010224281972051, 'border_count': 240, 'bagging_temperature': 0.10565271312223717, 'random_strength': 0.3295302174116903, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 16}. **Best is trial 871 with value: 0.9135356454720617.**

## 2. XGBClassifier

In [13]:
#def objective_xgboost(trial):
#    # Hiperparametre aralıklarını tanımlama
#    param = {
#        'n_estimators': trial.suggest_int('n_estimators', 100, 2000),
#        'max_depth': trial.suggest_int('max_depth', 3, 20),
#        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0),
#        'subsample': trial.suggest_float('subsample', 0.20, 1.0),
#        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.20, 1.0),
#        'gamma': trial.suggest_float('gamma', 1e-10, 1.0),
#        'reg_alpha': trial.suggest_float('reg_alpha', 1e-5, 5),
#        'reg_lambda': trial.suggest_float('reg_lambda', 1e-5, 5),
#        'min_child_weight': trial.suggest_int('min_child_weight', 1, 30),
#        
#
#    }
#
#    # XGBoost modelini oluşturma ve eğitme
#    model = XGBClassifier(**param, early_stopping_rounds=200, random_state=42)
#    model.fit(x_train, y_train, eval_set=[(x_test, y_test)], verbose=0)
#
#    # Modelin doğruluğunu hesaplama
#    preds = model.predict(x_test)
#    accuracy = accuracy_score(y_test, preds)
#    return accuracy
#
## Optuna study nesnesi oluşturma ve optimizasyon başlatma
#study_xgboost = optuna.create_study(direction='maximize')
#study_xgboost.optimize(objective_xgboost, n_trials=2005)

#### Optuna ile XGBoost parametre aramasının sonucunda şunlara eriştim
- [I 2024-02-22 14:27:03,497] *Trial 1033 finished with value:* **0.914980732177264** *and parameters:* {'n_estimators': 1653, 'max_depth': 3, 'learning_rate': 0.08346083239445595, 'subsample': 0.9473612065571799, 'colsample_bytree': 0.2727334724076542, 'gamma': 0.035093546773850515, 'reg_alpha': 2.0084596983174867, 'reg_lambda': 4.700706962671846, 'min_child_weight': 1}. **Best is trial 1033 with value: 0.914980732177264.**

## 3. LGBMClassifier

In [14]:
#def objective_lightgbm(trial):
#    param = {
#        'objective': 'multiclass',
#        'metric': 'multi_logloss',
#        'num_class': len(set(y_train)),  # Sınıf sayısını belirtin
#        'verbosity': -1,
#        'boosting_type': 'gbdt',
#        'lambda_l1': trial.suggest_float('lambda_l1', 1e-10, 20.0, log=True),
#        'lambda_l2': trial.suggest_float('lambda_l2', 1e-10, 20.0, log=True),
#        'num_leaves': trial.suggest_int('num_leaves', 2, 512),
#        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
#        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
#        'bagging_freq': trial.suggest_int('bagging_freq', 0.001, 10),
#        'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
#        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.3),
#        'max_depth': trial.suggest_int('max_depth', -1, 100),  # -1 means no limit
#        'min_gain_to_split': trial.suggest_float('min_gain_to_split', 0, 50),
#        'min_child_weight': trial.suggest_float('min_child_weight', 1e-7, 1e-5, log=True),
#        'min_sum_hessian_in_leaf': trial.suggest_float('min_sum_hessian_in_leaf', 1e-8, 1e-5, log=True)
#    }
#
#    # Create and train LightGBM model
#    model = LGBMClassifier(**param, early_stopping_rounds=100, random_state=42, verbose=0)
#    model.fit(x_train, y_train, eval_set=[(x_test, y_test)])
#
#    # Evaluate model performance
#    preds = model.predict(x_test)
#    accuracy = accuracy_score(y_test, preds)
#    return accuracy
#
## Optuna study nesnesi oluşturma ve optimizasyon başlatma
#study_lightgbm = optuna.create_study(direction='maximize')
#study_lightgbm.optimize(objective_lightgbm, n_trials=4000)

#### Optuna ile LGBMClassifier parametre aramasının sonucunda şunlara eriştim
- [I 2024-02-22 13:37:47,196] *Trial 3253 finished with value:* **0.9123314065510597** *and parameters:* {'lambda_l1': 0.0034407365453225976, 'lambda_l2': 0.14618731005541508, 'num_leaves': 19, 'feature_fraction': 0.40031517145679507, 'bagging_fraction': 0.6394939715840073, 'bagging_freq': 10, 'min_child_samples': 1, 'learning_rate': 0.10868460654536086, 'max_depth': 88, 'min_gain_to_split': 0.015546065137711812, 'min_child_weight': 2.6133446037573905e-07, 'min_sum_hessian_in_leaf': 6.347168996577988e-06}. **Best is trial 3253 with value: 0.9123314065510597.**

## 4. GradientBoostingClassifier

In [15]:
#def objective_gradient_boosting(trial):
#    param = {
#        'n_estimators': trial.suggest_int('n_estimators', 100, 2000),
#        'max_depth': trial.suggest_int('max_depth', 3, 15),
#        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5, log=True),
#        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
#        'min_samples_split': trial.suggest_int('min_samples_split', 2, 100),
#        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 50),
#        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
#        'validation_fraction': trial.suggest_float('validation_fraction', 0.1, 0.3),
#        'n_iter_no_change': trial.suggest_int('n_iter_no_change', 5, 20),
#        'tol': trial.suggest_float('tol', 1e-5, 1e-3, log=True),
#        'random_state': 42
#    }
#
#    # GradientBoostingClassifier modelini oluşturma ve eğitme
#    model = GradientBoostingClassifier(**param)
#    model.fit(x_train, y_train)
#
#    # Modelin doğruluğunu hesaplama
#    preds = model.predict(x_test)
#    accuracy = accuracy_score(y_test, preds)
#    return accuracy
#
## Optuna study nesnesi oluşturma ve optimizasyon başlatma
#study_gradient_boosting = optuna.create_study(direction='maximize')
#study_gradient_boosting.optimize(objective_gradient_boosting, n_trials=2005)

#### Optuna ile GradientBoostingClassifier parametre aramasının sonucunda şunlara eriştim
- [I 2024-02-22 04:21:59,956] *Trial 220 finished with value:* **0.9104046242774566** *and parameters:* {'n_estimators': 653, 'max_depth': 7, 'learning_rate': 0.01630624786154065, 'subsample': 0.9267674351619599, 'min_samples_split': 83, 'min_samples_leaf': 10, 'max_features': 'sqrt', 'validation_fraction': 0.15505642008587056, 'n_iter_no_change': 7, 'tol': 1.1030772570725336e-05}. **Best is trial 220 with value: 0.9104046242774566.**

## 5. RandomForestClassifier

In [16]:
#from sklearn.ensemble import RandomForestClassifier
#
#def objective_random_forest(trial):
#    param = {
#        'n_estimators': trial.suggest_int('n_estimators', 100, 2000),
#        'max_depth': trial.suggest_int('max_depth', 5, 100),
#        'min_samples_split': trial.suggest_int('min_samples_split', 2, 50),
#        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
#        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
#        'class_weight': trial.suggest_categorical('class_weight', ['balanced', 'balanced_subsample', None]),
#        'random_state': 42
#    }
#
#    # RandomForest modelini oluşturma ve eğitme
#    model = RandomForestClassifier(**param)
#    model.fit(x_train, y_train)
#
#    # Modelin doğruluğunu hesaplama
#    preds = model.predict(x_test)
#    accuracy = accuracy_score(y_test, preds)
#    return accuracy
#
## Optuna study nesnesi oluşturma ve optimizasyon başlatma
#study_random_forest = optuna.create_study(direction='maximize')
#study_random_forest.optimize(objective_random_forest, n_trials=1000)

#### Optuna ile RandomForestClassifier parametre aramasının sonucunda şunlara eriştim
- [I 2024-02-22 08:38:06,226] *Trial 468 finished with value:* **0.901252408477842** *and parameters:* {'n_estimators': 1596, 'max_depth': 92, 'min_samples_split': 12, 'min_samples_leaf': 1, 'bootstrap': False, 'class_weight': 'balanced'}. **Best is trial 468 with value: 0.901252408477842.**

## Bulduğum en iyi parametreler ile modelleri oluşturma

In [17]:
#CATBOOST         :{'iterations': 1819, 'depth': 5, 'learning_rate': 0.1014969073551741, 'l2_leaf_reg': 5.34593664307217, 'border_count': 164, 'bagging_temperature': 0.8408771489965642, 'random_strength': 0.7008891194758375, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 16}. Best is trial 205 with value: 0.9135356454720617.
#XGBOOST          :{'n_estimators': 1653, 'max_depth': 3, 'learning_rate': 0.08346083239445595, 'subsample': 0.9473612065571799, 'colsample_bytree': 0.2727334724076542, 'gamma': 0.035093546773850515, 'reg_alpha': 2.0084596983174867, 'reg_lambda': 4.700706962671846, 'min_child_weight': 1}. Best is trial 1033 with value: 0.914980732177264.
#LIGHTGBM         :{'lambda_l1': 0.0034407365453225976, 'lambda_l2': 0.14618731005541508, 'num_leaves': 19, 'feature_fraction': 0.40031517145679507, 'bagging_fraction': 0.6394939715840073, 'bagging_freq': 10, 'min_child_samples': 1, 'learning_rate': 0.10868460654536086, 'max_depth': 88, 'min_gain_to_split': 0.015546065137711812, 'min_child_weight': 2.6133446037573905e-07, 'min_sum_hessian_in_leaf': 6.347168996577988e-06}. Best is trial 3253 with value: 0.9123314065510597.
#GRADIENTBOOSTING :{'n_estimators': 653, 'max_depth': 7, 'learning_rate': 0.01630624786154065, 'subsample': 0.9267674351619599, 'min_samples_split': 83, 'min_samples_leaf': 10, 'max_features': 'sqrt', 'validation_fraction': 0.15505642008587056, 'n_iter_no_change': 7, 'tol': 1.1030772570725336e-05}. Best is trial 220 with value: 0.9104046242774566.
#RANDOMFOREST     :{'n_estimators': 1596, 'max_depth': 92, 'min_samples_split': 12, 'min_samples_leaf': 1, 'bootstrap': False, 'class_weight': 'balanced'}. Best is trial 972 with value: 0.901252408477842.

In [18]:
catboost_model = CatBoostClassifier(
    iterations=1819, depth=5, learning_rate=0.1014969073551741, l2_leaf_reg=5.34593664307217, 
    border_count=164, bagging_temperature=0.8408771489965642, random_strength=0.7008891194758375, grow_policy='Depthwise',
    min_data_in_leaf=16, verbose=0)

xgboost_model = XGBClassifier(
    n_estimators=1653, max_depth=3, learning_rate=0.08346083239445595, subsample=0.9473612065571799, 
    colsample_bytree=0.2727334724076542, gamma=0.035093546773850515, reg_alpha=2.0084596983174867, 
    reg_lambda=4.700706962671846, min_child_weight= 1, verbosity=0)

lightgbm_model = LGBMClassifier(
    lambda_l1=0.0034407365453225976, lambda_l2=0.14618731005541508, num_leaves=19, 
    feature_fraction=0.40031517145679507, bagging_fraction=0.6394939715840073, bagging_freq=10, 
    min_child_samples=1, learning_rate=0.10868460654536086, max_depth=88, min_gain_to_split=0.015546065137711812, 
    min_child_weight=2.6133446037573905e-07, min_sum_hessian_in_leaf=6.347168996577988e-06, verbose=-1)

gradient_boosting_model = GradientBoostingClassifier(
    n_estimators=653, max_depth=7, learning_rate=0.01630624786154065, subsample=0.9267674351619599, 
    min_samples_split=83, min_samples_leaf=10, max_features='sqrt',validation_fraction=0.15505642008587056, n_iter_no_change=7,
    tol=1.1030772570725336e-05)

random_forest_model = RandomForestClassifier(
    n_estimators=930, max_depth=326, min_samples_split=7, min_samples_leaf=1, bootstrap=True, class_weight=None
)

random_forest_model = RandomForestClassifier(
    n_estimators=1596, max_depth=92, min_samples_split=12, min_samples_leaf=1, bootstrap=False, class_weight='balanced')

## Oluştruduğumuz Modelleri Tek Tek Deneyelim

In [19]:
# Modelleri eğitme ve test veri seti üzerinde değerlendirme
def train_and_evaluate(model, x_train, y_train, x_test, y_test):
    model.fit(x_train, y_train)
    preds = model.predict(x_test)
    accuracy = accuracy_score(y_test, preds)
    return accuracy

# Oluşturduğunuz modelleri bir listeye ekleyin
models = [
    ('CatBoost', catboost_model),
    ('XGBoost', xgboost_model),
    ('LightGBM', lightgbm_model),
    ('Gradient Boosting', gradient_boosting_model),
    ('Random Forest',random_forest_model)
]

# Her model için eğitim ve değerlendirme yapın
for name, model in models:
    accuracy = train_and_evaluate(model, x_train, y_train, x_test, y_test)
    print(f"{name} Model Doğruluk: {accuracy:.4f}")

CatBoost Model Doğruluk: 0.9116
XGBoost Model Doğruluk: 0.9104
LightGBM Model Doğruluk: 0.9061
Gradient Boosting Model Doğruluk: 0.9058
Random Forest Model Doğruluk: 0.8986


# Voting Classifier
- Oluşturduğumuz modellerin gücünü Voting Classifier ile birleştirerek skoru daha da arttırmaya çalışacağım

In [20]:
from sklearn.ensemble import VotingClassifier

# Oluşturulan modelleri voting classifier'a eklemek
voting_clf = VotingClassifier(
    estimators=[
        ('catboost', catboost_model),
        ('xgboost', xgboost_model),
        #('lightgbm', lightgbm_model),
        #('random_forest', random_forest_model),
        #('gradient_boosting', gradient_boosting_model)
    ],
    voting='soft' # Soft voting kullanılıyor
)

# Voting classifier modelini eğitmek
voting_clf.fit(x_train, y_train)

In [21]:
#catboost+xgboost+lightgbm+gradient_boosting                SOFT = 0.9097
#catboost+xgboost+lightgbm+gradient_boosting+randomforest   SOFT = 0.9092
#catboost+xgboost+lightgbm+randomforest                     SOFT = 0.9094
#catboost+xgboost+lightgbm                                  SOFT = 0.9111
#catboost+xgboost                                           SOFT = 0.9135
#catboost                                                   SOFT = 0.9116
#catboost+lightgbm                                          SOFT = 0.9109
#xgboost+lightgbm                                           SOFT = 0.9098
#xgboost                                                    SOFT = 0.9104
#catboost+xgboost+gradient_boosting                         SOFT = 0.9094
#catboost+xgboost+randomforest                              SOFT = 0.9106
#catboost+randomforest                                      SOFT = 0.9109
#catboost+lightgbm+randomforest                             SOFT = 0.9075

accuracy = voting_clf.score(x_test, y_test) 
print(f"Voting Ensemble Model Doğruluk: {accuracy:.4f}")

Voting Ensemble Model Doğruluk: 0.9135


#### - En yüksek skoru catboost ve xgboost kombinasyonu ile aldım

# Submission Dosyasının Hazırlanması
- Test veri setini oluşturduğum voting_clf modeli ile tahmin edeceğim.
- Tahmin edilen stünü istenen formata dönüştüreceğim.
- Submission dosyasını kaydedeceğim. 

In [22]:
# Test veri seti üzerinde tahmin yapma
final_preds = voting_clf.predict(test_data)

In [23]:
# Sonuçları submission dosyasına yazma
submission["NObeyesdad"] = final_preds

In [24]:
# Tahminleri hedef değişkenin orijinal etiketlerine dönüştürme
submission["NObeyesdad"] = submission["NObeyesdad"].replace({
    0: "Insufficient_Weight",
    1: "Normal_Weight",
    2: "Overweight_Level_I",
    3: "Overweight_Level_II",
    4: "Obesity_Type_I",
    5: "Obesity_Type_II",
    6: "Obesity_Type_III"
})

In [25]:
submission.to_csv('submission.csv', index=False)