In [1]:
# Load libraries
import warnings
from tqdm import tqdm
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from catboost import CatBoostClassifier, Pool

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

# Suppress warnings
warnings.filterwarnings("ignore", category=FutureWarning)


In [2]:
# Load the datasets
train_df = pd.read_csv("/kaggle/input/playground-series-s4e11/train.csv")
test_df = pd.read_csv("/kaggle/input/playground-series-s4e11/test.csv")
original_df = pd.read_csv("/kaggle/input/depression-surveydataset-for-analysis/final_depression_dataset_1.csv")

# Retain the test IDs for submission
test_ids = test_df["id"]

# Prepare the data
original_df["Depression"] = original_df["Depression"].map({"Yes": 1, "No": 0})
full_train = pd.concat([train_df, original_df], ignore_index=True)
full_train = full_train.drop(["id"], axis=1)
test_df = test_df.drop(["id"], axis=1)


### Feature Engineering

In this section of the code, we create additional features for the dataset to enhance the predictive power of machine learning models. These new features are designed based on existing columns by combining them in meaningful ways or applying mathematical operations. Here's a breakdown of the newly engineered features:

#### Feature Creation

1. **satisfaction_by_work**  
   The ratio of `Work Pressure` to `Job Satisfaction`.  
   Formula:  
   `Satisfaction by Work = Work Pressure / (Job Satisfaction + 1e-8)`  
   The small constant `1e-8` is added to avoid division by zero.

2. **satisfaction_by_study**  
   The ratio of `Academic Pressure` to `Study Satisfaction`.  
   Formula:  
   `Satisfaction by Study = Academic Pressure / (Study Satisfaction + 1e-8)`

3. **age_work_satisfaction**  
   The ratio of `Age` to `Job Satisfaction`.  
   Formula:  
   `Age to Work Satisfaction = Age / (Job Satisfaction + 1e-8)`

4. **cgpa_study**  
   The ratio of `CGPA` to `Academic Pressure`.  
   Formula:  
   `CGPA to Study Pressure = CGPA / (Academic Pressure + 1e-8)`

5. **work_to_financial_stress_ratio**  
   The ratio of `Work Pressure` to `Financial Stress`.  
   Formula:  
   `Work to Financial Stress Ratio = Work Pressure / (Financial Stress + 1e-8)`

6. **academic_to_financial_stress_ratio**  
   The ratio of `Academic Pressure` to `Financial Stress`.  
   Formula:  
   `Academic to Financial Stress Ratio = Academic Pressure / (Financial Stress + 1e-8)`

7. **normalized_work_stress**  
   A normalized measure of `Work Pressure` relative to `Job Satisfaction`.  
   Formula:  
   `Normalized Work Stress = Work Pressure / (Job Satisfaction + 1e-8)`

8. **normalized_academic_stress**  
   A normalized measure of `Academic Pressure` relative to `Study Satisfaction`.  
   Formula:  
   `Normalized Academic Stress = Academic Pressure / (Study Satisfaction + 1e-8)`

9. **age_cgpa_interaction**  
   The interaction between `Age` and `CGPA`. This could capture patterns where both age and academic performance (CGPA) together influence the target variable.  
   Formula:  
   `Age CGPA Interaction = Age * CGPA`

10. **total_satisfaction**  
   The sum of `Study Satisfaction` and `Job Satisfaction`.  
   Formula:  
   `Total Satisfaction = Study Satisfaction + Job Satisfaction`

11. **total_stress**  
   The sum of `Academic Pressure` and `Work Pressure`.  
   Formula:  
   `Total Stress = Academic Pressure + Work Pressure`

12. **is_profession_missing**  
   A binary feature indicating if the `Profession` column has a missing value (NaN).  
   Formula:  
   `Is Profession Missing = Profession.isna().astype(int)`

13. **is_cgpa_missing**  
   A binary feature indicating if the `CGPA` column has a missing value (NaN).  
   Formula:  
   `Is CGPA Missing = CGPA.isna().astype(int)`


In [3]:
# Feature engineering
def new_feats(df):
    df = (
        df.assign(
            satisfaction_by_work=df["Work Pressure"] / (df["Job Satisfaction"] + 1e-8),
            satisfaction_by_study=df["Academic Pressure"] / (df["Study Satisfaction"] + 1e-8),
            age_work_satisfaction=df["Age"] / (df["Job Satisfaction"] + 1e-8),
            cgpa_study=df["CGPA"] / (df["Academic Pressure"] + 1e-8),
            work_to_financial_stress_ratio=df["Work Pressure"] / (df["Financial Stress"] + 1e-8),
            academic_to_financial_stress_ratio=df["Academic Pressure"] / (df["Financial Stress"] + 1e-8),
            normalized_work_stress=df["Work Pressure"] / (df["Job Satisfaction"] + 1e-8),
            normalized_academic_stress=df["Academic Pressure"] / (df["Study Satisfaction"] + 1e-8),
            age_cgpa_interaction=df["Age"] * df["CGPA"],
            total_satisfaction=df["Study Satisfaction"] + df["Job Satisfaction"],
            total_stress=df["Academic Pressure"] + df["Work Pressure"],
            is_profession_missing=df["Profession"].isna().astype(int),
            is_cgpa_missing=df["CGPA"].isna().astype(int),
        )
    )
    return df

full_train = new_feats(full_train).copy()
test_df = new_feats(test_df).copy()


### Code Explanation: Handling Missing Values and Encoding Categorical Features

The code below performs two main tasks:

1. **Identifying Numerical and Categorical Features**
2. **Handling Missing Values and Encoding Categorical Data**

#### 1. **Identifying Numerical and Categorical Features**

```python
num_feats = full_train.select_dtypes(include="float64").columns
obj_feats = full_train.select_dtypes(include="object").columns
target = "Depression"


In [4]:
# Define numerical and categorical features
num_feats = full_train.select_dtypes(include="float64").columns
obj_feats = full_train.select_dtypes(include="object").columns
target = "Depression"

# Handle missing values and encode categorical data
for col in obj_feats:
    le = LabelEncoder()
    combined_data = pd.concat([full_train[col], test_df[col]], axis=0)
    le.fit(combined_data.astype(str))
    full_train[col] = le.transform(full_train[col].astype(str))
    test_df[col] = test_df[col].map(lambda s: le.classes_.tolist().index(s) if s in le.classes_ else -1)

for col in num_feats:
    full_train[col] = full_train[col].fillna(full_train[col].mean())
    test_df[col] = test_df[col].fillna(full_train[col].mean())


In [5]:
# Prepare training and testing data
y = full_train["Depression"]
X = full_train.drop(["Depression"], axis=1)

# Number of splits and repetitions for repeated stacking
n_splits = 10
n_repeats = 5

# Base model hyperparameters
cat_params = {
    "iterations": 715,
    "learning_rate": 0.05009420761428966,
    "rsm": 0.5859169200239407,
    "subsample": 0.7705184727295318,
    "min_data_in_leaf": 30,
    "depth": 7,
    "l2_leaf_reg": 0.004379496536587387,
    "random_strength": 0.4519161767798322,
    "bootstrap_type": "Bernoulli",
    "loss_function": "Logloss",
    "random_seed": 42,
    "verbose": False,
}

xgb_params = {
    "n_estimators": 190,
    "learning_rate": 0.09496932234009307,
    "max_depth": 9,
    "min_child_weight": 10,
    "subsample": 0.9433525544556154,
    "colsample_bytree": 0.986782619688853,
    "colsample_bynode": 0.933054684872868,
    "colsample_bylevel": 0.7217799408248594,
    "reg_lambda": 6.588710936371029,
    "reg_alpha": 0.8772425195518072,
    "random_state": 42,
    "use_label_encoder": False,
    "eval_metric": "logloss",
}


### Stacking Predictions Across Repetitions

This section implements a stacking ensemble technique where predictions from two base models, XGBoost and CatBoost, are combined to improve model performance. The stacking process is repeated across multiple iterations and cross-validation folds to ensure robustness and reliability in the predictions.

#### Initializing Stacking Arrays

Before the models are trained, two placeholder arrays are initialized:

- **X_stack**: This array stores the predicted probabilities for the training data, where predictions from both XGBoost and CatBoost are saved for each sample.
- **test_stack**: This array stores the predicted probabilities for the test dataset, again for both base models.

These arrays are updated during the stacking process to collect predictions across repetitions and folds.

#### Repeated Stacking with Cross-Validation

The stacking process is repeated over several iterations (repetitions), and in each iteration, **Stratified K-Fold cross-validation** is used. This ensures that each fold maintains the distribution of the target variable across the training and validation sets.

- **Stratified K-Fold**: This cross-validation technique is applied to split the data into multiple training and validation sets, ensuring that each fold represents the overall distribution of the target variable.
- During each fold, both base models (XGBoost and CatBoost) are trained on the training set. Their predicted probabilities for the validation set are added to the stacking arrays. The predictions for each sample are aggregated over the course of the entire repetition.

The predictions for the test dataset are also aggregated across all repetitions and folds to provide a final prediction from the ensemble model.

By using multiple repetitions and folds, the stacking process helps mitigate overfitting and improves the generalization of the model, resulting in more reliable predictions.


In [6]:
# Placeholder for stacking predictions across repetitions
X_stack = np.zeros((X.shape[0], 2))  # 2 base models: XGBoost and CatBoost
test_stack = np.zeros((test_df.shape[0], 2))

# Repeated stacking
for repeat in range(n_repeats):
    print(f"Repetition {repeat + 1}/{n_repeats}")
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42 + repeat)
    
    for fold, (train_idx, valid_idx) in enumerate(tqdm(kf.split(X, y), total=n_splits, desc=f"Repetition {repeat + 1} Progress")):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
        
        # Train XGBoost
        xgb_model = XGBClassifier(**xgb_params)
        xgb_model.fit(X_train, y_train)
        X_stack[valid_idx, 0] += xgb_model.predict_proba(X_valid)[:, 1] / n_repeats
        test_stack[:, 0] += xgb_model.predict_proba(test_df)[:, 1] / (n_splits * n_repeats)
        
        # Train CatBoost
        cat_model = CatBoostClassifier(**cat_params)
        cat_model.fit(Pool(X_train, y_train, cat_features=X[obj_feats].columns.values))
        X_stack[valid_idx, 1] += cat_model.predict_proba(Pool(X_valid, cat_features=X[obj_feats].columns.values))[:, 1] / n_repeats
        test_stack[:, 1] += cat_model.predict_proba(Pool(test_df, cat_features=X[obj_feats].columns.values))[:, 1] / (n_splits * n_repeats)


Repetition 1/5


Repetition 1 Progress: 100%|██████████| 10/10 [13:46<00:00, 82.63s/it]


Repetition 2/5


Repetition 2 Progress: 100%|██████████| 10/10 [13:38<00:00, 81.84s/it]


Repetition 3/5


Repetition 3 Progress: 100%|██████████| 10/10 [13:30<00:00, 81.01s/it]


Repetition 4/5


Repetition 4 Progress: 100%|██████████| 10/10 [13:30<00:00, 81.01s/it]


Repetition 5/5


Repetition 5 Progress: 100%|██████████| 10/10 [13:31<00:00, 81.18s/it]


In [7]:
# Train optimized Gradient Boosting meta-model
best_params = {
    "n_estimators": 146,
    "learning_rate": 0.0298358509190979,
    "max_depth": 6,
    "subsample": 0.6920403072473079,
    "max_features": "sqrt",
}

best_meta_model = GradientBoostingClassifier(random_state=42, **best_params)
best_meta_model.fit(X_stack, y)

# Save predictions for the test set
test_meta_preds = best_meta_model.predict_proba(test_stack)[:, 1]
submission_path = "/kaggle/working/submission.csv"
submission = pd.DataFrame({"id": test_ids, "Depression": (test_meta_preds > 0.5).astype(int)})
submission.to_csv(submission_path, index=False)
print(f"Predictions saved to {submission_path}")

# Evaluate optimized meta-model
meta_preds = best_meta_model.predict_proba(X_stack)[:, 1]


Predictions saved to /kaggle/working/submission.csv
