In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,precision_score, recall_score, f1_score, confusion_matrix


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/playground-series-s4e11'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/playground-series-s4e11/sample_submission.csv
/kaggle/input/playground-series-s4e11/train.csv
/kaggle/input/playground-series-s4e11/test.csv


In [2]:


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


# Load the datasets
train_df = pd.read_csv('/kaggle/input/playground-series-s4e11/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s4e11/test.csv')


# Columns with numerical data
numeric_cols = ['id', 'Age', 'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction', 'Work/Study Hours', 'Financial Stress']

# Columns with categorical data
categorical_cols = ['Name', 'Gender', 'City', 'Working Professional or Student', 'Profession', 'Academic Pressure', 
                    'Sleep Duration', 'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?', 'Family History of Mental Illness']


In [3]:

# Handle missing values
imputer = SimpleImputer(strategy='most_frequent')

# Impute numerical columns in training data
train_df[numeric_cols] = imputer.fit_transform(train_df[numeric_cols])
# Impute numerical columns in test data
test_df[numeric_cols] = imputer.transform(test_df[numeric_cols])



In [4]:
# Impute categorical columns in training data
train_df[categorical_cols] = imputer.fit_transform(train_df[categorical_cols])
# Impute categorical columns in test data
test_df[categorical_cols] = imputer.transform(test_df[categorical_cols])

# Concatenate train and test data to fit OneHotEncoder
combined_categorical = pd.concat([train_df[categorical_cols], test_df[categorical_cols]], axis=0)

# Initialize OneHotEncoder
encoder = OneHotEncoder(drop='first', handle_unknown='ignore', sparse=False)



In [5]:
# Fit the encoder on combined categorical data
encoder.fit(combined_categorical)

# Transform the categorical data in training and test sets
train_encoded = encoder.transform(train_df[categorical_cols])
test_encoded = encoder.transform(test_df[categorical_cols])

# Create DataFrame from encoded features
train_encoded_df = pd.DataFrame(train_encoded, columns=encoder.get_feature_names_out(categorical_cols))
test_encoded_df = pd.DataFrame(test_encoded, columns=encoder.get_feature_names_out(categorical_cols))

# Concatenate encoded features with numerical features for training data
train_final_df = pd.concat([train_df[numeric_cols], train_encoded_df], axis=1)
# Concatenate encoded features with numerical features for test data
test_final_df = pd.concat([test_df[numeric_cols], test_encoded_df], axis=1)





In [6]:
# Separate features and target variable in training data
X = train_final_df
y = train_df['Depression']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the XGBClassifier
xgb_classifier = XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.1, random_state=42)

# Train the model
xgb_classifier.fit(X_train, y_train)



In [7]:
# Make predictions on validation data
val_predictions = xgb_classifier.predict(X_val)
accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", accuracy)

# Calculate Precision
precision = precision_score(y_val, val_predictions)
print("Precision:", precision)

# Calculate Recall
recall = recall_score(y_val, val_predictions)
print("Recall:", recall)

# Calculate F1 Score
f1 = f1_score(y_val, val_predictions)
print("F1 Score:", f1)

# Confusion Matrix (optional)
conf_matrix = confusion_matrix(y_val, val_predictions)
print("Confusion Matrix:\n", conf_matrix)

# Make predictions on test data
test_predictions = xgb_classifier.predict(test_final_df)

# Prepare the output DataFrame
output_df = test_df[['id']].copy()
output_df['Depression'] = test_predictions



Validation Accuracy: 0.9362117981520967
Precision: 0.8369107321965897
Recall: 0.8094683740783857
F1 Score: 0.8229608442647203
Confusion Matrix:
 [[22173   813]
 [  982  4172]]


In [8]:
# Save the output to a CSV file
output_df.to_csv('submission.csv', index=False)