In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV

In [2]:
df_train = pd.read_csv('../train_set_encoded.csv')
df_val = pd.read_csv('../test_set_encoded.csv')
X_val = df_val.drop('Depression', axis=1)
y_val = df_val['Depression']
X = df_train.drop('Depression', axis=1)
y = df_train['Depression']

In [8]:
# Define parameter grid for DecisionTreeClassifier
param_grid = {
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 6, 8, 10, 15],
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random']
}

# Initialize Decision Tree model
dt = DecisionTreeClassifier(random_state=42)

# Perform GridSearchCV
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, 
                          cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X, y)

# Get the best model
best_dt = grid_search.best_estimator_

# Print the best parameters and accuracy
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Accuracy: {grid_search.best_score_}")

# Predict on training and validation sets
y_train_pred = best_dt.predict(X)
y_val_pred = best_dt.predict(X_val)

# Generate classification reports
print("\nClassification Report (Training Set):")
print(classification_report(y, y_train_pred))

print("\nClassification Report (Validation Set):")
print(classification_report(y_val, y_val_pred))

# Feature importance
feature_importances = best_dt.feature_importances_
features = X.columns
importances = pd.DataFrame({'feature': features, 'importance': feature_importances})
importances = importances.sort_values('importance', ascending=False)

# Display top 15 features
print("\nTop 15 Most Important Features:")
print(importances.head(15))

Fitting 5 folds for each of 672 candidates, totalling 3360 fits
Best Parameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 15, 'min_samples_split': 2, 'splitter': 'random'}
Best Cross-Validation Accuracy: 0.8294202852285256

Classification Report (Training Set):
              precision    recall  f1-score   support

           0       0.83      0.80      0.82      9215
           1       0.86      0.89      0.87     13103

    accuracy                           0.85     22318
   macro avg       0.85      0.84      0.84     22318
weighted avg       0.85      0.85      0.85     22318


Classification Report (Validation Set):
              precision    recall  f1-score   support

           0       0.82      0.75      0.78      2348
           1       0.83      0.88      0.85      3232

    accuracy                           0.82      5580
   macro avg       0.82      0.81      0.82      5580
weighted avg       0.82      0.82      0.82      5580


Top 15 Most Important