In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV

In [2]:
df_val = pd.read_csv('../X_test_encoded.csv')
df_train = pd.read_csv('../X_train_encoded.csv')
X_val = df_val.drop('Depression', axis=1)
y_val = df_val['Depression']
X = df_train.drop('Depression', axis=1)
y = df_train['Depression']

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

# Create pipeline with MinMaxScaler and Decision Tree
pipe = Pipeline([
    ('scaler', MinMaxScaler()),
    ('dt', DecisionTreeClassifier(random_state=42))
])

# Define parameter grid for pipeline
param_grid = {
    'dt__max_depth': [None, 10, 20, 30, 40, 50],
    'dt__min_samples_split': [2, 5, 10, 15],
    'dt__min_samples_leaf': [1, 2, 4, 6, 8, 10, 15],
    'dt__criterion': ['gini', 'entropy'],
    'dt__splitter': ['best', 'random']
}

# Perform GridSearchCV
grid_search = GridSearchCV(estimator=pipe, param_grid=param_grid, 
                          cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X, y)

# Get the best model
best_model = grid_search.best_estimator_

# Print the best parameters and accuracy
for param, value in grid_search.best_params_.items():
    print(f"  {param}: {value}")
print(f"Best Cross-Validation Accuracy: {grid_search.best_score_}")

# Predict on training and validation sets
y_train_pred = best_model.predict(X)
y_val_pred = best_model.predict(X_val)

# Generate classification reports
print("\nClassification Report (Validation Set):")
print(classification_report(y_val, y_val_pred, digits=4))

# Feature importance - access the decision tree inside the pipeline
best_dt = best_model.named_steps['dt']
feature_importances = best_dt.feature_importances_
features = X.columns
importances = pd.DataFrame({'feature': features, 'importance': feature_importances})
importances = importances.sort_values('importance', ascending=False)

# Display top 15 features
print("\nTop 15 Most Important Features:")
print(importances.head(15))

Fitting 5 folds for each of 672 candidates, totalling 3360 fits
  dt__criterion: gini
  dt__max_depth: 10
  dt__min_samples_leaf: 15
  dt__min_samples_split: 2
  dt__splitter: best
Best Cross-Validation Accuracy: 0.8298683239410943

Classification Report (Validation Set):
              precision    recall  f1-score   support

           0     0.8106    0.7531    0.7808      2313
           1     0.8336    0.8754    0.8540      3267

    accuracy                         0.8247      5580
   macro avg     0.8221    0.8143    0.8174      5580
weighted avg     0.8241    0.8247    0.8237      5580


Top 15 Most Important Features:
                                      feature  importance
40  Have you ever had suicidal thoughts ?_Yes    0.502171
1                           Academic Pressure    0.212327
8                            Financial Stress    0.104541
0                                         Age    0.057819
7                            Work/Study Hours    0.035990
3                  

In [4]:
import pandas as pd
import os
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# After getting predictions and running classification_report
def save_metrics_to_csv(model_name, y_true, y_pred, filepath='../model_metrics.csv'):
    """
    Save model metrics to CSV file with each model as a row
    
    Parameters:
    -----------
    model_name : str
        Name of the model
    y_true : array-like
        True labels
    y_pred : array-like
        Predicted labels
    filepath : str
        Path to CSV file
    """
    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    
    # Class 0 metrics
    precision_0 = precision_score(y_true, y_pred, pos_label=0)
    recall_0 = recall_score(y_true, y_pred, pos_label=0)
    f1_0 = f1_score(y_true, y_pred, pos_label=0)
    
    # Class 1 metrics
    precision_1 = precision_score(y_true, y_pred, pos_label=1)
    recall_1 = recall_score(y_true, y_pred, pos_label=1)
    f1_1 = f1_score(y_true, y_pred, pos_label=1)
    
    # Average metrics
    precision_avg = precision_score(y_true, y_pred, average='macro')
    recall_avg = recall_score(y_true, y_pred, average='macro')
    f1_avg = f1_score(y_true, y_pred, average='macro')
    
    # Create a dictionary with all metrics
    metrics_dict = {
        'model': model_name,
        'accuracy': accuracy,
        'precision_class0': precision_0,
        'recall_class0': recall_0,
        'f1_class0': f1_0,
        'precision_class1': precision_1,
        'recall_class1': recall_1,
        'f1_class1': f1_1,
        'precision_avg': precision_avg,
        'recall_avg': recall_avg,
        'f1_avg': f1_avg
    }
    
    # Check if file exists
    if os.path.exists(filepath):
        # Read existing data and append new row
        metrics_df = pd.read_csv(filepath)
        
        # Check if model already exists in the dataframe
        if model_name in metrics_df['model'].values:
            # Update existing row
            metrics_df.loc[metrics_df['model'] == model_name] = pd.Series(metrics_dict)
        else:
            # Append new row
            metrics_df = pd.concat([metrics_df, pd.DataFrame([metrics_dict])], ignore_index=True)
    else:
        # Create new dataframe
        metrics_df = pd.DataFrame([metrics_dict])
    
    # Save to CSV
    metrics_df.to_csv(filepath, index=False)
    print(f"Metrics saved to {filepath}")
    
    return metrics_df

# Use the function after evaluating your model
# Example usage after running the model:
save_metrics_to_csv("decision_tree_lib", y_val, y_val_pred)

Metrics saved to ../model_metrics.csv


Unnamed: 0,model,accuracy,precision_class0,recall_class0,f1_class0,precision_class1,recall_class1,f1_class1,precision_avg,recall_avg,f1_avg
0,SVM_Linear,0.846953,0.833562,0.788154,0.810222,0.855585,0.888583,0.871772,0.844573,0.838368,0.840997
1,logictics_Regression_lib,0.846057,0.830455,0.789883,0.809661,0.856213,0.885828,0.870769,0.843334,0.837856,0.840215
2,decision_tree_lib,0.824731,0.81061,0.753134,0.780816,0.833576,0.875421,0.853986,0.822093,0.814278,0.817401
