In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [2]:
df_train = pd.read_csv('../train_set_encoded.csv')
df_val = pd.read_csv('../test_set_encoded.csv')
X_val = df_val.drop('Depression', axis=1)
y_val = df_val['Depression']
X = df_train.drop('Depression', axis=1)
y = df_train['Depression']

In [3]:
# Define the parameter grid for GridSearchCV
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'solver': ['liblinear','saga'],  # Solvers for optimization
    'penalty': ['l2','l1']  # Regularization type
}

# Initialize the logistic regression model
log_reg = LogisticRegression(max_iter=1000, random_state=42)

# Perform GridSearchCV
grid_search = GridSearchCV(estimator=log_reg, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1)
grid_search.fit(X, y)

# Get the best model
best_log_reg = grid_search.best_estimator_

# Print the best parameters and accuracy
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Accuracy: {grid_search.best_score_}")

# Predict on training and validation sets using the best model
y_train_pred = best_log_reg.predict(X)
y_valid_pred = best_log_reg.predict(X_val)

# Generate classification reports
print("Classification Report (Training Set):")
print(classification_report(y, y_train_pred))

print("\nClassification Report (Validation Set):")
print(classification_report(y_val, y_valid_pred))


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Parameters: {'C': 0.1, 'penalty': 'l1', 'solver': 'saga'}
Best Cross-Validation Accuracy: 0.8468503774965326
Classification Report (Training Set):
              precision    recall  f1-score   support

           0       0.83      0.79      0.81      9215
           1       0.86      0.89      0.87     13103

    accuracy                           0.85     22318
   macro avg       0.84      0.84      0.84     22318
weighted avg       0.85      0.85      0.85     22318


Classification Report (Validation Set):
              precision    recall  f1-score   support

           0       0.84      0.78      0.81      2348
           1       0.85      0.90      0.87      3232

    accuracy                           0.85      5580
   macro avg       0.85      0.84      0.84      5580
weighted avg       0.85      0.85      0.84      5580

