In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
df = pd.read_csv('Student Depression Dataset.csv')  
df.head()

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,2,Male,33.0,Visakhapatnam,Student,5.0,0.0,8.97,2.0,0.0,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
1,8,Female,24.0,Bangalore,Student,2.0,0.0,5.9,5.0,0.0,5-6 hours,Moderate,BSc,No,3.0,2.0,Yes,0
2,26,Male,31.0,Srinagar,Student,3.0,0.0,7.03,5.0,0.0,Less than 5 hours,Healthy,BA,No,9.0,1.0,Yes,0
3,30,Female,28.0,Varanasi,Student,3.0,0.0,5.59,2.0,0.0,7-8 hours,Moderate,BCA,Yes,4.0,5.0,Yes,1
4,32,Female,25.0,Jaipur,Student,4.0,0.0,8.13,3.0,0.0,5-6 hours,Moderate,M.Tech,Yes,1.0,1.0,No,0


In [3]:
# Convert appropriate columns to 'category' types
df['Gender'] = df['Gender'].astype('category')
df['City'] = df['City'].astype('category')

# Rename the column for suicidal thoughts to a simpler name and convert it using a mapping
df.rename(columns={'Have you ever had suicidal thoughts ?': 'Suicidal_Thoughts'}, inplace=True)
df['Suicidal_Thoughts'] = df['Suicidal_Thoughts'].map({'Yes': 1, 'No': 0})

# Convert Yes/No in 'Family History of Mental Illness' to binary
df['Family History of Mental Illness'] = df['Family History of Mental Illness'].map({'Yes': 1, 'No': 0})

# ----- Clean "Sleep Duration" column -----
# Remove any double quotes and single quotes, plus extra whitespace.
df['Sleep Duration'] = df['Sleep Duration'].str.replace('"', '', regex=False)\
                                      .str.strip("'").str.strip()

# Debug: Print distinct cleaned values in Sleep Duration
print("\nDistinct values in 'Sleep Duration' after cleaning:")
print(df['Sleep Duration'].unique())

# Convert textual sleep duration descriptions to numeric values using a mapping.
sleep_mapping = {
    'Less than 5 hours': 4.0,
    '5-6 hours': 5.5,
    '7-8 hours': 7.5,
    'More than 8 hours': 9.0
}
df['Sleep_Duration_Num'] = df['Sleep Duration'].replace(sleep_mapping)

# For any entries that remain as "Others", convert them to NaN
df['Sleep_Duration_Num'] = df['Sleep_Duration_Num'].replace("Others", np.nan)

# Fill missing sleep duration values (from "Others" or other mapping issues) with the median.
median_sleep = df['Sleep_Duration_Num'].median()
df['Sleep_Duration_Num'] = df['Sleep_Duration_Num'].fillna(median_sleep)

# Explicitly convert Sleep_Duration_Num to float
df['Sleep_Duration_Num'] = df['Sleep_Duration_Num'].astype(float)

# Convert other numeric columns to proper numeric types.
numeric_cols = ['Age', 'Academic Pressure', 'Work Pressure', 'CGPA', 
                'Study Satisfaction', 'Job Satisfaction', 'Work/Study Hours', 
                'Financial Stress', 'Depression']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Also ensure 'Family History of Mental Illness' is numeric.
df['Family History of Mental Illness'] = pd.to_numeric(df['Family History of Mental Illness'], errors='coerce')

df.head()


Distinct values in 'Sleep Duration' after cleaning:
['5-6 hours' 'Less than 5 hours' '7-8 hours' 'More than 8 hours' 'Others']


  df['Sleep_Duration_Num'] = df['Sleep_Duration_Num'].replace("Others", np.nan)


Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Suicidal_Thoughts,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression,Sleep_Duration_Num
0,2,Male,33.0,Visakhapatnam,Student,5.0,0.0,8.97,2.0,0.0,5-6 hours,Healthy,B.Pharm,1,3.0,1.0,0,1,5.5
1,8,Female,24.0,Bangalore,Student,2.0,0.0,5.9,5.0,0.0,5-6 hours,Moderate,BSc,0,3.0,2.0,1,0,5.5
2,26,Male,31.0,Srinagar,Student,3.0,0.0,7.03,5.0,0.0,Less than 5 hours,Healthy,BA,0,9.0,1.0,1,0,4.0
3,30,Female,28.0,Varanasi,Student,3.0,0.0,5.59,2.0,0.0,7-8 hours,Moderate,BCA,1,4.0,5.0,1,1,7.5
4,32,Female,25.0,Jaipur,Student,4.0,0.0,8.13,3.0,0.0,5-6 hours,Moderate,M.Tech,1,1.0,1.0,0,0,5.5


In [4]:
# Convert textual sleep duration descriptions to numeric values using a mapping
sleep_mapping = {
    'Less than 5 hours': 4.0,
    '5-6 hours': 5.5,
    '7-8 hours': 7.5,
    'More than 8 hours': 9.0,
    'Others': np.nan  # Map 'Others' to NaN initially
}

# Apply the mapping to create new column
df['Sleep Duration'] = df['Sleep Duration'].map(sleep_mapping)

# Calculate median excluding NaN values
median_sleep = df['Sleep Duration'].median()

# Fill NaN values (from "Others") with the median
df['Sleep Duration'] = df['Sleep Duration'].fillna(median_sleep)

# Convert to float type
df['Sleep Duration'] = df['Sleep Duration'].astype(float)

# Verify the results
print("\nUnique values in Sleep_Duration_Num after mapping:")
print(df['Sleep_Duration_Num'].unique())


Unique values in Sleep_Duration_Num after mapping:
[5.5 4.  7.5 9. ]


In [5]:
df

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Suicidal_Thoughts,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression,Sleep_Duration_Num
0,2,Male,33.0,Visakhapatnam,Student,5.0,0.0,8.97,2.0,0.0,5.5,Healthy,B.Pharm,1,3.0,1.0,0,1,5.5
1,8,Female,24.0,Bangalore,Student,2.0,0.0,5.90,5.0,0.0,5.5,Moderate,BSc,0,3.0,2.0,1,0,5.5
2,26,Male,31.0,Srinagar,Student,3.0,0.0,7.03,5.0,0.0,4.0,Healthy,BA,0,9.0,1.0,1,0,4.0
3,30,Female,28.0,Varanasi,Student,3.0,0.0,5.59,2.0,0.0,7.5,Moderate,BCA,1,4.0,5.0,1,1,7.5
4,32,Female,25.0,Jaipur,Student,4.0,0.0,8.13,3.0,0.0,5.5,Moderate,M.Tech,1,1.0,1.0,0,0,5.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27896,140685,Female,27.0,Surat,Student,5.0,0.0,5.75,5.0,0.0,5.5,Unhealthy,Class 12,1,7.0,1.0,1,0,5.5
27897,140686,Male,27.0,Ludhiana,Student,2.0,0.0,9.40,3.0,0.0,4.0,Healthy,MSc,0,0.0,3.0,1,0,4.0
27898,140689,Male,31.0,Faridabad,Student,3.0,0.0,6.61,4.0,0.0,5.5,Unhealthy,MD,0,12.0,2.0,0,0,5.5
27899,140690,Female,18.0,Ludhiana,Student,5.0,0.0,6.88,2.0,0.0,4.0,Healthy,Class 12,1,10.0,5.0,0,1,4.0


In [6]:
df.drop(columns=['id'], inplace=True)
df.drop(columns=['City'], inplace=True)
df.drop(columns=['Sleep_Duration_Num'], inplace=True)

In [7]:
df

Unnamed: 0,Gender,Age,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Suicidal_Thoughts,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,Male,33.0,Student,5.0,0.0,8.97,2.0,0.0,5.5,Healthy,B.Pharm,1,3.0,1.0,0,1
1,Female,24.0,Student,2.0,0.0,5.90,5.0,0.0,5.5,Moderate,BSc,0,3.0,2.0,1,0
2,Male,31.0,Student,3.0,0.0,7.03,5.0,0.0,4.0,Healthy,BA,0,9.0,1.0,1,0
3,Female,28.0,Student,3.0,0.0,5.59,2.0,0.0,7.5,Moderate,BCA,1,4.0,5.0,1,1
4,Female,25.0,Student,4.0,0.0,8.13,3.0,0.0,5.5,Moderate,M.Tech,1,1.0,1.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27896,Female,27.0,Student,5.0,0.0,5.75,5.0,0.0,5.5,Unhealthy,Class 12,1,7.0,1.0,1,0
27897,Male,27.0,Student,2.0,0.0,9.40,3.0,0.0,4.0,Healthy,MSc,0,0.0,3.0,1,0
27898,Male,31.0,Student,3.0,0.0,6.61,4.0,0.0,5.5,Unhealthy,MD,0,12.0,2.0,0,0
27899,Female,18.0,Student,5.0,0.0,6.88,2.0,0.0,4.0,Healthy,Class 12,1,10.0,5.0,0,1


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27901 entries, 0 to 27900
Data columns (total 16 columns):
 #   Column                            Non-Null Count  Dtype   
---  ------                            --------------  -----   
 0   Gender                            27901 non-null  category
 1   Age                               27901 non-null  float64 
 2   Profession                        27901 non-null  object  
 3   Academic Pressure                 27901 non-null  float64 
 4   Work Pressure                     27901 non-null  float64 
 5   CGPA                              27901 non-null  float64 
 6   Study Satisfaction                27901 non-null  float64 
 7   Job Satisfaction                  27901 non-null  float64 
 8   Sleep Duration                    27901 non-null  float64 
 9   Dietary Habits                    27901 non-null  object  
 10  Degree                            27901 non-null  object  
 11  Suicidal_Thoughts                 27901 non-null  int6

In [9]:
# Create one-hot encoding for categorical columns
categorical_cols = ['Gender', 'Profession', 'Dietary Habits', 'Degree']

# Apply one-hot encoding using pandas get_dummies
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Verify the new encoded dataframe
print("\nShape after encoding:", df_encoded.shape)
print("\nNew columns created:")
new_cols = [col for col in df_encoded.columns if any(x in col for x in categorical_cols)]
print(new_cols)

# If you want to see the first few rows of the encoded dataframe
df_encoded.head()


Shape after encoding: (27901, 56)

New columns created:
['Gender_Male', 'Profession_Chef', 'Profession_Civil Engineer', 'Profession_Content Writer', 'Profession_Digital Marketer', 'Profession_Doctor', 'Profession_Educational Consultant', 'Profession_Entrepreneur', 'Profession_Lawyer', 'Profession_Manager', 'Profession_Pharmacist', 'Profession_Student', 'Profession_Teacher', 'Profession_UX/UI Designer', 'Dietary Habits_Moderate', 'Dietary Habits_Others', 'Dietary Habits_Unhealthy', 'Degree_B.Com', 'Degree_B.Ed', 'Degree_B.Pharm', 'Degree_B.Tech', 'Degree_BA', 'Degree_BBA', 'Degree_BCA', 'Degree_BE', 'Degree_BHM', 'Degree_BSc', 'Degree_Class 12', 'Degree_LLB', 'Degree_LLM', 'Degree_M.Com', 'Degree_M.Ed', 'Degree_M.Pharm', 'Degree_M.Tech', 'Degree_MA', 'Degree_MBA', 'Degree_MBBS', 'Degree_MCA', 'Degree_MD', 'Degree_ME', 'Degree_MHM', 'Degree_MSc', 'Degree_Others', 'Degree_PhD']


Unnamed: 0,Age,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Suicidal_Thoughts,Work/Study Hours,Financial Stress,...,Degree_MA,Degree_MBA,Degree_MBBS,Degree_MCA,Degree_MD,Degree_ME,Degree_MHM,Degree_MSc,Degree_Others,Degree_PhD
0,33.0,5.0,0.0,8.97,2.0,0.0,5.5,1,3.0,1.0,...,False,False,False,False,False,False,False,False,False,False
1,24.0,2.0,0.0,5.9,5.0,0.0,5.5,0,3.0,2.0,...,False,False,False,False,False,False,False,False,False,False
2,31.0,3.0,0.0,7.03,5.0,0.0,4.0,0,9.0,1.0,...,False,False,False,False,False,False,False,False,False,False
3,28.0,3.0,0.0,5.59,2.0,0.0,7.5,1,4.0,5.0,...,False,False,False,False,False,False,False,False,False,False
4,25.0,4.0,0.0,8.13,3.0,0.0,5.5,1,1.0,1.0,...,False,False,False,False,False,False,False,False,False,False


In [27]:
df_encoded.head().T 

Unnamed: 0,0,1,2,3,4
Age,33.0,24.0,31.0,28.0,25.0
Academic Pressure,5.0,2.0,3.0,3.0,4.0
Work Pressure,0.0,0.0,0.0,0.0,0.0
CGPA,8.97,5.9,7.03,5.59,8.13
Study Satisfaction,2.0,5.0,5.0,2.0,3.0
Job Satisfaction,0.0,0.0,0.0,0.0,0.0
Sleep Duration,5.5,5.5,4.0,7.5,5.5
Suicidal_Thoughts,1,0,0,1,1
Work/Study Hours,3.0,3.0,9.0,4.0,1.0
Financial Stress,1.0,2.0,1.0,5.0,1.0


In [10]:
df_encoded.isnull().sum()

Age                                  0
Academic Pressure                    0
Work Pressure                        0
CGPA                                 0
Study Satisfaction                   0
Job Satisfaction                     0
Sleep Duration                       0
Suicidal_Thoughts                    0
Work/Study Hours                     0
Financial Stress                     3
Family History of Mental Illness     0
Depression                           0
Gender_Male                          0
Profession_Chef                      0
Profession_Civil Engineer            0
Profession_Content Writer            0
Profession_Digital Marketer          0
Profession_Doctor                    0
Profession_Educational Consultant    0
Profession_Entrepreneur              0
Profession_Lawyer                    0
Profession_Manager                   0
Profession_Pharmacist                0
Profession_Student                   0
Profession_Teacher                   0
Profession_UX/UI Designer

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 55 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Age                                1 non-null      int64  
 1   Academic Pressure                  1 non-null      float64
 2   Work Pressure                      1 non-null      float64
 3   CGPA                               1 non-null      float64
 4   Study Satisfaction                 1 non-null      float64
 5   Job Satisfaction                   1 non-null      float64
 6   Sleep Duration                     1 non-null      float64
 7   Suicidal_Thoughts                  1 non-null      int64  
 8   Work/Study Hours                   1 non-null      float64
 9   Financial Stress                   1 non-null      float64
 10  Family History of Mental Illness   1 non-null      int64  
 11  Gender_Male                        1 non-null      int64  
 12

In [11]:
# Drop rows where Financial Stress is null
df_encoded = df_encoded.dropna(subset=['Financial Stress'])

# Verify the removal of null values
print("Number of null values in Financial Stress after dropping:", df_encoded['Financial Stress'].isnull().sum())
print("New shape of dataframe:", df_encoded.shape)

Number of null values in Financial Stress after dropping: 0
New shape of dataframe: (27898, 56)


In [12]:
# Import required libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from joblib import dump
import numpy as np
import os

# Separate features (X) and target variable (y)
X = df_encoded.drop('Depression', axis=1)
y = df_encoded['Depression']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred = model.predict(X_test)

# Print model performance metrics
print("Model Performance Metrics:")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print(f"\nModel Accuracy: {model.score(X_test, y_test):.4f}")

# Print feature importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': abs(model.coef_[0])
})
print("\nTop 10 Most Important Features:")
print(feature_importance.sort_values('Importance', ascending=False).head(10))

# Create models directory if it doesn't exist
os.makedirs('models', exist_ok=True)

# Save the trained model
model_path = 'depression_model.joblib'
dump(model, model_path)

# Save feature names and their order
feature_data = {
    'feature_names': X.columns.tolist(),
    'target_name': 'Depression'
}
feature_path = 'feature_names.joblib'
dump(feature_data, feature_path)

print(f"\nModel saved to: {model_path}")
print(f"Feature information saved to: {feature_path}")

Model Performance Metrics:

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.77      0.81      2348
           1       0.85      0.89      0.87      3232

    accuracy                           0.84      5580
   macro avg       0.84      0.83      0.84      5580
weighted avg       0.84      0.84      0.84      5580


Confusion Matrix:
[[1818  530]
 [ 342 2890]]

Model Accuracy: 0.8437

Top 10 Most Important Features:
                     Feature  Importance
7          Suicidal_Thoughts    2.512583
22        Profession_Student    1.423366
27  Dietary Habits_Unhealthy    1.062622
1          Academic Pressure    0.846443
53             Degree_Others    0.666792
26     Dietary Habits_Others    0.555653
9           Financial Stress    0.548182
25   Dietary Habits_Moderate    0.491331
2              Work Pressure    0.391727
40                Degree_LLM    0.318814

Model saved to: depression_model.joblib
Feature information saved t

In [22]:
from joblib import load
import numpy as np
import pandas as pd
import os

# 1. Load the trained model and feature metadata
MODEL_PATH   = os.path.join('depression_model.joblib')
FEATURE_PATH = os.path.join('feature_names.joblib')

model        = load(MODEL_PATH)
feature_data = load(FEATURE_PATH)
feature_cols = feature_data['feature_names']

# Helper to build a single‐row DataFrame in the correct column order
def make_input_row(mapping: dict):
    # default all columns to 0
    row = {col: 0 for col in feature_cols}
    # overwrite with provided values
    for k, v in mapping.items():
        if k in row:
            row[k] = v
        else:
            print(f"Warning: '{k}' not in feature columns")
    return pd.DataFrame([row], columns=feature_cols)

# 2. Define a few sample cases
# Note: categorical one‑hot features must match your trained columns, e.g. 'Gender_Female', etc.
sample_1 = {
    'Age': 25,
    'Academic Pressure': 3.0,
    'Work Pressure': 1.0,
    'CGPA': 8.0,
    'Study Satisfaction': 4.0,
    'Job Satisfaction': 3.0,
    'Sleep Duration': 7.5,
    'Suicidal_Thoughts': 0,
    'Work/Study Hours': 5.0,
    'Financial Stress': 2.0,
    'Family History of Mental Illness': 0,
    'Gender_Female': 1,
    'Profession_Student': 1,
    'Dietary Habits_Healthy': 1,
    'Degree_BCA': 1
}


# 3. Build test DataFrames
df1 = make_input_row(sample_1)

# 4. Predict and display
for i, df in enumerate([df1], start=1):
    pred = model.predict(df)[0]
    proba = model.predict_proba(df)[0]
    print(f"--- Sample {i} ---")
    print("Input:")
    print(df.to_dict(orient='records')[0])
    print("Prediction:", pred)
    print("Probabilities:", proba)
    print()

--- Sample 1 ---
Input:
{'Age': 25, 'Academic Pressure': 3.0, 'Work Pressure': 1.0, 'CGPA': 8.0, 'Study Satisfaction': 4.0, 'Job Satisfaction': 3.0, 'Sleep Duration': 7.5, 'Suicidal_Thoughts': 0, 'Work/Study Hours': 5.0, 'Financial Stress': 2.0, 'Family History of Mental Illness': 0, 'Gender_Male': 0, 'Profession_Chef': 0, 'Profession_Civil Engineer': 0, 'Profession_Content Writer': 0, 'Profession_Digital Marketer': 0, 'Profession_Doctor': 0, 'Profession_Educational Consultant': 0, 'Profession_Entrepreneur': 0, 'Profession_Lawyer': 0, 'Profession_Manager': 0, 'Profession_Pharmacist': 0, 'Profession_Student': 1, 'Profession_Teacher': 0, 'Profession_UX/UI Designer': 0, 'Dietary Habits_Moderate': 0, 'Dietary Habits_Others': 0, 'Dietary Habits_Unhealthy': 0, 'Degree_B.Com': 0, 'Degree_B.Ed': 0, 'Degree_B.Pharm': 0, 'Degree_B.Tech': 0, 'Degree_BA': 0, 'Degree_BBA': 0, 'Degree_BCA': 1, 'Degree_BE': 0, 'Degree_BHM': 0, 'Degree_BSc': 0, 'Degree_Class 12': 0, 'Degree_LLB': 0, 'Degree_LLM': 0, '