In [348]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV


In [323]:
df = pd.read_csv('Student Depression Dataset.csv')
df.head()

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,2,Male,33.0,Visakhapatnam,Student,5.0,0.0,8.97,2.0,0.0,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
1,8,Female,24.0,Bangalore,Student,2.0,0.0,5.9,5.0,0.0,5-6 hours,Moderate,BSc,No,3.0,2.0,Yes,0
2,26,Male,31.0,Srinagar,Student,3.0,0.0,7.03,5.0,0.0,Less than 5 hours,Healthy,BA,No,9.0,1.0,Yes,0
3,30,Female,28.0,Varanasi,Student,3.0,0.0,5.59,2.0,0.0,7-8 hours,Moderate,BCA,Yes,4.0,5.0,Yes,1
4,32,Female,25.0,Jaipur,Student,4.0,0.0,8.13,3.0,0.0,5-6 hours,Moderate,M.Tech,Yes,1.0,1.0,No,0


In [324]:
df.info()
df.describe(include='all')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27901 entries, 0 to 27900
Data columns (total 18 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   id                                     27901 non-null  int64  
 1   Gender                                 27901 non-null  object 
 2   Age                                    27901 non-null  float64
 3   City                                   27901 non-null  object 
 4   Profession                             27901 non-null  object 
 5   Academic Pressure                      27901 non-null  float64
 6   Work Pressure                          27901 non-null  float64
 7   CGPA                                   27901 non-null  float64
 8   Study Satisfaction                     27901 non-null  float64
 9   Job Satisfaction                       27901 non-null  float64
 10  Sleep Duration                         27901 non-null  object 
 11  Di

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
count,27901.0,27901,27901.0,27901,27901,27901.0,27901.0,27901.0,27901.0,27901.0,27901,27901,27901,27901,27901.0,27898.0,27901,27901.0
unique,,2,,52,14,,,,,,5,4,28,2,,,2,
top,,Male,,Kalyan,Student,,,,,,Less than 5 hours,Unhealthy,Class 12,Yes,,,No,
freq,,15547,,1570,27870,,,,,,8310,10317,6080,17656,,,14398,
mean,70442.149421,,25.8223,,,3.141214,0.00043,7.656104,2.943837,0.000681,,,,,7.156984,3.139867,,0.585499
std,40641.175216,,4.905687,,,1.381465,0.043992,1.470707,1.361148,0.044394,,,,,3.707642,1.437347,,0.492645
min,2.0,,18.0,,,0.0,0.0,0.0,0.0,0.0,,,,,0.0,1.0,,0.0
25%,35039.0,,21.0,,,2.0,0.0,6.29,2.0,0.0,,,,,4.0,2.0,,0.0
50%,70684.0,,25.0,,,3.0,0.0,7.77,3.0,0.0,,,,,8.0,3.0,,1.0
75%,105818.0,,30.0,,,4.0,0.0,8.92,4.0,0.0,,,,,10.0,4.0,,1.0


In [325]:
# Check for duplicates
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

Number of duplicate rows: 0


In [326]:
# Kiểm tra các cột có giá trị null
print(df.isnull().sum())

id                                       0
Gender                                   0
Age                                      0
City                                     0
Profession                               0
Academic Pressure                        0
Work Pressure                            0
CGPA                                     0
Study Satisfaction                       0
Job Satisfaction                         0
Sleep Duration                           0
Dietary Habits                           0
Degree                                   0
Have you ever had suicidal thoughts ?    0
Work/Study Hours                         0
Financial Stress                         3
Family History of Mental Illness         0
Depression                               0
dtype: int64


In [327]:
# Xử lý missing values nếu có
df = df.dropna()  # hoặc dùng fillna nếu cần


In [328]:
print(df.isnull().sum())

id                                       0
Gender                                   0
Age                                      0
City                                     0
Profession                               0
Academic Pressure                        0
Work Pressure                            0
CGPA                                     0
Study Satisfaction                       0
Job Satisfaction                         0
Sleep Duration                           0
Dietary Habits                           0
Degree                                   0
Have you ever had suicidal thoughts ?    0
Work/Study Hours                         0
Financial Stress                         0
Family History of Mental Illness         0
Depression                               0
dtype: int64


In [329]:
# Dùng LabelEncoder cho các cột dạng text
label_cols = ['Gender', 'City', 'Profession', 'Dietary Habits', 'Degree',
              'Have you ever had suicidal thoughts ?',
              'Family History of Mental Illness']

le = LabelEncoder()
for col in label_cols:
    df[col] = le.fit_transform(df[col])


In [330]:
# Mapping thời lượng ngủ thành số
sleep_map = {
    'Less than 5 hours': 4,
    '5-6 hours': 5.5,
    '7-8 hours': 7.5,
    'More than 8 hours': 9
}
df['Sleep Duration'] = df['Sleep Duration'].map(sleep_map)

# Kiểm tra và xử lý NaN
if df['Sleep Duration'].isnull().sum() > 0:
    print("Found NaN in 'Sleep Duration' after mapping.")
    df['Sleep Duration'].fillna(df['Sleep Duration'].mean(), inplace=True)  # Hoặc dùng phương pháp khác

Found NaN in 'Sleep Duration' after mapping.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Sleep Duration'].fillna(df['Sleep Duration'].mean(), inplace=True)  # Hoặc dùng phương pháp khác


In [331]:
X = df.drop(columns=['id', 'Depression', 'City', 'Profession'])  # loại bỏ City và Profession
y = df['Depression']


In [332]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [333]:
# Bước 1: Chia dữ liệu thành train (70%) và tạm thời (30%)
train_set, temp_set = train_test_split(df, test_size=0.3, random_state=42, stratify=df['Depression'])

# Bước 2: Chia phần tạm thời (30%) thành validation (15%) và test (15%)
valid_set, test_set = train_test_split(temp_set, test_size=0.5, random_state=42, stratify=temp_set['Depression'])

# Bước 3: Lưu ra file CSV
train_set.to_csv('train_set.csv', index=False)
valid_set.to_csv('valid_set.csv', index=False)
test_set.to_csv('test_set.csv', index=False)

print(f"Train set: {train_set.shape}")
print(f"Validation set: {valid_set.shape}")
print(f"Test set: {test_set.shape}")


Train set: (19528, 18)
Validation set: (4185, 18)
Test set: (4185, 18)


In [334]:
train_df = pd.read_csv('train_set.csv')
valid_df = pd.read_csv('valid_set.csv')
test_df = pd.read_csv('test_set.csv')


In [335]:
X_train = train_df.drop(columns=['Depression'])
y_train = train_df['Depression']

X_valid = valid_df.drop(columns=['Depression'])
y_valid = valid_df['Depression']

X_test = test_df.drop(columns=['Depression'])
y_test = test_df['Depression']


In [336]:
print(train_df.isnull().sum())
print(valid_df.isnull().sum())
print(test_df.isnull().sum())


id                                       0
Gender                                   0
Age                                      0
City                                     0
Profession                               0
Academic Pressure                        0
Work Pressure                            0
CGPA                                     0
Study Satisfaction                       0
Job Satisfaction                         0
Sleep Duration                           0
Dietary Habits                           0
Degree                                   0
Have you ever had suicidal thoughts ?    0
Work/Study Hours                         0
Financial Stress                         0
Family History of Mental Illness         0
Depression                               0
dtype: int64
id                                       0
Gender                                   0
Age                                      0
City                                     0
Profession                               

In [337]:
# Danh sách các cột có dữ liệu Yes/No
yes_no_columns = [
    'Have you ever had suicidal thoughts ?',
    'Family History of Mental Illness'
]

# Chuyển đổi Yes/No thành 1/0
for col in yes_no_columns:
    df[col] = df[col].astype(str).str.strip().map({'Yes': 1, 'No': 0})



In [338]:
# 2. One-hot encode cho các cột dạng object
X_train_encoded = pd.get_dummies(X_train, drop_first=True)
X_valid_encoded = pd.get_dummies(X_valid, drop_first=True)
X_test_encoded = pd.get_dummies(X_test, drop_first=True)

In [339]:
# 3. Căn chỉnh cột giữa các tập
X_valid_encoded = X_valid_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)
X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)


In [340]:
print("X_train_encoded:")
display(X_train_encoded.head())

print("\nX_valid_encoded:")
display(X_valid_encoded.head())

print("\nX_test_encoded:")
display(X_test_encoded.head())


X_train_encoded:


Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness
0,113127,0,28.0,2,11,4.0,0.0,8.28,4.0,0.0,7.5,3,0,1,10.0,1.0,1
1,22943,0,19.0,32,11,5.0,0.0,7.09,5.0,0.0,5.5,1,11,1,11.0,5.0,1
2,88721,0,22.0,44,11,3.0,0.0,6.65,1.0,0.0,9.0,3,21,1,2.0,4.0,0
3,46602,1,59.0,37,11,1.0,0.0,8.14,1.0,0.0,5.5,3,27,1,10.0,4.0,1
4,94657,0,33.0,25,11,3.0,0.0,6.47,1.0,0.0,4.0,1,17,0,10.0,1.0,1



X_valid_encoded:


Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness
0,130472,0,25.0,39,11,5.0,0.0,5.87,4.0,0.0,4.0,3,21,1,9.0,2.0,0
1,45743,1,22.0,50,11,3.0,0.0,7.25,5.0,0.0,7.5,0,4,1,3.0,2.0,0
2,15264,1,24.0,14,11,2.0,0.0,9.54,2.0,0.0,5.5,3,1,0,10.0,3.0,0
3,115996,1,34.0,17,11,4.0,0.0,8.04,2.0,0.0,7.5,3,15,1,12.0,4.0,1
4,123354,1,31.0,44,11,5.0,0.0,9.21,4.0,0.0,4.0,3,13,1,6.0,3.0,1



X_test_encoded:


Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness
0,59771,0,24.0,33,11,4.0,0.0,9.93,2.0,0.0,7.5,0,11,1,6.0,5.0,0
1,64684,0,32.0,49,11,2.0,0.0,7.91,4.0,0.0,9.0,0,7,1,12.0,2.0,1
2,48614,0,18.0,50,11,4.0,0.0,9.6,5.0,0.0,4.0,1,11,0,6.0,1.0,0
3,75026,0,23.0,6,11,3.0,0.0,5.56,4.0,0.0,5.5,0,10,1,10.0,4.0,1
4,113495,0,22.0,37,11,3.0,0.0,8.95,1.0,0.0,4.0,0,7,1,10.0,4.0,0


In [341]:
X_train_encoded.info()
X_valid_encoded.info()
X_test_encoded.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19528 entries, 0 to 19527
Data columns (total 17 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   id                                     19528 non-null  int64  
 1   Gender                                 19528 non-null  int64  
 2   Age                                    19528 non-null  float64
 3   City                                   19528 non-null  int64  
 4   Profession                             19528 non-null  int64  
 5   Academic Pressure                      19528 non-null  float64
 6   Work Pressure                          19528 non-null  float64
 7   CGPA                                   19528 non-null  float64
 8   Study Satisfaction                     19528 non-null  float64
 9   Job Satisfaction                       19528 non-null  float64
 10  Sleep Duration                         19528 non-null  float64
 11  Di

In [342]:
train_df = pd.read_csv("train_set.csv")
valid_df = pd.read_csv("valid_set.csv")
test_df  = pd.read_csv("test_set.csv")

In [343]:
# Tách X và y
X_train = train_df.drop(columns=['Depression'])
y_train = train_df['Depression']

X_valid = valid_df.drop(columns=['Depression'])
y_valid = valid_df['Depression']

X_test = test_df.drop(columns=['Depression'])
y_test = test_df['Depression']


In [344]:
# One-hot encoding (nếu có cột phân loại)
X_train_encoded = pd.get_dummies(X_train, drop_first=True)
X_valid_encoded = pd.get_dummies(X_valid, drop_first=True)
X_test_encoded = pd.get_dummies(X_test, drop_first=True)

# Đảm bảo các tập có cùng cột sau one-hot
X_valid_encoded = X_valid_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)
X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

# Chuẩn hóa dữ liệu
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_valid_scaled = scaler.transform(X_valid_encoded)
X_test_scaled = scaler.transform(X_test_encoded)

In [347]:
# Define and train the logistic regression model
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train_scaled, y_train)

# Predict on training, validation, and test sets
y_train_pred = log_reg.predict(X_train_scaled)
y_valid_pred = log_reg.predict(X_valid_scaled)
y_test_pred = log_reg.predict(X_test_scaled)

# Generate classification reports
print("Classification Report (Training Set):")
print(classification_report(y_train, y_train_pred))

print("\nClassification Report (Validation Set):")
print(classification_report(y_valid, y_valid_pred))

print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_test_pred))



Classification Report (Training Set):
              precision    recall  f1-score   support

           0       0.83      0.79      0.81      8094
           1       0.86      0.89      0.87     11434

    accuracy                           0.85     19528
   macro avg       0.84      0.84      0.84     19528
weighted avg       0.85      0.85      0.85     19528


Classification Report (Validation Set):
              precision    recall  f1-score   support

           0       0.83      0.79      0.81      1734
           1       0.86      0.88      0.87      2451

    accuracy                           0.85      4185
   macro avg       0.84      0.84      0.84      4185
weighted avg       0.85      0.85      0.85      4185


Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.84      0.79      0.81      1735
           1       0.86      0.89      0.87      2450

    accuracy                           0.85      4185
   macro avg  

In [349]:

# Định nghĩa mô hình SVM
svm = SVC(random_state=42)

# Định nghĩa grid search với các giá trị kernel
param_grid = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}

# Sử dụng GridSearchCV để tìm kiếm tham số tốt nhất
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1)
grid_search.fit(X_train_scaled, y_train)

# In ra kết quả tốt nhất
print(f"Best kernel: {grid_search.best_params_['kernel']}")
print(f"Best accuracy: {grid_search.best_score_}")

# Dự đoán trên tập validation và test
best_svm = grid_search.best_estimator_
y_valid_pred_svm = best_svm.predict(X_valid_scaled)
y_test_pred_svm = best_svm.predict(X_test_scaled)

# Báo cáo kết quả
print("\nClassification Report (Validation Set):")
print(classification_report(y_valid, y_valid_pred_svm))

print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_test_pred_svm))

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best kernel: linear
Best accuracy: 0.8450433326580532

Classification Report (Validation Set):
              precision    recall  f1-score   support

           0       0.83      0.79      0.81      1734
           1       0.86      0.89      0.87      2451

    accuracy                           0.85      4185
   macro avg       0.84      0.84      0.84      4185
weighted avg       0.85      0.85      0.85      4185


Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.84      0.79      0.81      1735
           1       0.86      0.89      0.87      2450

    accuracy                           0.85      4185
   macro avg       0.85      0.84      0.84      4185
weighted avg       0.85      0.85      0.85      4185

