In [199]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler



In [177]:
df = pd.read_csv('Student Depression Dataset.csv')
df.head()

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,2,Male,33.0,Visakhapatnam,Student,5.0,0.0,8.97,2.0,0.0,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
1,8,Female,24.0,Bangalore,Student,2.0,0.0,5.9,5.0,0.0,5-6 hours,Moderate,BSc,No,3.0,2.0,Yes,0
2,26,Male,31.0,Srinagar,Student,3.0,0.0,7.03,5.0,0.0,Less than 5 hours,Healthy,BA,No,9.0,1.0,Yes,0
3,30,Female,28.0,Varanasi,Student,3.0,0.0,5.59,2.0,0.0,7-8 hours,Moderate,BCA,Yes,4.0,5.0,Yes,1
4,32,Female,25.0,Jaipur,Student,4.0,0.0,8.13,3.0,0.0,5-6 hours,Moderate,M.Tech,Yes,1.0,1.0,No,0


In [178]:
df.info()
df.describe(include='all')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27901 entries, 0 to 27900
Data columns (total 18 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   id                                     27901 non-null  int64  
 1   Gender                                 27901 non-null  object 
 2   Age                                    27901 non-null  float64
 3   City                                   27901 non-null  object 
 4   Profession                             27901 non-null  object 
 5   Academic Pressure                      27901 non-null  float64
 6   Work Pressure                          27901 non-null  float64
 7   CGPA                                   27901 non-null  float64
 8   Study Satisfaction                     27901 non-null  float64
 9   Job Satisfaction                       27901 non-null  float64
 10  Sleep Duration                         27901 non-null  object 
 11  Di

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
count,27901.0,27901,27901.0,27901,27901,27901.0,27901.0,27901.0,27901.0,27901.0,27901,27901,27901,27901,27901.0,27898.0,27901,27901.0
unique,,2,,52,14,,,,,,5,4,28,2,,,2,
top,,Male,,Kalyan,Student,,,,,,Less than 5 hours,Unhealthy,Class 12,Yes,,,No,
freq,,15547,,1570,27870,,,,,,8310,10317,6080,17656,,,14398,
mean,70442.149421,,25.8223,,,3.141214,0.00043,7.656104,2.943837,0.000681,,,,,7.156984,3.139867,,0.585499
std,40641.175216,,4.905687,,,1.381465,0.043992,1.470707,1.361148,0.044394,,,,,3.707642,1.437347,,0.492645
min,2.0,,18.0,,,0.0,0.0,0.0,0.0,0.0,,,,,0.0,1.0,,0.0
25%,35039.0,,21.0,,,2.0,0.0,6.29,2.0,0.0,,,,,4.0,2.0,,0.0
50%,70684.0,,25.0,,,3.0,0.0,7.77,3.0,0.0,,,,,8.0,3.0,,1.0
75%,105818.0,,30.0,,,4.0,0.0,8.92,4.0,0.0,,,,,10.0,4.0,,1.0


In [179]:
# Check for duplicates
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

Number of duplicate rows: 0


In [180]:
# Kiểm tra các cột có giá trị null
print(df.isnull().sum())

id                                       0
Gender                                   0
Age                                      0
City                                     0
Profession                               0
Academic Pressure                        0
Work Pressure                            0
CGPA                                     0
Study Satisfaction                       0
Job Satisfaction                         0
Sleep Duration                           0
Dietary Habits                           0
Degree                                   0
Have you ever had suicidal thoughts ?    0
Work/Study Hours                         0
Financial Stress                         3
Family History of Mental Illness         0
Depression                               0
dtype: int64


In [181]:
# Xử lý missing values nếu có
df = df.dropna()  # hoặc dùng fillna nếu cần


In [182]:
print(df.isnull().sum())

id                                       0
Gender                                   0
Age                                      0
City                                     0
Profession                               0
Academic Pressure                        0
Work Pressure                            0
CGPA                                     0
Study Satisfaction                       0
Job Satisfaction                         0
Sleep Duration                           0
Dietary Habits                           0
Degree                                   0
Have you ever had suicidal thoughts ?    0
Work/Study Hours                         0
Financial Stress                         0
Family History of Mental Illness         0
Depression                               0
dtype: int64


In [184]:
# Mapping thời lượng ngủ thành số
sleep_map = {
    'Less than 5 hours': 4,
    '5-6 hours': 5.5,
    '7-8 hours': 7.5,
    'More than 8 hours': 9
}
df['Sleep Duration'] = df['Sleep Duration'].map(sleep_map)

# Kiểm tra và xử lý NaN
if df['Sleep Duration'].isnull().sum() > 0:
    print("Found NaN in 'Sleep Duration' after mapping.")
    df['Sleep Duration'].fillna(df['Sleep Duration'].mean(), inplace=True)  # Hoặc dùng phương pháp khác

Found NaN in 'Sleep Duration' after mapping.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Sleep Duration'].fillna(df['Sleep Duration'].mean(), inplace=True)  # Hoặc dùng phương pháp khác


In [185]:
df = df.drop(columns=['City', 'Profession'])


In [186]:
categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
print("Categorical Columns:", categorical_columns)

Categorical Columns: ['Gender', 'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?', 'Family History of Mental Illness']


In [187]:
# Bước 1: Chia dữ liệu thành train (70%) và tạm thời (30%)
train_set, temp_set = train_test_split(df, test_size=0.3, random_state=42, stratify=df['Depression'])

# Bước 2: Chia phần tạm thời (30%) thành validation (15%) và test (15%)
valid_set, test_set = train_test_split(temp_set, test_size=0.5, random_state=42, stratify=temp_set['Depression'])

# Bước 3: Lưu ra file CSV
train_set.to_csv('train_set.csv', index=False)
valid_set.to_csv('valid_set.csv', index=False)
test_set.to_csv('test_set.csv', index=False)

print(f"Train set: {train_set.shape}")
print(f"Validation set: {valid_set.shape}")
print(f"Test set: {test_set.shape}")


Train set: (19528, 16)
Validation set: (4185, 16)
Test set: (4185, 16)


In [188]:
train_df = pd.read_csv('train_set.csv')
valid_df = pd.read_csv('valid_set.csv')
test_df = pd.read_csv('test_set.csv')


In [189]:
X_train = train_df.drop(columns=['Depression'])
y_train = train_df['Depression']

X_valid = valid_df.drop(columns=['Depression'])
y_valid = valid_df['Depression']

X_test = test_df.drop(columns=['Depression'])
y_test = test_df['Depression']


In [190]:
print(train_df.isnull().sum())
print(valid_df.isnull().sum())
print(test_df.isnull().sum())


id                                       0
Gender                                   0
Age                                      0
Academic Pressure                        0
Work Pressure                            0
CGPA                                     0
Study Satisfaction                       0
Job Satisfaction                         0
Sleep Duration                           0
Dietary Habits                           0
Degree                                   0
Have you ever had suicidal thoughts ?    0
Work/Study Hours                         0
Financial Stress                         0
Family History of Mental Illness         0
Depression                               0
dtype: int64
id                                       0
Gender                                   0
Age                                      0
Academic Pressure                        0
Work Pressure                            0
CGPA                                     0
Study Satisfaction                       

In [191]:
# 2. One-hot encode cho các cột dạng object
X_train_encoded = pd.get_dummies(X_train, drop_first=True)
X_valid_encoded = pd.get_dummies(X_valid, drop_first=True)
X_test_encoded = pd.get_dummies(X_test, drop_first=True)

In [192]:
yes_no_columns = ['Have you ever had suicidal thoughts ?', 'Family History of Mental Illness']
# One-hot encode các cột yes_no_columns
df_encoded = pd.get_dummies(df, columns=yes_no_columns, drop_first=True)

In [193]:
# 3. Căn chỉnh cột giữa các tập
X_valid_encoded = X_valid_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)
X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)


In [200]:
# Các cột số cần chuẩn hóa
columns_to_scale = ['Age', 'Academic Pressure', 'Work Pressure', 'CGPA',
                    'Study Satisfaction', 'Job Satisfaction',
                    'Work/Study Hours', 'Financial Stress', 'Sleep Duration']

# Khởi tạo và fit scaler với tập train
scaler = MinMaxScaler()
X_train_encoded[columns_to_scale] = scaler.fit_transform(X_train_encoded[columns_to_scale])
X_valid_encoded[columns_to_scale] = scaler.transform(X_valid_encoded[columns_to_scale])
X_test_encoded[columns_to_scale] = scaler.transform(X_test_encoded[columns_to_scale])


In [201]:
print("X_train_encoded:")
display(X_train_encoded.head())

print("\nX_valid_encoded:")
display(X_valid_encoded.head())

print("\nX_test_encoded:")
display(X_test_encoded.head())


X_train_encoded:


Unnamed: 0,id,Age,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Work/Study Hours,Financial Stress,...,Degree_MBBS,Degree_MCA,Degree_MD,Degree_ME,Degree_MHM,Degree_MSc,Degree_Others,Degree_PhD,Have you ever had suicidal thoughts ?_Yes,Family History of Mental Illness_Yes
0,113127,0.243902,0.8,0.0,0.828,0.8,0.0,0.7,0.833333,0.0,...,False,False,False,False,False,False,False,False,True,True
1,22943,0.02439,1.0,0.0,0.709,1.0,0.0,0.3,0.916667,1.0,...,False,False,False,False,False,False,False,False,True,True
2,88721,0.097561,0.6,0.0,0.665,0.2,0.0,1.0,0.166667,0.75,...,False,True,False,False,False,False,False,False,True,False
3,46602,1.0,0.2,0.0,0.814,0.2,0.0,0.3,0.833333,0.75,...,False,False,False,False,False,False,False,True,True,True
4,94657,0.365854,0.6,0.0,0.647,0.2,0.0,0.0,0.833333,0.0,...,False,False,False,False,False,False,False,False,False,True



X_valid_encoded:


Unnamed: 0,id,Age,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Work/Study Hours,Financial Stress,...,Degree_MBBS,Degree_MCA,Degree_MD,Degree_ME,Degree_MHM,Degree_MSc,Degree_Others,Degree_PhD,Have you ever had suicidal thoughts ?_Yes,Family History of Mental Illness_Yes
0,130472,0.170732,1.0,0.0,0.587,0.8,0.0,0.0,0.75,0.25,...,False,True,False,False,False,False,False,False,True,False
1,45743,0.097561,0.6,0.0,0.725,1.0,0.0,0.7,0.25,0.25,...,False,False,False,False,False,False,False,False,True,False
2,15264,0.146341,0.4,0.0,0.954,0.4,0.0,0.3,0.833333,0.5,...,False,False,False,False,False,False,False,False,False,False
3,115996,0.390244,0.8,0.0,0.804,0.4,0.0,0.7,1.0,0.75,...,False,False,False,False,False,False,False,False,True,True
4,123354,0.317073,1.0,0.0,0.921,0.8,0.0,0.0,0.5,0.5,...,False,False,False,False,False,False,False,False,True,True



X_test_encoded:


Unnamed: 0,id,Age,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Work/Study Hours,Financial Stress,...,Degree_MBBS,Degree_MCA,Degree_MD,Degree_ME,Degree_MHM,Degree_MSc,Degree_Others,Degree_PhD,Have you ever had suicidal thoughts ?_Yes,Family History of Mental Illness_Yes
0,59771,0.146341,0.8,0.0,0.993,0.4,0.0,0.7,0.5,1.0,...,False,False,False,False,False,False,False,False,True,False
1,64684,0.341463,0.4,0.0,0.791,0.8,0.0,1.0,1.0,0.25,...,False,False,False,False,False,False,False,False,True,True
2,48614,0.0,0.8,0.0,0.96,1.0,0.0,0.0,0.5,0.0,...,False,False,False,False,False,False,False,False,False,False
3,75026,0.121951,0.6,0.0,0.556,0.8,0.0,0.3,0.833333,0.75,...,False,False,False,False,False,False,False,False,True,True
4,113495,0.097561,0.6,0.0,0.895,0.2,0.0,0.0,0.833333,0.75,...,False,False,False,False,False,False,False,False,True,False


In [202]:
X_train_encoded.info()
X_valid_encoded.info()
X_test_encoded.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19528 entries, 0 to 19527
Data columns (total 43 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   id                                         19528 non-null  int64  
 1   Age                                        19528 non-null  float64
 2   Academic Pressure                          19528 non-null  float64
 3   Work Pressure                              19528 non-null  float64
 4   CGPA                                       19528 non-null  float64
 5   Study Satisfaction                         19528 non-null  float64
 6   Job Satisfaction                           19528 non-null  float64
 7   Sleep Duration                             19528 non-null  float64
 8   Work/Study Hours                           19528 non-null  float64
 9   Financial Stress                           19528 non-null  float64
 10  Gender_Male           

In [203]:
# Chuyển đổi các cột categorical thành số
for col in categorical_columns:
    if col in X_train_encoded.columns:
        X_train_encoded[col] = X_train_encoded[col].astype('category').cat.codes
    if col in X_valid_encoded.columns:
        X_valid_encoded[col] = X_valid_encoded[col].astype('category').cat.codes
    if col in X_test_encoded.columns:
        X_test_encoded[col] = X_test_encoded[col].astype('category').cat.codes
# Chuẩn hóa dữ liệu
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_valid_scaled = scaler.transform(X_valid_encoded)
X_test_scaled = scaler.transform(X_test_encoded)

In [204]:
print("Data after one-hot encoding (Training Set):")
display(X_train_encoded.head())

print("\nData after one-hot encoding (Validation Set):")
display(X_valid_encoded.head())

print("\nData after one-hot encoding (Test Set):")
display(X_test_encoded.head())

Data after one-hot encoding (Training Set):


Unnamed: 0,id,Age,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Work/Study Hours,Financial Stress,...,Degree_MBBS,Degree_MCA,Degree_MD,Degree_ME,Degree_MHM,Degree_MSc,Degree_Others,Degree_PhD,Have you ever had suicidal thoughts ?_Yes,Family History of Mental Illness_Yes
0,113127,0.243902,0.8,0.0,0.828,0.8,0.0,0.7,0.833333,0.0,...,False,False,False,False,False,False,False,False,True,True
1,22943,0.02439,1.0,0.0,0.709,1.0,0.0,0.3,0.916667,1.0,...,False,False,False,False,False,False,False,False,True,True
2,88721,0.097561,0.6,0.0,0.665,0.2,0.0,1.0,0.166667,0.75,...,False,True,False,False,False,False,False,False,True,False
3,46602,1.0,0.2,0.0,0.814,0.2,0.0,0.3,0.833333,0.75,...,False,False,False,False,False,False,False,True,True,True
4,94657,0.365854,0.6,0.0,0.647,0.2,0.0,0.0,0.833333,0.0,...,False,False,False,False,False,False,False,False,False,True



Data after one-hot encoding (Validation Set):


Unnamed: 0,id,Age,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Work/Study Hours,Financial Stress,...,Degree_MBBS,Degree_MCA,Degree_MD,Degree_ME,Degree_MHM,Degree_MSc,Degree_Others,Degree_PhD,Have you ever had suicidal thoughts ?_Yes,Family History of Mental Illness_Yes
0,130472,0.170732,1.0,0.0,0.587,0.8,0.0,0.0,0.75,0.25,...,False,True,False,False,False,False,False,False,True,False
1,45743,0.097561,0.6,0.0,0.725,1.0,0.0,0.7,0.25,0.25,...,False,False,False,False,False,False,False,False,True,False
2,15264,0.146341,0.4,0.0,0.954,0.4,0.0,0.3,0.833333,0.5,...,False,False,False,False,False,False,False,False,False,False
3,115996,0.390244,0.8,0.0,0.804,0.4,0.0,0.7,1.0,0.75,...,False,False,False,False,False,False,False,False,True,True
4,123354,0.317073,1.0,0.0,0.921,0.8,0.0,0.0,0.5,0.5,...,False,False,False,False,False,False,False,False,True,True



Data after one-hot encoding (Test Set):


Unnamed: 0,id,Age,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Work/Study Hours,Financial Stress,...,Degree_MBBS,Degree_MCA,Degree_MD,Degree_ME,Degree_MHM,Degree_MSc,Degree_Others,Degree_PhD,Have you ever had suicidal thoughts ?_Yes,Family History of Mental Illness_Yes
0,59771,0.146341,0.8,0.0,0.993,0.4,0.0,0.7,0.5,1.0,...,False,False,False,False,False,False,False,False,True,False
1,64684,0.341463,0.4,0.0,0.791,0.8,0.0,1.0,1.0,0.25,...,False,False,False,False,False,False,False,False,True,True
2,48614,0.0,0.8,0.0,0.96,1.0,0.0,0.0,0.5,0.0,...,False,False,False,False,False,False,False,False,False,False
3,75026,0.121951,0.6,0.0,0.556,0.8,0.0,0.3,0.833333,0.75,...,False,False,False,False,False,False,False,False,True,True
4,113495,0.097561,0.6,0.0,0.895,0.2,0.0,0.0,0.833333,0.75,...,False,False,False,False,False,False,False,False,True,False


In [205]:
# Define and train the logistic regression model
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train_scaled, y_train)

# Predict on training, validation, and test sets
y_train_pred = log_reg.predict(X_train_scaled)
y_valid_pred = log_reg.predict(X_valid_scaled)
y_test_pred = log_reg.predict(X_test_scaled)

# Generate classification reports
print("Classification Report (Training Set):")
print(classification_report(y_train, y_train_pred))

print("\nClassification Report (Validation Set):")
print(classification_report(y_valid, y_valid_pred))

print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_test_pred))

Classification Report (Training Set):
              precision    recall  f1-score   support

           0       0.83      0.79      0.81      8094
           1       0.86      0.89      0.87     11434

    accuracy                           0.85     19528
   macro avg       0.84      0.84      0.84     19528
weighted avg       0.85      0.85      0.85     19528


Classification Report (Validation Set):
              precision    recall  f1-score   support

           0       0.83      0.79      0.81      1734
           1       0.86      0.88      0.87      2451

    accuracy                           0.85      4185
   macro avg       0.84      0.84      0.84      4185
weighted avg       0.85      0.85      0.85      4185


Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.84      0.79      0.81      1735
           1       0.86      0.89      0.87      2450

    accuracy                           0.85      4185
   macro avg  

In [174]:

'''# Định nghĩa mô hình SVM
svm = SVC(random_state=42)

# Định nghĩa grid search với các giá trị kernel
param_grid = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}

# Sử dụng GridSearchCV để tìm kiếm tham số tốt nhất
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1)
grid_search.fit(X_train_scaled, y_train)

# In ra kết quả tốt nhất
print(f"Best kernel: {grid_search.best_params_['kernel']}")
print(f"Best accuracy: {grid_search.best_score_}")

# Dự đoán trên tập validation và test
best_svm = grid_search.best_estimator_
y_valid_pred_svm = best_svm.predict(X_valid_scaled)
y_test_pred_svm = best_svm.predict(X_test_scaled)

# Báo cáo kết quả
print("\nClassification Report (Validation Set):")
print(classification_report(y_valid, y_valid_pred_svm))

print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_test_pred_svm))'''

'# Định nghĩa mô hình SVM\nsvm = SVC(random_state=42)\n\n# Định nghĩa grid search với các giá trị kernel\nparam_grid = {\'kernel\': [\'linear\', \'poly\', \'rbf\', \'sigmoid\']}\n\n# Sử dụng GridSearchCV để tìm kiếm tham số tốt nhất\ngrid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=5, scoring=\'accuracy\', verbose=1)\ngrid_search.fit(X_train_scaled, y_train)\n\n# In ra kết quả tốt nhất\nprint(f"Best kernel: {grid_search.best_params_[\'kernel\']}")\nprint(f"Best accuracy: {grid_search.best_score_}")\n\n# Dự đoán trên tập validation và test\nbest_svm = grid_search.best_estimator_\ny_valid_pred_svm = best_svm.predict(X_valid_scaled)\ny_test_pred_svm = best_svm.predict(X_test_scaled)\n\n# Báo cáo kết quả\nprint("\nClassification Report (Validation Set):")\nprint(classification_report(y_valid, y_valid_pred_svm))\n\nprint("\nClassification Report (Test Set):")\nprint(classification_report(y_test, y_test_pred_svm))'

In [206]:
# Gắn lại nhãn 'Depression' vào các tập
train_encoded_df = X_train_encoded.copy()
train_encoded_df['Depression'] = y_train.values

valid_encoded_df = X_valid_encoded.copy()
valid_encoded_df['Depression'] = y_valid.values

test_encoded_df = X_test_encoded.copy()
test_encoded_df['Depression'] = y_test.values

# Ghi ra file CSV
train_encoded_df.to_csv("train_set_encoded.csv", index=False)
valid_encoded_df.to_csv("valid_set_encoded.csv", index=False)
test_encoded_df.to_csv("test_set_encoded.csv", index=False)

print("Đã lưu xong các tập dữ liệu đã được one-hot encoding.")


Đã lưu xong các tập dữ liệu đã được one-hot encoding.


In [213]:
# Reload the original dataset
df = pd.read_csv('Student Depression Dataset.csv')

df = df.dropna()  # Handle missing values

# Mapping thời lượng ngủ thành số
sleep_map = {
    'Less than 5 hours': 4,
    '5-6 hours': 5.5,
    '7-8 hours': 7.5,
    'More than 8 hours': 9
}
df['Sleep Duration'] = df['Sleep Duration'].map(sleep_map)

# Kiểm tra và xử lý NaN
if df['Sleep Duration'].isnull().sum() > 0:
    print("Found NaN in 'Sleep Duration' after mapping.")
    df['Sleep Duration'].fillna(df['Sleep Duration'].mean(), inplace=True)  # Hoặc dùng phương pháp khác

categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
print("Categorical Columns:", categorical_columns)

# Bước 1: Chia dữ liệu thành train (70%) và tạm thời (30%)
train_set, temp_set = train_test_split(df, test_size=0.3, random_state=42, stratify=df['Depression'])

# Bước 2: Chia phần tạm thời (30%) thành validation (15%) và test (15%)
valid_set, test_set = train_test_split(temp_set, test_size=0.5, random_state=42, stratify=temp_set['Depression'])

# Bước 3: Lưu ra file CSV
train_set.to_csv('full_train_set.csv', index=False)
valid_set.to_csv('full_valid_set.csv', index=False)
test_set.to_csv('full_test_set.csv', index=False)

print(f"Train set: {train_set.shape}")
print(f"Validation set: {valid_set.shape}")
print(f"Test set: {test_set.shape}")

train_df = pd.read_csv('full_train_set.csv')
valid_df = pd.read_csv('full_valid_set.csv')
test_df = pd.read_csv('full_test_set.csv')


X_train = train_df.drop(columns=['Depression'])
y_train = train_df['Depression']

X_valid = valid_df.drop(columns=['Depression'])
y_valid = valid_df['Depression']

X_test = test_df.drop(columns=['Depression'])
y_test = test_df['Depression']
# 2. One-hot encode cho các cột dạng object
X_train_encoded = pd.get_dummies(X_train, drop_first=True)
X_valid_encoded = pd.get_dummies(X_valid, drop_first=True)
X_test_encoded = pd.get_dummies(X_test, drop_first=True)

yes_no_columns = ['Have you ever had suicidal thoughts ?', 'Family History of Mental Illness']
# One-hot encode các cột yes_no_columns
df_encoded = pd.get_dummies(df, columns=yes_no_columns, drop_first=True)

# 3. Căn chỉnh cột giữa các tập
X_valid_encoded = X_valid_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)
X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

# Các cột số cần chuẩn hóa
columns_to_scale = ['Age', 'Academic Pressure', 'Work Pressure', 'CGPA',
                    'Study Satisfaction', 'Job Satisfaction',
                    'Work/Study Hours', 'Financial Stress', 'Sleep Duration']

# Khởi tạo và fit scaler với tập train
scaler = MinMaxScaler()
X_train_encoded[columns_to_scale] = scaler.fit_transform(X_train_encoded[columns_to_scale])
X_valid_encoded[columns_to_scale] = scaler.transform(X_valid_encoded[columns_to_scale])
X_test_encoded[columns_to_scale] = scaler.transform(X_test_encoded[columns_to_scale])

# Chuyển đổi các cột categorical thành số
for col in categorical_columns:
    if col in X_train_encoded.columns:
        X_train_encoded[col] = X_train_encoded[col].astype('category').cat.codes
    if col in X_valid_encoded.columns:
        X_valid_encoded[col] = X_valid_encoded[col].astype('category').cat.codes
    if col in X_test_encoded.columns:
        X_test_encoded[col] = X_test_encoded[col].astype('category').cat.codes
# Chuẩn hóa dữ liệu
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_valid_scaled = scaler.transform(X_valid_encoded)
X_test_scaled = scaler.transform(X_test_encoded)

# Define and train the logistic regression model
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train_scaled, y_train)

# Predict on training, validation, and test sets
y_train_pred = log_reg.predict(X_train_scaled)
y_valid_pred = log_reg.predict(X_valid_scaled)
y_test_pred = log_reg.predict(X_test_scaled)

# Generate classification reports
print("Classification Report (Training Set):")
print(classification_report(y_train, y_train_pred))

print("\nClassification Report (Validation Set):")
print(classification_report(y_valid, y_valid_pred))

print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_test_pred))


Found NaN in 'Sleep Duration' after mapping.
Categorical Columns: ['Gender', 'City', 'Profession', 'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?', 'Family History of Mental Illness']
Train set: (19528, 18)
Validation set: (4185, 18)
Test set: (4185, 18)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Sleep Duration'].fillna(df['Sleep Duration'].mean(), inplace=True)  # Hoặc dùng phương pháp khác


Classification Report (Training Set):
              precision    recall  f1-score   support

           0       0.84      0.79      0.81      8094
           1       0.86      0.89      0.87     11434

    accuracy                           0.85     19528
   macro avg       0.85      0.84      0.84     19528
weighted avg       0.85      0.85      0.85     19528


Classification Report (Validation Set):
              precision    recall  f1-score   support

           0       0.90      0.66      0.76      1734
           1       0.80      0.95      0.87      2451

    accuracy                           0.83      4185
   macro avg       0.85      0.80      0.81      4185
weighted avg       0.84      0.83      0.82      4185


Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.83      0.79      0.81      1735
           1       0.86      0.89      0.87      2450

    accuracy                           0.85      4185
   macro avg  