In [None]:
!pip install lightgbm scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-24.9.0-py3-none-any.whl.metadata (11 kB)
Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl (107 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.8/107.8 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyaml-24.9.0-py3-none-any.whl (24 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-24.9.0 scikit-optimize-0.10.2


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from skopt import BayesSearchCV
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
data = pd.read_csv('/content/mental_health_dataset.csv')




In [None]:
data.shape

(1000, 12)

In [None]:
data.isnull().sum()

Unnamed: 0,0
User_ID,0
Age,0
Gender,0
Occupation,0
Country,0
Mental_Health_Condition,0
Severity,501
Consultation_History,0
Stress_Level,0
Sleep_Hours,0


In [None]:
for i in data.select_dtypes(include = 'object').columns:
    data[i].fillna(data[i].mode()[0], inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[i].fillna(data[i].mode()[0], inplace = True)


In [None]:
# Preprocessing: Encode categorical variables, handle missing values (if any)
categorical_columns = ['Gender', 'Occupation', 'Country', 'Severity', 'Consultation_History', 'Stress_Level']
encoder = LabelEncoder()

for col in categorical_columns:
    data[col] = encoder.fit_transform(data[col].astype(str))

# Splitting data into features (X) and target (y)
X = data.drop(columns=['User_ID', 'Mental_Health_Condition'])  # 'User_ID' is assumed to be irrelevant for prediction
y = encoder.fit_transform(data['Mental_Health_Condition'])

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the LightGBM model
lgb_model = lgb.LGBMClassifier()

# Set up parameter search space for Bayesian Optimization
param_space = {
    'num_leaves': (10, 50),
    'max_depth': (3, 12),
    'learning_rate': (0.01, 0.3, 'log-uniform'),
    'n_estimators': (50, 300),
    'min_child_samples': (5, 30)
}

In [None]:
bayes_search = BayesSearchCV(
    estimator=lgb_model,
    search_spaces=param_space,
    n_iter=32,
    cv=3,
    n_jobs=-1,
    random_state=42
)

# Fit the model
bayes_search.fit(X_train, y_train)

# Best parameters and evaluation
best_params = bayes_search.best_params_
best_model = bayes_search.best_estimator_

# Predict on the test set
y_pred = best_model.predict(X_test)

# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
print("Best Parameters:", best_params)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))

[LightGBM] [Info] Number of positive: 417, number of negative: 383
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000053 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 200
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.521250 -> initscore=0.085051
[LightGBM] [Info] Start training from score 0.085051
Best Parameters: OrderedDict([('learning_rate', 0.01), ('max_depth', 12), ('min_child_samples', 5), ('n_estimators', 117), ('num_leaves', 10)])
Accuracy: 0.51
Classification Report:
               precision    recall  f1-score   support

           0       0.54      0.26      0.36       102
           1       0.50      0.77      0.60        98

    accuracy                           0.51       200
   macro avg       0.52      0.52      0.48       