In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Load the dataset
file_path = "https://raw.githubusercontent.com/anvarbla/Dry-Eye-Disease/refs/heads/main/Preprocessed_Dry_Eye_Dataset.csv"
df = pd.read_csv(file_path)

In [None]:
# One-hot encode the "Sleep quality" column, ensuring numerical (0/1) values
df = pd.get_dummies(df, columns=['Sleep quality'], drop_first=False)
df["Sleep quality_1"]=df["Sleep quality_1"].astype(int)
df["Sleep quality_2"]=df["Sleep quality_2"].astype(int)
df["Sleep quality_3"]=df["Sleep quality_3"].astype(int)
df["Sleep quality_4"]=df["Sleep quality_4"].astype(int)
df["Sleep quality_5"]=df["Sleep quality_5"].astype(int)
# One-hot encode the "Sleep quality" column, ensuring numerical (0/1) values
df = pd.get_dummies(df, columns=['Stress level'], drop_first=False)
df["Stress level_1"]=df["Stress level_1"].astype(int)
df["Stress level_2"]=df["Stress level_2"].astype(int)
df["Stress level_3"]=df["Stress level_3"].astype(int)
df["Stress level_4"]=df["Stress level_4"].astype(int)
df["Stress level_5"]=df["Stress level_5"].astype(int)

In [None]:
df

Unnamed: 0,Age,Sleep duration,Heart rate,Daily steps,Physical activity,Height,Weight,Sleep disorder,Wake up during night,Feel sleepy during day,...,Sleep quality_1,Sleep quality_2,Sleep quality_3,Sleep quality_4,Sleep quality_5,Stress level_1,Stress level_2,Stress level_3,Stress level_4,Stress level_5
0,24,9.5,67,3000,31,161,69,1,0,0,...,0,1,0,0,0,1,0,0,0,0
1,39,9.6,60,12000,74,164,87,0,0,0,...,0,1,0,0,0,0,0,1,0,0
2,45,5.4,95,12000,93,179,94,1,1,0,...,1,0,0,0,0,0,0,0,0,1
3,45,5.4,78,19000,32,160,77,0,0,0,...,0,0,0,1,0,0,0,0,0,1
4,42,5.7,72,4000,173,179,99,0,1,0,...,0,0,1,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,26,9.4,91,20000,88,200,72,0,1,1,...,0,1,0,0,0,0,0,1,0,0
19996,39,7.3,65,2000,53,195,96,1,0,1,...,0,0,1,0,0,0,0,0,1,0
19997,20,8.0,78,10000,17,186,87,0,0,1,...,0,0,0,0,1,0,0,1,0,0
19998,38,4.5,60,3000,115,170,66,0,1,0,...,0,0,1,0,0,0,0,1,0,0


In [None]:

# Create "Age Group" column with categorical labels
df['Age Group'] = df['Age'].apply(lambda x: 'Young Adult' if x <= 25 else 'Adult')

# One-hot encode the "Age Group" column, ensuring numerical (0/1) values
df = pd.get_dummies(df, columns=['Age Group'], drop_first=False)
df["Age Group_Adult"]=df["Age Group_Adult"].astype(int)
df["Age Group_Young Adult"]=df["Age Group_Young Adult"].astype(int)

# Display the first few rows to verify
print(df.head())

   Age  Sleep duration  Heart rate  Daily steps  Physical activity  Height  \
0   24             9.5          67         3000                 31     161   
1   39             9.6          60        12000                 74     164   
2   45             5.4          95        12000                 93     179   
3   45             5.4          78        19000                 32     160   
4   42             5.7          72         4000                173     179   

   Weight  Sleep disorder  Wake up during night  Feel sleepy during day  ...  \
0      69               1                     0                       0  ...   
1      87               0                     0                       0  ...   
2      94               1                     1                       0  ...   
3      77               0                     0                       0  ...   
4      99               0                     1                       0  ...   

   Sleep quality_3  Sleep quality_4  Sleep quality

In [None]:
df['BMI'] = (df['Weight'] / ((df['Height']/100) ** 2)).round(1)

In [None]:
# Define the BMI categories using bins
bmi_bins = [0, 18.5, 24.9, 29.9, float('inf')]  # BMI thresholds
bmi_labels = ['BMI_underweight', 'BMI_normal_weight', 'BMI_overweight', 'BMI_obesity']  # Labels for categories

# Create a new column 'BMI Category' based on the BMI value
df['BMI Category'] = pd.cut(df['BMI'], bins=bmi_bins, labels=bmi_labels, right=False)

In [None]:
# Initialize OneHotEncoder (sparse_output=False to get a dense array)
ohe = OneHotEncoder(sparse_output=False)

# Fit and transform the BMI Category column
encoded_bmi = ohe.fit_transform(df[['BMI Category']])

# Convert the encoded result into a DataFrame with appropriate column names
encoded_bmi_df = pd.DataFrame(encoded_bmi, columns=ohe.categories_[0]).astype(int)

# Join the one-hot encoded columns with the original DataFrame
df = pd.concat([df, encoded_bmi_df], axis=1)

In [None]:
# Create the function
def sleep_group (hours):
    if hours <= 4:
        return "Very short"
    elif hours >4 and hours <= 6:
        return "Short"
    elif hours >6 and hours <=8:
        return "Normal"
    elif hours >8 and hours <= 10:
        return "Long"
    else:
        return "Very Long"


#Apply the function
df["Sleep duration group"] = df["Sleep duration"].apply(sleep_group)

# One-hot encode the "Age Group" column, ensuring numerical (0/1) values
df = pd.get_dummies(df, columns=['Sleep duration group'], drop_first=False)

df["Sleep duration group_Long"]=df["Sleep duration group_Long"].astype(int)
df["Sleep duration group_Normal"]=df["Sleep duration group_Normal"].astype(int)
df["Sleep duration group_Short"]=df["Sleep duration group_Short"].astype(int)
df["Sleep duration group_Very short"]=df["Sleep duration group_Very short"].astype(int)

In [None]:
df

Unnamed: 0,Age,Sleep duration,Heart rate,Daily steps,Physical activity,Height,Weight,Sleep disorder,Wake up during night,Feel sleepy during day,...,BMI,BMI Category,BMI_normal_weight,BMI_obesity,BMI_overweight,BMI_underweight,Sleep duration group_Long,Sleep duration group_Normal,Sleep duration group_Short,Sleep duration group_Very short
0,24,9.5,67,3000,31,161,69,1,0,0,...,26.6,BMI_overweight,0,0,1,0,1,0,0,0
1,39,9.6,60,12000,74,164,87,0,0,0,...,32.3,BMI_obesity,0,1,0,0,1,0,0,0
2,45,5.4,95,12000,93,179,94,1,1,0,...,29.3,BMI_overweight,0,0,1,0,0,0,1,0
3,45,5.4,78,19000,32,160,77,0,0,0,...,30.1,BMI_obesity,0,1,0,0,0,0,1,0
4,42,5.7,72,4000,173,179,99,0,1,0,...,30.9,BMI_obesity,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,26,9.4,91,20000,88,200,72,0,1,1,...,18.0,BMI_underweight,0,0,0,1,1,0,0,0
19996,39,7.3,65,2000,53,195,96,1,0,1,...,25.2,BMI_overweight,0,0,1,0,0,1,0,0
19997,20,8.0,78,10000,17,186,87,0,0,1,...,25.1,BMI_overweight,0,0,1,0,0,1,0,0
19998,38,4.5,60,3000,115,170,66,0,1,0,...,22.8,BMI_normal_weight,1,0,0,0,0,0,1,0


In [None]:
df.drop("Age", axis=1, inplace=True)
df.drop("Sleep duration", axis=1, inplace=True)
df.drop("Height", axis=1, inplace=True)
df.drop("Weight", axis=1, inplace=True)
df.drop("BMI", axis=1, inplace=True)
df.drop("BMI Category", axis=1, inplace=True)

In [None]:
# Define the correct column names
columns_to_normalize = ["Heart rate", "Daily steps", "Physical activity", "Average screen time", "Systolic_BP", "Diastolic_BP"]

# Apply Min-Max Scaling only to the selected columns
scaler = MinMaxScaler()
df[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize])
df

Unnamed: 0,Heart rate,Daily steps,Physical activity,Sleep disorder,Wake up during night,Feel sleepy during day,Caffeine consumption,Alcohol consumption,Smoking,Medical issue,...,Age Group_Adult,Age Group_Young Adult,BMI_normal_weight,BMI_obesity,BMI_overweight,BMI_underweight,Sleep duration group_Long,Sleep duration group_Normal,Sleep duration group_Short,Sleep duration group_Very short
0,0.175,0.105263,0.172222,1,0,0,0,0,0,1,...,0,1,0,0,1,0,1,0,0,0
1,0.000,0.578947,0.411111,0,0,0,0,1,0,1,...,1,0,0,1,0,0,1,0,0,0
2,0.875,0.578947,0.516667,1,1,0,1,1,0,0,...,1,0,0,0,1,0,0,0,1,0
3,0.450,0.947368,0.177778,0,0,0,1,0,0,1,...,1,0,0,1,0,0,0,0,1,0
4,0.300,0.157895,0.961111,0,1,0,0,0,0,1,...,1,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,0.775,1.000000,0.488889,0,1,1,0,0,1,1,...,1,0,0,0,0,1,1,0,0,0
19996,0.125,0.052632,0.294444,1,0,1,0,1,0,1,...,1,0,0,0,1,0,0,1,0,0
19997,0.450,0.473684,0.094444,0,0,1,1,1,1,1,...,0,1,0,0,1,0,0,1,0,0
19998,0.000,0.105263,0.638889,0,1,0,1,1,1,0,...,1,0,1,0,0,0,0,0,1,0


In [None]:
#Oversampling to have the same number of values for dry eyes disease and not dry eyes disease
#Create the train and the test group as the oversampling is going to be apply only to train group
# X-y split; features = X, target = y
features = df.drop(columns = ["Dry Eye Disease"])
target = df["Dry Eye Disease"]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.20, random_state=0)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample

X_train["Dry Eye Disease"] = y_train.values
dry_eye = X_train[X_train["Dry Eye Disease"] == 1]
no_dry_eye = X_train[X_train["Dry Eye Disease"] == 0]
len(dry_eye),len(no_dry_eye)

yes_dry_eye_oversampled = resample(no_dry_eye,
                                    replace=True,
                                    n_samples = len(dry_eye),
                                    random_state=0)

train_over = pd.concat([dry_eye, yes_dry_eye_oversampled])
train_over

Unnamed: 0,Heart rate,Daily steps,Physical activity,Sleep disorder,Wake up during night,Feel sleepy during day,Caffeine consumption,Alcohol consumption,Smoking,Medical issue,...,Age Group_Young Adult,BMI_normal_weight,BMI_obesity,BMI_overweight,BMI_underweight,Sleep duration group_Long,Sleep duration group_Normal,Sleep duration group_Short,Sleep duration group_Very short,Dry Eye Disease
17815,0.875,0.736842,0.783333,0,1,1,0,0,0,1,...,0,0,0,0,1,0,1,0,0,1
1379,0.750,0.368421,0.716667,1,0,0,1,0,1,0,...,0,1,0,0,0,0,1,0,0,1
7346,0.500,0.052632,0.450000,0,1,0,0,1,1,1,...,1,0,0,1,0,1,0,0,0,1
19550,0.325,0.473684,0.033333,1,0,1,1,0,1,0,...,1,1,0,0,0,0,0,1,0,1
19718,0.150,0.210526,0.544444,1,0,1,0,1,1,1,...,0,0,0,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15706,0.825,1.000000,0.577778,1,0,0,0,1,0,1,...,1,1,0,0,0,1,0,0,0,0
6490,0.825,0.368421,0.744444,1,1,1,1,0,1,0,...,0,1,0,0,0,0,0,1,0,0
19650,0.950,0.210526,0.850000,0,1,0,0,1,0,0,...,0,0,0,1,0,0,1,0,0,0
2788,0.500,1.000000,0.766667,1,1,0,0,1,0,0,...,0,0,0,0,1,0,1,0,0,0


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Define features (X) and target variable (y)
X = df.drop(columns=['Dry Eye Disease'])  # Features
y = df['Dry Eye Disease']  # Target variable

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize numerical features for better model performance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train the Logistic Regression model
model = LogisticRegression(random_state=42)
model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print results
print(f"Model Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", report)

Model Accuracy: 0.6793

Classification Report:
               precision    recall  f1-score   support

           0       0.69      0.14      0.24      1393
           1       0.68      0.97      0.80      2607

    accuracy                           0.68      4000
   macro avg       0.68      0.55      0.52      4000
weighted avg       0.68      0.68      0.60      4000



In [None]:
#Apply the oversampling df on Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, root_mean_squared_error, classification_report, accuracy_score

X_train_resampled = train_over.drop(columns=["Dry Eye Disease"])  # Features
y_train_resampled = train_over["Dry Eye Disease"]  # Target

# Initialize Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on the oversampled dataset
rf.fit(X_train_resampled, y_train_resampled)

# Predict on the original test set (unchanged)
y_pred = rf.predict(X_test)

# Evaluate model performance
print(f"MAE: {mean_absolute_error(y_test, y_pred): .2f}")
print(f"MSE: {mean_squared_error(y_test, y_pred): .2f}")
print(f"Accuracy Score: {accuracy_score(y_test, y_pred):.2f}")
print(classification_report(y_test, y_pred))

MAE:  0.09
MSE:  0.09
Accuracy Score: 0.91
              precision    recall  f1-score   support

           0       0.97      0.75      0.85      1393
           1       0.88      0.99      0.93      2607

    accuracy                           0.91      4000
   macro avg       0.93      0.87      0.89      4000
weighted avg       0.91      0.91      0.90      4000



In [None]:
#Apply the original df on Random Forest Classifier
features = df.drop(columns=['Dry Eye Disease'])
target = df["Dry Eye Disease"]
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.20, random_state=42)

# Initialize the Random Forest Classifier with 100 trees (estimators)
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model on the training data
rf.fit(X_train, y_train)

# Predict on the test set
y_pred = rf.predict(X_test)

print(f"MAE, {mean_absolute_error(y_pred, y_test): .4f}")
print(f"MSE, {mean_squared_error(y_pred, y_test): .4f}")
print(f"Accuracy score: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))

MAE,  0.3035
MSE,  0.3035
Accuracy score: 0.6965
              precision    recall  f1-score   support

           0       0.60      0.21      0.31      1307
           1       0.71      0.93      0.81      2693

    accuracy                           0.70      4000
   macro avg       0.66      0.57      0.56      4000
weighted avg       0.67      0.70      0.64      4000



In [None]:
#KNN
X_train_over = train_over.drop(columns = ["Dry Eye Disease"])
y_train_over = train_over["Dry Eye Disease"]

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3) # n_neighbors = K
knn.fit(X_train_over, y_train_over)

pred = knn.predict(X_test)
pred

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
print(f"The accuracy of the model is {knn.score(X_test, y_test)*100: .2f}%")

In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.1-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.9-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.2.1-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.1-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.6/233.6 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.9-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Mak

In [None]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.model_selection import train_test_split

# Load your dataset
# Assuming df is already loaded from your previous step
features = df.drop(columns=['Dry Eye Disease'])
target = df['Dry Eye Disease']

# Split into train and test datasets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.20, random_state=42)

# Define the objective function for Optuna optimization
def objective(trial):
    # Define hyperparameters to optimize
    n_estimators = trial.suggest_int('n_estimators', 50, 500)  # number of trees in forest
    max_depth = trial.suggest_int('max_depth', 10, 50, log=True)  # max depth of each tree
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)  # min samples required to split an internal node
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)  # min samples required to be at a leaf node
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])  # max features to consider for splitting

    # Initialize Random Forest with these hyperparameters
    rf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42
    )

    # Train the model
    rf.fit(X_train, y_train)

    # Make predictions
    y_pred = rf.predict(X_test)

    # Calculate accuracy score for the test set
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

# Create an Optuna study to optimize the objective function
study = optuna.create_study(direction='maximize')  # We want to maximize the accuracy
study.optimize(objective, n_trials=50)  # Number of trials for optimization

# Print the best hyperparameters found
print(f"Best Hyperparameters: {study.best_params}")
print(f"Best Accuracy: {study.best_value:.4f}")

# Now, train the final Random Forest model with the best hyperparameters found by Optuna
best_rf = RandomForestClassifier(**study.best_params, random_state=42)
best_rf.fit(X_train, y_train)

# Predict on the test set
y_pred = best_rf.predict(X_test)

# Evaluate the final model
print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")


[I 2025-02-27 16:28:12,388] A new study created in memory with name: no-name-4181f78b-b7af-4be9-9f88-94b86d4d874b
[I 2025-02-27 16:28:34,062] Trial 0 finished with value: 0.70175 and parameters: {'n_estimators': 163, 'max_depth': 32, 'min_samples_split': 4, 'min_samples_leaf': 18, 'max_features': None}. Best is trial 0 with value: 0.70175.
[I 2025-02-27 16:28:38,946] Trial 1 finished with value: 0.69925 and parameters: {'n_estimators': 231, 'max_depth': 33, 'min_samples_split': 4, 'min_samples_leaf': 6, 'max_features': 'log2'}. Best is trial 0 with value: 0.70175.
[I 2025-02-27 16:28:42,729] Trial 2 finished with value: 0.70025 and parameters: {'n_estimators': 189, 'max_depth': 10, 'min_samples_split': 18, 'min_samples_leaf': 9, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.70175.
[I 2025-02-27 16:28:51,822] Trial 3 finished with value: 0.701 and parameters: {'n_estimators': 86, 'max_depth': 30, 'min_samples_split': 18, 'min_samples_leaf': 8, 'max_features': None}. Best is tri

Best Hyperparameters: {'n_estimators': 56, 'max_depth': 12, 'min_samples_split': 9, 'min_samples_leaf': 11, 'max_features': None}
Best Accuracy: 0.7023
Test Accuracy: 0.7023
