<p style="font-weight:bold; letter-spacing: 2px; color:#F5F5DC; font-size:140%; text-align:left; max-width: 1050px; padding: 10px; border-bottom: 3px solid #D2B48C"> Preprocessing on Sleep Disorder Prediction</p>

In [None]:
# Encode binary category columns 
binary_cols = ['Gender', 'BMI Category']
le_dict = {}
for binary_col in binary_cols:
    le_dict[binary_col] = LabelEncoder()
    X_train[binary_col] = le_dict[binary_col].fit_transform(X_train[binary_col])
    X_test[binary_col] = le_dict[binary_col].transform(X_test[binary_col])

In [None]:
# Encode multi-category columns
multi_cat_cols = ['Blood Pressure Category', 'Occupation']
X_train = pd.get_dummies(X_train, columns=multi_cat_cols, drop_first=True)
X_test = pd.get_dummies(X_test, columns=multi_cat_cols, drop_first=True)
X_test = settings.fix_columns(X_test, X_train.columns)

In [None]:
# Scale numerical columns 
scale_cols = ['Age', 'Sleep Duration', 'Quality of Sleep', 'Physical Activity Level', 'Stress Level', 'Heart Rate', 'Daily Steps']
ss_dict = {}
for scale_col in scale_cols:
    ss_dict[scale_col] = StandardScaler()
    X_train[scale_col] = ss_dict[scale_col].fit_transform(X_train[[scale_col]])
    X_test[scale_col] = ss_dict[scale_col].transform(X_test[[scale_col]])


In [None]:
class CustomSMOTENC(BaseEstimator, TransformerMixin):
    def __init__(self, categorical_features=None, sampling_strategy='auto', random_state=None):
        self.categorical_features = categorical_features
        self.sampling_strategy = sampling_strategy
        self.random_state = random_state
        self.smote_nc = SMOTENC(
            categorical_features=categorical_features,
            sampling_strategy=sampling_strategy,
            random_state=random_state
        )
    
    def fit(self, X, y=None):
        self.smote_nc.fit(X, y)
        return self
    
    def transform(self, X, y=None):
        X_resampled, y_resampled = self.smote_nc.fit_resample(X, y)
        return X_resampled, y_resampled


categorical_features = ['Gender_Female', 'Gender_Male', 'Occupation_Accountant',
       'Occupation_Doctor', 'Occupation_Engineer', 'Occupation_Lawyer',
       'Occupation_Nurse', 'Occupation_Salesperson', 'Occupation_Teacher',
       'Occupation_Unknown', 'BMI Category_Normal', 'BMI Category_Overweight',
       'Blood Pressure Category_Elevated',
       'Blood Pressure Category_High Blood Pressure Stage 1',
       'Blood Pressure Category_High Blood Pressure Stage 2',
       'Blood Pressure Category_Normal']


smote = CustomSMOTENC(categorical_features=categorical_features, random_state=42)
smote.fit(transformed_X_train_df, y_train)
X_resampled, y_resampled = smote.transform(transformed_X_train_df, y_train)

# Print the class distribution before and after SMOTE-NC
print("Class distribution before SMOTE-NC:")
print(pd.Series(y_train).value_counts())

print("\nClass distribution after SMOTE-NC:")
print(pd.Series(y_resampled).value_counts())


# Update pipeline
pipeline_mlr = settings.create_pipeline(["preprocess", column_trans], ["smote", smote], ["multinomial_logistic_regression", model_mlr])

# Check that train dataset has been smote augmented 
smote.fit(settings.convert_transformed_features_to_df(column_trans, column_trans.fit_transform(X_train)), y_train)
X_train_smote_df, y_train_smote_df = smote.transform(settings.convert_transformed_features_to_df(column_trans, column_trans.fit_transform(X_train)), y_train)

# Print the class distribution before and after SMOTE-NC
print("Class distribution before SMOTE-NC:")
print(pd.Series(y_train).value_counts())
print("\nClass distribution after SMOTE-NC:")
print(pd.Series(y_train_smote).value_counts())

In [None]:
# Create a column transformer for encoding
column_trans = ColumnTransformer(
    transformers=[
        ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value'), ALL_FEATURES_ENCODE)
    ],
    remainder='passthrough')

# create model
model_nb = CategoricalNB()

# create sklearn pipeline
pipeline_nb = settings.create_pipeline(["preprocess", column_trans], ["categorical_naive_bayes", model_nb])

# create cross validation object for cross_val_score
cv_nb = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)

# run training cross validation on pipeline 
cv_scores = cross_val_score(pipeline_nb, X_train, y_train, scoring='accuracy', cv=cv_nb)