In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import VotingRegressor

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import roc_auc_score
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

from sklearn import ensemble
from sklearn import svm
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from catboost import CatBoostRegressor, CatBoostClassifier, Pool
from xgboost import XGBRegressor, XGBClassifier
from lightgbm import LGBMRegressor, LGBMClassifier

import optuna

import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
data = pd.read_csv('data.csv')
data = data[data['Cholesterol Total (mg/dL)'] != 187]
target = data['Cholesterol Total (mg/dL)']
data.drop(['Responden', 'Cholesterol Total (mg/dL)', 'Tempat lahir'], axis=1, inplace=True)
data["Jenis Kelamin"] = data["Jenis Kelamin"].apply(lambda x: 1 if x == 'M' else 0)

In [22]:
def perform_cross_validation(model, X, y, num_folds):
    # Initialize KFold with stratified sampling
    kf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)
    
    # Initialize list to store RMSE scores for each fold
    rmse_scores = []
    accuracy = []

    # Perform k-fold cross-validation
    for train_index, test_index in kf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Train the model
        model.fit(X_train, y_train)

        # Make predictions on the test set
        y_pred = model.predict(X_test)

        counter = y_pred - np.array(y_test)
        
        counter = len([elm for elm in counter if elm == 0])
        # Calculate RMSE
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        accuracy.append(counter/len(y_test))
        # Append RMSE to the list of scores
        rmse_scores.append(rmse)

    print(rmse_scores)
    print(f'RMSE = {np.mean(rmse_scores)}')

In [10]:
# Define the models
models = [
    XGBRegressor(),
    CatBoostRegressor(verbose=False),
    LGBMRegressor(),
    RandomForestRegressor(),
    LinearRegression(),
    Ridge(),
    Lasso(),
    LogisticRegression(),
]

for model in models:
    print(model.__class__.__name__)
    perform_cross_validation(model, data, target, 5)

XGBRegressor
RMSE = 32.55845011260976
CatBoostRegressor
RMSE = 32.21672503030186
LGBMRegressor
RMSE = 32.53727556023499
RandomForestRegressor
RMSE = 31.52978966740082
LinearRegression
RMSE = 32.2936294706871
Ridge
RMSE = 32.28843252235978
Lasso
RMSE = 32.26098747866248
LogisticRegression
RMSE = 39.77495361746239


In [20]:
perform_cross_validation(RandomForestRegressor(), data.drop(['IMT (kg/m2)', 'Tinggi badan (cm)', 'Lingkar perut (cm)', 'Masa Kerja',  'Tekanan darah  (S)', 'Tekanan darah  (D)'], axis=1), target, 5)

RMSE = 32.09047706266671


In [27]:
class DSCModel:
    def __init__(self):
        self.__classifier = XGBClassifier()
        self.__regressor = RandomForestRegressor()    

    def fit(self, X, y):
        target = y.apply(lambda x: 1 if x == 187 else 0)
        self.__classifier.fit(X, target)
        X.drop(['IMT (kg/m2)', 'Tinggi badan (cm)', 'Lingkar perut (cm)', 'Masa Kerja',  'Tekanan darah  (S)', 'Tekanan darah  (D)'], axis=1, inplace=True)
        x_filtered, y_filtered = X[target == 0], y[target == 0]
        self.__regressor.fit(x_filtered, y_filtered)

    def predict(self, X):
        pred_187 = self.__classifier.predict(X)
        not_187 = X[pred_187 == 0]
        not_187.drop(['IMT (kg/m2)', 'Tinggi badan (cm)', 'Lingkar perut (cm)', 'Masa Kerja',  'Tekanan darah  (S)', 'Tekanan darah  (D)'], axis=1, inplace=True)
        pred = self.__regressor.predict(not_187)
        pred = np.round(pred)
        result = []
        idx1, idx2 = 0, 0
        while idx1 + idx2 < len(X):
            if pred_187[idx1 + idx2]:
                result.append(187)
                idx1 += 1
            else:
                result.append(pred[idx2])
                idx2 += 1
        return result

In [29]:
data = pd.read_csv('data.csv')
target = data['Cholesterol Total (mg/dL)']
data.drop(['Responden', 'Cholesterol Total (mg/dL)', 'Tempat lahir'], axis=1, inplace=True)
data["Jenis Kelamin"] = data["Jenis Kelamin"].apply(lambda x: 1 if x == 'M' else 0)

In [30]:
perform_cross_validation(DSCModel(), data, target, 5)

[22.61224077432002, 21.426462227397447, 20.60937332032967, 15.738653814116283, 20.87801201665978]
RMSE = 20.252948430564636
