In [8]:
from imblearn.over_sampling import SMOTE
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)


def load_fold_data(sf, nfs):
    """
    Memuat data lipatan (fold) yang ditentukan untuk prediksi stroke.
    Args:
        SF (int): Indeks lipatan yang akan dimuat dari data stroke.
        NFS (int): Indeks lipatan yang akan dimuat dari data non-stroke.
    Returns:
        tuple: Mengembalikan dua elemen:
            - training_data: Data training yang akan digunakan untuk melatih model.
            - testing_data: Data testing yang akan digunakan untuk menguji model.
    """
    # Load fold dari stroke data
    stroke_fold = pd.read_csv(f'StrokeFold/stroke_fold_{sf}.csv')
    remaining_stroke = [pd.read_csv(f'StrokeFold/stroke_fold_{i}.csv') for i in range(1, 6) if i != sf]
    remaining_stroke = pd.concat(remaining_stroke)

    # Load fold dari no stroke data
    no_stroke_fold = pd.read_csv(f'NonStrokeFold/no_stroke_fold_{nfs}.csv')

    # Ambil 100 data di no_stroke_fold sebagai testing
    testing_no_stroke = no_stroke_fold.sample(n=100, random_state=42)

    # Sisanya sebagai training
    training_no_stroke = no_stroke_fold.drop(testing_no_stroke.index)

    # Gabungkan data training stroke dan non-stroke
    training_data = pd.concat([remaining_stroke, training_no_stroke])

    # Gabungkan data testing stroke dan non-stroke
    testing_data = pd.concat([stroke_fold, testing_no_stroke])
    
    # Inisialisasi SMOTE
    smote = SMOTE(sampling_strategy=0.7, random_state=42)
    
    # Terapkan SMOTE pada data training
    X_train_smote, y_train_smote = smote.fit_resample(training_data.drop(columns=['stroke']), training_data['stroke'])

    # Gabungkan kembali data yang telah di-resample
    training_data = pd.concat([X_train_smote, y_train_smote], axis=1)

    return training_data, testing_data


In [4]:
import pandas as pd

def load_fold_data_ns(sf, nfs):
    """
    Memuat data lipatan (fold) yang ditentukan untuk prediksi stroke.
    Args:
        SF (int): Indeks lipatan yang akan dimuat dari data stroke.
        NFS (int): Indeks lipatan yang akan dimuat dari data non-stroke.
    Returns:
        tuple: Mengembalikan dua elemen:
            - training_data: Data training yang akan digunakan untuk melatih model.
            - testing_data: Data testing yang akan digunakan untuk menguji model.
    """
    # Load fold dari stroke data
    stroke_fold = pd.read_csv(f'StrokeFold/stroke_fold_{sf}.csv')
    remaining_stroke = [pd.read_csv(f'StrokeFold/stroke_fold_{i}.csv') for i in range(1, 6) if i != sf]
    remaining_stroke = pd.concat(remaining_stroke)

    # Load fold dari no stroke data
    no_stroke_fold = pd.read_csv(f'NonStrokeFold/no_stroke_fold_{nfs}.csv')

    # Ambil 100 data di no_stroke_fold sebagai testing
    testing_no_stroke = no_stroke_fold.sample(n=100, random_state=42)

    # Sisanya sebagai training
    training_no_stroke = no_stroke_fold.drop(testing_no_stroke.index)
    
    #mengambil fold dari data non stroke yang tersisa
    remaining_no_stroke = [pd.read_csv(f'NonStrokeFold/no_stroke_fold_{i}.csv') for i in range(1, 6) if i != nfs]
    remaining_no_stroke = pd.concat(remaining_no_stroke)

    # Gabungkan data training stroke dan non-stroke
    training_data = pd.concat([remaining_stroke, training_no_stroke, remaining_no_stroke])

    # Gabungkan data testing stroke dan non-stroke
    testing_data = pd.concat([stroke_fold, testing_no_stroke])

    return training_data, testing_data


In [5]:
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm

def cross_validation(model, func):
    results = []
# Loop through all combinations of folds dengan progress bar
    for sf in tqdm(range(1, 6), desc="Stroke Folds"):
        for nfs in tqdm(range(1, 6), desc="Non-Stroke Folds", leave=False):
            train_data, test_data = func(sf, nfs)

            # Pisahkan fitur dan target untuk data train dan test
            X_train = train_data.drop(columns=['stroke'])
            y_train = train_data['stroke']
            X_test = test_data.drop(columns=['stroke'])
            y_test = test_data['stroke']

            # Latih model
            model.fit(X_train, y_train)

            # Prediksi pada data test
            y_pred = model.predict(X_test)

            # Evaluasi model
            accuracy = accuracy_score(y_test, y_pred)
            report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)

            # Simpan hasil dalam dataframe
            results.append({
                'Stroke Fold': sf,
                'Non-Stroke Fold': nfs,
                'Accuracy': accuracy,
                'Precision (Weighted Avg)': report['weighted avg']['precision'],
                'Recall (Weighted Avg)': report['weighted avg']['recall'],
                'F1-Score (Weighted Avg)': report['weighted avg']['f1-score'],
                'Precision (Macro Avg)': report['macro avg']['precision'],
                'Recall (Macro Avg)': report['macro avg']['recall'],
                'F1-Score (Macro Avg)': report['macro avg']['f1-score']
            })
    # Buat DataFrame dari hasil
    result_df = pd.DataFrame(results)
    return result_df

In [39]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

# Inisialisasi model Logistic Regression dengan max_iter yang lebih tinggi
logistic_model = LogisticRegression(random_state=42, max_iter=1000)

# Scale the data
scaler = StandardScaler()

def cross_validation_scaled(model, func):
	results = []
	for sf in tqdm(range(1, 6), desc="Stroke Folds"):
		for nfs in tqdm(range(1, 6), desc="Non-Stroke Folds", leave=False):
			train_data, test_data = func(sf, nfs)

			# Pisahkan fitur dan target untuk data train dan test
			X_train = train_data.drop(columns=['stroke'])
			y_train = train_data['stroke']
			X_test = test_data.drop(columns=['stroke'])
			y_test = test_data['stroke']

			# Scale the data
			X_train = scaler.fit_transform(X_train)
			X_test = scaler.transform(X_test)

			# Latih model
			model.fit(X_train, y_train)

			# Prediksi pada data test
			y_pred = model.predict(X_test)

			# Evaluasi model
			accuracy = accuracy_score(y_test, y_pred)
			report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)

			# Simpan hasil dalam dataframe
			results.append({
				'Stroke Fold': sf,
				'Non-Stroke Fold': nfs,
				'Accuracy': accuracy,
				'Precision (Weighted Avg)': report['weighted avg']['precision'],
				'Recall (Weighted Avg)': report['weighted avg']['recall'],
				'F1-Score (Weighted Avg)': report['weighted avg']['f1-score'],
				'Precision (Macro Avg)': report['macro avg']['precision'],
				'Recall (Macro Avg)': report['macro avg']['recall'],
				'F1-Score (Macro Avg)': report['macro avg']['f1-score']
			})
	# Buat DataFrame dari hasil
	result_df = pd.DataFrame(results)
	return result_df

results = cross_validation_scaled(logistic_model, load_fold_data)

print(results)
print(results.describe())



[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A

[A[A


[A[A[A


[A[A[A


[A[A[A

[A[A


[A[A[A


[A[A[A


[A[A[A

[A[A


[A[A[A


[A[A[A


[A[A[A

[A[A


[A[A[A


[A[A[A


[A[A[A

Stroke Folds: 100%|██████████| 5/5 [00:01<00:00,  4.53it/s]

    Stroke Fold  Non-Stroke Fold  Accuracy  Precision (Weighted Avg)  Recall (Weighted Avg)  F1-Score (Weighted Avg)  Precision (Macro Avg)  Recall (Macro Avg)  F1-Score (Macro Avg)
0             1                1  0.740000                  0.744213               0.740000                 0.741807               0.709103            0.715000              0.711695
1             1                2  0.746667                  0.744281               0.746667                 0.745336               0.714461            0.710000              0.712063
2             1                3  0.686667                  0.695056               0.686667                 0.690110               0.653110            0.660000              0.655678
3             1                4  0.746667                  0.744281               0.746667                 0.745336               0.714461            0.710000              0.712063
4             1                5  0.773333                  0.768153               0.77333




In [11]:
logistic_model = LogisticRegression(random_state=42, max_iter=1000)
results = cross_validation_scaled(logistic_model, load_fold_data_ns)
print(results)
print(results.describe())

Stroke Folds: 100%|██████████| 5/5 [00:01<00:00,  3.97it/s]

    Stroke Fold  Non-Stroke Fold  Accuracy  Precision (Weighted Avg)  Recall (Weighted Avg)  F1-Score (Weighted Avg)  Precision (Macro Avg)  Recall (Macro Avg)  F1-Score (Macro Avg)
0             1                1  0.666667                  0.444444               0.666667                 0.533333               0.333333                 0.5              0.400000
1             1                2  0.666667                  0.444444               0.666667                 0.533333               0.333333                 0.5              0.400000
2             1                3  0.666667                  0.444444               0.666667                 0.533333               0.333333                 0.5              0.400000
3             1                4  0.666667                  0.444444               0.666667                 0.533333               0.333333                 0.5              0.400000
4             1                5  0.666667                  0.444444               0.66666




In [28]:
from skopt import BayesSearchCV
from skopt.space import Real, Integer

results = []

for i in range(1, 6):
    for j in range(1, 6):
        # Definisikan ruang pencarian untuk hyperparameter
        param_space = {
            'C': Real(1e-6, 1e+6, prior='log-uniform'),
            'solver': ['newton-cg', 'lbfgs', 'liblinear']
        }

        # Inisialisasi model RandomForestClassifier
        logistic_model = LogisticRegression(random_state=42, max_iter=1000)

        # Inisialisasi BayesSearchCV
        opt = BayesSearchCV(
            logistic_model,
            param_space,
            n_iter=32,
            cv=3,
            n_jobs=-1,
            random_state=42
        )

        # Latih model dengan optimasi Bayesian
        train_data, test_data = load_fold_data(i, j)
        X_train = train_data.drop(columns=['stroke'])
        y_train = train_data['stroke']
        X_test = test_data.drop(columns=['stroke'])
        y_test = test_data['stroke']
        opt.fit(X_train, y_train)

        # Simpan hasil terbaik
        best_params = opt.best_params_
        best_score = opt.best_score_

        # Prediksi pada data test
        y_pred_opt = opt.predict(X_test)

        # Evaluasi model
        accuracy_opt = accuracy_score(y_test, y_pred_opt)
        report_opt = classification_report(y_test, y_pred_opt, target_names=['No Stroke', 'Stroke'], output_dict=True)

        # Simpan hasil dalam dataframe
        results.append({
            'Stroke Fold': i,
            'Non-Stroke Fold': j,
            'Best Parameters': best_params,
            'Best Score': best_score,
            'Accuracy': accuracy_opt,
            'Precision': report_opt['weighted avg']['precision'],
            'Recall': report_opt['weighted avg']['recall'],
            'F1-Score': report_opt['weighted avg']['f1-score']
        })

# Buat DataFrame dari hasil
result_best_parameters = pd.DataFrame(results)

In [29]:
print(result_best_parameters)

    Stroke Fold  Non-Stroke Fold                                    Best Parameters  Best Score  Accuracy  Precision    Recall  F1-Score
0             1                1   {'C': 11185.625288472094, 'solver': 'liblinear'}    0.764465  0.740000   0.744213  0.740000  0.741807
1             1                2    {'C': 0.0004309780808920444, 'solver': 'lbfgs'}    0.756507  0.760000   0.755853  0.760000  0.757353
2             1                3   {'C': 1.6363876753596596, 'solver': 'liblinear'}    0.746432  0.686667   0.695056  0.686667  0.690110
3             1                4    {'C': 3.317697704417197, 'solver': 'liblinear'}    0.756507  0.746667   0.744281  0.746667  0.745336
4             1                5    {'C': 3.317697704417197, 'solver': 'liblinear'}    0.762385  0.773333   0.768153  0.773333  0.769400
5             2                1   {'C': 25.406936492978463, 'solver': 'liblinear'}    0.762790  0.760000   0.760000  0.760000  0.760000
6             2                2   {'C': 

In [41]:
model_best = LogisticRegression(C=best_params['C'], random_state=42, solver=best_params['solver'], max_iter=1000)

cross_validation_scaled(model_best, load_fold_data)

print(results)
print(results.describe())

Stroke Folds: 100%|██████████| 5/5 [00:01<00:00,  4.42it/s]


    Stroke Fold  Non-Stroke Fold  Accuracy  Precision (Weighted Avg)  Recall (Weighted Avg)  F1-Score (Weighted Avg)  Precision (Macro Avg)  Recall (Macro Avg)  F1-Score (Macro Avg)
0             1                1  0.740000                  0.744213               0.740000                 0.741807               0.709103            0.715000              0.711695
1             1                2  0.746667                  0.744281               0.746667                 0.745336               0.714461            0.710000              0.712063
2             1                3  0.686667                  0.695056               0.686667                 0.690110               0.653110            0.660000              0.655678
3             1                4  0.746667                  0.744281               0.746667                 0.745336               0.714461            0.710000              0.712063
4             1                5  0.773333                  0.768153               0.77333