In [105]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer

## Turkey RF

In [121]:
# Function to load and preprocess data
def load_and_preprocess(data_path, features_path=None):
    data = pd.read_csv(data_path)
    if features_path:
        selected_features = pd.read_csv(features_path)["Name"].astype(str).tolist()
        X = data[selected_features].apply(pd.to_numeric, errors='coerce').dropna(axis=1)
    else:
        X = data.drop(columns=['class']).apply(pd.to_numeric, errors='coerce')
    y = data["class"]
    return X, y

# Function to perform k-fold cross-validation
def k_fold_train_evaluate_rf(X, y, n_splits=5):
    # Encode target labels
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)

    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Hyperparameter grid
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    }

    # Stratified K-Fold cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    fold_accuracies = []

    for fold, (train_idx, test_idx) in enumerate(skf.split(X_scaled, y_encoded), 1):
        # Split into training and test sets for this fold
        X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
        y_train, y_test = y_encoded[train_idx], y_encoded[test_idx]

        # Perform GridSearchCV for hyperparameter tuning
        grid_search = GridSearchCV(
            estimator=RandomForestClassifier(random_state=42),
            param_grid=param_grid,
            scoring='accuracy',
            cv=5,
            n_jobs=-1,
            verbose=0
        )
        grid_search.fit(X_train, y_train)

        # Best model from GridSearchCV
        best_rf_model = grid_search.best_estimator_
        y_pred = best_rf_model.predict(X_test)

        # Calculate accuracy for this fold
        fold_accuracy = accuracy_score(y_test, y_pred)
        fold_accuracies.append(fold_accuracy)

        # Print results for this fold
        print(f"\nFold {fold} Results:")
        print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
        print("Confusion Matrix:")
        print(confusion_matrix(y_test, y_pred))
        print(f"Fold {fold} Accuracy: {fold_accuracy:.2f}")

    # Overall results
    print("\nK-Fold Cross-Validation Results:")
    print(f"Mean Accuracy: {np.mean(fold_accuracies):.2f}")
    print(f"Standard Deviation of Accuracy: {np.std(fold_accuracies):.2f}")

# Baseline Dataset
print("\nBaseline Dataset Results:")
X, y = load_and_preprocess("baseline_turkey.csv")
k_fold_train_evaluate_rf(X, y)

# Best Wrapper Dataset
print("\nBest Wrapper Dataset Results:")
X, y = load_and_preprocess("baseline_turkey.csv", "best_first_wrapper_turkey.csv")
k_fold_train_evaluate_rf(X, y)

# Correlation-Based Dataset
print("\nCorrelation-Based Dataset Results:")
X, y = load_and_preprocess("baseline_turkey.csv", "best_first_cor_turkey.csv")
k_fold_train_evaluate_rf(X, y)


Baseline Dataset Results:

Fold 1 Results:
              precision    recall  f1-score   support

   alzheimer       0.33      0.33      0.33         9
     control       0.57      0.73      0.64        11
         mci       1.00      0.40      0.57         5

    accuracy                           0.52        25
   macro avg       0.63      0.49      0.51        25
weighted avg       0.57      0.52      0.52        25

Confusion Matrix:
[[3 6 0]
 [3 8 0]
 [3 0 2]]
Fold 1 Accuracy: 0.52

Fold 2 Results:
              precision    recall  f1-score   support

   alzheimer       0.29      0.20      0.24        10
     control       0.29      0.40      0.33        10
         mci       0.00      0.00      0.00         5

    accuracy                           0.24        25
   macro avg       0.19      0.20      0.19        25
weighted avg       0.23      0.24      0.23        25

Confusion Matrix:
[[2 6 2]
 [4 4 2]
 [1 4 0]]
Fold 2 Accuracy: 0.24

Fold 3 Results:
              precision 

## Hangzhou RF

In [130]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Function to load and preprocess data
def load_and_preprocess(data_path, features_path=None):
    data = pd.read_csv(data_path)
    if features_path:
        selected_features = pd.read_csv(features_path)["Name"].astype(str).tolist()
        X = data[selected_features].apply(pd.to_numeric, errors='coerce').dropna(axis=1)
    else:
        X = data.drop(columns=['class']).apply(pd.to_numeric, errors='coerce')
    y = data["class"]
    return X, y

# Function to perform k-fold cross-validation
def k_fold_train_evaluate_rf(X, y, n_splits=5):
    # Encode target labels
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)

    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Hyperparameter grid
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    }

    # Stratified K-Fold cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    fold_accuracies = []

    for fold, (train_idx, test_idx) in enumerate(skf.split(X_scaled, y_encoded), 1):
        # Split into training and test sets for this fold
        X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
        y_train, y_test = y_encoded[train_idx], y_encoded[test_idx]

        # Perform GridSearchCV for hyperparameter tuning
        grid_search = GridSearchCV(
            estimator=RandomForestClassifier(random_state=42),
            param_grid=param_grid,
            scoring='accuracy',
            cv=5,
            n_jobs=-1,
            verbose=0
        )
        grid_search.fit(X_train, y_train)

        # Best model from GridSearchCV
        best_rf_model = grid_search.best_estimator_
        y_pred = best_rf_model.predict(X_test)

        # Calculate accuracy for this fold
        fold_accuracy = accuracy_score(y_test, y_pred)
        fold_accuracies.append(fold_accuracy)

        # Print results for this fold
        print(f"\nFold {fold} Results:")
        print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
        print("Confusion Matrix:")
        print(confusion_matrix(y_test, y_pred))
        print(f"Fold {fold} Accuracy: {fold_accuracy:.2f}")

    # Overall results
    print("\nK-Fold Cross-Validation Results:")
    print(f"Mean Accuracy: {np.mean(fold_accuracies):.2f}")
    print(f"Standard Deviation of Accuracy: {np.std(fold_accuracies):.2f}")

# Baseline Dataset
print("\nBaseline Dataset Results:")
X, y = load_and_preprocess("baseline_china.csv")
k_fold_train_evaluate_rf(X, y)

# Best Wrapper Dataset
print("\nBest Wrapper Dataset Results:")
X, y = load_and_preprocess("baseline_china.csv", "updated_best_first_wrapper_china.csv")
k_fold_train_evaluate_rf(X, y)

# Correlation-Based Dataset
print("\nCorrelation-Based Dataset Results:")
X, y = load_and_preprocess("baseline_china.csv", "updated_best_first_cor_china.csv")
k_fold_train_evaluate_rf(X, y)


Baseline Dataset Results:

Fold 1 Results:
              precision    recall  f1-score   support

   alzheimer       0.36      0.67      0.47         6
     control       0.00      0.00      0.00         6
         mci       0.43      0.43      0.43         7

    accuracy                           0.37        19
   macro avg       0.26      0.37      0.30        19
weighted avg       0.27      0.37      0.31        19

Confusion Matrix:
[[4 0 2]
 [4 0 2]
 [3 1 3]]
Fold 1 Accuracy: 0.37

Fold 2 Results:
              precision    recall  f1-score   support

   alzheimer       0.36      0.83      0.50         6
     control       1.00      0.17      0.29         6
         mci       0.75      0.43      0.55         7

    accuracy                           0.47        19
   macro avg       0.70      0.48      0.44        19
weighted avg       0.70      0.47      0.45        19

Confusion Matrix:
[[5 0 1]
 [5 1 0]
 [4 0 3]]
Fold 2 Accuracy: 0.47

Fold 3 Results:
              precision 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Fold 5 Results:
              precision    recall  f1-score   support

   alzheimer       0.50      0.86      0.63         7
     control       0.50      0.40      0.44         5
         mci       0.50      0.17      0.25         6

    accuracy                           0.50        18
   macro avg       0.50      0.47      0.44        18
weighted avg       0.50      0.50      0.45        18

Confusion Matrix:
[[6 1 0]
 [2 2 1]
 [4 1 1]]
Fold 5 Accuracy: 0.50

K-Fold Cross-Validation Results:
Mean Accuracy: 0.42
Standard Deviation of Accuracy: 0.09

Best Wrapper Dataset Results:

Fold 1 Results:
              precision    recall  f1-score   support

   alzheimer       0.62      0.83      0.71         6
     control       0.60      0.50      0.55         6
         mci       0.67      0.57      0.62         7

    accuracy                           0.63        19
   macro avg       0.63      0.63      0.63        19
weighted avg       0.63      0.63      0.62        19

Confusion Matr

## Turkey - SVM, KNN, NB, LR

In [131]:
# load the datasets
baseline_data = pd.read_csv("baseline_turkey.csv")
cor_based_data = pd.read_csv("best_first_cor_turkey.csv")
cfs_data = pd.read_csv("best_first_wrapper_turkey.csv")

# prepare the baseline dataset
X_baseline = baseline_data.drop(columns=['class']) 
y_baseline = baseline_data['class']  

# prepare the correlation-based dataset
cor_features = cor_based_data['Name'].astype(str).tolist()  
cor_features = [col for col in cor_features if col in X_baseline.columns]  
X_cor = baseline_data[cor_features]  
y_cor = y_baseline  

# prepare the wrapper dataset
cfs_features = cfs_data['Name'].astype(str).tolist()  
cfs_features = [col for col in cfs_features if col in X_baseline.columns]  
X_cfs = baseline_data[cfs_features] 
y_cfs = y_baseline  

# convert all features to numeric
X_baseline = X_baseline.apply(pd.to_numeric, errors='coerce')  
X_cor = X_cor.apply(pd.to_numeric, errors='coerce')  
X_cfs = X_cfs.apply(pd.to_numeric, errors='coerce')  

from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="mean")  # Replace NaNs with column mean
X_baseline = pd.DataFrame(imputer.fit_transform(X_baseline), columns=X_baseline.columns)
X_cor = pd.DataFrame(imputer.fit_transform(X_cor), columns=X_cor.columns)
X_cfs = pd.DataFrame(imputer.fit_transform(X_cfs), columns=X_cfs.columns)

# align target labels
y_baseline = y_baseline.loc[X_baseline.index]
y_cor = y_cor.loc[X_cor.index]
y_cfs = y_cfs.loc[X_cfs.index]

# encode target labels
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
datasets = {
    "Baseline Dataset": (X_baseline, label_encoder.fit_transform(y_baseline)),
    "Correlation-Based Dataset": (X_cor, label_encoder.fit_transform(y_cor)),
    "Wrapper Dataset": (X_cfs, label_encoder.fit_transform(y_cfs)),
}


models = {
    "SVM": SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000)
}

# train and evaluate each model using 5-fold CV
results = {}

for dataset_name, (X, y) in datasets.items():
    print(f"\nTraining models on {dataset_name} with 5-fold cross-validation...\n")
    
    if len(X) == 0 or len(y) == 0:
        print(f"Skipping {dataset_name}: No valid samples after preprocessing.\n")
        continue

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    for model_name, model in models.items():
        try:
            accuracies = []
            
            # perform 5-fold cross-validation
            for train_index, test_index in skf.split(X, y):
                X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                y_train, y_test = y[train_index], y[test_index]
                
                model.fit(X_train, y_train)
                
                y_pred = model.predict(X_test)
                
                accuracy = accuracy_score(y_test, y_pred)
                accuracies.append(accuracy)
            
            mean_accuracy = sum(accuracies) / len(accuracies)
            
            if dataset_name not in results:
                results[dataset_name] = {}
            results[dataset_name][model_name] = mean_accuracy
            
            print(f"Model: {model_name}")
            print(f"Mean Accuracy (5-fold CV): {mean_accuracy:.2f}")
        except Exception as e:
            print(f"Error training {model_name} on {dataset_name}: {e}")

# summary of results
print("\nSummary of Model Performance for Each Dataset (5-fold CV):")
for dataset_name, model_accuracies in results.items():
    print(f"\n{dataset_name}:")
    for model_name, accuracy in model_accuracies.items():
        print(f"{model_name}: {accuracy:.2f}")


Training models on Baseline Dataset with 5-fold cross-validation...

Model: SVM
Mean Accuracy (5-fold CV): 0.45
Model: KNN
Mean Accuracy (5-fold CV): 0.47
Model: Naive Bayes
Mean Accuracy (5-fold CV): 0.49
Model: Logistic Regression
Mean Accuracy (5-fold CV): 0.41

Training models on Correlation-Based Dataset with 5-fold cross-validation...

Model: SVM
Mean Accuracy (5-fold CV): 0.54
Model: KNN
Mean Accuracy (5-fold CV): 0.50
Model: Naive Bayes
Mean Accuracy (5-fold CV): 0.78
Model: Logistic Regression
Mean Accuracy (5-fold CV): 0.63

Training models on Wrapper Dataset with 5-fold cross-validation...

Model: SVM
Mean Accuracy (5-fold CV): 0.45
Model: KNN
Mean Accuracy (5-fold CV): 0.46
Model: Naive Bayes
Mean Accuracy (5-fold CV): 0.56
Model: Logistic Regression
Mean Accuracy (5-fold CV): 0.48

Summary of Model Performance for Each Dataset (5-fold CV):

Baseline Dataset:
SVM: 0.45
KNN: 0.47
Naive Bayes: 0.49
Logistic Regression: 0.41

Correlation-Based Dataset:
SVM: 0.54
KNN: 0.50
Nai

In [88]:
# Step 1: Load the datasets
baseline_data = pd.read_csv("baseline_turkey.csv")
cor_based_data = pd.read_csv("best_first_cor_turkey.csv")
cfs_data = pd.read_csv("best_first_wrapper_turkey.csv")

# Step 2: Prepare the Baseline Dataset
X_baseline = baseline_data.drop(columns=['class'])  # Features
y_baseline = baseline_data['class']  # Target labels

# Step 3: Prepare the Correlation-Based Dataset
cor_features = cor_based_data['Name'].astype(str).tolist()  # Feature names
cor_features = [col for col in cor_features if col in X_baseline.columns]  # Align features
X_cor = baseline_data[cor_features]  # Features
y_cor = y_baseline  # Target labels

# Step 4: Prepare the CFS Dataset
cfs_features = cfs_data['Name'].astype(str).tolist()  # Feature names
cfs_features = [col for col in cfs_features if col in X_baseline.columns]  # Align features
X_cfs = baseline_data[cfs_features]  # Features
y_cfs = y_baseline  # Target labels

# Step 5: Convert all features to numeric
X_baseline = X_baseline.apply(pd.to_numeric, errors='coerce')  # Convert to numeric
X_cor = X_cor.apply(pd.to_numeric, errors='coerce')  # Convert to numeric
X_cfs = X_cfs.apply(pd.to_numeric, errors='coerce')  # Convert to numeric

# Step 6: Handle missing values (impute instead of drop)
imputer = SimpleImputer(strategy="mean")  # Replace NaNs with column mean
X_baseline = pd.DataFrame(imputer.fit_transform(X_baseline), columns=X_baseline.columns)
X_cor = pd.DataFrame(imputer.fit_transform(X_cor), columns=X_cor.columns)
X_cfs = pd.DataFrame(imputer.fit_transform(X_cfs), columns=X_cfs.columns)

# Align target labels
y_baseline = y_baseline.loc[X_baseline.index]
y_cor = y_cor.loc[X_cor.index]
y_cfs = y_cfs.loc[X_cfs.index]

# Step 7: Encode target labels
label_encoder = LabelEncoder()
datasets = {
    "Baseline Dataset": (X_baseline, label_encoder.fit_transform(y_baseline)),
    "Correlation-Based Dataset": (X_cor, label_encoder.fit_transform(y_cor)),
    "CFS Dataset": (X_cfs, label_encoder.fit_transform(y_cfs)),
}

# Step 8: Initialize algorithms
models = {
    "SVM": SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000)
}

# Step 9: Train and evaluate each model on all datasets
results = {}

for dataset_name, (X, y) in datasets.items():
    print(f"\nTraining models on {dataset_name}...\n")
    
    if len(X) == 0 or len(y) == 0:
        print(f"Skipping {dataset_name}: No valid samples after preprocessing.\n")
        continue
    
    # Split the data into training and testing subsets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
    
    for model_name, model in models.items():
        try:
            # Train the model
            model.fit(X_train, y_train)
            
            # Make predictions
            y_pred = model.predict(X_test)
            
            # Evaluate the model
            accuracy = accuracy_score(y_test, y_pred)
            
            # Store the results
            if dataset_name not in results:
                results[dataset_name] = {}
            results[dataset_name][model_name] = accuracy
            
            # Generate target names dynamically
            target_names = [str(cls) for cls in sorted(set(y_train))]
            
            print(f"Model: {model_name}")
            print(f"Accuracy: {accuracy:.2f}")
            print("Classification Report:")
            print(classification_report(y_test, y_pred, target_names=target_names))
        except Exception as e:
            print(f"Error training {model_name} on {dataset_name}: {e}")

# Step 10: Display summary of results
print("\nSummary of Model Performance for Each Dataset:")
for dataset_name, model_accuracies in results.items():
    print(f"\n{dataset_name}:")
    for model_name, accuracy in model_accuracies.items():
        print(f"{model_name}: {accuracy:.2f}")


Training models on Baseline Dataset...

Model: SVM
Accuracy: 0.37
Classification Report:
              precision    recall  f1-score   support

           0       0.38      0.43      0.40        14
           1       0.38      0.50      0.43        16
           2       0.00      0.00      0.00         8

    accuracy                           0.37        38
   macro avg       0.25      0.31      0.28        38
weighted avg       0.30      0.37      0.33        38

Model: KNN
Accuracy: 0.34
Classification Report:
              precision    recall  f1-score   support

           0       0.33      0.43      0.38        14
           1       0.40      0.25      0.31        16
           2       0.30      0.38      0.33         8

    accuracy                           0.34        38
   macro avg       0.34      0.35      0.34        38
weighted avg       0.35      0.34      0.34        38

Model: Naive Bayes
Accuracy: 0.53
Classification Report:
              precision    recall  f1-scor

### China - SVM, KNN, NB, LR

In [132]:
# load the datasets
baseline_data = pd.read_csv("baseline_china.csv")
cor_based_data = pd.read_csv("updated_best_first_cor_china.csv")
cfs_data = pd.read_csv("updated_best_first_wrapper_china.csv")

# prepare the baseline dataset
X_baseline = baseline_data.drop(columns=['class']) 
y_baseline = baseline_data['class']  

# prepare the correlation-based dataset
cor_features = cor_based_data['Name'].astype(str).tolist()  
cor_features = [col for col in cor_features if col in X_baseline.columns]  
X_cor = baseline_data[cor_features]  
y_cor = y_baseline  

# prepare the wrapper dataset
cfs_features = cfs_data['Name'].astype(str).tolist()  
cfs_features = [col for col in cfs_features if col in X_baseline.columns]  
X_cfs = baseline_data[cfs_features] 
y_cfs = y_baseline  

# convert all features to numeric
X_baseline = X_baseline.apply(pd.to_numeric, errors='coerce')  
X_cor = X_cor.apply(pd.to_numeric, errors='coerce')  
X_cfs = X_cfs.apply(pd.to_numeric, errors='coerce')  

from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="mean")  # Replace NaNs with column mean
X_baseline = pd.DataFrame(imputer.fit_transform(X_baseline), columns=X_baseline.columns)
X_cor = pd.DataFrame(imputer.fit_transform(X_cor), columns=X_cor.columns)
X_cfs = pd.DataFrame(imputer.fit_transform(X_cfs), columns=X_cfs.columns)

# align target labels
y_baseline = y_baseline.loc[X_baseline.index]
y_cor = y_cor.loc[X_cor.index]
y_cfs = y_cfs.loc[X_cfs.index]

# encode target labels
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
datasets = {
    "Baseline Dataset": (X_baseline, label_encoder.fit_transform(y_baseline)),
    "Correlation-Based Dataset": (X_cor, label_encoder.fit_transform(y_cor)),
    "Wrapper Dataset": (X_cfs, label_encoder.fit_transform(y_cfs)),
}


models = {
    "SVM": SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000)
}

# train and evaluate each model using 5-fold CV
results = {}

for dataset_name, (X, y) in datasets.items():
    print(f"\nTraining models on {dataset_name} with 5-fold cross-validation...\n")
    
    if len(X) == 0 or len(y) == 0:
        print(f"Skipping {dataset_name}: No valid samples after preprocessing.\n")
        continue

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    for model_name, model in models.items():
        try:
            accuracies = []
            
            # perform 5-fold cross-validation
            for train_index, test_index in skf.split(X, y):
                X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                y_train, y_test = y[train_index], y[test_index]
                
                model.fit(X_train, y_train)
                
                y_pred = model.predict(X_test)
                
                accuracy = accuracy_score(y_test, y_pred)
                accuracies.append(accuracy)
            
            mean_accuracy = sum(accuracies) / len(accuracies)
            
            if dataset_name not in results:
                results[dataset_name] = {}
            results[dataset_name][model_name] = mean_accuracy
            
            print(f"Model: {model_name}")
            print(f"Mean Accuracy (5-fold CV): {mean_accuracy:.2f}")
        except Exception as e:
            print(f"Error training {model_name} on {dataset_name}: {e}")

# summary of results
print("\nSummary of Model Performance for Each Dataset (5-fold CV):")
for dataset_name, model_accuracies in results.items():
    print(f"\n{dataset_name}:")
    for model_name, accuracy in model_accuracies.items():
        print(f"{model_name}: {accuracy:.2f}")


Training models on Baseline Dataset with 5-fold cross-validation...

Model: SVM
Mean Accuracy (5-fold CV): 0.27
Model: KNN
Mean Accuracy (5-fold CV): 0.27
Model: Naive Bayes
Mean Accuracy (5-fold CV): 0.29
Model: Logistic Regression
Mean Accuracy (5-fold CV): 0.31

Training models on Correlation-Based Dataset with 5-fold cross-validation...

Model: SVM
Mean Accuracy (5-fold CV): 0.60
Model: KNN
Mean Accuracy (5-fold CV): 0.51
Model: Naive Bayes
Mean Accuracy (5-fold CV): 0.61
Model: Logistic Regression
Mean Accuracy (5-fold CV): 0.57

Training models on Wrapper Dataset with 5-fold cross-validation...

Model: SVM
Mean Accuracy (5-fold CV): 0.57
Model: KNN
Mean Accuracy (5-fold CV): 0.61
Model: Naive Bayes
Mean Accuracy (5-fold CV): 0.63
Model: Logistic Regression
Mean Accuracy (5-fold CV): 0.62

Summary of Model Performance for Each Dataset (5-fold CV):

Baseline Dataset:
SVM: 0.27
KNN: 0.27
Naive Bayes: 0.29
Logistic Regression: 0.31

Correlation-Based Dataset:
SVM: 0.60
KNN: 0.51
Nai

In [101]:
# Step 1: Prepare the models with hyperparameter tuning
tuned_models = {
    "SVM": GridSearchCV(
        SVC(probability=True, random_state=42),
        param_grid={'C': [0.1, 1, 10], 'gamma': ['scale', 'auto'], 'kernel': ['linear', 'rbf']},
        scoring='accuracy',
        cv=5,
        n_jobs=-1,
        verbose=1,
    ),
    "KNN": GridSearchCV(
        KNeighborsClassifier(),
        param_grid={'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance']},
        scoring='accuracy',
        cv=5,
        n_jobs=-1,
        verbose=1,
    ),
    "Naive Bayes": GaussianNB(),  # No hyperparameters to tune
    "Logistic Regression": GridSearchCV(
        LogisticRegression(max_iter=1000),
        param_grid={'C': [0.1, 1, 10]},
        scoring='accuracy',
        cv=5,
        n_jobs=-1,
        verbose=1,
    ),
}

# Step 2: Train and evaluate models on datasets
improved_results = {}

for dataset_name, (X, y) in datasets.items():
    print(f"\nTraining models on {dataset_name}...\n")
    
    if len(X) == 0 or len(y) == 0:
        print(f"Skipping {dataset_name}: No valid samples after preprocessing.\n")
        continue
    
    # Address class imbalance
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    
    # Standardize features
    scaler = StandardScaler()
    X_resampled = scaler.fit_transform(X_resampled)
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42, stratify=y_resampled)
    
    for model_name, model in tuned_models.items():
        try:
            print(f"\nTuning {model_name}...")
            
            # Train the model (with hyperparameter tuning if applicable)
            model.fit(X_train, y_train)
            best_model = model.best_estimator_ if isinstance(model, GridSearchCV) else model
            
            # Evaluate the model
            y_pred = best_model.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            
            # Store the results
            if dataset_name not in improved_results:
                improved_results[dataset_name] = {}
            improved_results[dataset_name][model_name] = accuracy
            
            print(f"\n{dataset_name} - {model_name} Tuned Accuracy: {accuracy:.2f}")
            print("Classification Report:")
            print(classification_report(y_test, y_pred))
        except Exception as e:
            print(f"Error tuning {model_name} on {dataset_name}: {e}")

# Step 3: Display summary of improved results
print("\nSummary of Improved Model Performance for Each Dataset:")
for dataset_name, model_accuracies in improved_results.items():
    print(f"\n{dataset_name}:")
    for model_name, accuracy in model_accuracies.items():
        print(f"{model_name}: {accuracy:.2f}")


Training models on Baseline Dataset...


Tuning SVM...
Fitting 5 folds for each of 12 candidates, totalling 60 fits

Baseline Dataset - SVM Tuned Accuracy: 0.47
Classification Report:
              precision    recall  f1-score   support

           0       0.38      1.00      0.56        10
           1       1.00      0.40      0.57        10
           2       0.00      0.00      0.00        10

    accuracy                           0.47        30
   macro avg       0.46      0.47      0.38        30
weighted avg       0.46      0.47      0.38        30


Tuning KNN...
Fitting 5 folds for each of 6 candidates, totalling 30 fits

Baseline Dataset - KNN Tuned Accuracy: 0.33
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        10
           1       0.33      1.00      0.50        10
           2       0.00      0.00      0.00        10

    accuracy                           0.33        30
   macro avg       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Baseline Dataset - Logistic Regression Tuned Accuracy: 0.47
Classification Report:
              precision    recall  f1-score   support

           0       0.36      0.50      0.42        10
           1       0.67      0.60      0.63        10
           2       0.43      0.30      0.35        10

    accuracy                           0.47        30
   macro avg       0.48      0.47      0.47        30
weighted avg       0.48      0.47      0.47        30


Training models on Correlation-Based Dataset...


Tuning SVM...
Fitting 5 folds for each of 12 candidates, totalling 60 fits

Correlation-Based Dataset - SVM Tuned Accuracy: 0.43
Classification Report:
              precision    recall  f1-score   support

           0       0.30      0.30      0.30        10
           1       0.62      0.50      0.56        10
           2       0.42      0.50      0.45        10

    accuracy                           0.43        30
   macro avg       0.45      0.43      0.44        30
weight