In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, learning_curve, LeaveOneOut, StratifiedKFold
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

In [None]:
# DATASET 1: HEART DISEASE

In [None]:
# load dataset
df = pd.read_csv('processed.cleveland.data', header=None, na_values="?")

# column names
columns = [
    "age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang",
    "oldpeak", "slope", "ca", "thal", "target"
]
df.columns = columns

# drop missing values
df.dropna(inplace=True)

# converting target to binary (1 for presence of heart disease, 0 for absence)
df["target"] = df["target"].apply(lambda x: 1 if x > 0 else 0)

X = df.drop("target", axis=1)
y = df["target"]

print(df.head())

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
# classifiers and hyperparameter grids
param_grids = {
    "SVM": {
        "C": [0.1, 1, 10, 100],
        "gamma": [0.01, 0.1, 1, 'scale'],
        "kernel": ["rbf", "linear"]
    },
    "Random Forest": {
        "n_estimators": [50, 100, 200],
        "max_depth": [None, 5, 10, 20],
        "min_samples_split": [2, 5, 10]
    },
    "KNN": {
        "n_neighbors": [3, 5, 7, 9, 11],
        "weights": ["uniform", "distance"],
        "metric": ["euclidean", "manhattan"]
    },
    "Logistic Regression": {
        "C": [0.01, 0.1, 1, 10, 100],
        "penalty": ["l2"],
        "solver": ["lbfgs"]
    }
}

classifiers = {
    "SVM": SVC(kernel='rbf', C=1, gamma='scale'),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42)
}


best_models = {}

# performing grid search for hyperparameter tuning 

for clf_name, clf in classifiers.items():
    print(f"\n--- Tuning Hyper-Parameters for {clf_name} ---")
    grid_search = GridSearchCV(clf, param_grids[clf_name], cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
    grid_search.fit(X, y)
    best_models[clf_name] = grid_search.best_estimator_
    print(f"Best Parameters for {clf_name}: {grid_search.best_params_}")
    print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.4f}")


In [None]:
# evaluating classfiers on different training and test sizes 

def evaluate_with_partitions(X, y, models, n_repeats=3):
    splits = {
        "20/80": 0.2,
        "50/50": 0.5,
        "80/20": 0.8
    }
    
    for split_name, test_size in splits.items():
        print(f"\n--- Train-Test Split: {split_name} ---")
        
        for _ in range(n_repeats):
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=None)
            
            for clf_name, clf in models.items():
                # train the classifier
                clf.fit(X_train, y_train)
                
                # predict and calculate accuracy
                y_train_pred = clf.predict(X_train)
                y_test_pred = clf.predict(X_test)
                
                train_acc = accuracy_score(y_train, y_train_pred)
                test_acc = accuracy_score(y_test, y_test_pred)
                
                print(f"{clf_name}: Train Accuracy = {train_acc:.4f}, Test Accuracy = {test_acc:.4f}")

# evaluate classifiers with tuned hyper-parameters
results = evaluate_with_partitions(X, y, best_models)

In [None]:
def plot_learning_curve(model, X, y, title):
    train_sizes, train_scores, val_scores = learning_curve(
        model, X, y, cv=5, scoring='accuracy', train_sizes=np.linspace(0.1, 1.0, 10), n_jobs=-1
    )

    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    val_mean = np.mean(val_scores, axis=1)
    val_std = np.std(val_scores, axis=1)

    plt.figure(figsize=(10, 6))
    plt.plot(train_sizes, train_mean, 'o-', color="r", label="Training Accuracy")
    plt.plot(train_sizes, val_mean, 'o-', color="g", label="Validation Accuracy")
    plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, alpha=0.1, color="g")
    
    plt.title(f"Learning Curve for {title}")
    plt.xlabel("Training Size")
    plt.ylabel("Accuracy")
    plt.legend(loc="best")
    plt.grid()
    plt.show()

# plot learning curves for each classifier with training and validation accuracy
for clf_name, clf in best_models.items():
    plot_learning_curve(clf, X, y, clf_name)

In [None]:
def cross_validate_classifiers(X, y, classifiers, cv=5):
    print("\n--- Cross-Validation Results for Heart Disease Dataset ---")
    
    for clf_name, clf in classifiers.items():
        scores = cross_val_score(clf, X, y, cv=cv, scoring='accuracy')
        print(f"{clf_name}: Mean Accuracy = {scores.mean():.4f} (± {scores.std():.4f})")

# perform cross-validation
cross_validate_classifiers(X, y, classifiers)

In [None]:
# DATASET 2: LUNG CANCER

In [None]:
# load dataset
df = pd.read_csv('lung-cancer.data', header=None, na_values="?")

# column names
columns = ['target'] + [f'feature_{i}' for i in range(1, 57)]
df.columns = columns

# drop missing values
df.dropna(inplace=True)

X = df.drop('target', axis=1)
y = df['target']

print(df.head())

# show the shapes of X and y
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# classifiers and hyperparameter grids
param_grids = {
    "SVM": {
        "C": [0.1, 1, 10, 100],
        "gamma": [0.01, 0.1, 1, 'scale'],
        "kernel": ["rbf", "linear"]
    },
    "Random Forest": {
        "n_estimators": [50, 100, 200],
        "max_depth": [None, 5, 10, 20],
        "min_samples_split": [2, 5, 10]
    },
    "KNN": {
        "n_neighbors": list(range(1, len(X))),
        "weights": ["uniform", "distance"],
        "metric": ["euclidean", "manhattan"]
    },
    "Logistic Regression": {
        "C": [0.01, 0.1, 1, 10, 100],
        "penalty": ["l2"],
        "solver": ["lbfgs"]
    }
}

classifiers = {
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(random_state=42),
    "KNN": KNeighborsClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42)
}

# using leave one out cross validation because of small dataset size
# using stratified kfold for svm because it performs better 
best_models = {}

loo = LeaveOneOut()
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


for clf_name, clf in classifiers.items():
    print(f"\n--- Tuning Hyper-Parameters for {clf_name} ---")
    
    if clf_name == "KNN":
        max_neighbors = len(X) - 1
        param_grids["KNN"]["n_neighbors"] = list(range(1, max_neighbors + 1))
        grid_search = GridSearchCV(clf, param_grids[clf_name], cv=loo, scoring='accuracy', verbose=1, n_jobs=-1)
    elif clf_name == "SVM":
        grid_search = GridSearchCV(clf, param_grids[clf_name], cv=skf, scoring='accuracy', verbose=1, n_jobs=-1)
    else:
        grid_search = GridSearchCV(clf, param_grids[clf_name], cv=loo, scoring='accuracy', verbose=1, n_jobs=-1)

    grid_search.fit(X_scaled, y)
    best_models[clf_name] = grid_search.best_estimator_
    print(f"Best Parameters for {clf_name}: {grid_search.best_params_}")
    print(f"Best Accuracy: {grid_search.best_score_:.4f}")

In [None]:
### evaluating classifiers with different training and test sizes
def evaluate_with_partitions(X, y, models, n_repeats=3):
    splits = {
        "20/80": 0.2,
        "50/50": 0.5,
        "80/20": 0.8
    }
    
    for split_name, test_size in splits.items():
        print(f"\n--- Train-Test Split: {split_name} ---")
        
        for _ in range(n_repeats):
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=None, stratify=y)
            
            for clf_name, clf in models.items():
                # For KNN, set n_neighbors dynamically based on train set size
                if clf_name == "KNN":
                    n_neighbors = min(7, len(X_train))
                    clf.set_params(n_neighbors=n_neighbors)
                
                # Train the classifier
                clf.fit(X_train, y_train)
                
                # Predict and calculate accuracy
                y_train_pred = clf.predict(X_train)
                y_test_pred = clf.predict(X_test)
                
                train_acc = accuracy_score(y_train, y_train_pred)
                test_acc = accuracy_score(y_test, y_test_pred)
                
                print(f"{clf_name}: Train Accuracy = {train_acc:.4f}, Test Accuracy = {test_acc:.4f}")

evaluate_with_partitions(X_scaled, y, best_models)

In [None]:
def plot_learning_curve(model, X, y, title):
    train_sizes, train_scores, val_scores = learning_curve(
        model, X, y, cv=loo, scoring='accuracy', train_sizes=np.linspace(0.1, 1.0, 10), n_jobs=-1
    )

    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    val_mean = np.mean(val_scores, axis=1)
    val_std = np.std(val_scores, axis=1)

    plt.figure(figsize=(10, 6))
    plt.plot(train_sizes, train_mean, 'o-', color="r", label="Training Accuracy")
    plt.plot(train_sizes, val_mean, 'o-', color="g", label="Validation Accuracy")
    plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, alpha=0.1, color="g")
    
    plt.title(f"Learning Curve for {title}")
    plt.xlabel("Training Size")
    plt.ylabel("Accuracy")
    plt.legend(loc="best")
    plt.grid()
    plt.show()

# plot learning curves for each classifier
for clf_name, clf in best_models.items():
    plot_learning_curve(clf, X_scaled, y, clf_name)

In [None]:
def cross_validate_classifiers(X, y, classifiers, cv=5):
    print("\n--- Cross-Validation Results for Lung Cancer Dataset ---")
    
    for clf_name, clf in classifiers.items():
        scores = cross_val_score(clf, X, y, cv=cv, scoring='accuracy')
        print(f"{clf_name}: Mean Accuracy = {scores.mean():.4f} (\u00b1 {scores.std():.4f})")

# perform cross-validation
cross_validate_classifiers(X_scaled, y, best_models)

In [None]:
# DATASET 3: BREAST CANCER

In [None]:
column_names = [
    'ID', 'Diagnosis',
    'Radius_Mean', 'Texture_Mean', 'Perimeter_Mean', 'Area_Mean', 'Smoothness_Mean', 
    'Compactness_Mean', 'Concavity_Mean', 'Concave_Points_Mean', 'Symmetry_Mean', 'Fractal_Dimension_Mean',
    'Radius_StdErr', 'Texture_StdErr', 'Perimeter_StdErr', 'Area_StdErr', 'Smoothness_StdErr',
    'Compactness_StdErr', 'Concavity_StdErr', 'Concave_Points_StdErr', 'Symmetry_StdErr', 'Fractal_Dimension_StdErr',
    'Radius_Worst', 'Texture_Worst', 'Perimeter_Worst', 'Area_Worst', 'Smoothness_Worst',
    'Compactness_Worst', 'Concavity_Worst', 'Concave_Points_Worst', 'Symmetry_Worst', 'Fractal_Dimension_Worst'
]


df = pd.read_csv('wdbc.data', names=column_names)

print(df.head())

df.drop('ID', axis=1, inplace=True)

# convert 'Diagnosis' to binary (1 = Malignant, 0 = Benign)
df['Diagnosis'] = df['Diagnosis'].apply(lambda x: 1 if x == 'M' else 0)

X = df.drop('Diagnosis', axis=1)
y = df['Diagnosis']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# classifiers and hyperparameter grids
param_grids = {
    "SVM": {
        "C": [0.1, 1, 10, 100],
        "gamma": [0.01, 0.1, 1, 'scale'],
        "kernel": ["rbf", "linear"]
    },
    "Random Forest": {
        "n_estimators": [50, 100, 200],
        "max_depth": [None, 5, 10, 20],
        "min_samples_split": [2, 5, 10]
    },
    "KNN": {
        "n_neighbors": [3, 5, 7, 9, 11],
        "weights": ["uniform", "distance"],
        "metric": ["euclidean", "manhattan"]
    },
    "Logistic Regression": {
        "C": [0.01, 0.1, 1, 10, 100],
        "penalty": ["l2"],
        "solver": ["lbfgs"]
    }
}

classifiers = {
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(random_state=42),
    "KNN": KNeighborsClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42)
}

In [None]:
# using stratified kfold for hyperparameter tuning 
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

best_models = {}

for clf_name, clf in classifiers.items():
    print(f"\n--- Tuning Hyper-Parameters for {clf_name} ---")
    grid_search = GridSearchCV(clf, param_grids[clf_name], cv=skf, scoring='accuracy', verbose=1, n_jobs=-1)
    grid_search.fit(X_scaled, y)
    best_models[clf_name] = grid_search.best_estimator_
    print(f"Best Parameters for {clf_name}: {grid_search.best_params_}")
    print(f"Best 5-Fold CV Accuracy: {grid_search.best_score_:.4f}")

In [None]:
def evaluate_with_partitions(X, y, models, n_repeats=3):
    splits = {
        "20/80": 0.2,
        "50/50": 0.5,
        "80/20": 0.8
    }
    
    for split_name, test_size in splits.items():
        print(f"\n--- Train-Test Split: {split_name} ---")
        
        for _ in range(n_repeats):
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=None, stratify=y)
            
            for clf_name, clf in models.items():
                clf.fit(X_train, y_train)
                
                # predict and calculate accuracy
                y_train_pred = clf.predict(X_train)
                y_test_pred = clf.predict(X_test)
                
                train_acc = accuracy_score(y_train, y_train_pred)
                test_acc = accuracy_score(y_test, y_test_pred)
                
                print(f"{clf_name}: Train Accuracy = {train_acc:.4f}, Test Accuracy = {test_acc:.4f}")

# evaluate classifiers with tuned hyper-parameters
evaluate_with_partitions(X_scaled, y, best_models)

In [None]:
def plot_learning_curve(model, X, y, title):
    train_sizes, train_scores, val_scores = learning_curve(
        model, X, y, cv=5, scoring='accuracy', train_sizes=np.linspace(0.1, 1.0, 10), n_jobs=-1
    )

    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    val_mean = np.mean(val_scores, axis=1)
    val_std = np.std(val_scores, axis=1)

    plt.figure(figsize=(10, 6))
    plt.plot(train_sizes, train_mean, 'o-', color="r", label="Training Accuracy")
    plt.plot(train_sizes, val_mean, 'o-', color="g", label="Validation Accuracy")
    plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, alpha=0.1, color="g")
    
    plt.title(f"Learning Curve for {title}")
    plt.xlabel("Training Size")
    plt.ylabel("Accuracy")
    plt.legend(loc="best")
    plt.grid()
    plt.show()

# plot learning curves for each classifier
for clf_name, clf in best_models.items():
    plot_learning_curve(clf, X_scaled, y, clf_name)


In [None]:
def cross_validate_classifiers(X, y, classifiers, cv=5):
    print("\n--- Cross-Validation Results for Breast Cancer Dataset ---")
    
    for clf_name, clf in classifiers.items():
        scores = cross_val_score(clf, X, y, cv=cv, scoring='accuracy')
        print(f"{clf_name}: Mean Accuracy = {scores.mean():.4f} (\u00b1 {scores.std():.4f})")

# perform cross-validation
cross_validate_classifiers(X_scaled, y, best_models)