In [1]:
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd

## implementing K-Fold from scratch


In [5]:
def create_folds(X, y, k):
    """
    Create k folds for cross-validation
    
    Parameters:
    X: feature matrix
    y: target vector
    k: number of folds
    
    Returns:
    list of tuples (train_idx, val_idx) for each fold
    """
    n_samples = len(X)
    
    indices = np.arange(n_samples)
    np.random.shuffle(indices)
    fold_size = n_samples // k
    

    folds = []
    for i in range(k):

        start_idx = i * fold_size
        end_idx = start_idx + fold_size if i < k-1 else n_samples
        
        val_indices = indices[start_idx:end_idx]
        
        train_indices = np.concatenate([
            indices[:start_idx],
            indices[end_idx:]
        ])
        
        folds.append((train_indices, val_indices))
    
    return folds

In [6]:
def k_fold_cross_validation(model, X, y, k):
    """
    Implement k-fold cross-validation from scratch
    
    Parameters:
    model: sklearn model object
    X: feature matrix
    y: target vector
    k: number of folds
    
    Returns:
    tuple of (mean_accuracy, mean_f1, accuracies, f1_scores)
    """
    X = np.array(X)
    y = np.array(y)
    
    folds = create_folds(X, y, k)
    
    accuracies = []
    f1_scores = []
    
    # Perform k-fold cross-validation
    for fold_idx, (train_idx, val_idx) in enumerate(folds, 1):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        

        model.fit(X_train, y_train)
        

        y_pred = model.predict(X_val)
        

        accuracy = accuracy_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred)
        
        accuracies.append(accuracy)
        f1_scores.append(f1)
        
        print(f"Fold {fold_idx}: Accuracy = {accuracy:.4f}, F1-Score = {f1:.4f}")
    
    mean_accuracy = np.mean(accuracies)
    mean_f1 = np.mean(f1_scores)
    
    return mean_accuracy, mean_f1, accuracies, f1_scores

## comparison with sklearn result


In [7]:
def compare_with_sklearn(model, X, y, k):
    """
    Compare custom implementation with sklearn's cross_val_score
    
    Parameters:
    model: sklearn model object
    X: feature matrix
    y: target vector
    k: number of folds
    
    Returns:
    tuple of (sklearn_accuracy, sklearn_f1)
    """
    # Calculate accuracy using sklearn
    sklearn_accuracy = cross_val_score(
        model, X, y, cv=k, scoring='accuracy'
    ).mean()
    
    # Calculate F1-score using sklearn
    sklearn_f1 = cross_val_score(
        model, X, y, cv=k, scoring='f1'
    ).mean()
    
    return sklearn_accuracy, sklearn_f1

## train and test


In [2]:
df = pd.read_csv('../dataset/full_processed_heart_disease_cleveland.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    float64
 1   sex       303 non-null    float64
 2   cp        303 non-null    float64
 3   trestbps  303 non-null    float64
 4   chol      303 non-null    float64
 5   fbs       303 non-null    float64
 6   restecg   303 non-null    float64
 7   thalach   303 non-null    float64
 8   exang     303 non-null    float64
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    float64
 11  ca        303 non-null    float64
 12  thal      303 non-null    float64
 13  target    303 non-null    int64  
dtypes: float64(13), int64(1)
memory usage: 33.3 KB


In [4]:
df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [9]:
X = df.drop(['target'],axis=1)
y = df['target']

In [11]:
knn = KNeighborsClassifier(n_neighbors=5)
    
# Set number of folds
k = 5
    
print("Custom K-Fold Cross-Validation Results:")
print("-" * 40)
    
# Perform custom k-fold cross-validation
mean_acc, mean_f1, accuracies, f1_scores = k_fold_cross_validation(knn, X, y, k)

Custom K-Fold Cross-Validation Results:
----------------------------------------
Fold 1: Accuracy = 0.6333, F1-Score = 0.5926
Fold 2: Accuracy = 0.7000, F1-Score = 0.7097
Fold 3: Accuracy = 0.6833, F1-Score = 0.6545
Fold 4: Accuracy = 0.6500, F1-Score = 0.5532
Fold 5: Accuracy = 0.6667, F1-Score = 0.5714


In [13]:
# Compare with sklearn implementation
print("\nComparison with Sklearn Implementation:")
print("-" * 40)
sklearn_acc, sklearn_f1 = compare_with_sklearn(knn, X, y, k)
print(f"Sklearn Mean Accuracy: {sklearn_acc:.4f}")
print(f"Sklearn Mean F1-Score: {sklearn_f1:.4f}")


Comparison with Sklearn Implementation:
----------------------------------------
Sklearn Mean Accuracy: 0.6571
Sklearn Mean F1-Score: 0.6020


In [14]:
# Calculate differences
acc_diff = abs(mean_acc - sklearn_acc)
f1_diff = abs(mean_f1 - sklearn_f1)
print(f"\nDifference in Accuracy: {acc_diff:.4f}")
print(f"Difference in F1-Score: {f1_diff:.4f}")


Difference in Accuracy: 0.0096
Difference in F1-Score: 0.0142
