In [13]:
import numpy as np
import pandas as pd

# Data Preprocessing
def preprocess_data(train_path, test_path):
    # Read in the data
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)

    # Combine train and test data for consistent preprocessing
    train['is_train'] = 1
    test['is_train'] = 0
    data = pd.concat([train, test], sort=False)

    # Drop unnecessary columns
    data = data.drop(columns=['Surname', 'CustomerId', 'id'])

    # Handle missing values for numerical features
    numerical_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
    data[numerical_cols] = data[numerical_cols].fillna(data[numerical_cols].median())

    # Handle missing values for categorical features
    categorical_cols = ['Geography', 'Gender']
    for col in categorical_cols:
        data[col] = data[col].fillna(data[col].mode()[0])

    # One-hot encode categorical features using pd.get_dummies
    data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)

    # Scale numerical features using z-score normalization
    for col in numerical_cols:
        mean = data[col].mean()
        std = data[col].std()
        data[col] = (data[col] - mean) / std

    # Split back into train and test sets
    train_data = data[data['is_train'] == 1].drop(columns=['is_train'])
    test_data = data[data['is_train'] == 0].drop(columns=['is_train', 'Exited'])

    # Separate features and target
    X_train = train_data.drop(columns=['Exited']).values
    y_train = train_data['Exited'].values
    X_test = test_data.values
    test_ids = test['id'].values

    return X_train, y_train, X_test, test_ids

# KNN Class Definition
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def compute_distance(self, x):
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((self.X_train - x) ** 2, axis=1))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(self.X_train - x), axis=1)
        else:
            raise ValueError(f"Unsupported distance metric: {self.distance_metric}")

    def predict_proba(self, X):
        probabilities = []
        for x in X:
            distances = self.compute_distance(x)
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = self.y_train[k_indices]
            prob_1 = np.sum(k_nearest_labels) / self.k
            probabilities.append([1 - prob_1, prob_1])
        return np.array(probabilities)

    def predict(self, X):
        proba = self.predict_proba(X)
        return (proba[:, 1] >= 0.5).astype(int)

# Stratified K-Fold Cross-Validation
def stratified_k_fold_split(X, y, n_splits=5, random_state=42):
    np.random.seed(random_state)
    y = np.array(y)
    classes, y_indices = np.unique(y, return_inverse=True)
    n_classes = len(classes)
    class_counts = np.bincount(y_indices)
    folds = [[] for _ in range(n_splits)]
    for cls in range(n_classes):
        idxs = np.where(y_indices == cls)[0]
        np.random.shuffle(idxs)
        fold_sizes = (len(idxs) // n_splits) * np.ones(n_splits, dtype=int)
        fold_sizes[:len(idxs) % n_splits] += 1
        current = 0
        for fold_idx, fold_size in enumerate(fold_sizes):
            start, stop = current, current + fold_size
            folds[fold_idx].extend(idxs[start:stop])
            current = stop
    splits = []
    for fold_idx in range(n_splits):
        test_indices = np.array(folds[fold_idx])
        train_indices = np.array([i for i in range(len(X)) if i not in test_indices])
        splits.append((train_indices, test_indices))
    return splits

# ROC AUC Score Implementation
def roc_auc_score_manual(y_true, y_scores):
    # Ensure arrays are numpy arrays
    y_true = np.array(y_true)
    y_scores = np.array(y_scores)

    # Sort scores and corresponding true labels in descending order
    desc_order = np.argsort(-y_scores)
    y_true = y_true[desc_order]
    y_scores = y_scores[desc_order]

    # Compute true positive rates (tpr) and false positive rates (fpr)
    tps = np.cumsum(y_true)
    fps = np.cumsum(1 - y_true)
    tpr = tps / tps[-1]
    fpr = fps / fps[-1]

    # Add (0,0) point
    tpr = np.concatenate([[0], tpr])
    fpr = np.concatenate([[0], fpr])

    # Compute AUC using the trapezoidal rule
    auc = np.trapz(tpr, fpr)
    return auc

# Cross-Validation Function
def cross_validate(X, y, knn, n_splits=5):
    splits = stratified_k_fold_split(X, y, n_splits)
    auc_scores = []
    for fold_idx, (train_idx, val_idx) in enumerate(splits):
        X_train_cv, X_val_cv = X[train_idx], X[val_idx]
        y_train_cv, y_val_cv = y[train_idx], y[val_idx]
        knn.fit(X_train_cv, y_train_cv)
        y_val_proba = knn.predict_proba(X_val_cv)[:, 1]
        auc = roc_auc_score_manual(y_val_cv, y_val_proba)
        auc_scores.append(auc)
        print(f"Fold {fold_idx + 1} AUC: {auc:.4f}")
    mean_auc = np.mean(auc_scores)
    print(f"\nMean AUC: {mean_auc:.4f}")
    return mean_auc

# Load and preprocess data
X, y, X_test, test_ids = preprocess_data('train.csv', 'test.csv')

# Ensure data is numpy array
X = np.array(X, dtype=float)
y = np.array(y, dtype=int)
X_test = np.array(X_test, dtype=float)

# Hyperparameter Tuning
k_values = [24, 25, 26, 27, 28]
distance_metrics = ['euclidean', 'manhattan']
best_auc = 0
best_k = None
best_metric = None

for k in k_values:
    for metric in distance_metrics:
        print(f"\nEvaluating k={k}, metric={metric}")
        knn = KNN(k=k, distance_metric=metric)
        mean_auc = cross_validate(X, y, knn, n_splits=5)
        if mean_auc > best_auc:
            best_auc = mean_auc
            best_k = k
            best_metric = metric

print(f"\nBest hyperparameters: k={best_k}, metric={best_metric}, AUC={best_auc:.4f}")

# Train final model on full training data
knn = KNN(k=best_k, distance_metric=best_metric)
knn.fit(X, y)

# Predict on test data
test_proba = knn.predict_proba(X_test)[:, 1]

# Prepare submission
submission = pd.DataFrame({
    'id': test_ids,
    'Exited': test_proba  # Use probabilities for better granularity
})
submission.to_csv('submissions.csv', index=False)
print("Submission file 'submissions.csv' created.")



Evaluating k=24, metric=euclidean
Fold 1 AUC: 0.9151
Fold 2 AUC: 0.9085
Fold 3 AUC: 0.9202
Fold 4 AUC: 0.9254
Fold 5 AUC: 0.9240

Mean AUC: 0.9186

Evaluating k=24, metric=manhattan
Fold 1 AUC: 0.9131
Fold 2 AUC: 0.9088
Fold 3 AUC: 0.9172
Fold 4 AUC: 0.9260
Fold 5 AUC: 0.9186

Mean AUC: 0.9167

Evaluating k=25, metric=euclidean
Fold 1 AUC: 0.9152
Fold 2 AUC: 0.9089
Fold 3 AUC: 0.9203
Fold 4 AUC: 0.9262
Fold 5 AUC: 0.9232

Mean AUC: 0.9188

Evaluating k=25, metric=manhattan
Fold 1 AUC: 0.9128
Fold 2 AUC: 0.9074
Fold 3 AUC: 0.9165
Fold 4 AUC: 0.9269
Fold 5 AUC: 0.9185

Mean AUC: 0.9164

Evaluating k=26, metric=euclidean
Fold 1 AUC: 0.9145
Fold 2 AUC: 0.9085
Fold 3 AUC: 0.9203
Fold 4 AUC: 0.9265
Fold 5 AUC: 0.9238

Mean AUC: 0.9187

Evaluating k=26, metric=manhattan
Fold 1 AUC: 0.9115
Fold 2 AUC: 0.9085
Fold 3 AUC: 0.9177
Fold 4 AUC: 0.9258
Fold 5 AUC: 0.9195

Mean AUC: 0.9166

Evaluating k=27, metric=euclidean
Fold 1 AUC: 0.9154
Fold 2 AUC: 0.9091
Fold 3 AUC: 0.9213
Fold 4 AUC: 0.9258
F