In [73]:
import numpy as np
import pandas as pd

In [83]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        y_preds = []
        X = np.array(X)
        for row in X:
            distances = self.compute_distance(row, self.X_train)
            inds = np.argsort(distances)[:self.k]
            k_nearest = self.y_train.iloc[inds].values
            churn = np.mean(k_nearest)
            y_preds.append(churn)

        return np.array(y_preds)

    def compute_distance(self, X1, X2):
        # euclidean
        if self.distance_metric == 'euclidean':
            return np.linalg.norm(X2 - X1, axis=1)

        # manhattan
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X2 - X1), axis=1)

In [78]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # extract only the relevant features
    X_raw = train_data.drop(columns=['id', 'CustomerId', 'Surname', 'Exited'])
    y = train_data['Exited']
    X_test_raw = test_data.drop(columns=['id', 'CustomerId', 'Surname'])

    # one hot encode geography and gender
    X_raw = pd.get_dummies(X_raw)
    X_test_raw = pd.get_dummies(X_test_raw)

    # realign
    X_test_raw = X_test_raw.reindex(columns=X_raw.columns, fill_value=0)

    # standardize using z-score normalization
    m = X_raw.mean(axis=0)
    sd = X_raw.std(axis=0)
    X_scaled = (X_raw - m) / sd
    X_test_scaled = (X_test_raw - m) / sd

    return X_scaled, y, X_test_scaled

In [76]:
# Define cross-validation
def cross_validate(X, y, knn, n_splits=5):
    unique_classes, class_counts = np.unique(y, return_counts=True)
    indices_per_class = [np.where(y == cls)[0] for cls in unique_classes]

    fold_indices = [[] for _ in range(n_splits)]

    for cls_indices in indices_per_class:
        np.random.shuffle(cls_indices)
        for i, index in enumerate(cls_indices):
            fold_indices[i % n_splits].append(index)

    scores = []
    for i in range(n_splits):
        val_indices = fold_indices[i]
        train_indices = np.hstack([fold_indices[j] for j in range(n_splits) if j != i])

        X_train, X_val = X.iloc[train_indices], X.iloc[val_indices]
        y_train, y_val = y.iloc[train_indices], y.iloc[val_indices]

        knn.fit(X_train, y_train)
        y_val_pred = knn.predict(X_val)
        score = compute_roc_auc(y_val, y_val_pred)
        scores.append(score)

    return scores

def compute_roc_auc(y_true, y_pred):
    sorted_indices = np.argsort(y_pred)[::-1]
    sorted_y_true = y_true.iloc[sorted_indices]

    tpr = np.cumsum(sorted_y_true) / sorted_y_true.sum()
    fpr = np.cumsum(1 - sorted_y_true) / (len(sorted_y_true) - sorted_y_true.sum())

    auc = np.trapz(tpr, fpr)
    return auc

In [84]:
# Load and preprocess data
X, y, X_test = preprocess_data('/content/train.csv', '/content/test.csv')

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
print('Testing default model...')
cv_scores = cross_validate(X, y, knn)
print(f'Default cross-validation scores: {np.mean(cv_scores)}\n')

# hyperparamters tuning
best_k = None
best_metric = None
best_score = 0
for k in [5, 15, 25, 35, 45, 55, 65, 75, 85, 95, 500]:
    for metric in ['euclidean', 'manhattan']:
      print(f'Evaluating k={k} and metric={metric}...')
      knn = KNN(k=k, distance_metric=metric)
      cv_score = np.mean(cross_validate(X, y, knn))
      print(f'Cross-validation score for k={k} and metric={metric}: {cv_score:.6f}\n')

      if cv_score > best_score:
          best_score = cv_score
          best_k = k
          best_metric = metric

print(f'Best k: {best_k} and metric: {best_metric} with score: {best_score}')

# train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=45, distance_metric='euclidean')
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Save test predictions
ans = pd.DataFrame({'id': pd.read_csv('/content/test.csv')['id'], 'Exited': test_predictions}).to_csv('/content/submissions.csv', index=False)

Testing default model...
Default cross-validation scores: 0.8555934286468776

Evaluating k=5 and metric=euclidean...
Cross-validation score for k=5 and metric=euclidean: 0.857772

Evaluating k=5 and metric=manhattan...
Cross-validation score for k=5 and metric=manhattan: 0.869665

Evaluating k=15 and metric=euclidean...
Cross-validation score for k=15 and metric=euclidean: 0.898402

Evaluating k=15 and metric=manhattan...
Cross-validation score for k=15 and metric=manhattan: 0.897487

Evaluating k=25 and metric=euclidean...
Cross-validation score for k=25 and metric=euclidean: 0.902697

Evaluating k=25 and metric=manhattan...
Cross-validation score for k=25 and metric=manhattan: 0.904907

Evaluating k=35 and metric=euclidean...
Cross-validation score for k=35 and metric=euclidean: 0.903270

Evaluating k=35 and metric=manhattan...
Cross-validation score for k=35 and metric=manhattan: 0.904102

Evaluating k=45 and metric=euclidean...
Cross-validation score for k=45 and metric=euclidean: 