In [18]:
import numpy as np
import pandas as pd

In [19]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        predictions = [self._predict(x) for x in X]
        return np.array(predictions)

    def _predict(self, x):
        # Compute distances between x and all training samples
        distances = [self.compute_distance(x, x_train) for x_train in self.X_train]
        # Get the k nearest samples
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        # Majority vote: most common label among k nearest neighbors
        most_common = self._majority_vote(k_nearest_labels)
        return most_common

    def compute_distance(self, X1, X2):
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X1 - X2) ** 2))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X1 - X2))
        else:
            raise ValueError("Unknown distance metric")

    def _majority_vote(self, neighbors):
        # Find the unique classes and their counts
        unique, counts = np.unique(neighbors, return_counts=True)
        # Return the class with the highest count
        return unique[np.argmax(counts)]

In [20]:
# Define data preprocessing function
from google.colab import drive
drive.mount('/content/drive')

def preprocess_data(train_path, test_path):
    train_data = pd.read_csv('/content/drive/My Drive/train.csv')[:10000]
    test_data = pd.read_csv('/content/drive/My Drive/test.csv')

    # Convert categorical variables to numerical (e.g., Geography, Gender)
    train_data['Geography'] = train_data['Geography'].map({'France': 0, 'Germany': 1, 'Spain': 2})
    test_data['Geography'] = test_data['Geography'].map({'France': 0, 'Germany': 1, 'Spain': 2})

    train_data['Gender'] = train_data['Gender'].map({'Male': 0, 'Female': 1})
    test_data['Gender'] = test_data['Gender'].map({'Male': 0, 'Female': 1})

    # Exclude non-relevant columns such as CustomerId and Surname
    X_train = train_data.drop(columns=['CustomerId', 'Surname', 'Exited'])
    y_train = train_data['Exited'].values
    X_test = test_data.drop(columns=['CustomerId', 'Surname'])

    # Scale numerical features using min-max scaling
    for col in X_train.columns:
        if X_train[col].dtype != 'object':
            min_val = X_train[col].min()
            max_val = X_train[col].max()
            X_train[col] = (X_train[col] - min_val) / (max_val - min_val)
            X_test[col] = (X_test[col] - min_val) / (max_val - min_val)

    return X_train.values, y_train, X_test.values

# Load and preprocess data
X, y, X_test = preprocess_data('train_data', 'test_data')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    n_samples = len(X)
    fold_size = n_samples // n_splits
    auc_scores = []

    for fold in range(n_splits):
        start = fold * fold_size
        end = start + fold_size if fold != n_splits - 1 else n_samples

        X_train_fold = np.concatenate((X[:start], X[end:]), axis=0)
        y_train_fold = np.concatenate((y[:start], y[end:]), axis=0)
        X_val_fold = X[start:end]
        y_val_fold = y[start:end]

        knn.fit(X_train_fold, y_train_fold)
        y_pred = knn.predict(X_val_fold)

        # Calculate ROC AUC manually
        auc_score = roc_auc(y_val_fold, y_pred)
        auc_scores.append(auc_score)

    return np.mean(auc_scores)

def roc_auc(y_true, y_pred):
    # Sort predictions and true values together
    sorted_indices = np.argsort(y_pred)
    sorted_y_true = y_true[sorted_indices]

    # Calculate true positive rate (TPR) and false positive rate (FPR)
    positives = np.sum(y_true == 1)
    negatives = np.sum(y_true == 0)

    tpr = np.cumsum(sorted_y_true == 1) / positives
    fpr = np.cumsum(sorted_y_true == 0) / negatives

    # Compute AUC using the trapezoidal rule
    return np.trapz(tpr, fpr)

In [23]:
# Load and preprocess data
X, y, X_test = preprocess_data('/content/drive/My Drive/train.csv'[:10000], '/content/drive/My Drive/test.csv')

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)

# TODO: hyperparamters tuning
# Hyperparameter tuning function
def hyperparameter_tuning(X, y, k_values, distance_metrics):
    best_k = None
    best_distance_metric = None
    best_score = -1  # Initialize to a low value

    # Iterate over all possible combinations of k and distance metrics
    for k in k_values:
        for distance_metric in distance_metrics:
            knn = KNN(k=k, distance_metric=distance_metric)
            cv_score = cross_validate(X, y, knn)
            print(f'k={k}, distance_metric={distance_metric}, ROC AUC={cv_score}')

            if cv_score > best_score:
                best_score = cv_score
                best_k = k
                best_distance_metric = distance_metric

    return best_k, best_distance_metric, best_score

# Define the range of hyperparameters to test
k_values = [3, 5, 7, 9]  # Test different k values
distance_metrics = ['euclidean', 'manhattan']  # Test different distance metrics

# Perform hyperparameter tuning
best_k, best_distance_metric, best_score = hyperparameter_tuning(X, y, k_values, distance_metrics)
print(f'Best parameters: k={best_k}, distance_metric={best_distance_metric}, Best ROC AUC={best_score}')

# Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=best_k, distance_metric=best_distance_metric)
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('/content/drive/My Drive/test.csv')['id'], 'Exited': test_predictions}).to_csv('submission.csv', index=False)

# Download the submission file
from google.colab import files
files.download('submission.csv')

Cross-validation scores: 0.2742392498634296
k=3, distance_metric=euclidean, ROC AUC=0.27376134660152374
k=3, distance_metric=manhattan, ROC AUC=0.273118428430282
k=5, distance_metric=euclidean, ROC AUC=0.2742392498634296
k=5, distance_metric=manhattan, ROC AUC=0.2686750957337013
k=7, distance_metric=euclidean, ROC AUC=0.2824857789857056
k=7, distance_metric=manhattan, ROC AUC=0.2701292548331019
k=9, distance_metric=euclidean, ROC AUC=0.2824311735787534
k=9, distance_metric=manhattan, ROC AUC=0.27584742941011486
Best parameters: k=7, distance_metric=euclidean, Best ROC AUC=0.2824857789857056


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>