In [1]:
import numpy as np
import pandas as pd

In [2]:
class KNN:
    def __init__(self, k=3, distance_metric='euclidean', batch_size=100):
        self.k = k
        self.distance_metric = distance_metric
        self.batch_size = batch_size

    def fit(self, X, y):
        print(f"Training KNN with k={self.k} on training data...")
        self.X_train = X
        self.y_train = y
        print(f"Training completed.")

    def predict(self, X):
        print(f"Making predictions for {len(X)} test points...")
        predictions = []
        num_test = X.shape[0]

        for i in range(0, num_test, self.batch_size):
            X_batch = X[i:i + self.batch_size]
            distances = self.compute_distance(X_batch)

            for dist in distances:
                k_indices = np.argsort(dist)[:self.k]
                k_nearest_labels = self.y_train[k_indices].astype(int)
                majority_vote = np.argmax(np.bincount(k_nearest_labels))
                predictions.append(majority_vote)

        print(f"Predictions completed.")
        return predictions

    def compute_distance(self, X_test):
        if self.distance_metric == 'euclidean':
            distances = np.sqrt(np.sum((self.X_train - X_test[:, np.newaxis]) ** 2, axis=2))
            return distances
        else:
            raise ValueError("Unsupported distance metric")

In [3]:
def preprocess_data(train_path, test_path):
    print("Loading and preprocessing data...")
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    train_data = train_data.drop(['CustomerId', 'Surname'], axis=1)
    test_data = test_data.drop(['CustomerId', 'Surname'], axis=1)

    train_data['Geography'] = train_data['Geography'].astype('category').cat.codes
    train_data['Gender'] = train_data['Gender'].astype('category').cat.codes
    test_data['Geography'] = test_data['Geography'].astype('category').cat.codes
    test_data['Gender'] = test_data['Gender'].astype('category').cat.codes

    X_train = train_data.drop('Exited', axis=1).values
    y_train = train_data['Exited'].values
    X_test = test_data.values

    mean = np.mean(X_train, axis=0)
    std = np.std(X_train, axis=0)

    X_train = (X_train - mean) / std
    X_test = (X_test - mean) / std

    print("Data preprocessing completed.")
    return X_train, y_train, X_test

In [4]:
def cross_validate(X, y, knn, n_splits=5):
    print("Starting cross-validation...")
    n = len(X)
    fold_size = n // n_splits
    indices = np.arange(n)
    np.random.shuffle(indices)

    accuracy_scores = []

    for fold in range(n_splits):
        print(f"Processing fold {fold + 1} of {n_splits}...")
        val_start = fold * fold_size
        val_end = val_start + fold_size if fold != n_splits - 1 else n
        val_indices = indices[val_start:val_end]
        train_indices = np.concatenate([indices[:val_start], indices[val_end:]])

        X_train, y_train = X[train_indices], y[train_indices]
        X_val, y_val = X[val_indices], y[val_indices]

        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_val)

        accuracy = np.mean(y_pred == y_val)
        accuracy_scores.append(accuracy)

        print(f"Fold {fold + 1} accuracy: {accuracy:.4f}")

    print("Cross-validation completed.")
    return np.mean(accuracy_scores)

In [5]:
X, y, X_test = preprocess_data('/content/train.csv', '/content/test.csv')

knn = KNN(k=5, distance_metric='euclidean')

cv_scores = cross_validate(X, y, knn)
print("Cross-validation scores:", cv_scores)

best_k = 5
best_score = 0
for k in range(1, 20):
    print(f"Evaluating k={k}...")
    knn = KNN(k=k, distance_metric='euclidean')
    cv_score = cross_validate(X, y, knn)
    print(f"k = {k}, Cross-validation accuracy = {cv_score:.4f}")

    if cv_score > best_score:
        best_score = cv_score
        best_k = k

print(f"Best k found: {best_k} with cross-validation accuracy: {best_score:.4f}")

print(f"Training final model with k={best_k} on full dataset...")
knn = KNN(k=best_k, distance_metric='euclidean')
knn.fit(X, y)
test_predictions = knn.predict(X_test)

pd.DataFrame({'id': pd.read_csv('/content/test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)
print("Predictions saved successfully.")

Loading data...
Loading and preprocessing data...
Data preprocessing completed.
Starting cross-validation with k=5...
Starting cross-validation...
Processing fold 1 of 5...
Training KNN with k=5 on training data...
Training completed.
Making predictions for 3000 test points...
Predictions completed.
Fold 1 accuracy: 0.8810
Processing fold 2 of 5...
Training KNN with k=5 on training data...
Training completed.
Making predictions for 3000 test points...
Predictions completed.
Fold 2 accuracy: 0.8680
Processing fold 3 of 5...
Training KNN with k=5 on training data...
Training completed.
Making predictions for 3000 test points...
Predictions completed.
Fold 3 accuracy: 0.8677
Processing fold 4 of 5...
Training KNN with k=5 on training data...
Training completed.
Making predictions for 3000 test points...
Predictions completed.
Fold 4 accuracy: 0.8590
Processing fold 5 of 5...
Training KNN with k=5 on training data...
Training completed.
Making predictions for 3000 test points...
Prediction