In [37]:
import numpy as np
import pandas as pd

In [38]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        if not isinstance(k, int) or k <= 0:
            raise ValueError("k must be a positive integer")
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y.astype(int)

    def predict(self, X, batch_size=1000):
        return self._predict_proba(X, batch_size).argmax(axis=1)

    def predict_proba(self, X, batch_size=1000):
        return self._predict_proba(X, batch_size)

    def _predict_proba(self, X, batch_size=1000):
        n_samples = X.shape[0]
        n_classes = len(np.unique(self.y_train))
        y_pred_proba = np.zeros((n_samples, n_classes))

        for i in range(0, n_samples, batch_size):
            batch = X[i:i+batch_size]
            if self.distance_metric == 'euclidean':
                distances = np.sqrt(((batch[:, np.newaxis, :] - self.X_train[np.newaxis, :, :]) ** 2).sum(axis=2))
            elif self.distance_metric == 'manhattan':
                distances = np.abs(batch[:, np.newaxis, :] - self.X_train[np.newaxis, :, :]).sum(axis=2)
            else:
                raise ValueError("Invalid distance metric")

            k = min(self.k, self.X_train.shape[0])
            k_indices = np.argpartition(distances, k, axis=1)[:, :k]
            k_nearest_labels = self.y_train[k_indices]

            for j, label in enumerate(k_nearest_labels):
                y_pred_proba[i+j] = np.bincount(label, minlength=n_classes) / k

        return y_pred_proba

In [39]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Handle missing values
    train_data = train_data.dropna()
    test_data = test_data.dropna()

    # Encode categorical variables
    categorical_columns = ['Geography', 'Gender']
    for column in categorical_columns:
        train_data[column] = pd.Categorical(train_data[column]).codes
        test_data[column] = pd.Categorical(test_data[column]).codes

    # Separate features and target
    X_train = train_data.drop(['Exited', 'id', 'Surname'], axis=1)
    y_train = train_data['Exited'].astype(int)
    X_test = test_data.drop(['id', 'Surname'], axis=1)

    # Apply feature scaling
    X_train, X_test = feature_scaling(X_train, X_test)

    return X_train, y_train, X_test

In [40]:
def feature_scaling(X_train, X_test):
    # Min-Max scaling
    min_vals = X_train.min()
    max_vals = X_train.max()
    X_train_scaled = (X_train - min_vals) / (max_vals - min_vals)
    X_test_scaled = (X_test - min_vals) / (max_vals - min_vals)
    return X_train_scaled, X_test_scaled

In [41]:
def train_test_split(X, y, test_size=0.2, random_state=None):
    if random_state is not None:
        np.random.seed(random_state)

    n = len(X)
    n_test = int(n * test_size)
    indices = np.random.permutation(n)
    test_indices = indices[:n_test]
    train_indices = indices[n_test:]

    X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
    y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]

    return X_train, X_test, y_train, y_test

In [42]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    np.random.seed(42)
    indices = np.random.permutation(len(X))
    fold_size = len(X) // n_splits
    scores = []

    for i in range(n_splits):
        start = i * fold_size
        end = (i + 1) * fold_size if i < n_splits - 1 else len(X)
        test_indices = indices[start:end]
        train_indices = np.concatenate([indices[:start], indices[end:]])

        X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
        y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]

        knn.fit(X_train.values, y_train.values)
        y_pred = knn.predict(X_test.values)
        score = np.mean(y_pred == y_test)
        scores.append(score)

    return np.mean(scores), np.std(scores)

In [43]:
def evaluate_knn_with_different_k(X_train, y_train, X_val, y_val, k_values, distance_metrics):
    best_accuracy = 0
    best_k = 0
    best_metric = ''

    for metric in distance_metrics:
        for k in k_values:
            knn = KNN(k=k, distance_metric=metric)
            knn.fit(X_train, y_train)
            val_predictions = knn.predict(X_val)
            accuracy = np.mean(val_predictions == y_val)
            print(f"Validation Accuracy for k={k}, metric={metric}: {accuracy:.4f}")

            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_k = k
                best_metric = metric

    print(f"\nBest k: {best_k}, Best metric: {best_metric} with accuracy: {best_accuracy:.4f}")
    return best_k, best_metric

In [44]:
# Load and preprocess data
X_train, y_train, X_test = preprocess_data('train.csv', 'test.csv')

# Split the training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [45]:
# Convert to numpy arrays for faster computation
X_train = X_train.values
y_train = y_train.values
X_val = X_val.values
y_val = y_val.values
X_test = X_test.values

In [46]:
# Example usage
k_values = [12, 13, 14, 15, 16, 17]
distance_metrics = ['euclidean', 'manhattan']
best_k, best_metric = evaluate_knn_with_different_k(X_train, y_train, X_val, y_val, k_values, distance_metrics)

# Train with best parameters
knn = KNN(k=best_k, distance_metric=best_metric)
knn.fit(X_train, y_train)

Validation Accuracy for k=12, metric=euclidean: 0.8637
Validation Accuracy for k=13, metric=euclidean: 0.8677
Validation Accuracy for k=14, metric=euclidean: 0.8647
Validation Accuracy for k=15, metric=euclidean: 0.8637
Validation Accuracy for k=16, metric=euclidean: 0.8630
Validation Accuracy for k=17, metric=euclidean: 0.8637
Validation Accuracy for k=12, metric=manhattan: 0.8630
Validation Accuracy for k=13, metric=manhattan: 0.8643
Validation Accuracy for k=14, metric=manhattan: 0.8660
Validation Accuracy for k=15, metric=manhattan: 0.8647
Validation Accuracy for k=16, metric=manhattan: 0.8607
Validation Accuracy for k=17, metric=manhattan: 0.8643

Best k: 13, Best metric: euclidean with accuracy: 0.8677


In [47]:
# Perform cross-validation
cv_score, cv_std = cross_validate(pd.DataFrame(X_train), pd.Series(y_train), knn)
print(f"Cross-validation score: {cv_score:.4f} (+/- {cv_std:.4f})")

# Train on full dataset and make predictions on test set
full_X_train = np.vstack((X_train, X_val))
full_y_train = np.hstack((y_train, y_val))
knn.fit(full_X_train, full_y_train)

# Predict probabilities in batches to save memory
batch_size = 1000
test_predictions_proba = np.zeros((X_test.shape[0], 2))
for i in range(0, X_test.shape[0], batch_size):
    batch = X_test[i:i+batch_size]
    test_predictions_proba[i:i+batch_size] = knn.predict_proba(batch)

Cross-validation score: 0.8692 (+/- 0.0068)


In [48]:
# Save test predictions (probabilities for the positive class)
submission = pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions_proba[:, 1]})
submission.to_csv('submission.csv', index=False)
print("Submission file created: submission.csv")

Submission file created: submission.csv
