In [27]:
import numpy as np
import pandas as pd

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        # TODO: Implement the fit method
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        # TODO: Implement the predict method
        predictions = [self._predict_single(x) for x in X]
        return np.array(predictions)

    def _predict_single(self, x):
        # Predict the class of a single sample.
        distances = self.compute_distance(self.X_train, x)
        k_nearest_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = self.y_train[k_nearest_indices]
        return np.mean(k_nearest_labels == 1)

    def compute_distance(self, X1, X2):
        # TODO: Implement distance computation based on self.distance_metric
        # Hint: Use numpy operations for efficient computation
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X1 - X2) ** 2, axis=1))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X1 - X2), axis=1)
        else:
            raise ValueError("Unsupported distance metric")

In [29]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # TODO: Implement data preprocessing
    # Handle categorical variables, scale features, etc.

    # Drop unnecessary columns
    train_data = train_data.drop(columns=['CustomerId', 'Surname'])
    test_data = test_data.drop(columns=['CustomerId', 'Surname'])

    # Encode categorical variables
    # Gender encoding
    train_data['Gender'] = (train_data['Gender'] == 'Male').astype(int)
    test_data['Gender'] = (test_data['Gender'] == 'Male').astype(int)

    # One-hot encoding for 'Geography'
    train_data = pd.get_dummies(train_data, columns=['Geography'], drop_first=True)
    test_data = pd.get_dummies(test_data, columns=['Geography'], drop_first=True)

    # Ensure test set has the same columns as the train set
    missing_cols = set(train_data.columns) - set(test_data.columns)
    for col in missing_cols:
        test_data[col] = 0
    test_data = test_data[train_data.columns.drop('Exited')]

    # Extract labels from train data
    y = train_data['Exited']
    X = train_data.drop(columns=['Exited'])
    X_test = test_data  # Now X_test is properly defined

    # Scale features manually (mean normalization)
    X_train_mean = X.mean()
    X_train_std = X.std()

    X = (X - X_train_mean) / X_train_std
    X_test = (X_test - X_train_mean) / X_train_std  # Use train mean and std for test data

    return X.values, y.values, X_test.values

In [30]:
import numpy as np

# Define cross-validation function
def cross_validate(X, y, knn, n_splits=10):
    # TODO: Implement cross-validation
    # Compute ROC AUC scores
    fold_size = len(X) // n_splits
    roc_auc_scores = []

    for i in range(n_splits):
        # Split the data into training and validation sets
        X_val = X[i * fold_size:(i + 1) * fold_size]
        y_val = y[i * fold_size:(i + 1) * fold_size]

        X_train = np.concatenate([X[:i * fold_size], X[(i + 1) * fold_size:]])
        y_train = np.concatenate([y[:i * fold_size], y[(i + 1) * fold_size:]])

        # Fit the KNN model and make predictions
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_val)

        # Calculate ROC-AUC score and store it
        roc_auc = compute_roc_auc(y_val, y_pred)
        roc_auc_scores.append(roc_auc)

    return np.mean(roc_auc_scores)

def compute_roc_auc(y_true, y_pred):
    sorted_indices = np.argsort(y_pred)[::-1]  # Sort by predicted probabilities
    y_true = y_true[sorted_indices]
    cumulative_positive = np.cumsum(y_true)
    total_positive = np.sum(y_true)
    total_negative = len(y_true) - total_positive

    tpr = cumulative_positive / total_positive  # True Positive Rate
    fpr = (np.arange(len(y_true)) + 1 - cumulative_positive) / total_negative  # False Positive Rate

    return np.trapz(tpr, fpr)  # Area under the ROC curve

In [31]:
# Load and preprocess data
X, y, X_test = preprocess_data('/content/drive/MyDrive/CS506/Assignment5/train.csv', '/content/drive/MyDrive/CS506/Assignment5/test.csv')

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)

# TODO: hyperparamters tuning
best_k = 1
best_score = 0
best_metric = 'euclidean'

for metric in ['euclidean', 'manhattan']:
    for k in range(1, 21, 2):
        knn = KNN(k=k, distance_metric=metric)
        score = cross_validate(X, y, knn)
        print(f"ROC-AUC Score with k={k}, metric={metric}: {score:.4f}")

        if score > best_score:
            best_k = k
            best_metric = metric
            best_score = score

print(f"Best k: {best_k}, Best metric: {best_metric} with ROC-AUC Score: {best_score:.4f}")

# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=best_k, distance_metric=best_metric)
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('/content/drive/MyDrive/CS506/Assignment5/test.csv')['id'], 'Exited': test_predictions}).to_csv('/content/drive/MyDrive/CS506/Assignment5/submissions.csv', index=False)

Cross-validation scores: 0.8671167290574422
ROC-AUC Score with k=1, metric=euclidean: 0.7516
ROC-AUC Score with k=3, metric=euclidean: 0.8417
ROC-AUC Score with k=5, metric=euclidean: 0.8671
ROC-AUC Score with k=7, metric=euclidean: 0.8811
ROC-AUC Score with k=9, metric=euclidean: 0.8880
ROC-AUC Score with k=11, metric=euclidean: 0.8901
ROC-AUC Score with k=13, metric=euclidean: 0.8942
ROC-AUC Score with k=15, metric=euclidean: 0.8946
ROC-AUC Score with k=17, metric=euclidean: 0.8969
ROC-AUC Score with k=19, metric=euclidean: 0.8986
ROC-AUC Score with k=1, metric=manhattan: 0.7558
ROC-AUC Score with k=3, metric=manhattan: 0.8459
ROC-AUC Score with k=5, metric=manhattan: 0.8695
ROC-AUC Score with k=7, metric=manhattan: 0.8810
ROC-AUC Score with k=9, metric=manhattan: 0.8885
ROC-AUC Score with k=11, metric=manhattan: 0.8930
ROC-AUC Score with k=13, metric=manhattan: 0.8961
ROC-AUC Score with k=15, metric=manhattan: 0.8983
ROC-AUC Score with k=17, metric=manhattan: 0.8997
ROC-AUC Score wi