In [15]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold


In [16]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        y_pred = []
        for x_test in X:
            distances = self.compute_distance(self.X_train, x_test)
            k_indices = distances.argsort()[:self.k]
            k_nearest_labels = self.y_train[k_indices]
            prob = np.mean(k_nearest_labels)
            y_pred.append(prob)
        return np.array(y_pred)
    

    def compute_distance(self, X_train, x_test):
        if self.distance_metric == 'euclidean':
            distances = np.sqrt(np.sum((X_train - x_test) ** 2, axis=1))
        elif self.distance_metric == 'manhattan':
            distances = np.sum(np.abs(X_train - x_test), axis=1)
        else:
            raise ValueError("Unsupported distance metric")
        return distances

In [17]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Handle missing values (if any)
    # For this dataset, we'll check for missing values
    if train_data.isnull().sum().any() or test_data.isnull().sum().any():
        # Implement missing value handling if needed
        train_data.fillna(train_data.mean(), inplace=True)
        test_data.fillna(test_data.mean(), inplace=True)

    # Save 'id' from test_data for submission
    test_ids = test_data['id'].values

    # Drop unnecessary columns
    # For training data, drop 'id', 'CustomerId', 'Surname'
    train_data.drop(['id', 'CustomerId', 'Surname'], axis=1, inplace=True)
    # For test data, drop 'CustomerId', 'Surname' (keep 'id' for submission)
    test_data.drop(['CustomerId', 'Surname'], axis=1, inplace=True)

    # Encode categorical variables
    # Label Encoding for Gender
    le = LabelEncoder()
    train_data['Gender'] = le.fit_transform(train_data['Gender'])
    test_data['Gender'] = le.transform(test_data['Gender'])

    # One-Hot Encoding for Geography
    train_data = pd.get_dummies(train_data, columns=['Geography'], drop_first=True)
    test_data = pd.get_dummies(test_data, columns=['Geography'], drop_first=True)

    # Ensure both datasets have the same dummy variables
    missing_cols = set(train_data.columns) - set(test_data.columns)
    for c in missing_cols:
        test_data[c] = 0
    test_data = test_data[train_data.columns.drop('Exited')]

    # Separate features and target variable
    X = train_data.drop('Exited', axis=1).values
    y = train_data['Exited'].values
    X_test = test_data.values

    # Feature Scaling
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    X_test = scaler.transform(X_test)

    return X, y, X_test, test_ids


In [18]:
# Define cross-validation function
def cross_validate(X, y, k, distance_metric, n_splits=5):
    skf = StratifiedKFold(n_splits=n_splits)
    roc_auc_scores = []

    for train_index, val_index in skf.split(X, y):
        X_train_fold, X_val_fold = X[train_index], X[val_index]
        y_train_fold, y_val_fold = y[train_index], y[val_index]

        knn = KNN(k=k, distance_metric=distance_metric)
        knn.fit(X_train_fold, y_train_fold)
        y_val_pred = knn.predict(X_val_fold)
        score = roc_auc_score(y_val_fold, y_val_pred)
        roc_auc_scores.append(score)

    return np.mean(roc_auc_scores)

In [19]:
# Load and preprocess data
X, y, X_test, test_ids = preprocess_data('train.csv', 'test.csv')

# Hyperparameter tuning
best_score = 0
best_k = None
best_metric = None

for metric in ['euclidean', 'manhattan']:
    for k in range(1, 21):
        score = cross_validate(X, y, k, metric, n_splits=5)
        if score > best_score:
            best_score = score
            best_k = k
            best_metric = metric
        print(f"k={k}, metric={metric}, ROC AUC={score:.4f}")

print(f"\nBest ROC AUC Score: {best_score:.4f}")
print(f"Optimal k: {best_k}")
print(f"Optimal Distance Metric: {best_metric}")

# Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=best_k, distance_metric=best_metric)
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Ensure predictions are between 0 and 1
test_predictions = np.clip(test_predictions, 0, 1)

# Save test predictions
submission = pd.DataFrame({'id': test_ids, 'Exited': test_predictions})
submission.to_csv('submissions.csv', index=False)


print("Submission file 'submissions.csv' has been created.")

k=1, metric=euclidean, ROC AUC=0.7532
k=2, metric=euclidean, ROC AUC=0.8153
k=3, metric=euclidean, ROC AUC=0.8446
k=4, metric=euclidean, ROC AUC=0.8618
k=5, metric=euclidean, ROC AUC=0.8717
k=6, metric=euclidean, ROC AUC=0.8771
k=7, metric=euclidean, ROC AUC=0.8815
k=8, metric=euclidean, ROC AUC=0.8855
k=9, metric=euclidean, ROC AUC=0.8872
k=10, metric=euclidean, ROC AUC=0.8905
k=11, metric=euclidean, ROC AUC=0.8920
k=12, metric=euclidean, ROC AUC=0.8948
k=13, metric=euclidean, ROC AUC=0.8968
k=14, metric=euclidean, ROC AUC=0.8970
k=15, metric=euclidean, ROC AUC=0.8969
k=16, metric=euclidean, ROC AUC=0.8981
k=17, metric=euclidean, ROC AUC=0.8984
k=18, metric=euclidean, ROC AUC=0.8994
k=19, metric=euclidean, ROC AUC=0.9006
k=20, metric=euclidean, ROC AUC=0.9007
k=1, metric=manhattan, ROC AUC=0.7508
k=2, metric=manhattan, ROC AUC=0.8172
k=3, metric=manhattan, ROC AUC=0.8471
k=4, metric=manhattan, ROC AUC=0.8636
k=5, metric=manhattan, ROC AUC=0.8736
k=6, metric=manhattan, ROC AUC=0.8789
k