In [7]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from google.colab import drive
from google.colab import files
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def _euclidean_distance(self, X1, X2):
        # Vectorized Euclidean distance computation
        return np.sqrt(np.sum((X1 - X2) ** 2, axis=1))

    def _manhattan_distance(self, X1, X2):
        # Vectorized Manhattan distance computation
        return np.sum(np.abs(X1 - X2), axis=1)

    def _compute_distance(self, x1, x2):
        if self.distance_metric == 'euclidean':
            return self._euclidean_distance(x1, x2)
        elif self.distance_metric == 'manhattan':
            return self._manhattan_distance(x1, x2)
        else:
            raise ValueError(f"Unknown distance metric: {self.distance_metric}")

    def predict(self, X):
        predictions = []
        for x in X:
            # Compute distances between x and all X_train (vectorized)
            distances = self._compute_distance(x, self.X_train)

            # Sort by distance and select the top k
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = self.y_train[k_indices]

            # Majority vote
            most_common = np.bincount(k_nearest_labels).argmax()
            predictions.append(most_common)

        return np.array(predictions)

    def predict_proba(self, X):
        probabilities = []
        for x in X:
            distances = self._compute_distance(x, self.X_train)
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = self.y_train[k_indices]

            # Calculate the probability of the majority class
            proba = np.mean(k_nearest_labels)
            probabilities.append(proba)

        return np.array(probabilities)

In [9]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Drop irrelevant columns (Customer ID, Surname)
    train_data = train_data.drop(columns=['CustomerId', 'Surname'])
    test_data = test_data.drop(columns=['CustomerId', 'Surname'])

    # Identify categorical and numerical columns
    categorical_cols = ['Geography', 'Gender']
    numerical_cols = [col for col in train_data.columns if col not in categorical_cols + ['Exited']]

    # Handle missing values separately for categorical and numerical columns
    train_data[numerical_cols] = train_data[numerical_cols].fillna(train_data[numerical_cols].median())
    test_data[numerical_cols] = test_data[numerical_cols].fillna(test_data[numerical_cols].median())

    # Fill categorical columns with the most frequent (mode) value
    train_data[categorical_cols] = train_data[categorical_cols].fillna(train_data[categorical_cols].mode().iloc[0])
    test_data[categorical_cols] = test_data[categorical_cols].fillna(test_data[categorical_cols].mode().iloc[0])

    # One-hot encode categorical variables
    encoder = OneHotEncoder(drop='first', sparse_output=False)
    train_encoded = encoder.fit_transform(train_data[categorical_cols])
    test_encoded = encoder.transform(test_data[categorical_cols])

    train_encoded_df = pd.DataFrame(train_encoded, columns=encoder.get_feature_names_out())
    test_encoded_df = pd.DataFrame(test_encoded, columns=encoder.get_feature_names_out())

    # Drop original categorical columns and concatenate the encoded ones
    train_data = train_data.drop(columns=categorical_cols)
    test_data = test_data.drop(columns=categorical_cols)

    train_data = pd.concat([train_data, train_encoded_df], axis=1)
    test_data = pd.concat([test_data, test_encoded_df], axis=1)

    # Scale numerical features
    scaler = StandardScaler()
    feature_columns = train_data.drop(columns=['Exited']).columns
    train_data[feature_columns] = scaler.fit_transform(train_data[feature_columns])
    test_data[feature_columns] = scaler.transform(test_data[feature_columns])

    # Prepare features and labels
    X = train_data.drop(columns=['Exited']).values
    y = train_data['Exited'].values
    X_test = test_data.values

    return X, y, X_test


In [10]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    roc_auc_scores = []

    for train_index, val_index in kf.split(X):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        # Train the KNN model
        knn.fit(X_train, y_train)

        # Predict probabilities
        y_val_pred_proba = knn.predict_proba(X_val)

        # Calculate ROC AUC
        roc_auc = roc_auc_score(y_val, y_val_pred_proba)
        roc_auc_scores.append(roc_auc)

    return np.mean(roc_auc_scores), roc_auc_scores

In [11]:
# Load and preprocess data
train_path = '/content/drive/MyDrive/train.csv'
test_path = '/content/drive/MyDrive/test.csv'

# Load and preprocess data
X, y, X_test = preprocess_data(train_path, test_path)
# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
mean_roc_auc, cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)
print("Mean ROC AUC:", mean_roc_auc)

# Train on full dataset with optimal hyperparameters
knn = KNN(k=5, distance_metric='euclidean')  # Best parameters
knn.fit(X, y)

# Make predictions on the test set
test_predictions = knn.predict_proba(X_test)

pd.DataFrame({'id': pd.read_csv(test_path)['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)
files.download('submissions.csv')
print("Test predictions saved to 'submissions.csv'")




Cross-validation scores: [0.8554862461198053, 0.878138723231228, 0.8755474268182397, 0.8557049070092548, 0.8843730556729739]
Mean ROC AUC: 0.8698500717703004


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Test predictions saved to 'submissions.csv'
