In [38]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import roc_auc_score
from collections import Counter

# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        """Store the training data."""
        self.X_train = np.array(X)
        self.y_train = np.array(y)

    def predict(self, X):
        """Predict the labels for the given input samples."""
        X = np.array(X)
        predictions = [self._predict_single(x) for x in X]
        return np.array(predictions)

    def predict_proba(self, X):
        """Predict the probabilities for each class."""
        X = np.array(X)
        prob_predictions = [self._predict_proba_single(x) for x in X]
        return np.array(prob_predictions)

    def _predict_single(self, x):
        """Predict the label for a single input sample."""
        distances = self._compute_distances(x)
        nearest_indices = np.argsort(distances)[:self.k]
        nearest_labels = self.y_train[nearest_indices]
        most_common = Counter(nearest_labels).most_common(1)
        return most_common[0][0]

    def _predict_proba_single(self, x):
        """Predict the probability for a single input sample."""
        distances = self._compute_distances(x)
        nearest_indices = np.argsort(distances)[:self.k]
        nearest_labels = self.y_train[nearest_indices]
        count = Counter(nearest_labels)
        # Return probabilities for each class
        prob_class_0 = count.get(0, 0) / self.k
        prob_class_1 = count.get(1, 0) / self.k
        return [prob_class_0, prob_class_1]

    def _compute_distances(self, x):
        """Compute the distances from a single point to all training data points."""
        if self.distance_metric == 'euclidean':
            return np.linalg.norm(self.X_train - x, axis=1)
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(self.X_train - x), axis=1)
        else:
            raise ValueError(f"Unsupported distance metric: {self.distance_metric}")

# Function to detect and cap outliers using z-score method (for NumPy arrays)
def detect_outliers_zscore(df, columns, threshold=3):
    """Cap outliers using z-score method for NumPy arrays."""
    df = df.copy()  # Ensure we don't modify the original array
    for col_idx in columns:
        mean = np.mean(df[:, col_idx])
        std_dev = np.std(df[:, col_idx])
        z_scores = (df[:, col_idx] - mean) / std_dev
        df[:, col_idx] = np.where(np.abs(z_scores) > threshold, 
                                  np.sign(z_scores) * (threshold * std_dev + mean), 
                                  df[:, col_idx])
    return df

# Preprocessing function
def preprocess_data(train_path, test_path):
    """Preprocess the data by loading, encoding categorical features, and handling missing values."""
    # Load datasets
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Separate features and target
    X = train_data.drop(['Exited', 'id', 'CustomerId', 'Surname'], axis=1, errors='ignore')
    y = train_data['Exited']  # Target variable
    X_test = test_data.drop(['id', 'CustomerId', 'Surname'], axis=1, errors='ignore')

    # One-hot encode character variables with fewer than 5 unique values
    cat_columns = X.select_dtypes(include=['object']).columns
    small_cat_columns = [col for col in cat_columns if X[col].nunique() < 5]

    encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
    X_encoded = encoder.fit_transform(X[small_cat_columns])
    X_test_encoded = encoder.transform(X_test[small_cat_columns])

    # Create DataFrames from encoded data
    X_encoded_df = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(small_cat_columns))
    X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoder.get_feature_names_out(small_cat_columns))

    # Drop original categorical columns
    X = X.drop(small_cat_columns, axis=1)
    X_test = X_test.drop(small_cat_columns, axis=1)

    # Concatenate encoded columns with original data
    X = pd.concat([X, X_encoded_df], axis=1)
    X_test = pd.concat([X_test, X_test_encoded_df], axis=1)

    # Ensure both train and test have the same columns
    X_test = X_test.reindex(columns=X.columns, fill_value=0)

    # Handle missing values for numeric and categorical columns separately
    for col in X.columns:
        if X[col].dtype in ['int64', 'float64']:
            mean_value = X[col].mean()  # Compute mean from training set
            X[col].fillna(mean_value, inplace=True)
            X_test[col].fillna(mean_value, inplace=True)
        else:
            mode_value = X[col].mode()[0]  # Compute mode from training set
            X[col].fillna(mode_value, inplace=True)
            X_test[col].fillna(mode_value, inplace=True)

    return X, y, X_test

# Cross-validation function with scaling inside the folds
def cross_validate(X, y, knn, n_splits=5):
    """Perform cross-validation and compute ROC AUC scores."""
    # Define the numeric column indices (assumed to be known)
    numeric_columns = ['Balance', 'EstimatedSalary', 'Age', 'CreditScore', 'Tenure', 'NumOfProducts']
    numeric_indices = [X.columns.get_loc(col) for col in numeric_columns]

    # Use KFold for better handling of class imbalance
    kf = KFold(n_splits=n_splits, shuffle=True)
    roc_auc_scores = []

    for train_index, test_index in kf.split(X, y):
        X_train, X_val = X.iloc[train_index], X.iloc[test_index]
        y_train, y_val = y.iloc[train_index], y.iloc[test_index]

        # Detect and cap outliers using z-score method on the training data
        X_train = detect_outliers_zscore(X_train.to_numpy(), numeric_indices)
        X_val = detect_outliers_zscore(X_val.to_numpy(), numeric_indices)

        # Scale numeric columns using StandardScaler (fit on training, apply to both)
        scaler = StandardScaler()
        X_train[:, numeric_indices] = scaler.fit_transform(X_train[:, numeric_indices])
        X_val[:, numeric_indices] = scaler.transform(X_val[:, numeric_indices])

        # Train the KNN model on the current fold
        knn.fit(X_train, y_train)
        
        # Predict probabilities for the positive class
        y_prob = knn.predict_proba(X_val)[:, 1]

        # Compute ROC AUC score for the current fold
        roc_auc = roc_auc_score(y_val, y_prob)
        roc_auc_scores.append(roc_auc)

    # Return the average ROC AUC score across all folds and individual scores
    mean_auc = np.mean(roc_auc_scores)
    return mean_auc, roc_auc_scores

# Step 1: Preprocess data (train and test)
X, y, X_test = preprocess_data('/Users/anneke/GitHub/abvo138-assignment-5/train.csv', '/Users/anneke/GitHub/abvo138-assignment-5/test.csv')

# Step 2: Hyperparameter tuning with cross-validation
best_k = None
best_score = 0

for k in range(1, 21):  # Range for 'k'
    knn = KNN(k=k, distance_metric='euclidean')
    score, _ = cross_validate(X, y, knn, n_splits=5)  # Cross-validation
    print(f"k={k}, ROC AUC={score:.4f}")

    if score > best_score:
        best_score = score
        best_k = k

print(f"Best k: {best_k}, Best ROC AUC: {best_score:.4f}")

# Step 3: Final training on full dataset
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # Fit the scaler on the entire training data

knn = KNN(k=best_k, distance_metric='euclidean')
knn.fit(X_scaled, y)  # Train the model on scaled training data

# Step 4: Apply the same scaler to the test set
X_test_scaled = scaler.transform(X_test)  # Apply the scaler to the test data

# Step 5: Predict on the test set
test_probabilities = knn.predict_proba(X_test_scaled)[:, 1]

# Step 6: Create submission file
test_data = pd.read_csv('/Users/anneke/GitHub/abvo138-assignment-5/test.csv')
test_ids = test_data['id']
submission = pd.DataFrame({'id': test_ids, 'Exited': test_probabilities})
submission.to_csv('/Users/anneke/GitHub/abvo138-assignment-5/submission.csv', index=False)


print("Submission file 'submission.csv' has been created!")




k=1, ROC AUC=0.7630
k=2, ROC AUC=0.8252
k=3, ROC AUC=0.8554
k=4, ROC AUC=0.8718
k=5, ROC AUC=0.8806
k=6, ROC AUC=0.8881
k=7, ROC AUC=0.8877
k=8, ROC AUC=0.8956
k=9, ROC AUC=0.8981
k=10, ROC AUC=0.9004
k=11, ROC AUC=0.9026
k=12, ROC AUC=0.9043
k=13, ROC AUC=0.9052
k=14, ROC AUC=0.9062
k=15, ROC AUC=0.9087
k=16, ROC AUC=0.9094
k=17, ROC AUC=0.9109
k=18, ROC AUC=0.9109
k=19, ROC AUC=0.9127
k=20, ROC AUC=0.9127
Best k: 19, Best ROC AUC: 0.9127
Submission file 'submission.csv' has been created!
