In [47]:
import numpy as np
import pandas as pd

class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        """Store the training data."""
        self.X_train = np.array(X, dtype=float)  # Ensure it's a numpy array of floats
        self.y_train = np.array(y)

    def predict(self, X):
        """Predict the labels for the given input samples."""
        X = np.array(X, dtype=float)  # Convert input to numpy array of floats
        predictions = [self._predict_single(x) for x in X]
        return np.array(predictions)

    def predict_proba(self, X):
        """Predict the probabilities for each class."""
        X = np.array(X, dtype=float)  # Convert input to numpy array of floats
        prob_predictions = [self._predict_proba_single(x) for x in X]
        return np.array(prob_predictions)

    def _predict_single(self, x):
        """Predict the label for a single input sample."""
        distances = self._compute_distances(x)
        nearest_indices = np.argsort(distances)[:self.k]
        nearest_labels = self.y_train[nearest_indices]
        
        # Count the occurrences of each label using numpy
        labels, counts = np.unique(nearest_labels, return_counts=True)
        most_common_label = labels[np.argmax(counts)]  # Get the label with the maximum count
        return most_common_label

    def _predict_proba_single(self, x):
        """Predict the probability for a single input sample."""
        distances = self._compute_distances(x)
        nearest_indices = np.argsort(distances)[:self.k]
        nearest_labels = self.y_train[nearest_indices]

        # Count occurrences of each label using numpy
        labels, counts = np.unique(nearest_labels, return_counts=True)
        
        # Calculate probabilities for classes 0 and 1
        prob_class_0 = counts[labels == 0][0] / self.k if 0 in labels else 0
        prob_class_1 = counts[labels == 1][0] / self.k if 1 in labels else 0
        return [prob_class_0, prob_class_1]

    def _compute_distances(self, x):
        """Compute the distances from a single point to all training data points."""
        if self.distance_metric == 'euclidean':
            # Ensure both X_train and x are numpy arrays of floats
            return np.sqrt(np.sum((self.X_train - x) ** 2, axis=1))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(self.X_train - x), axis=1)
        else:
            raise ValueError(f"Unsupported distance metric: {self.distance_metric}")

def detect_outliers_zscore(df, columns, threshold=3):
    """Cap outliers using z-score method for pandas DataFrame."""
    df = df.copy()  # Ensure we don't modify the original DataFrame
    for col in columns:
        mean = df[col].mean()
        std_dev = df[col].std()
        z_scores = (df[col] - mean) / std_dev
        df[col] = np.where(np.abs(z_scores) > threshold, 
                           np.sign(z_scores) * (threshold * std_dev + mean), 
                           df[col])
    return df

def preprocess_data(train_path, test_path):
    """Preprocess the data by loading, encoding categorical features, and handling missing values."""
    # Load datasets
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Separate features and target
    X = train_data.drop(['Exited', 'id', 'CustomerId', 'Surname'], axis=1, errors='ignore')
    y = train_data['Exited']  # Target variable
    X_test = test_data.drop(['id', 'CustomerId', 'Surname'], axis=1, errors='ignore')

    # One-hot encode character variables with fewer than 5 unique values
    cat_columns = X.select_dtypes(include=['object']).columns
    small_cat_columns = [col for col in cat_columns if X[col].nunique() < 5]

    # Manually one-hot encode categorical columns using pandas
    X_encoded = pd.get_dummies(X[small_cat_columns], drop_first=False)
    X_test_encoded = pd.get_dummies(X_test[small_cat_columns], drop_first=False)

    # Align the columns of the test set to match the training set
    X_encoded, X_test_encoded = X_encoded.align(X_test_encoded, join='left', axis=1, fill_value=0)

    # Drop original categorical columns
    X = X.drop(small_cat_columns, axis=1)
    X_test = X_test.drop(small_cat_columns, axis=1)

    # Concatenate encoded columns with original data
    X = pd.concat([X, X_encoded], axis=1)
    X_test = pd.concat([X_test, X_test_encoded], axis=1)

    # Ensure both train and test have the same columns
    X_test = X_test.reindex(columns=X.columns, fill_value=0)

    # Handle missing values for numeric and categorical columns separately
    for col in X.columns:
        if np.issubdtype(X[col].dtype, np.number):  # Numeric columns
            mean_value = X[col].mean()  # Compute mean from training set
            X[col].fillna(mean_value, inplace=True)
            X_test[col].fillna(mean_value, inplace=True)
        else:  # Categorical columns
            mode_value = X[col].mode()[0]  # Compute mode from training set
            X[col].fillna(mode_value, inplace=True)
            X_test[col].fillna(mode_value, inplace=True)

    return X, y, X_test


def scale_data(train_data, test_data, numeric_indices):
    """Scale numeric columns based on training data and apply the scaling to test data."""
    for idx in numeric_indices:
        # Extract the training and test columns
        train_col = train_data[:, idx]
        test_col = test_data[:, idx]

        # Calculate mean and standard deviation from the training data
        mean = np.mean(train_col)
        std = np.std(train_col)

        # Perform scaling
        train_data[:, idx] = (train_col - mean) / std
        test_data[:, idx] = (test_col - mean) / std

    return train_data, test_data


def manual_kfold_split(X, y, n_splits):
    """Manual implementation of K-Fold split."""
    fold_size = len(X) // n_splits
    indices = np.random.permutation(len(X))  # Shuffle the data indices
    folds = []
    
    for i in range(n_splits):
        test_indices = indices[i * fold_size:(i + 1) * fold_size]
        train_indices = np.setdiff1d(indices, test_indices)
        folds.append((train_indices, test_indices))
    
    return folds

def roc_auc_manual(y_true, y_prob):
    """Manual implementation of ROC AUC calculation."""
    # Sort by predicted probabilities
    sorted_indices = np.argsort(y_prob)[::-1]
    y_true_sorted = y_true[sorted_indices]
    
    # Count the number of positive and negative cases
    P = np.sum(y_true == 1)
    N = np.sum(y_true == 0)
    
    # Compute True Positive Rate (TPR) and False Positive Rate (FPR)
    tpr = np.cumsum(y_true_sorted == 1) / P
    fpr = np.cumsum(y_true_sorted == 0) / N
    
    # Calculate the AUC using the trapezoidal rule
    auc = np.trapz(tpr, fpr)
    
    return auc


def cross_validate(X, y, knn, n_splits=5):
    """Perform cross-validation and compute ROC AUC scores using only numpy and pandas."""
    # Define the numeric column indices (assumed to be known)
    numeric_columns = ['Balance', 'EstimatedSalary', 'Age', 'CreditScore', 'Tenure', 'NumOfProducts']
    numeric_indices = [X.columns.get_loc(col) for col in numeric_columns]

    # Manually split data into K folds
    folds = manual_kfold_split(X, y, n_splits)
    roc_auc_scores = []

    for train_index, test_index in folds:
        X_train, X_val = X.iloc[train_index].to_numpy(), X.iloc[test_index].to_numpy()
        y_train, y_val = y.iloc[train_index].to_numpy(), y.iloc[test_index].to_numpy()

        # Detect and cap outliers using z-score method on the training and validation data
        X_train = detect_outliers_zscore(pd.DataFrame(X_train, columns=X.columns), numeric_columns).to_numpy()
        X_val = detect_outliers_zscore(pd.DataFrame(X_val, columns=X.columns), numeric_columns).to_numpy()

        # Scale numeric columns
        X_train, X_val = scale_data(X_train, X_val, numeric_indices)

        # Train the KNN model on the current fold
        knn.fit(X_train, y_train)
        
        # Predict probabilities for the positive class
        y_prob = knn.predict_proba(X_val)[:, 1]

        # Compute ROC AUC score for the current fold using the manual implementation
        roc_auc = roc_auc_manual(y_val, y_prob)
        roc_auc_scores.append(roc_auc)

    # Return the average ROC AUC score across all folds and individual scores
    mean_auc = np.mean(roc_auc_scores)
    return mean_auc, roc_auc_scores

# Testing Model - 5 Fold CV
X, y, X_test = preprocess_data('/Users/anneke/GitHub/abvo138-assignment-5/train.csv', '/Users/anneke/GitHub/abvo138-assignment-5/test.csv')
numeric_columns = ['Balance', 'EstimatedSalary', 'Age', 'CreditScore', 'Tenure', 'NumOfProducts']
numeric_indices = [X.columns.get_loc(col) for col in numeric_columns]

# Step 2: Hyperparameter tuning with cross-validation
best_k = None
best_score = 0

for k in range(1, 21):  # Range for 'k'
    knn = KNN(k=k, distance_metric='euclidean')  # Assuming KNN implementation is done with numpy/pandas
    score, _ = cross_validate(X, y, knn, n_splits=5)  # Cross-validation function uses only numpy/pandas
    print(f"k={k}, ROC AUC={score:.4f}")

    if score > best_score:
        best_score = score
        best_k = k

print(f"Best k: {best_k}, Best ROC AUC: {best_score:.4f}")

# Step 3: Final training on full dataset (Scaling without StandardScaler)
# Apply scaling using the custom scaling function
X_scaled, X_test_scaled = scale_data(X.to_numpy(), X_test.to_numpy(), numeric_indices)

# Step 4: Train the final KNN model with best 'k' value
knn = KNN(k=best_k, distance_metric='euclidean')  # Assuming the KNN class is built with numpy/pandas
knn.fit(X_scaled, y.to_numpy())  # Train the model on scaled training data

# Step 5: Predict probabilities for the test set
test_probabilities = knn.predict_proba(X_test_scaled)[:, 1]

# Step 6: Create the submission file
test_data = pd.read_csv('/Users/anneke/GitHub/abvo138-assignment-5/test.csv')
test_ids = test_data['id']
submission = pd.DataFrame({'id': test_ids, 'Exited': test_probabilities})
submission.to_csv('/Users/anneke/GitHub/abvo138-assignment-5/submission.csv', index=False)

print("Submission file 'submission.csv' has been created!")

# Testing Model - 10 Fold CV
X, y, X_test = preprocess_data('/Users/anneke/GitHub/abvo138-assignment-5/train.csv', '/Users/anneke/GitHub/abvo138-assignment-5/test.csv')
numeric_columns = ['Balance', 'EstimatedSalary', 'Age', 'CreditScore', 'Tenure', 'NumOfProducts']
numeric_indices = [X.columns.get_loc(col) for col in numeric_columns]

# Step 2: Hyperparameter tuning with cross-validation
best_k = None
best_score = 0

for k in range(1, 21):  # Range for 'k'
    knn = KNN(k=k, distance_metric='euclidean')  # Assuming KNN implementation is done with numpy/pandas
    score, _ = cross_validate(X, y, knn, n_splits=10)  # Cross-validation function uses only numpy/pandas
    print(f"k={k}, ROC AUC={score:.4f}")

    if score > best_score:
        best_score = score
        best_k = k

print(f"Best k: {best_k}, Best ROC AUC: {best_score:.4f}")

# Step 3: Final training on full dataset (Scaling without StandardScaler)
# Apply scaling using the custom scaling function
X_scaled, X_test_scaled = scale_data(X.to_numpy(), X_test.to_numpy(), numeric_indices)

# Step 4: Train the final KNN model with best 'k' value
knn = KNN(k=best_k, distance_metric='euclidean')  # Assuming the KNN class is built with numpy/pandas
knn.fit(X_scaled, y.to_numpy())  # Train the model on scaled training data

# Step 5: Predict probabilities for the test set
test_probabilities = knn.predict_proba(X_test_scaled)[:, 1]

# Step 6: Create the submission file
test_data = pd.read_csv('/Users/anneke/GitHub/abvo138-assignment-5/test.csv')
test_ids = test_data['id']
submission = pd.DataFrame({'id': test_ids, 'Exited': test_probabilities})
submission.to_csv('/Users/anneke/GitHub/abvo138-assignment-5/10submission.csv', index=False)

print("Submission file '10submission.csv' has been created!")

k=1, ROC AUC=0.7592
k=2, ROC AUC=0.8311
k=3, ROC AUC=0.8540
k=4, ROC AUC=0.8682
k=5, ROC AUC=0.8796
k=6, ROC AUC=0.8856
k=7, ROC AUC=0.8930
k=8, ROC AUC=0.8938
k=9, ROC AUC=0.8993
k=10, ROC AUC=0.8993
k=11, ROC AUC=0.9011
k=12, ROC AUC=0.9040
k=13, ROC AUC=0.9057
k=14, ROC AUC=0.9066
k=15, ROC AUC=0.9092
k=16, ROC AUC=0.9094
k=17, ROC AUC=0.9102
k=18, ROC AUC=0.9111
k=19, ROC AUC=0.9117
k=20, ROC AUC=0.9145
Best k: 20, Best ROC AUC: 0.9145
Submission file 'submission.csv' has been created!
k=1, ROC AUC=0.7625
k=2, ROC AUC=0.8355
k=3, ROC AUC=0.8573
k=4, ROC AUC=0.8700
k=5, ROC AUC=0.8804
k=6, ROC AUC=0.8907
k=7, ROC AUC=0.8934
k=8, ROC AUC=0.8936
k=9, ROC AUC=0.8979
k=10, ROC AUC=0.9016
k=11, ROC AUC=0.9037
k=12, ROC AUC=0.9051
k=13, ROC AUC=0.9044
k=14, ROC AUC=0.9090
k=15, ROC AUC=0.9087


In [46]:
# Testing Model - Try 20 K
X, y, X_test = preprocess_data('/Users/anneke/GitHub/abvo138-assignment-5/train.csv', '/Users/anneke/GitHub/abvo138-assignment-5/test.csv')
numeric_columns = ['Balance', 'EstimatedSalary', 'Age', 'CreditScore', 'Tenure', 'NumOfProducts']
numeric_indices = [X.columns.get_loc(col) for col in numeric_columns]

# Step 2: Hyperparameter tuning with cross-validation
best_k = 20

# Step 3: Final training on full dataset (Scaling without StandardScaler)
# Apply scaling using the custom scaling function
X_scaled, X_test_scaled = scale_data(X.to_numpy(), X_test.to_numpy(), numeric_indices)

# Step 4: Train the final KNN model with best 'k' value
knn = KNN(k=20, distance_metric='euclidean')  # Assuming the KNN class is built with numpy/pandas
knn.fit(X_scaled, y.to_numpy())  # Train the model on scaled training data

# Step 5: Predict probabilities for the test set
test_probabilities = knn.predict_proba(X_test_scaled)[:, 1]

# Step 6: Create the submission file
test_data = pd.read_csv('/Users/anneke/GitHub/abvo138-assignment-5/test.csv')
test_ids = test_data['id']
submission = pd.DataFrame({'id': test_ids, 'Exited': test_probabilities})
submission.to_csv('/Users/anneke/GitHub/abvo138-assignment-5/20ksubmission.csv', index=False)

print("Submission file '20ksubmission.csv' has been created!")

Submission file '20ksubmission.csv' has been created!
