In [2]:
import numpy as np
import pandas as pd
from collections import Counter

In [None]:
import

In [3]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.metrics import make_scorer, roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
import time
import matplotlib.pyplot as plt

In [11]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        """Store the training data."""
        self.X_train = np.array(X)
        self.y_train = np.array(y)

    def predict(self, X):
        """Predict the labels for the given input samples."""
        X = np.array(X)
        predictions = [self._predict_single(x) for x in X]
        return np.array(predictions)

    def predict_proba(self, X):
        """Predict the probabilities for each class."""
        X = np.array(X)
        prob_predictions = [self._predict_proba_single(x) for x in X]
        return np.array(prob_predictions)

    def _predict_single(self, x):
        """Predict the label for a single input sample."""
        distances = self._compute_distances(x)
        nearest_indices = np.argsort(distances)[:self.k]
        nearest_labels = self.y_train[nearest_indices]
        most_common = Counter(nearest_labels).most_common(1)
        return most_common[0][0]

    def _predict_proba_single(self, x):
        """Predict the probability for a single input sample."""
        distances = self._compute_distances(x)
        nearest_indices = np.argsort(distances)[:self.k]
        nearest_labels = self.y_train[nearest_indices]
        count = Counter(nearest_labels)
        # Return probabilities for each class
        return [count.get(0, 0) / self.k, count.get(1, 0) / self.k]

    def _compute_distances(self, x):
        """Compute the distances from a single point to all training data points."""
        if self.distance_metric == 'euclidean':
            return np.linalg.norm(self.X_train - x, axis=1)
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(self.X_train - x), axis=1)
        else:
            raise ValueError(f"Unsupported distance metric: {self.distance_metric}")


In [13]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    # Load datasets
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Separate features and target
    X = train_data.drop(['Exited', 'id', 'CustomerID', 'Surname'], axis=1, errors='ignore')
    y = train_data['Exited']  # Target variable
    X_test = test_data.drop(['id', 'CustomerID'], axis=1, errors='ignore')

    # One-hot encode character variables with fewer than 5 unique values
    cat_columns = X.select_dtypes(include=['object']).columns
    small_cat_columns = [col for col in cat_columns if X[col].nunique() < 5]

    encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
    X_encoded = encoder.fit_transform(X[small_cat_columns])
    X_test_encoded = encoder.transform(X_test[small_cat_columns])

    # Create DataFrames from encoded data
    X_encoded_df = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(small_cat_columns))
    X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoder.get_feature_names_out(small_cat_columns))

    # Drop original categorical columns
    X = X.drop(small_cat_columns, axis=1)
    X_test = X_test.drop(small_cat_columns, axis=1)

    # Concatenate encoded columns with original data
    X = pd.concat([X, X_encoded_df], axis=1)
    X_test = pd.concat([X_test, X_test_encoded_df], axis=1)

    # **Fix the KeyError here**: Make sure both X and X_test have the same columns
    X_test = X_test.reindex(columns=X.columns, fill_value=0)

    # Scale numeric columns
    numeric_columns = ['Balance', 'EstimatedSalary', 'Age', 'CreditScore']
    scaler = StandardScaler()
    X[numeric_columns] = scaler.fit_transform(X[numeric_columns])
    X_test[numeric_columns] = scaler.transform(X_test[numeric_columns])

    # Handle missing values for numeric and categorical columns separately
    for col in X.columns:
        if X[col].dtype in ['int64', 'float64']:
            X[col].fillna(X[col].mean(), inplace=True)
        else:
            X[col].fillna(X[col].mode()[0], inplace=True)  # Fill with the mode (most common value)

    for col in X_test.columns:
        if X_test[col].dtype in ['int64', 'float64']:
            X_test[col].fillna(X_test[col].mean(), inplace=True)
        else:
            X_test[col].fillna(X_test[col].mode()[0], inplace=True)

    return X, y, X_test

# Usage example:
X, y, X_test = preprocess_data('train.csv', 'test.csv')




In [37]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    # TODO: Implement cross-validation
    # Compute ROC AUC scores
    # Ensure input is in NumPy format
    X = X.to_numpy() if isinstance(X, pd.DataFrame) else X
    y = y.to_numpy() if isinstance(y, pd.Series) else y
    
    """Perform cross-validation and compute ROC AUC scores."""
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    roc_auc_scores = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Train the KNN model on the current fold
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)

        # Compute ROC AUC score for the current fold
        roc_auc = roc_auc_score(y_test, y_pred)
        roc_auc_scores.append(roc_auc)

    # Return the average ROC AUC score across all folds
    return np.mean(roc_auc_scores)          

In [15]:
# Manual Cross-Validation with .iloc fix for pandas DataFrame
def manual_cross_validate(X, y, knn, n_splits=5):
    """Manually perform cross-validation and compute ROC AUC scores."""
    n_samples = len(X)
    fold_size = n_samples // n_splits
    indices = np.arange(n_samples)
    np.random.shuffle(indices)

    roc_auc_scores = []

    for fold in range(n_splits):
        start = fold * fold_size
        end = (fold + 1) * fold_size if fold != n_splits - 1 else n_samples
        test_indices = indices[start:end]
        train_indices = np.concatenate((indices[:start], indices[end:]))

        # Use .iloc to access rows by positional indices
        X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
        y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]

        # Train KNN on the training fold
        knn.fit(X_train, y_train)

        # Get probabilities for ROC calculation
        y_proba = knn.predict_proba(X_test)[:, 1]  # Get the probability for class 1

        # Manually compute ROC AUC for the current fold
        roc_auc = compute_manual_roc_auc(y_test, y_proba)
        roc_auc_scores.append(roc_auc)

    return np.mean(roc_auc_scores)


# Manual ROC and AUC Computation
def compute_manual_roc_auc(y_true, y_scores):
    """Manually compute ROC AUC score."""
    thresholds = np.linspace(0, 1, 100)  # 100 thresholds from 0 to 1
    tpr_list = []  # True Positive Rate
    fpr_list = []  # False Positive Rate

    for threshold in thresholds:
        # Apply the threshold to make binary predictions
        y_pred = (y_scores >= threshold).astype(int)

        # Calculate TPR and FPR
        tp = np.sum((y_pred == 1) & (y_true == 1))  # True Positives
        fp = np.sum((y_pred == 1) & (y_true == 0))  # False Positives
        fn = np.sum((y_pred == 0) & (y_true == 1))  # False Negatives
        tn = np.sum((y_pred == 0) & (y_true == 0))  # True Negatives

        tpr = tp / (tp + fn) if (tp + fn) > 0 else 0  # Sensitivity / Recall
        fpr = fp / (fp + tn) if (fp + tn) > 0 else 0  # Fall-out / 1-Specificity

        tpr_list.append(tpr)
        fpr_list.append(fpr)

    # Compute AUC using the trapezoidal rule
    auc = np.trapz(tpr_list, fpr_list)
    return auc

In [16]:
# Load and preprocess data
X, y, X_test = preprocess_data('/Users/anneke/GitHub/abvo138-assignment-5/train.csv', 
                               '/Users/anneke/GitHub/abvo138-assignment-5/test.csv')


# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
cv_score = manual_cross_validate(X, y, knn)
print("Cross-validation ROC AUC score:", cv_score)

# TODO: hyperparamters tuning
# Hyperparameter tuning: Find the best k
best_k = None
best_score = 0

for k in range(1, 21):
    knn = KNN(k=k, distance_metric='euclidean')
    score = manual_cross_validate(X, y, knn)
    print(f"k={k}, ROC AUC={score:.4f}")

    if score > best_score:
        best_score = score
        best_k = k

print(f"Best k: {best_k}, Best ROC AUC: {best_score:.4f}")

# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=best_k, distance_metric='euclidean')
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('/path/of/test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)



Cross-validation ROC AUC score: -0.5109773284662806
k=1, ROC AUC=-0.5076
k=2, ROC AUC=-0.5229
k=3, ROC AUC=-0.5184
k=4, ROC AUC=-0.5115
k=5, ROC AUC=-0.5070
k=6, ROC AUC=-0.5096
k=7, ROC AUC=-0.5078
k=8, ROC AUC=-0.5088
k=9, ROC AUC=-0.5102
k=10, ROC AUC=-0.5065
k=11, ROC AUC=-0.5146
k=12, ROC AUC=-0.5066
k=13, ROC AUC=-0.5103
k=14, ROC AUC=-0.5092
k=15, ROC AUC=-0.5110
k=16, ROC AUC=-0.5081
k=17, ROC AUC=-0.5022
k=18, ROC AUC=-0.5060
k=19, ROC AUC=-0.5048
k=20, ROC AUC=-0.5031
Best k: None, Best ROC AUC: 0.0000


FileNotFoundError: [Errno 2] No such file or directory: '/path/of/test.csv'

In [52]:
#######################################################################################################################
# LAB 5 CODE
#######################################################################################################################

import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import time
import matplotlib.pyplot as plt
from sklearn.metrics import make_scorer, roc_auc_score

# Load train and test CSV files
train_data = pd.read_csv('/Users/anneke/GitHub/abvo138-assignment-5/train.csv')
test_data = pd.read_csv('/Users/anneke/GitHub/abvo138-assignment-5/test.csv')

# Drop id column from the training dataset
train_data = train_data.drop(['CustomerId', 'id'], axis=1)

# Separate features (X) and target (y) from training data
X_train = train_data.drop('Exited', axis=1)
y_train = train_data['Exited']


# Identify numerical and categorical columns
numerical_cols = X_train.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = X_train.select_dtypes(include=['object']).columns

# Create a Pipeline object that applies the standard scaler to the numerical columns
# and the one hot encoder to the categorical columns; then applies the knn classifier (k=5).
# The model should match the given output of this cell.
# model = TODO
# Define the transformations for numerical and categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Create a pipeline that applies the preprocessor and then the KNN classifier
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier(n_neighbors= 5))
])

# Train the model on the training data
model.fit(X_train, y_train)

# Define the hyperparameter range for n_neighbors
n_neighbors_values = range(1, 21)  # Testing for neighbors from 1 to 20

# Store results
n_neighbors_list = []
auc_scores = []
times_per_sample = []

# Perform cross-validation over different values of n_neighbors
for n_neighbors in n_neighbors_values:
    # Same as previous code block.
    # model = TODO
    model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier(n_neighbors=n_neighbors))])

    # Measure the time taken for cross-validation
    start_time = time.time()

    # Perform cross-validation and calculate mean validation AUC score.
    # Perform 5-fold cross-validation using the cross_val_score function.
    # mean_validation_score = TODO
    # Perform 5-fold cross-validation using cross_val_score and calculate mean AUC score
    auc_scorer = make_scorer(roc_auc_score, needs_proba=True)
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring=auc_scorer)
    mean_validation_score = np.mean(cv_scores)

    # Calculate elapsed time and seconds per sample
    elapsed_time = time.time() - start_time
    seconds_per_sample = elapsed_time / len(X_train)

    # Store results for plotting
    n_neighbors_list.append(n_neighbors)
    auc_scores.append(mean_validation_score)
    times_per_sample.append(seconds_per_sample)

    # Print out the validation accuracy, the value of n_neighbors, and the time per sample
    print(f'Validation Accuracy: {mean_validation_score:.4f} with n_neighbors={n_neighbors}')
    print(f'Time taken: {elapsed_time:.2f} seconds, Seconds per sample: {seconds_per_sample:.6f} seconds')

# Plotting results
fig, axs = plt.subplots(1, 2, figsize=(12, 6))

# Left plot: AUC score vs number of neighbors
axs[0].plot(n_neighbors_list, auc_scores, marker='o', linestyle='-', color='b')
axs[0].set_xlabel('Number of Neighbors')
axs[0].set_ylabel('Validation ROC-AUC Score')
axs[0].set_title('Validation ROC-AUC Score vs. Number of Neighbors')

# Right plot: Time per sample vs number of neighbors
axs[1].plot(n_neighbors_list, times_per_sample, color='r')
axs[1].set_xlabel('Number of Neighbors')
axs[1].set_ylabel('Time per Sample (seconds)')
axs[1].set_title('Time per Sample vs. Number of Neighbors')

plt.tight_layout()
plt.show()

# Output the best hyperparameter
best_n_neighbors = n_neighbors_list[np.argmax(auc_scores)]
best_score = np.max(auc_scores)
print(f'Best n_neighbors: {best_n_neighbors} with Validation Accuracy: {best_score:.4f}')


ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/pipeline.py", line 423, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/pipeline.py", line 357, in _fit
    self._validate_steps()
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/pipeline.py", line 241, in _validate_steps
    raise TypeError(
TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' '<function preprocess_data at 0x7f7d7cd961f0>' (type <class 'function'>) doesn't


Validation Accuracy: 0.7527 with n_neighbors=1
Time taken: 22.22 seconds, Seconds per sample: 0.001481 seconds
