In [39]:
import numpy as np
import pandas as pd

In [40]:
import numpy as np

class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        # Store the training data as NumPy arrays
        self.X_train = np.array(X)
        self.y_train = np.array(y, dtype=int)  # Ensure y_train is integer type

    def predict(self, X, return_probabilities=True):
        # Predict the label or probability for each example in the test set
        predictions = [self._predict(x, return_probabilities) for x in np.array(X)]
        return np.array(predictions)

    def _predict(self, x, return_probabilities):
        # Compute the distances between x and all examples in the training set
        distances = self.compute_distances(x, self.X_train)
        
        # Sort by distance and return the indices of the k nearest neighbors
        k_indices = np.argsort(distances)[:self.k]
        
        # Get the labels of the k nearest samples
        k_nearest_labels = self.y_train[k_indices]
        
        if return_probabilities:
            # Return the proportion of neighbors that are class 1 as the probability
            return np.mean(k_nearest_labels)
        else:
            # Return the most common class among the k neighbors
            return np.bincount(k_nearest_labels).argmax()

    def compute_distances(self, x, X_train):
        """ Efficient vectorized distance computation. """
        if self.distance_metric == 'euclidean':
            # Vectorized Euclidean distance computation
            return np.sqrt(np.sum((X_train - x) ** 2, axis=1))
        elif self.distance_metric == 'manhattan':
            # Vectorized Manhattan distance computation
            return np.sum(np.abs(X_train - x), axis=1)


In [41]:
import pandas as pd
import numpy as np

def z_score_scaler(X, mean=None, std=None):
    """ Scale features using z-score normalization. """
    if mean is None:
        mean = np.mean(X, axis=0)
    if std is None:
        std = np.std(X, axis=0)
    return (X - mean) / std, mean, std

def one_hot_encode(df, column):
    """ One-hot encode a single categorical column. """
    dummies = pd.get_dummies(df[column], prefix=column, drop_first=True)
    df = pd.concat([df, dummies], axis=1)
    df.drop(column, axis=1, inplace=True)
    return df

def preprocess_data(train_path, test_path):
    """ Preprocess the train and test datasets according to the task requirements. """
    
    # Load the datasets
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)
    
    # Drop irrelevant columns (CustomerId, Surname, id)
    train_data = train_data.drop(columns=["CustomerId", "Surname", "id"])
    test_data = test_data.drop(columns=["CustomerId", "Surname", "id"])
    
    # Handle missing values for numerical columns (fill with median)
    # Selecting only numerical columns for missing value imputation
    numeric_columns_train = train_data.select_dtypes(include=[np.number]).columns.drop("Exited")
    numeric_columns_test = test_data.select_dtypes(include=[np.number]).columns
    
    # Fill missing values with median for numeric columns
    train_data[numeric_columns_train] = train_data[numeric_columns_train].fillna(train_data[numeric_columns_train].median())
    test_data[numeric_columns_test] = test_data[numeric_columns_test].fillna(test_data[numeric_columns_test].median())
    
    # One-hot encode categorical variables (Geography, Gender) for both train and test data
    train_data = one_hot_encode(train_data, "Geography")
    train_data = one_hot_encode(train_data, "Gender")
    
    test_data = one_hot_encode(test_data, "Geography")
    test_data = one_hot_encode(test_data, "Gender")
    
    # Align test_data with train_data columns by using column names (excluding 'Exited')
    # We don't want to use row indices for reindexing columns, so we use train_data's columns except for 'Exited'.
    test_data = test_data.reindex(columns=train_data.drop(columns=["Exited"]).columns, fill_value=0)
    
    # Separate features (X) and target (y) for training data
    X_train = train_data.drop(columns=["Exited"])  # Training features
    y_train = train_data["Exited"]
    
    # Scale the features using z-score normalization (fit on train, transform on both train and test)
    X_train_scaled, mean_train, std_train = z_score_scaler(X_train)
    X_test_scaled, _, _ = z_score_scaler(test_data, mean_train, std_train)
    
    return X_train_scaled, y_train, X_test_scaled


In [42]:
import numpy as np

def accuracy_score_manual(y_true, y_pred):
    """ Manually calculate accuracy. """
    return np.sum(y_true == y_pred) / len(y_true)

def precision_score_manual(y_true, y_pred):
    """ Manually calculate precision. """
    true_positives = np.sum((y_pred == 1) & (y_true == 1))
    predicted_positives = np.sum(y_pred == 1)
    if predicted_positives == 0:
        return 0
    return true_positives / predicted_positives

def recall_score_manual(y_true, y_pred):
    """ Manually calculate recall. """
    true_positives = np.sum((y_pred == 1) & (y_true == 1))
    actual_positives = np.sum(y_true == 1)
    if actual_positives == 0:
        return 0
    return true_positives / actual_positives

def f1_score_manual(y_true, y_pred):
    """ Manually calculate F1 score. """
    precision = precision_score_manual(y_true, y_pred)
    recall = recall_score_manual(y_true, y_pred)
    if (precision + recall) == 0:
        return 0
    return 2 * (precision * recall) / (precision + recall)

def roc_auc_score_manual(y_true, y_pred):
    """ Manually calculate ROC AUC score. """
    # For simplicity, we'll approximate ROC AUC using a basic method:
    # Here, we treat it as the proportion of correct rankings of positive vs. negative instances
    pos = y_true == 1
    neg = y_true == 0
    correct_rankings = 0
    total_pairs = np.sum(pos) * np.sum(neg)
    
    if total_pairs == 0:
        return 0.5  # No positive or no negative examples

    for i in range(len(y_true)):
        if pos[i]:
            correct_rankings += np.sum(y_pred[i] > y_pred[neg])

    return correct_rankings / total_pairs

def cross_validate(X, y, knn, n_splits=5):
    # Shuffle and split the data into k folds
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    fold_size = X.shape[0] // n_splits
    folds = [indices[i * fold_size: (i + 1) * fold_size] for i in range(n_splits)]
    
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    roc_auc_scores = []
    
    # Cross-validation loop
    for i in range(n_splits):
        # Create training and validation sets
        val_indices = folds[i]
        train_indices = np.concatenate([folds[j] for j in range(n_splits) if j != i])
        
        # Use .iloc[] to access rows by index for DataFrames
        X_train, X_val = X.iloc[train_indices], X.iloc[val_indices]
        y_train, y_val = y.iloc[train_indices], y.iloc[val_indices]
        
        # Train the KNN model on the training set
        knn.fit(X_train, y_train)
        
        # Predict on the validation set
        y_pred = knn.predict(X_val)
        
        # Calculate evaluation metrics manually
        accuracy_scores.append(accuracy_score_manual(y_val.values, y_pred))
        precision_scores.append(precision_score_manual(y_val.values, y_pred))
        recall_scores.append(recall_score_manual(y_val.values, y_pred))
        f1_scores.append(f1_score_manual(y_val.values, y_pred))
        roc_auc_scores.append(roc_auc_score_manual(y_val.values, y_pred))
    
    # Return the average of each metric across the folds
    return {
        'accuracy': np.mean(accuracy_scores),
        'precision': np.mean(precision_scores),
        'recall': np.mean(recall_scores),
        'f1': np.mean(f1_scores),
        'roc_auc': np.mean(roc_auc_scores)
    }


In [43]:
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')

# Hyperparameter tuning (finding the best k and distance metric)
best_k = None
best_distance_metric = None
best_cv_score = -1

# Define possible values for hyperparameters
k_values = [3, 5, 7, 9, 11]
distance_metrics = ['euclidean', 'manhattan']

# Try each combination of k and distance metric
for k in k_values:
    for distance_metric in distance_metrics:
        knn = KNN(k=k, distance_metric=distance_metric)
        cv_scores = cross_validate(X, y, knn)
        print(f"CV Scores for k={k}, distance_metric={distance_metric}: {cv_scores}")
        
        # Check if this is the best model so far
        if cv_scores['accuracy'] > best_cv_score:
            best_cv_score = cv_scores['accuracy']
            best_k = k
            best_distance_metric = distance_metric

print(f"Best hyperparameters: k={best_k}, distance_metric={best_distance_metric}")

# Train the final model with the best hyperparameters
knn = KNN(k=best_k, distance_metric=best_distance_metric)
knn.fit(X, y)

# Predict on the test set
test_predictions = knn.predict(X_test)

# Save the test predictions
submission = pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions})
submission.to_csv('submissions.csv', index=False)

print("Submission file saved as 'submissions.csv'.")


CV Scores for k=3, distance_metric=euclidean: {'accuracy': np.float64(0.7056000000000001), 'precision': np.float64(0.8496913972242599), 'recall': np.float64(0.3297503706522909), 'f1': np.float64(0.4747523389600114), 'roc_auc': np.float64(0.743217109751015)}
CV Scores for k=3, distance_metric=manhattan: {'accuracy': np.float64(0.701), 'precision': np.float64(0.8440713038774126), 'recall': np.float64(0.31810782987367847), 'f1': np.float64(0.4619691200943059), 'roc_auc': np.float64(0.7495758127917269)}
CV Scores for k=5, distance_metric=euclidean: {'accuracy': np.float64(0.6288666666666666), 'precision': np.float64(0.8860359489403414), 'recall': np.float64(0.22536023671600036), 'f1': np.float64(0.359143864430796), 'roc_auc': np.float64(0.804181393663096)}
CV Scores for k=5, distance_metric=manhattan: {'accuracy': np.float64(0.6267333333333334), 'precision': np.float64(0.9015719188325313), 'recall': np.float64(0.2157985241114663), 'f1': np.float64(0.3478601284990314), 'roc_auc': np.float64