In [118]:
import numpy as np
import pandas as pd

In [119]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        """
        Store the training data for KNN.
        
        Parameters:
        X (numpy array): Features of the training data
        y (numpy array): Target values of the training data (churn or not)
        """
        # Store training data
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        """
        Predict the class for each data point in X.
        
        Parameters:
        X (numpy array): Features of the test data
        
        Returns:
        numpy array: Predicted classes for the test data
        """
        predictions = []
        
        for x in X:
            # Compute distances between x and all points in the training data
            distances = [self.compute_distance(x, x_train) for x_train in self.X_train]
            
            # Get the indices of the k nearest neighbors
            k_nearest_indices = np.argsort(distances)[:self.k]
            
            # Find the labels of the k nearest neighbors
            k_nearest_labels = [self.y_train[i] for i in k_nearest_indices]
            
            # Perform majority vote for classification
            prediction = max(set(k_nearest_labels), key=k_nearest_labels.count)
            predictions.append(prediction)
        
        return np.array(predictions)
    
    def predict_proba(self, X):
        """
        Predict the probability of each class for each data point in X.
        
        Parameters:
        X (numpy array): Features of the test data
        
        Returns:
        numpy array: Probabilities for class 1 (churn) for the test data
        """
        probabilities = []
        
        for x in X:
            # Compute distances between x and all points in the training data
            distances = [self.compute_distance(x, x_train) for x_train in self.X_train]
            
            # Get the indices of the k nearest neighbors
            k_nearest_indices = np.argsort(distances)[:self.k]
            
            # Find the labels of the k nearest neighbors
            k_nearest_labels = [self.y_train[i] for i in k_nearest_indices]
            
            # Calculate the probability of belonging to class 1 (churn)
            prob_class_1 = np.sum(k_nearest_labels) / self.k  # Since labels are binary (0 or 1)
            probabilities.append(prob_class_1)
        
        return np.array(probabilities)

    def compute_distance(self, X1, X2):
        """
        Compute the distance between two points based on the specified metric.
        
        Parameters:
        X1, X2 (numpy arrays): Points between which the distance is computed
        
        Returns:
        float: The computed distance
        """
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X1 - X2) ** 2))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X1 - X2))
        elif self.distance_metric == 'cosine':  # Added cosine similarity
            dot_product = np.dot(X1, X2)
            norm_X1 = np.linalg.norm(X1)
            norm_X2 = np.linalg.norm(X2)
            
            return 1 - (dot_product / (norm_X1 * norm_X2))  # Convert similarity to distance
        
        else:
            raise ValueError("Unsupported distance metric. Use 'euclidean', 'manhattan', or 'cosine'.")


In [120]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    # Load the data
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)
    
    # Handle missing values
    # Fill missing numerical values with the mean
    numerical_columns = train_data.select_dtypes(include=[np.number]).columns
    
    # Exclude 'Exited' from numerical columns for test data
    numerical_columns_test = [col for col in numerical_columns if col != 'Exited']  
    
    train_data[numerical_columns] = train_data[numerical_columns].fillna(train_data[numerical_columns].mean())
    test_data[numerical_columns_test] = test_data[numerical_columns_test].fillna(test_data[numerical_columns_test].mean())

    # Handle categorical variables
    # Convert categorical variables (Geography, Gender) into dummy variables (one-hot encoding)
    categorical_columns = ['Geography', 'Gender']
    train_data = pd.get_dummies(train_data, columns=categorical_columns, drop_first=True)
    test_data = pd.get_dummies(test_data, columns=categorical_columns, drop_first=True)

    # Ensure the same columns exist in train and test data after encoding (handle missing categories)
    train_columns = set(train_data.columns)
    test_columns = set(test_data.columns)
    missing_cols_in_test = train_columns - test_columns
    missing_cols_in_train = test_columns - train_columns
    
    for col in missing_cols_in_test:
        test_data[col] = 0
    
    for col in missing_cols_in_train:
        train_data[col] = 0
    
    # Scale numerical features
    features_to_scale = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
    for feature in features_to_scale:
        train_mean = train_data[feature].mean()
        train_std = train_data[feature].std()
        
        train_data[feature] = (train_data[feature] - train_mean) / train_std
        test_data[feature] = (test_data[feature] - train_mean) / train_std  # Use train mean and std for scaling test set

    # Separate features and target variable
    X_train = train_data.drop(columns=['CustomerId', 'Surname', 'Exited'])
    y_train = train_data['Exited']
    
    # Test data might not have 'Exited', so exclude it from the test set handling
    X_test = test_data.drop(columns=['CustomerId', 'Surname'], errors='ignore')
    
    return X_train, y_train, X_test, None


In [121]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    """
    Perform cross-validation on the provided KNN model.
    
    Parameters:
    X (numpy array): Features of the dataset
    y (numpy array): Target labels of the dataset
    knn (KNN object): The KNN model to validate
    n_splits (int): Number of cross-validation splits (default is 5)
    
    Returns:
    numpy array: Array of ROC AUC scores for each split
    """
    # Shuffle the data randomly before splitting
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    # Use .iloc to access rows by integer location
    X, y = X.iloc[indices], y.iloc[indices] # Use .iloc for integer-based indexing
    
    # Split the data into `n_splits` folds
    fold_size = X.shape[0] // n_splits
    roc_auc_scores = []

    for i in range(n_splits):
        # Define start and end of the fold
        start = i * fold_size
        end = (i + 1) * fold_size if i != n_splits - 1 else X.shape[0]
        
        # Create training and validation sets
        X_val, y_val = X.iloc[start:end], y.iloc[start:end] # Use .iloc for integer-based indexing
        X_train = pd.concat([X.iloc[:start], X.iloc[end:]], axis=0) # Use .iloc for integer-based indexing and pd.concat
        y_train = pd.concat([y.iloc[:start], y.iloc[end:]], axis=0) # Use .iloc for integer-based indexing and pd.concat
        
        # Train the model on the training set
        knn.fit(X_train.values, y_train.values) # Convert to NumPy arrays
        
        # Predict probabilities for the validation set
        y_pred = knn.predict(X_val.values) # Convert to NumPy array
        
        # Compute the ROC AUC score
        roc_auc = roc_auc_score(y_val.values, y_pred)  # Convert to NumPy array
        roc_auc_scores.append(roc_auc)

    return np.array(roc_auc_scores)

def roc_auc_score(y_true, y_pred):
    """
    Compute the ROC AUC score.
    
    Parameters:
    y_true (numpy array): True binary labels
    y_pred (numpy array): Predicted binary labels or probabilities
    
    Returns:
    float: ROC AUC score
    """
    # Sort by predicted probabilities
    sorted_indices = np.argsort(y_pred)
    y_true_sorted = y_true[sorted_indices]
    
    # Compute true positive rate (TPR) and false positive rate (FPR)
    P = np.sum(y_true)  # Positive samples
    N = len(y_true) - P  # Negative samples
    tpr = np.cumsum(y_true_sorted) / P  # True positive rate
    fpr = np.cumsum(1 - y_true_sorted) / N  # False positive rate
    
    # Compute ROC AUC using trapezoidal rule
    auc = np.trapz(tpr, fpr)
    return auc


In [122]:
# Hyperparameter tuning function
def hyperparameter_tuning(X, y, k_values, distance_metrics, n_splits=5):
    """
    Perform hyperparameter tuning to find the best k and distance metric.
    
    Parameters:
    X (numpy array): Features of the dataset
    y (numpy array): Target labels of the dataset
    k_values (list): List of k values to try
    distance_metrics (list): List of distance metrics to try
    n_splits (int): Number of cross-validation splits
    
    Returns:
    dict: Dictionary containing the best k, distance metric, and corresponding ROC AUC score
    """
    best_score = -1
    best_k = None
    best_distance_metric = None
    
    for k in k_values:
        for metric in distance_metrics:
            # Create the KNN model with current hyperparameters
            knn = KNN(k=k, distance_metric=metric)
            
            # Perform cross-validation
            cv_scores = cross_validate(X, y, knn, n_splits=n_splits)
            mean_cv_score = np.mean(cv_scores)
            
            print(f"k={k}, metric={metric}, ROC AUC: {mean_cv_score}")
            
            # Keep track of the best hyperparameters based on ROC AUC score
            if mean_cv_score > best_score:
                best_score = mean_cv_score
                best_k = k
                best_distance_metric = metric
    
    return {
        "best_k": best_k,
        "best_distance_metric": best_distance_metric,
        "best_score": best_score
    }

# Load and preprocess data
X, y, X_test, _ = preprocess_data('train.csv', 'test.csv')

# List of k values and distance metrics to explore
k_values = [5, 7, 9, 13]
distance_metrics = ['euclidean', 'manhattan', 'cosine']

# Hyperparameter tuning
best_params = hyperparameter_tuning(X, y, k_values, distance_metrics)
print(f"Best parameters: k={best_params['best_k']}, distance_metric={best_params['best_distance_metric']}, ROC AUC={best_params['best_score']}")

# Create KNN model with optimal hyperparameters
knn = KNN(k=best_params['best_k'], distance_metric=best_params['best_distance_metric'])

# Perform cross-validation (optional, already done during tuning)
cv_scores = cross_validate(X, y, knn)
print("Cross-validation scores:", cv_scores)

# Train on full dataset with optimal hyperparameters
knn.fit(X.values, y.values)  # Convert to NumPy arrays

# Make predictions on the test set
test_predictions = knn.predict(X_test.values)  # Convert to NumPy array

# Save test predictions
pd.DataFrame({'id': pd.read_csv('test.csv')['CustomerId'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

k=5, metric=euclidean, ROC AUC: 0.4783215531073274
k=5, metric=manhattan, ROC AUC: 0.4631602593362307
k=5, metric=cosine, ROC AUC: 0.22542334240343473
k=7, metric=euclidean, ROC AUC: 0.499042168568114
k=7, metric=manhattan, ROC AUC: 0.47297118997315335
k=7, metric=cosine, ROC AUC: 0.2185755374193577
k=9, metric=euclidean, ROC AUC: 0.49225382862815314
k=9, metric=manhattan, ROC AUC: 0.48999203244049755
k=9, metric=cosine, ROC AUC: 0.21782646777265388
k=13, metric=euclidean, ROC AUC: 0.4992664858410921
k=13, metric=manhattan, ROC AUC: 0.49922940058316134
k=13, metric=cosine, ROC AUC: 0.22445252329455964
Best parameters: k=13, distance_metric=euclidean, ROC AUC=0.4992664858410921
Cross-validation scores: [0.50367596 0.50462345 0.49159629 0.50654061 0.49544699]


ValueError: operands could not be broadcast together with shapes (13,) (12,) 