In [6]:
import numpy as np
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')
BASE_PATH = "/content/drive/My Drive/Fall 2024/CS506/HW5/"


Mounted at /content/drive


In [7]:

class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        self.X_train = X.astype(np.float64)
        self.y_train = y.astype(np.float64)

    def predict_proba(self, X, batch_size=1000):
        X = X.astype(np.float64)
        n_samples = X.shape[0]
        probabilities = np.zeros(n_samples)

        for i in range(0, n_samples, batch_size):
            end = min(i + batch_size, n_samples)
            batch = X[i:end]
            batch_distances = self.compute_distances(batch)
            k_indices = np.argpartition(batch_distances, self.k, axis=1)[:, :self.k]
            k_nearest_labels = self.y_train[k_indices]
            probabilities[i:end] = np.mean(k_nearest_labels, axis=1)

        return probabilities

    def compute_distances(self, X):
        if self.distance_metric == 'euclidean':
            return np.sqrt(((self.X_train[:, np.newaxis] - X) ** 2).sum(axis=2)).T
        elif self.distance_metric == 'manhattan':
            return np.abs(self.X_train[:, np.newaxis] - X).sum(axis=2).T
        else:
            raise ValueError("Unsupported distance metric.")



In [8]:
# Define data preprocessing function

def preprocess_data(train_data, test_data):
    y_train = train_data['Exited'].values
    X_train = train_data.drop(columns=['Exited', 'id', 'CustomerId', 'Surname'])
    test_data = test_data.drop(columns=['id', 'CustomerId', 'Surname'])

    combined_data = pd.concat([X_train, test_data])

    for column in combined_data.select_dtypes(include=['float64', 'int64']).columns:
        combined_data[column] = combined_data[column].fillna(combined_data[column].median())

    combined_data = pd.get_dummies(combined_data, columns=['Geography', 'Gender'], drop_first=True)

    scaler = lambda x: (x - x.mean()) / x.std()
    numerical_features = ['CreditScore', 'Age', 'Balance', 'NumOfProducts', 'EstimatedSalary']
    combined_data[numerical_features] = combined_data[numerical_features].apply(scaler)

    X_train = combined_data.iloc[:len(train_data)].values
    X_test = combined_data.iloc[len(train_data):].values

    return X_train.astype(np.float64), y_train.astype(np.float64), X_test.astype(np.float64)


In [9]:
# Define cross-validation function

def cross_validate(X, y, knn, n_splits=5):
    fold_size = min(1000, len(X) // n_splits)  # Use at most 1000 samples per fold
    indices = np.arange(len(X))
    np.random.shuffle(indices)
    auc_scores = []

    for i in range(n_splits):
        start = i * fold_size
        end = (i + 1) * fold_size
        val_indices = indices[start:end]
        train_indices = np.concatenate([indices[:start], indices[end:]])

        X_train, X_val = X[train_indices], X[val_indices]
        y_train, y_val = y[train_indices], y[val_indices]

        knn.fit(X_train, y_train)
        predictions = knn.predict_proba(X_val)
        auc_score = calculate_auc(y_val, predictions)
        auc_scores.append(auc_score)

    return np.mean(auc_scores)


In [10]:
def calculate_auc(y_true, y_pred):
    positive_scores = y_pred[y_true == 1]
    negative_scores = y_pred[y_true == 0]

    auc = 0
    for pos_score in positive_scores:
        auc += (negative_scores < pos_score).mean()

    return auc / len(positive_scores)

# Load the datasets
train_data = pd.read_csv(BASE_PATH + 'train.csv')
test_data = pd.read_csv(BASE_PATH + 'test.csv')

# Preprocess the data
X, y, X_test = preprocess_data(train_data, test_data)

print(f"X_train shape: {X.shape}, X_train type: {type(X)}")
print(f"y_train shape: {y.shape}, y_train type: {type(y)}")
print(f"X_test shape: {X_test.shape}, X_test type: {type(X_test)}")

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)
print("Cross-validation AUC scores:", cv_scores)

# Train on full dataset and make predictions on test set
knn.fit(X, y)
test_probabilities = knn.predict_proba(X_test)

# Save test predictions to CSV
submission = pd.DataFrame({'id': test_data['id'], 'Exited': test_probabilities})
submission.to_csv(BASE_PATH + 'submissions.csv', index=False)

X_train shape: (15000, 11), X_train type: <class 'numpy.ndarray'>
y_train shape: (15000,), y_train type: <class 'numpy.ndarray'>
X_test shape: (10000, 11), X_test type: <class 'numpy.ndarray'>
Cross-validation AUC scores: 0.8099790059436623
