In [53]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from scipy.spatial import cKDTree
from google.colab import drive
drive.mount('/content/drive')


BASE_PATH = "/content/drive/My Drive/Fall 2024/CS506/HW5/"
TRAIN_PATH = BASE_PATH + "train.csv"
TEST_PATH = BASE_PATH + "test.csv"
SUBMISSION_PATH = BASE_PATH + "submissions.csv"

Mounted at /content/drive


In [54]:
class KNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
        self.tree = cKDTree(X)

    def predict(self, X):
        distances, indices = self.tree.query(X, k=self.k)
        nearest_neighbors = self.y_train[indices]
        predictions = np.mean(nearest_neighbors, axis=1)
        return predictions

In [55]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Identify common columns
    common_columns = list(set(train_data.columns) & set(test_data.columns))
    common_columns.remove('id')  # Remove 'id' if it's in common columns

    # Select only common columns for both datasets
    train_data = train_data[common_columns + ['Exited']]
    test_data = test_data[common_columns]

    # Combine train and test for preprocessing
    all_data = pd.concat([train_data, test_data], axis=0)

    # Remove non-numeric columns
    numeric_columns = all_data.select_dtypes(include=[np.number]).columns.tolist()
    categorical_columns = ['Geography', 'Gender']

    # Keep only numeric and specified categorical columns
    columns_to_keep = numeric_columns + categorical_columns
    all_data = all_data[columns_to_keep]

    # Handle categorical variables
    all_data = pd.get_dummies(all_data, columns=categorical_columns, drop_first=True)

    # Scale numerical features
    scaler = StandardScaler()
    all_data[numeric_columns] = scaler.fit_transform(all_data[numeric_columns])

    # Split back into train and test
    train_data = all_data[:len(train_data)]
    test_data = all_data[len(train_data):]

    # Prepare X and y for train data
    X = train_data.drop(['Exited'], axis=1)
    y = train_data['Exited'].astype(int)  # Ensure 'Exited' is integer

    # Prepare X_test
    X_test = test_data.drop(['Exited'], axis=1) if 'Exited' in test_data.columns else test_data

    print(f"Features in training data: {X.columns.tolist()}")
    print(f"Features in test data: {X_test.columns.tolist()}")

    return X.values, y.values, X_test.values


In [56]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    scores = []

    for train_index, val_index in kf.split(X):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_val)
        score = roc_auc_score(y_val, y_pred)
        scores.append(score)

    return scores

In [57]:
# Load and preprocess data
print("Loading and preprocessing data...")
X, y, X_test = preprocess_data(TRAIN_PATH, TEST_PATH)

# Create and evaluate model
knn = KNN(k=5)

# Perform cross-validation
print("Performing cross-validation...")
cv_scores = cross_validate(X, y, knn)
print("Cross-validation scores:", cv_scores)
print("Mean CV score:", np.mean(cv_scores))

# Hyperparameter tuning
print("Starting hyperparameter tuning...")
k_values = [3, 5, 7, 9, 11]
best_score = 0
best_k = 0

# Use a subset of data for faster tuning
X_subset, _, y_subset, _ = train_test_split(X, y, test_size=0.7, random_state=42)

for k in k_values:
    knn = KNN(k=k)
    scores = cross_validate(X_subset, y_subset, knn)
    mean_score = np.mean(scores)
    print(f"k={k}, score={mean_score}")
    if mean_score > best_score:
        best_score = mean_score
        best_k = k

print(f"Best parameters: k={best_k}")
print(f"Best tuning score: {best_score}")

# Print shapes before making predictions
print(f"Shape of X (training data): {X.shape}")
print(f"Shape of X_test (test data): {X_test.shape}")

# Train on full dataset with optimal hyperparameters and make predictions on test set
print("Training final model...")
knn = KNN(k=best_k)
knn.fit(X, y)
print("Making predictions...")
test_predictions = knn.predict(X_test)

# Save test predictions
test_data = pd.read_csv(TEST_PATH)
submission = pd.DataFrame({'id': test_data['id'], 'Exited': test_predictions})
submission.to_csv(SUBMISSION_PATH, index=False)

print(f"Submission saved to {SUBMISSION_PATH}")

Loading and preprocessing data...
Features in training data: ['CreditScore', 'NumOfProducts', 'HasCrCard', 'Age', 'Balance', 'IsActiveMember', 'EstimatedSalary', 'Tenure', 'CustomerId', 'Geography_Germany', 'Geography_Spain', 'Gender_Male']
Features in test data: ['CreditScore', 'NumOfProducts', 'HasCrCard', 'Age', 'Balance', 'IsActiveMember', 'EstimatedSalary', 'Tenure', 'CustomerId', 'Geography_Germany', 'Geography_Spain', 'Gender_Male']
Performing cross-validation...
Cross-validation scores: [0.8644563431112448, 0.8853093656817566, 0.8805896235980359, 0.8781554033727946, 0.8908951384784907]
Mean CV score: 0.8798811748484645
Starting hyperparameter tuning...
k=3, score=0.8493372122825731
k=5, score=0.8691206069159589
k=7, score=0.8847654636290867
k=9, score=0.888071205282241
k=11, score=0.8946973131537501
Best parameters: k=11
Best tuning score: 0.8946973131537501
Shape of X (training data): (15000, 12)
Shape of X_test (test data): (10000, 12)
Training final model...
Making predictio