In [3]:
import numpy as np
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import numpy as np
from collections import Counter

class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        return np.array([self._predict(x) for x in X])

    def _predict(self, x):
        distances = self.compute_distance(x)
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = self.y_train[k_indices]
        most_common = Counter(k_nearest_labels).most_common(1)
        return most_common[0][0]

    def compute_distance(self, x):
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((self.X_train - x)**2, axis=1))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(self.X_train - x), axis=1)
        else:
            raise ValueError("Unsupported distance metric")

    def predict_proba(self, X):
        probas = []
        for x in X:
            distances = self.compute_distance(x)
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = self.y_train[k_indices]
            class_counts = Counter(k_nearest_labels)
            total = sum(class_counts.values())
            proba = {class_label: count / total for class_label, count in class_counts.items()}
            probas.append([proba.get(0, 0), proba.get(1, 0)])  # Assuming binary classification (0 and 1)
        return np.array(probas)

In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

def preprocess_data(train_path, test_path):
    # Load the data
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Combine train and test for preprocessing
    all_data = pd.concat([train_data, test_data], axis=0, sort=False)

    # Handle missing values
    numeric_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
    categorical_features = ['Geography', 'Gender']

    numeric_imputer = SimpleImputer(strategy='median')
    all_data[numeric_features] = numeric_imputer.fit_transform(all_data[numeric_features])

    # Handle categorical variables
    all_data = pd.get_dummies(all_data, columns=categorical_features, drop_first=True)

    # Scale numeric features
    scaler = StandardScaler()
    all_data[numeric_features] = scaler.fit_transform(all_data[numeric_features])

    # Split back into train and test
    train_preprocessed = all_data[:len(train_data)]
    test_preprocessed = all_data[len(train_data):]

    # Prepare features and target for train data
    X_train = train_preprocessed.drop(['Exited', 'id', 'CustomerId', 'Surname'], axis=1)
    y_train = train_preprocessed['Exited']

    # Prepare features for test data
    X_test = test_preprocessed.drop(['id', 'CustomerId', 'Surname'], axis=1)
    if 'Exited' in X_test.columns:
        X_test = X_test.drop('Exited', axis=1)

    return X_train.values, y_train.values, X_test.values

# Usage example:
# X_train, y_train, X_test = preprocess_data('train.csv', 'test.csv')

In [6]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

def cross_validate(X, y, knn, n_splits=5):
    # Initialize stratified k-fold
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Initialize list to store AUC scores
    auc_scores = []

    # Perform cross-validation
    for fold, (train_index, val_index) in enumerate(skf.split(X, y), 1):
        # Split data
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        # Fit the model
        knn.fit(X_train, y_train)

        # Predict probabilities
        y_pred_proba = knn.predict_proba(X_val)[:, 1]

        # Compute AUC score
        auc = roc_auc_score(y_val, y_pred_proba)
        auc_scores.append(auc)

        print(f"Fold {fold} AUC: {auc:.4f}")

    # Compute and print mean AUC
    mean_auc = np.mean(auc_scores)
    std_auc = np.std(auc_scores)
    print(f"\nMean AUC: {mean_auc:.4f} (+/- {std_auc:.4f})")

    return auc_scores, mean_auc

# Usage example:
# knn = KNN(k=5, distance_metric='euclidean')
# auc_scores, mean_auc = cross_validate(X_train, y_train, knn, n_splits=5)

In [9]:
import numpy as np
from sklearn.model_selection import train_test_split

# Load and preprocess data
X, y, X_test = preprocess_data('/content/drive/MyDrive/Fall\'24/CS506/Assignment5/train.csv',
                               '/content/drive/MyDrive/Fall\'24/CS506/Assignment5/test.csv')

# Ensure X, y, and X_test are numpy arrays
X = np.array(X, dtype=float)
y = np.array(y, dtype=int)
X_test = np.array(X_test, dtype=float)

# Hyperparameter tuning
k_values = [22, 23, 24, 25, 26]
distance_metrics = ['euclidean', 'manhattan']
best_k = None
best_metric = None
best_auc = 0

# Split the data for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

for k in k_values:
    for metric in distance_metrics:
        knn = KNN(k=k, distance_metric=metric)
        auc_scores, mean_auc = cross_validate(X_train, y_train, knn)

        print(f"k={k}, metric={metric}: Mean AUC = {mean_auc:.4f}")

        if mean_auc > best_auc:
            best_auc = mean_auc
            best_k = k
            best_metric = metric

print(f"\nBest hyperparameters: k={best_k}, metric={best_metric}")

# Train on full dataset with optimal hyperparameters
knn = KNN(k=best_k, distance_metric=best_metric)
knn.fit(X, y)

# Make predictions on test set
test_predictions_proba = knn.predict_proba(X_test)[:, 1]  # Probability of class 1 (churn)

# Save test predictions
output_df = pd.DataFrame({
    'id': pd.read_csv('/content/drive/MyDrive/Fall\'24/CS506/Assignment5/test.csv')['id'],
    'Exited': test_predictions_proba
})
output_path = '/content/drive/MyDrive/Fall\'24/CS506/Assignment5/submission_11.csv'
output_df.to_csv(output_path, index=False)

print(f"Predictions saved to {output_path}")

Fold 1 AUC: 0.9098
Fold 2 AUC: 0.9096
Fold 3 AUC: 0.9226
Fold 4 AUC: 0.9199
Fold 5 AUC: 0.9091

Mean AUC: 0.9142 (+/- 0.0058)
k=22, metric=euclidean: Mean AUC = 0.9142
Fold 1 AUC: 0.9044
Fold 2 AUC: 0.9114
Fold 3 AUC: 0.9202
Fold 4 AUC: 0.9176
Fold 5 AUC: 0.9106

Mean AUC: 0.9129 (+/- 0.0056)
k=22, metric=manhattan: Mean AUC = 0.9129
Fold 1 AUC: 0.9094
Fold 2 AUC: 0.9098
Fold 3 AUC: 0.9229
Fold 4 AUC: 0.9200
Fold 5 AUC: 0.9092

Mean AUC: 0.9142 (+/- 0.0060)
k=23, metric=euclidean: Mean AUC = 0.9142
Fold 1 AUC: 0.9063
Fold 2 AUC: 0.9114
Fold 3 AUC: 0.9202
Fold 4 AUC: 0.9173
Fold 5 AUC: 0.9119

Mean AUC: 0.9134 (+/- 0.0048)
k=23, metric=manhattan: Mean AUC = 0.9134
Fold 1 AUC: 0.9103
Fold 2 AUC: 0.9098
Fold 3 AUC: 0.9236
Fold 4 AUC: 0.9197
Fold 5 AUC: 0.9105

Mean AUC: 0.9148 (+/- 0.0057)
k=24, metric=euclidean: Mean AUC = 0.9148
Fold 1 AUC: 0.9068
Fold 2 AUC: 0.9125
Fold 3 AUC: 0.9195
Fold 4 AUC: 0.9177
Fold 5 AUC: 0.9131

Mean AUC: 0.9139 (+/- 0.0044)
k=24, metric=manhattan: Mean AUC =