In [8]:
import numpy as np
import pandas as pd

In [9]:
# Define the KNN class
class KNN:
    def __init__(self, k=5, distance_metric='euclidean', p=2):
        self.k = k
        self.distance_metric = distance_metric
        self.p = p

    def fit(self, X, y):
        self.X_train = X
        self.y_train = np.array(y)  # Convert y to a NumPy array
        self.classes_ = np.unique(y)  # Add this line to set the classes_ attribute

    def predict(self, X):
        predictions = []
        for x in X:
            distances = self.compute_distances(x)
            k_nearest = self.get_k_nearest(distances)
            predictions.append(self.get_majority_class(k_nearest))
        return np.array(predictions)

    def compute_distances(self, x):
        if self.distance_metric == 'euclidean':
            return np.linalg.norm(self.X_train - x, axis=1)
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(self.X_train - x), axis=1)
        elif self.distance_metric == 'minkowski':
            return np.sum(np.abs(self.X_train - x) ** self.p, axis=1) ** (1 / self.p)
        else:
            raise ValueError(f"Unknown distance metric: {self.distance_metric}")

    def get_k_nearest(self, distances):
        return np.argsort(distances)[:self.k]

    def get_majority_class(self, k_nearest):
        k_nearest_labels = np.array(self.y_train[k_nearest], dtype=int)
        return np.bincount(k_nearest_labels).argmax()

    def predict_proba(self, X):
        probas = []
        for x in X:
            distances = self.compute_distances(x)
            k_nearest = self.get_k_nearest(distances)
            k_nearest_labels = self.y_train[k_nearest]
            class1_prob = np.mean(k_nearest_labels == 1)
            probas.append([1 - class1_prob, class1_prob])  # Probabilities for [class 0, class 1]
        return np.array(probas)

In [10]:
def preprocess_data(train_file, test_file):
    train_df = pd.read_csv(train_file)
    test_df = pd.read_csv(test_file)
    
    # Target encoding for 'Geography' and 'Gender'
    target_cols = ['Geography', 'Gender']
    for col in target_cols:
        train_df[col] = train_df[col].astype('category').cat.codes
        test_df[col] = test_df[col].astype('category').cat.codes

    # Create interaction features as an example
    train_df['Credit_Age'] = train_df['CreditScore'] / (train_df['Age'] + 1)
    test_df['Credit_Age'] = test_df['CreditScore'] / (test_df['Age'] + 1)

    X = train_df.drop(columns=['id', 'CustomerId', 'Surname', 'Exited'])
    y = train_df['Exited']
    X_test = test_df.drop(columns=['id', 'CustomerId', 'Surname'])

    return X, y, X_test

def custom_kfold(X, y, n_splits=3):
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    
    fold_sizes = np.full(n_splits, X.shape[0] // n_splits, dtype=int)
    fold_sizes[:X.shape[0] % n_splits] += 1
    
    current = 0
    folds = []
    
    for fold_size in fold_sizes:
        start, stop = current, current + fold_size
        val_indices = indices[start:stop]
        train_indices = np.concatenate([indices[:start], indices[stop:]])
        folds.append((train_indices, val_indices))
        current = stop
    
    return folds

In [11]:
def cross_validate(X, y, knn, n_splits=3):
    # Convert to NumPy arrays to avoid issues with indexing
    X = np.array(X)
    y = np.array(y)
    
    folds = custom_kfold(X, y, n_splits)
    auc_scores = []
    
    for train_indices, val_indices in folds:
        X_train, X_val = X[train_indices], X[val_indices]
        y_train, y_val = y[train_indices], y[val_indices]
        
        knn.fit(X_train, y_train)
        y_pred_proba = knn.predict_proba(X_val)[:, 1]
        
        if len(np.unique(y_val)) > 1:
            auc_score = custom_roc_auc_score(y_val, y_pred_proba)
            auc_scores.append(auc_score)
        else:
            print("Only one class present in y_true. Skipping this fold.")
    
    return auc_scores

def custom_roc_auc_score(y_true, y_score):
    pos_label = 1
    neg_label = 0
    
    desc_score_indices = np.argsort(y_score)[::-1]
    y_true_sorted_by_score = y_true[desc_score_indices]
    
    distinct_value_indices = np.where(np.diff(y_score[desc_score_indices]))[0]
    
    threshold_idxs = np.r_[distinct_value_indices, y_true_sorted_by_score.size - 1]
    
    tps = np.cumsum(y_true_sorted_by_score == pos_label)[threshold_idxs]
    
    fps = 1 + threshold_idxs - tps
    
    tps = np.r_[0, tps]
    fps = np.r_[0, fps]
    
    fpr = fps / fps[-1]
    tpr = tps / tps[-1]
    
    return np.trapz(tpr, fpr)

def standardize_data(X):
    mean = np.mean(X, axis=0)
    std_dev = np.std(X, axis=0)
    return (X - mean) / std_dev

def add_polynomial_features(X, degree=2):
    poly_features = [X]
    for d in range(2, degree + 1):
        poly_features.append(X ** d)
    return np.concatenate(poly_features, axis=1)

def apply_pca(X, n_components=24):
    mean_X = np.mean(X, axis=0)
    centered_X = X - mean_X
    covariance_matrix = np.cov(centered_X.T)
    eigenvalues, eigenvectors = np.linalg.eigh(covariance_matrix)
    sorted_indices = np.argsort(eigenvalues)[::-1]
    top_eigenvectors = eigenvectors[:, sorted_indices[:n_components]]
    return centered_X @ top_eigenvectors

def smote_enn(X, y):
    def smote(X_minority):
        n_samples, n_features = X_minority.shape
        synthetic_samples = []
        
        for i in range(n_samples):
            sample_i = X_minority[i]
            neighbors_indices = np.random.choice(n_samples, size=5, replace=False)
            neighbor_sample = X_minority[neighbors_indices[np.random.randint(5)]]
            
            diff = neighbor_sample - sample_i
            synthetic_sample = sample_i + diff * np.random.rand()
            
            synthetic_samples.append(synthetic_sample)
        
        return np.array(synthetic_samples)

    def enn(X_resampled, y_resampled):
        def nearest_neighbors(X, point_idx, n_neighbors=3):
            distances = np.linalg.norm(X - X[point_idx], axis=1)
            neighbors_indices = np.argsort(distances)[1:n_neighbors+1]
            return neighbors_indices
        
        indices_to_remove = []
        
        for i in range(len(X_resampled)):
            neighbors_indices = nearest_neighbors(X_resampled, i)
            neighbors_labels = y_resampled[neighbors_indices]
            
            if len(np.unique(neighbors_labels)) > 1:
                indices_to_remove.append(i)
        
        mask_to_keep = ~np.isin(np.arange(len(X_resampled)), indices_to_remove)
        
        return X_resampled[mask_to_keep], y_resampled[mask_to_keep]

    minority_class_label = min(np.unique(y), key=lambda x: np.sum(y == x))
    
    X_minority_class_samples = X[y == minority_class_label]
    
    synthetic_samples_minority_class = smote(X_minority_class_samples)
    
    X_resampled = np.vstack((X, synthetic_samples_minority_class))
    y_resampled = np.hstack((y, np.full(len(synthetic_samples_minority_class), minority_class_label)))
    
    X_resampled_enn, y_resampled_enn = enn(X_resampled, y_resampled)
    
    return X_resampled_enn, y_resampled_enn

In [13]:
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')

# Step 1: Standardize the data
X_scaled = standardize_data(X)
X_test_scaled = standardize_data(X_test)

# Step 2: Add Polynomial Features (optional)
X_poly = add_polynomial_features(X_scaled, degree=2)
X_test_poly = add_polynomial_features(X_test_scaled, degree=2)

# Step 3: Apply SMOTEENN for class balancing (optional, for training data only)
X_balanced, y_balanced = smote_enn(X_poly, y)

# Step 4: PCA with more components
X_reduced = apply_pca(X_balanced, n_components=24)
X_test_reduced = apply_pca(X_test_poly, n_components=24)

# Step 5: Use a more complex model (KNN in this case since we can't use other models without sklearn imports)
knn_model = KNN(k=5, distance_metric='minkowski', p=1.5)

# Step 6: Cross-validate the model
cv_auc_scores = cross_validate(X_reduced, y_balanced, knn_model, n_splits=5)
print(f"Cross-Validation AUC Scores: {cv_auc_scores}")
print(f"Mean Cross-Validation AUC Score: {np.mean(cv_auc_scores)}")

# Step 7: Make predictions on the test set using the best model (as probabilities)
knn_model.fit(X_reduced, y_balanced)

# Use predict_proba to get probabilities instead of binary predictions
test_probabilities = knn_model.predict_proba(X_test_reduced)[:, 1]  # Select probability of class 1 (Exited)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_probabilities}).to_csv('submissions15.csv', index=False)

KeyboardInterrupt: 