In [25]:
import pandas as pd
import numpy as np

def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    #I concated them to have a uniform data set
    all_data = pd.concat([train_data, test_data], axis=0)

    #Dropped columns I dont think I need
    all_data = all_data.drop(['id', 'Surname', 'CustomerId'], axis=1)

    #variables
    all_data = pd.get_dummies(all_data, columns=['Geography', 'Gender'], drop_first=True)

    # I kinda tweaked some of the features
    # noticed that the accuracy is rlly bad without doing additional operations
    all_data['BalanceByEstimatedSalary'] = all_data['Balance'] / (all_data['EstimatedSalary'] + 1)
    all_data['ProductsPerTenure'] = all_data['NumOfProducts'] / (all_data['Tenure'] + 1)

    #Normalize numerical features
    numerical_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary', 'BalanceByEstimatedSalary', 'ProductsPerTenure']
    for feature in numerical_features:
        all_data[feature] = (all_data[feature] - all_data[feature].mean()) / all_data[feature].std()

    #Took them apart (test and train)
    X = all_data[all_data['Exited'].notna()].drop('Exited', axis=1)
    y = all_data[all_data['Exited'].notna()]['Exited']
    X_test = all_data[all_data['Exited'].isna()].drop('Exited', axis=1)

    return X, y, X_test

def feature_selection(X, y, k):
    correlations = []
    for column in X.columns:
        #using the correlation to select the features
        corr = np.corrcoef(X[column], y)[0, 1]
        correlations.append((abs(corr), column))

    correlations.sort(reverse=True)
    selected_features = [corr[1] for corr in correlations[:k]]

    return X[selected_features], selected_features

class KNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        return np.array([self._predict(x) for x in X.values])

    def _predict(self, x):
        distances = self.compute_distances(x)
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = self.y_train.iloc[k_indices]

        #it might return an error if I dont add the 1e-5
        weights = 1/(distances[k_indices] + 1e-5)
        weighted_votes = np.bincount(k_nearest_labels, weights=weights)
        most_common = weighted_votes.argmax()
        return most_common

    def compute_distances(self, x):
        # sticking with euclidian because i noticed mahataan makes minor difference
        X_train_values = self.X_train.values.astype(np.float64)
        x_values = np.array(x, dtype=np.float64)
        differences = X_train_values - x_values
        squared_differences = differences ** 2
        summed_differences = np.sum(squared_differences, axis=1)
        #additional operation to prevent error
        return np.sqrt(summed_differences)

def accuracy_score(y_true, y_pred):
    return np.mean(y_true == y_pred)

def cross_validate(X, y, knn, n_splits=5):
    #reintialize the data
    indices = np.arange(len(X))
    np.random.shuffle(indices)
    fold_size = len(X) // n_splits
    scores = []

    for i in range(n_splits):
        #iterate through the folds
        start = i * fold_size
        end = start + fold_size if i < n_splits - 1 else len(X)
        val_indices = indices[start:end]
        train_indices = np.concatenate([indices[:start], indices[end:]])

        X_train, X_val = X.iloc[train_indices], X.iloc[val_indices]
        y_train, y_val = y.iloc[train_indices], y.iloc[val_indices]

        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_val)

        accuracy = accuracy_score(y_val, y_pred)
        scores.append(accuracy)
    # return the result
    return np.mean(scores)

def select_best_model(X, y, k_neighbors, n_features):
    best_score = 0
    best_k = 0
    best_n_features = 0

    for k in k_neighbors:
        for n in n_features:
            X_selected, _ = feature_selection(X, y, n)
            knn = KNN(k=k)
            score = cross_validate(X_selected, y, knn)
            print(f"k={k}, n_features={n}, Mean Accuracy={score}")

            if score > best_score:
                best_score = score
                best_k = k
                best_n_features = n
    # by the time we reach here, we know what is our best mode;
    return best_k, best_n_features, best_score

if __name__ == "__main__":
    # Data loading
    X, y, X_test = preprocess_data('train.csv', 'test.csv')

    # picked a few that are representative, ran a lot more test than these
    k_values = [3, 7, 9, 11]
    n_features = [5,  7, 9]
    best_k, best_n_features, best_score = select_best_model(X, y, k_values, n_features)
    print(f"Best k: {best_k}")
    print(f"Best number of features: {best_n_features}")
    print(f"Best Accuracy score: {best_score}")

    # Train the final model on training dataset
    X_selected, selected_features = feature_selection(X, y, best_n_features)
    final_knn = KNN(k=best_k)
    final_knn.fit(X_selected, y)
    # prediction
    X_test_selected = X_test[selected_features]
    test_predictions = final_knn.predict(X_test_selected)

    #saving to csv
    test_data = pd.read_csv('test.csv')
    pd.DataFrame({'id': test_data['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

    print("Predictions saved to 'submissions.csv'")
    print("Selected features:", selected_features)

k=3, n_features=5, Mean Accuracy=0.8726666666666667
k=3, n_features=7, Mean Accuracy=0.8688666666666667
k=3, n_features=9, Mean Accuracy=0.8689333333333333
k=7, n_features=5, Mean Accuracy=0.8794666666666666
k=7, n_features=7, Mean Accuracy=0.8788666666666666
k=7, n_features=9, Mean Accuracy=0.8851333333333334
k=9, n_features=5, Mean Accuracy=0.881
k=9, n_features=7, Mean Accuracy=0.8792666666666665
k=9, n_features=9, Mean Accuracy=0.8854666666666666
k=11, n_features=5, Mean Accuracy=0.8832666666666666
k=11, n_features=7, Mean Accuracy=0.8833333333333334
k=11, n_features=9, Mean Accuracy=0.8888
Best k: 11
Best number of features: 9
Best Accuracy score: 0.8888
Predictions saved to 'submissions.csv'
Selected features: ['Age', 'NumOfProducts', 'Geography_Germany', 'IsActiveMember', 'Gender_Male', 'Balance', 'ProductsPerTenure', 'Geography_Spain', 'CreditScore']
