# Santander Customer Transaction Prediction

## Imports

In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import time
%matplotlib inline

## Load Data

In [None]:
data = pd.read_csv("train.csv", index_col=0, low_memory=False)
data

# split features from classes

In [None]:
X = data.drop("target", axis=1)
y = data["target"].to_numpy()
print(X.ndim)
print(y.ndim)
print(f"X:{type(X)}")
print(f"y:{type(y)}")

# normalize data

In [None]:

scaler = preprocessing.MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

print(X)

# PCA

In [None]:
print(f"Before PCA: {X.shape}")

variance_threshold = 0.5

pca = PCA(n_components=variance_threshold)
X = pca.fit_transform(X)
print(f"After PCA: {X.shape}")

# KNN

## find best k

In [None]:
param_grid = {'n_neighbors': range(1, 21)}  # Testing k from 1 to 20

grid_search = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=param_grid,
                           scoring='precision', n_jobs=-1)

grid_search.fit(X, y)
best_k = grid_search.best_params_['n_neighbors']

print(f"Best k: {best_k}")
print(f"Best Precision Score: {grid_search.best_score_}")


## use best k with KNN, find performancence metrics

In [None]:

knn = KNeighborsClassifier(n_neighbors=best_k)

kfold = KFold(n_splits=5, shuffle=True, random_state=2)
precision_scores = []
recall_scores = []
f1_scores = []

for train_idx, test_idx in kfold.split(X):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    knn.fit(X_train, y_train)

    y_pred = knn.predict(X_test)

    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

print(f"Precision: {np.mean(precision_scores):.4f}")
print(f"Recall: {np.mean(recall_scores):.4f}")
print(f"F1 Score: {np.mean(f1_scores):.4f}")

# MLP

In [None]:
for hidden_layer_sizes in [
    (1000, 1000, 1000, 1000, 1000),
]:
    for activation in ["relu", "tanh"]:
        mlp = MLPClassifier(
            hidden_layer_sizes=hidden_layer_sizes,
            activation=activation,
            max_iter=1000,
            solver="adam",
        )
        kfold = KFold(n_splits=5, shuffle=True, random_state=2)
        precision_scores = []
        recall_scores = []
        f1_scores = []

        for train_idx, test_idx in kfold.split(X):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            mlp.fit(X_train, y_train)

            y_pred = mlp.predict(X_test)

            precision = precision_score(y_test, y_pred)
            recall = recall_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)

            precision_scores.append(precision)
            recall_scores.append(recall)
            f1_scores.append(f1)

        # Output the results
        print(f"Hidden Layer Sizes: {hidden_layer_sizes}")
        print(f"Activation Function: {activation}")
        print(f"Precision: {np.mean(precision_scores):.4f}")
        print(f"Recall: {np.mean(recall_scores):.4f}")
        print(f"F1 Score: {np.mean(f1_scores):.4f}")
