# Santander Customer Transaction Prediction

## Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.neighbors import KNeighborsClassifier
import sklearn.tree
import matplotlib.pyplot as plt
import graphviz
from io import StringIO
import time
%matplotlib inline

## Load Data

In [2]:
data = pd.read_csv("train.csv", index_col=0, low_memory=False)
data

Unnamed: 0_level_0,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
ID_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
train_0,0,8.9255,-6.7863,11.9081,5.0930,11.4607,-9.2834,5.1187,18.6266,-4.9200,...,4.4354,3.9642,3.1364,1.6910,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
train_1,0,11.5006,-4.1473,13.8588,5.3890,12.3622,7.0433,5.6208,16.5338,3.1468,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.3560,1.9518
train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,-4.9193,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.9250,-5.8609,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,6.2654,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
train_199995,0,11.4880,-0.4956,8.2622,3.5142,10.3404,11.6081,5.6709,15.1516,-0.6209,...,6.1415,13.2305,3.9901,0.9388,18.0249,-1.7939,2.1661,8.5326,16.6660,-17.8661
train_199996,0,4.9149,-2.4484,16.7052,6.6345,8.3096,-10.5628,5.8802,21.5940,-3.6797,...,4.9611,4.6549,0.6998,1.8341,22.2717,1.7337,-2.1651,6.7419,15.9054,0.3388
train_199997,0,11.2232,-5.0518,10.5127,5.6456,9.3410,-5.4086,4.5555,21.5571,0.1202,...,4.0651,5.4414,3.1032,4.8793,23.5311,-1.5736,1.2832,8.7155,13.8329,4.1995
train_199998,0,9.7148,-8.6098,13.6104,5.7930,12.5173,0.5339,6.0479,17.0152,-2.1926,...,2.6840,8.6587,2.7337,11.1178,20.4158,-0.0786,6.7980,10.0342,15.5289,-13.9001


# split features from classes

In [3]:
X = data.drop("target", axis=1)
y = data["target"].to_numpy()
print(X.ndim)
print(y.ndim)
print(f"X:{type(X)}")
print(f"y:{type(y)}")

2
1
X:<class 'pandas.core.frame.DataFrame'>
y:<class 'numpy.ndarray'>


# normalize data

In [4]:

scaler = preprocessing.MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

print(X)

           var_0     var_1     var_2     var_3     var_4     var_5     var_6  \
0       0.427853  0.324824  0.568059  0.388041  0.550670  0.467321  0.454298   
1       0.557212  0.428639  0.681235  0.410417  0.628408  0.795072  0.536604   
2       0.411969  0.483777  0.578061  0.599690  0.474941  0.471329  0.753295   
3       0.535099  0.507140  0.396562  0.546993  0.647586  0.616822  0.572995   
4       0.473637  0.533434  0.624133  0.504796  0.621079  0.702836  0.589011   
...          ...       ...       ...       ...       ...       ...       ...   
199995  0.556579  0.572293  0.356529  0.268693  0.454064  0.886709  0.544817   
199996  0.226382  0.495472  0.846379  0.504570  0.278944  0.441637  0.579126   
199997  0.543277  0.393057  0.487100  0.429814  0.367884  0.545106  0.361976   
199998  0.467503  0.253090  0.666823  0.440957  0.641783  0.664399  0.606616   
199999  0.525846  0.367145  0.580254  0.610273  0.559035  0.660683  0.481378   

           var_7     var_8     var_9  .

# PCA

In [5]:
print(f"Before PCA: {X.shape}")

variance_threshold = 0.5

pca = PCA(n_components=variance_threshold)
X = pca.fit_transform(X)
print(f"After PCA: {X.shape}")

Before PCA: (200000, 200)
After PCA: (200000, 87)


# KNN

## find best k

In [None]:
param_grid = {'n_neighbors': range(1, 21)}  # Testing k from 1 to 20

grid_search = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=param_grid,
                           scoring='precision', n_jobs=-1)

grid_search.fit(X, y)
best_k = grid_search.best_params_['n_neighbors']

print(f"Best k: {best_k}")
print(f"Best Precision Score: {grid_search.best_score_}")


## use best k with KNN, find performance metrics

In [None]:

knn = KNeighborsClassifier(n_neighbors=best_k)

kfold = KFold(n_splits=5, shuffle=True, random_state=2)
precision_scores = []
recall_scores = []
f1_scores = []

for train_idx, test_idx in kfold.split(X):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    knn.fit(X_train, y_train)

    y_pred = knn.predict(X_test)

    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

print(f"Precision: {np.mean(precision_scores):.4f}")
print(f"Recall: {np.mean(recall_scores):.4f}")
print(f"F1 Score: {np.mean(f1_scores):.4f}")

# MLP

In [None]:
for hidden_layer_sizes in [
    (1000, 1000, 1000, 1000, 1000),
]:
    for activation in ["relu", "tanh"]:
        mlp = MLPClassifier(
            hidden_layer_sizes=hidden_layer_sizes,
            activation=activation,
            max_iter=1000,
            solver="adam",
        )
        kfold = KFold(n_splits=5, shuffle=True, random_state=2)
        precision_scores = []
        recall_scores = []
        f1_scores = []

        for train_idx, test_idx in kfold.split(X):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            mlp.fit(X_train, y_train)

            y_pred = mlp.predict(X_test)

            precision = precision_score(y_test, y_pred)
            recall = recall_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)

            precision_scores.append(precision)
            recall_scores.append(recall)
            f1_scores.append(f1)

        # Output the results
        print(f"Hidden Layer Sizes: {hidden_layer_sizes}")
        print(f"Activation Function: {activation}")
        print(f"Precision: {np.mean(precision_scores):.4f}")
        print(f"Recall: {np.mean(recall_scores):.4f}")
        print(f"F1 Score: {np.mean(f1_scores):.4f}")


## Decision Tree

In [14]:
criterions = ["gini", "entropy", "log_loss"]
for criterion in criterions:
    decision_tree_clf = sklearn.tree.DecisionTreeClassifier(criterion=criterion, random_state=2, max_depth=25) 
    kfold = KFold(n_splits=5, shuffle=True, random_state=2)
    precision_scores = []
    recall_scores = []
    f1_scores = []

    for train_idx, test_idx in kfold.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        decision_tree_clf.fit(X_train, y_train)

        y_pred = decision_tree_clf.predict(X_test)

        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)

    print(f"Criterion: {criterion}")
    print(f"Precision: {np.mean(precision_scores):.4f}")
    print(f"Recall: {np.mean(recall_scores):.4f}")
    print(f"F1 Score: {np.mean(f1_scores):.4f}")

Criterion: gini
Precision: 0.2691
Recall: 0.2112
F1 Score: 0.2366
Criterion: entropy
Precision: 0.2189
Recall: 0.2274
F1 Score: 0.2231
Criterion: log_loss
Precision: 0.2189
Recall: 0.2274
F1 Score: 0.2231
