In [126]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA


In [4]:
df = pd.read_csv("payment_customer_data.csv")
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Unnamed: 0,8250.0,4124.5,2381.714,0.0,2062.25,4124.5,6186.75,8249.0
id,8250.0,57821730.0,1822724.0,54982353.0,54990497.0,58989050.0,58996550.0,59006240.0
OVD_t1,8250.0,0.2490909,1.250197,0.0,0.0,0.0,0.0,34.0
OVD_t2,8250.0,0.1271515,0.8600464,0.0,0.0,0.0,0.0,34.0
OVD_t3,8250.0,0.3692121,2.90032,0.0,0.0,0.0,0.0,35.0
OVD_sum,8250.0,187.6817,1804.233,0.0,0.0,0.0,0.0,31500.0
pay_normal,8250.0,14.52667,12.05363,0.0,4.0,11.0,25.0,36.0
prod_code,8250.0,8.232,3.533055,0.0,6.0,10.0,10.0,27.0
prod_limit,2132.0,85789.7,74345.83,1.1,37400.0,68200.0,112200.0,660000.0
new_balance,8250.0,105404.2,1887704.0,-40303.2,0.0,0.0,24948.0,163212000.0


In [5]:
def preprocess(df):
    df.drop_duplicates()
    df.dropna(inplace=True)
    
    df['report_date'] = pd.to_datetime(df['report_date'],format='mixed')
    df['update_date'] = pd.to_datetime(df['update_date'],format='mixed')
    df['report_update_diff'] = (df['update_date'] - df['report_date']).dt.days
    
    df.drop('update_date', axis=1, inplace=True)
    df.drop('report_date', axis=1, inplace=True)

    return df

In [6]:
df = preprocess(df)
X = df.drop(['label'], axis=1)
y = df['label']

In [8]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.3, random_state=42)

print("X_train", X_train.shape, "y_train", y_train.shape)
print("X_val",X_val.shape, "y_val", y_val.shape)
print("X_train_val",X_train_val.shape, "y_train_val", y_train_val.shape)
print("X_test",X_test.shape, "y_test",y_test.shape)
    

X_train (830, 23) y_train (830,)
X_val (357, 23) y_val (357,)
X_train_val (1187, 23) y_train_val (1187,)
X_test (510, 23) y_test (510,)


# A single layer model
Perceptron is a single layer neural network, meaning it's a linear model.

In [95]:
C_range = np.logspace(np.log10(1e-5), np.log10(1e5), num=3)
penalties={'elasticnet':'saga', 
           'l1':'liblinear', 
           'l2':'lbfgs'}

In [115]:
def sp(C_range, penalties):
    best_p = None
    best_acc = 0
    bestC = 0
    best_penalty = 'none'

    for penalty in penalties.keys():
        for C in C_range:
            p = Perceptron(max_iter=100, alpha=C, penalty=penalty)
            p.fit(X_train, y_train)
            acc = p.score(X_val, y_val)
            print(f"penalty: {penalty}, C: {C}, validation accuracy: {acc}")
            if acc > best_acc:
                best_acc = acc
                best_p = p
                best_C = C
                best_penalty = penalty
    print(f"\nbest C: {best_C}, best penalty: {best_penalty}, best validation accuracy:{best_acc}\n")

    y_predict = p.predict(X_test)
    test_acc = accuracy_score(y_test, y_predict)
    print(f"Test accuracy:{test_acc}\n")


    

In [116]:
sp(C_range, penalties)

penalty: elasticnet, C: 1e-05, validation accuracy: 0.8571428571428571
penalty: elasticnet, C: 1.0, validation accuracy: 0.8571428571428571
penalty: elasticnet, C: 100000.0, validation accuracy: 0.8571428571428571
penalty: l1, C: 1e-05, validation accuracy: 0.8571428571428571
penalty: l1, C: 1.0, validation accuracy: 0.8571428571428571
penalty: l1, C: 100000.0, validation accuracy: 0.8571428571428571
penalty: l2, C: 1e-05, validation accuracy: 0.8571428571428571
penalty: l2, C: 1.0, validation accuracy: 0.8571428571428571
penalty: l2, C: 100000.0, validation accuracy: 0.8571428571428571

best C: 1e-05, best penalty: elasticnet, best validation accuracy:0.8571428571428571

Test accuracy:0.8784313725490196



# Multilayer perceptron

In [86]:
activations = ['relu', 'logistic','tanh']
hidden_layer_sizes = ([len(X),len(X)],[len(X),len(X),len(X)])

In [112]:
def mlps(hidden_layer_sizes, activations):
    best_mlp = None
    best_sizes = [0,0,0]
    best_act = ""
    best_accuracy = 0
    
    for sizes in hidden_layer_sizes:
        for activation in activations:
            mlp = MLPClassifier(max_iter=100, solver='sgd', random_state=1,hidden_layer_sizes=sizes, activation=activation)
            mlp.fit(X_train, y_train)
            y_predict = mlp.predict(X_val)
            accuracy = accuracy_score(y_val, y_predict)
            print(f"Activation: {activation}, Hidden layers: {sizes}, Validation accuracy: {accuracy}\n")
            if (accuracy > best_accuracy):
                best_accuracy = accuracy
                best_mlp = mlp
                best_sizes = sizes
                best_act = activation

    y_test_predict = best_mlp.predict(X_test)
    print(f"best activation: {best_act},  best hidden layers sizes: {best_sizes}, best test accuracy:{accuracy_score(y_test, y_test_predict)}\n")


In [113]:
mlps(hidden_layer_sizes, activations)

Activation: relu, Hidden layers: [1697, 1697], Validation accuracy: 0.8571428571428571

Activation: logistic, Hidden layers: [1697, 1697], Validation accuracy: 0.8571428571428571

Activation: tanh, Hidden layers: [1697, 1697], Validation accuracy: 0.8571428571428571

Activation: relu, Hidden layers: [1697, 1697, 1697], Validation accuracy: 0.8543417366946778

Activation: logistic, Hidden layers: [1697, 1697, 1697], Validation accuracy: 0.8571428571428571

Activation: tanh, Hidden layers: [1697, 1697, 1697], Validation accuracy: 0.8571428571428571

best activation: relu,  best hidden layers sizes: [1697, 1697], best test accuracy:0.8784313725490196

