# Using SVM to Predict Who is a Credit Risk

# 1. Necessary Imports

In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

# 2. Import Data, Merge into a New Dataset

In [2]:
def import_data():
    payment_data = pd.read_csv("payment_data.csv")
    customer_data = pd.read_csv("customer_data.csv")
    df = pd.merge(customer_data,payment_data)
    
    return df
# df.shape = (8250, 24)

# 3. Preprocess

In [3]:
from sklearn.preprocessing import StandardScaler

def preprocess(df):
    df.drop_duplicates()
    df.dropna(inplace=True)
    # Encode categorical features
#     df = pd.get_dummies(df, columns=['prod_code'])

#     # Impute missing values
#     df['prod_limit'].fillna(df['prod_limit'].mean(), inplace=True)
    df['fea_2'].fillna(df['fea_2'].median(), inplace=True)
    df['highest_balance'].fillna(df['highest_balance'].median(), inplace=True)
#     df['update_date'] = pd.to_datetime(df['update_date'], format='%d/%m/%Y')
#     df['report_date'] = pd.to_datetime(df['report_date'], format='%d/%m/%Y')
#     df['update_date'] = df['update_date'].apply(lambda x: pd.Timestamp(x).timestamp())
#     df['report_date'] = df['report_date'].apply(lambda x: pd.Timestamp(x).timestamp())

    # Scale numerical features
    scaler = StandardScaler()
#     num_cols = ['fea_1', 'fea_2', 'fea_3', 'fea_4', 'fea_5', 'fea_6', 'fea_7', 'fea_8', 'fea_9', 'fea_10', 'fea_11', 'OVD_t1', 'OVD_t2', 'OVD_t3', 'OVD_sum', 'pay_normal', 'prod_limit', 'new_balance', 'highest_balance', 'update_date', 'report_date']
    num_cols = ['fea_1', 'fea_2', 'fea_3', 'fea_4', 'fea_5', 'fea_6', 'fea_7', 'fea_8', 'fea_9', 'fea_10', 'fea_11', 'OVD_t1', 'OVD_t2', 'OVD_t3', 'OVD_sum', 'pay_normal', 'new_balance', 'highest_balance']
    df[num_cols] = scaler.fit_transform(df[num_cols])
    df = df[['label', 'fea_1', 'fea_2', 'fea_3', 'fea_4', 'fea_5', 'fea_6', 'fea_7', 'fea_8', 'fea_9', 'fea_10', 'fea_11', 'OVD_t1', 'OVD_t2', 'OVD_t3', 'OVD_sum', 'pay_normal', 'new_balance', 'highest_balance']]
    
    return df

# 4. Distribution of the Classes

In [4]:
# df.shape = (1697, 25)
# df['label'].value_counts()
def plot():
    low_credit_df = df[df['label'] == 0][0:200]
    high_credit_df = df[df['label'] == 1][0:200]
    axes = low_credit_df.plot(kind = 'scatter', x = 'OVD_t1', y = 'highest_balance', color = 'blue', label = 'low-risk')
    high_credit_df.plot(kind = 'scatter', x = 'OVD_t1', y = 'highest_balance', color = 'red', label = 'high-risk', ax = axes)

# 5. Remove Unwanted Columns

In [5]:
# Did not end up removing any columns.
def get_X_y(df):
    feature_df = df

    # Independent variables (features)
    X = np.asarray(feature_df)

    # Dependent variables
    y = np.asarray(df['label'])
    
    return X, y

# 6. Divide into training and test data

In [6]:
# feature_df (1697) -> Train (no. rows) / Test (no. rows) ## 80/20 split
# Train(X, y) ## X is a 2D array and y is a 1D array
# Test(X, y)
from sklearn.model_selection import train_test_split

def split():
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
    return X_train, X_test, y_train, y_test

# 7. Evaluation Results

In [47]:
from sklearn import svm
from sklearn.metrics import classification_report

def linear_results(X_test, X_train, y_test, y_train):
    # C is the regularization parameter
    linear_classifier = svm.SVC(kernel = 'linear', gamma = 'auto', C = 3)
    linear_classifier.fit(X_train, y_train)
    y_predict = linear_classifier.predict(X_test)
    print(classification_report(y_test, y_predict, zero_division = 0))
    
def poly_results(X_test, X_train, y_test, y_train):
    degrees = 10
    for i in range(3, degrees):
        poly_classifier = svm.SVC(kernel = 'poly', gamma = 'auto', degree = i,  C = 3)
        poly_classifier.fit(X_train, y_train)
        y_predict = poly_classifier.predict(X_test)
#         print(y_predict)
        print("Degree:", i)
        print(classification_report(y_test, y_predict, zero_division = 0))
    
def rbf_results(X_test, X_train, y_test, y_train):
    RBF_classifier = svm.SVC(kernel = 'rbf', gamma = 'auto', C = 3)
    RBF_classifier.fit(X_train, y_train)
    y_predict = RBF_classifier.predict(X_test)
    print(classification_report(y_test, y_predict, zero_division = 0))

In [8]:
## To-Do: 
# 1. Make plots for different combinations of regularization (C parameter) and transformations (Kernels)

In [9]:
df = import_data()
df = preprocess(df)
X, y = get_X_y(df)
X_train, X_test, y_train, y_test = split()

In [10]:
X_test.shape

(340, 19)

In [11]:
X_train.shape

(1357, 19)

In [12]:
y_train.shape

(1357,)

In [13]:
y_test.shape

(340,)

In [14]:
# print(y_test)
# y_test:
# [0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0
#  0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0
#  1 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0
#  0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
#  1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
#  0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0
#  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
#  0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
#  0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0
#  1 1 0 0 0 0 1]

In [31]:
# poly_results(X_test, X_train, y_test, y_train)

In [40]:
# rbf_results(X_test, X_train, y_test, y_train)

              precision    recall  f1-score   support

           0       0.99      1.00      1.00       299
           1       1.00      0.93      0.96        41

    accuracy                           0.99       340
   macro avg       1.00      0.96      0.98       340
weighted avg       0.99      0.99      0.99       340



In [42]:
# RBF Results

# C = 0.1
#               precision    recall  f1-score   support

#            0       0.88      1.00      0.94       299
#            1       0.00      0.00      0.00        41

#     accuracy                           0.88       340
#    macro avg       0.44      0.50      0.47       340
# weighted avg       0.77      0.88      0.82       340

# C = 1
#             precision    recall  f1-score   support

#            0       0.99      1.00      1.00       299
#            1       1.00      0.93      0.96        41

#     accuracy                           0.99       340
#    macro avg       1.00      0.96      0.98       340
# weighted avg       0.99      0.99      0.99       340

# C = 2
#               precision    recall  f1-score   support

#            0       0.99      1.00      1.00       299
#            1       1.00      0.93      0.96        41

#     accuracy                           0.99       340
#    macro avg       1.00      0.96      0.98       340
# weighted avg       0.99      0.99      0.99       340

# C = 3
#               precision    recall  f1-score   support

#            0       0.99      1.00      1.00       299
#            1       1.00      0.93      0.96        41

#     accuracy                           0.99       340
#    macro avg       1.00      0.96      0.98       340
# weighted avg       0.99      0.99      0.99       340


In [48]:
linear_results(X_test, X_train, y_test, y_train)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       299
           1       1.00      1.00      1.00        41

    accuracy                           1.00       340
   macro avg       1.00      1.00      1.00       340
weighted avg       1.00      1.00      1.00       340



In [None]:
# Linear Results:

# C = 0.1
#               precision    recall  f1-score   support

#            0       1.00      1.00      1.00       299
#            1       1.00      1.00      1.00        41

#     accuracy                           1.00       340
#    macro avg       1.00      1.00      1.00       340
# weighted avg       1.00      1.00      1.00       340

# C = 1
# precision    recall  f1-score   support

#            0       1.00      1.00      1.00       299
#            1       1.00      1.00      1.00        41

#     accuracy                           1.00       340
#    macro avg       1.00      1.00      1.00       340
# weighted avg       1.00      1.00      1.00       340

# C = 2
#               precision    recall  f1-score   support

#            0       1.00      1.00      1.00       299
#            1       1.00      1.00      1.00        41

#     accuracy                           1.00       340
#    macro avg       1.00      1.00      1.00       340
# weighted avg       1.00      1.00      1.00       340

# C = 3
#               precision    recall  f1-score   support

#            0       1.00      1.00      1.00       299
#            1       1.00      1.00      1.00        41

#     accuracy                           1.00       340
#    macro avg       1.00      1.00      1.00       340
# weighted avg       1.00      1.00      1.00       340

In [18]:
# Linear
# Poly
# RBF

# C = 0.1
#               precision    recall  f1-score   support

#            0       0.88      1.00      0.94       299
#            1       0.00      0.00      0.00        41

#     accuracy                           0.88       340
#    macro avg       0.44      0.50      0.47       340
# weighted avg       0.77      0.88      0.82       340

#               precision    recall  f1-score   support

#            0       0.00      0.00      0.00       299
#            1       0.12      1.00      0.22        41

#     accuracy                           0.12       340
#    macro avg       0.06      0.50      0.11       340
# weighted avg       0.01      0.12      0.03       340

#               precision    recall  f1-score   support

#            0       0.88      1.00      0.94       299
#            1       0.00      0.00      0.00        41

#     accuracy                           0.88       340
#    macro avg       0.44      0.50      0.47       340
# weighted avg       0.77      0.88      0.82       340

# C = 0.2
#                precision    recall  f1-score   support

#            0       0.88      1.00      0.94       299
#            1       0.00      0.00      0.00        41

#     accuracy                           0.88       340
#    macro avg       0.44      0.50      0.47       340
# weighted avg       0.77      0.88      0.82       340

#               precision    recall  f1-score   support

#            0       0.00      0.00      0.00       299
#            1       0.12      1.00      0.22        41

#     accuracy                           0.12       340
#    macro avg       0.06      0.50      0.11       340
# weighted avg       0.01      0.12      0.03       340

#               precision    recall  f1-score   support

#            0       0.88      1.00      0.94       299
#            1       0.00      0.00      0.00        41

#     accuracy                           0.88       340
#    macro avg       0.44      0.50      0.47       340
# weighted avg       0.77      0.88      0.82       340

# C = 0.3
#               precision    recall  f1-score   support

#            0       0.88      1.00      0.94       299
#            1       0.00      0.00      0.00        41

#     accuracy                           0.88       340
#    macro avg       0.44      0.50      0.47       340
# weighted avg       0.77      0.88      0.82       340

#               precision    recall  f1-score   support

#            0       0.00      0.00      0.00       299
#            1       0.12      1.00      0.22        41

#     accuracy                           0.12       340
#    macro avg       0.06      0.50      0.11       340
# weighted avg       0.01      0.12      0.03       340

#               precision    recall  f1-score   support

#            0       0.89      1.00      0.94       299
#            1       1.00      0.07      0.14        41

#     accuracy                           0.89       340
#    macro avg       0.94      0.54      0.54       340
# weighted avg       0.90      0.89      0.84       340

# C = 0.4
#               precision    recall  f1-score   support

#            0       0.88      1.00      0.94       299
#            1       0.00      0.00      0.00        41

#     accuracy                           0.88       340
#    macro avg       0.44      0.50      0.47       340
# weighted avg       0.77      0.88      0.82       340

#               precision    recall  f1-score   support

#            0       0.00      0.00      0.00       299
#            1       0.12      1.00      0.22        41

#     accuracy                           0.12       340
#    macro avg       0.06      0.50      0.11       340
# weighted avg       0.01      0.12      0.03       340

#               precision    recall  f1-score   support

#            0       0.92      1.00      0.96       299
#            1       1.00      0.37      0.54        41

#     accuracy                           0.92       340
#    macro avg       0.96      0.68      0.75       340
# weighted avg       0.93      0.92      0.91       340

In [None]:
# C = 0.1
# Degree: 3
#               precision    recall  f1-score   support

#            0       0.89      1.00      0.94       299
#            1       1.00      0.10      0.18        41

#     accuracy                           0.89       340
#    macro avg       0.94      0.55      0.56       340
# weighted avg       0.90      0.89      0.85       340

# Degree: 4
#               precision    recall  f1-score   support

#            0       0.89      1.00      0.94       299
#            1       0.80      0.10      0.17        41

#     accuracy                           0.89       340
#    macro avg       0.84      0.55      0.56       340
# weighted avg       0.88      0.89      0.85       340

# Degree: 5
#               precision    recall  f1-score   support

#            0       0.89      0.99      0.94       299
#            1       0.67      0.10      0.17        41

#     accuracy                           0.89       340
#    macro avg       0.78      0.55      0.55       340
# weighted avg       0.86      0.89      0.85       340

# Degree: 6
#               precision    recall  f1-score   support

#            0       0.89      0.99      0.94       299
#            1       0.60      0.07      0.13        41

#     accuracy                           0.88       340
#    macro avg       0.74      0.53      0.53       340
# weighted avg       0.85      0.88      0.84       340

# Degree: 7
#               precision    recall  f1-score   support

#            0       0.89      0.99      0.94       299
#            1       0.50      0.07      0.13        41

#     accuracy                           0.88       340
#    macro avg       0.69      0.53      0.53       340
# weighted avg       0.84      0.88      0.84       340

# Degree: 8
#               precision    recall  f1-score   support

#            0       0.89      0.99      0.93       299
#            1       0.43      0.07      0.12        41

#     accuracy                           0.88       340
#    macro avg       0.66      0.53      0.53       340
# weighted avg       0.83      0.88      0.84       340

# Degree: 9
#               precision    recall  f1-score   support

#            0       0.89      0.98      0.93       299
#            1       0.38      0.07      0.12        41

#     accuracy                           0.87       340
#    macro avg       0.63      0.53      0.53       340
# weighted avg       0.82      0.87      0.83       340

# C = 1 
# Degree: 3
#               precision    recall  f1-score   support

#            0       0.93      0.99      0.96       299
#            1       0.90      0.44      0.59        41

#     accuracy                           0.93       340
#    macro avg       0.91      0.72      0.77       340
# weighted avg       0.92      0.93      0.92       340

# Degree: 4
#               precision    recall  f1-score   support

#            0       0.89      0.99      0.94       299
#            1       0.60      0.15      0.24        41

#     accuracy                           0.89       340
#    macro avg       0.75      0.57      0.59       340
# weighted avg       0.86      0.89      0.85       340

# Degree: 5
#               precision    recall  f1-score   support

#            0       0.90      0.99      0.94       299
#            1       0.64      0.17      0.27        41

#     accuracy                           0.89       340
#    macro avg       0.77      0.58      0.60       340
# weighted avg       0.87      0.89      0.86       340

# Degree: 6
#               precision    recall  f1-score   support

#            0       0.89      0.99      0.94       299
#            1       0.60      0.15      0.24        41

#     accuracy                           0.89       340
#    macro avg       0.75      0.57      0.59       340
# weighted avg       0.86      0.89      0.85       340

# Degree: 7
#               precision    recall  f1-score   support

#            0       0.89      0.98      0.94       299
#            1       0.55      0.15      0.23        41

#     accuracy                           0.88       340
#    macro avg       0.72      0.56      0.58       340
# weighted avg       0.85      0.88      0.85       340

# Degree: 8
#               precision    recall  f1-score   support

#            0       0.89      0.98      0.93       299
#            1       0.50      0.12      0.20        41

#     accuracy                           0.88       340
#    macro avg       0.70      0.55      0.57       340
# weighted avg       0.84      0.88      0.85       340

# Degree: 9
#               precision    recall  f1-score   support

#            0       0.89      0.98      0.93       299
#            1       0.50      0.12      0.20        41

#     accuracy                           0.88       340
#    macro avg       0.70      0.55      0.57       340
# weighted avg       0.84      0.88      0.85       340

# C = 2
# Degree: 3
#               precision    recall  f1-score   support

#            0       0.97      1.00      0.99       299
#            1       1.00      0.80      0.89        41

#     accuracy                           0.98       340
#    macro avg       0.99      0.90      0.94       340
# weighted avg       0.98      0.98      0.98       340

# Degree: 4
#               precision    recall  f1-score   support

#            0       0.90      0.99      0.94       299
#            1       0.71      0.24      0.36        41

#     accuracy                           0.90       340
#    macro avg       0.81      0.62      0.65       340
# weighted avg       0.88      0.90      0.87       340

# Degree: 5
#               precision    recall  f1-score   support

#            0       0.90      0.99      0.94       299
#            1       0.64      0.17      0.27        41

#     accuracy                           0.89       340
#    macro avg       0.77      0.58      0.60       340
# weighted avg       0.87      0.89      0.86       340

# Degree: 6
#               precision    recall  f1-score   support

#            0       0.90      0.98      0.94       299
#            1       0.58      0.17      0.26        41

#     accuracy                           0.89       340
#    macro avg       0.74      0.58      0.60       340
# weighted avg       0.86      0.89      0.86       340

# Degree: 7
#               precision    recall  f1-score   support

#            0       0.90      0.98      0.94       299
#            1       0.54      0.17      0.26        41

#     accuracy                           0.88       340
#    macro avg       0.72      0.58      0.60       340
# weighted avg       0.85      0.88      0.85       340

# Degree: 8
#               precision    recall  f1-score   support

#            0       0.90      0.98      0.94       299
#            1       0.54      0.17      0.26        41

#     accuracy                           0.88       340
#    macro avg       0.72      0.58      0.60       340
# weighted avg       0.85      0.88      0.85       340

# Degree: 9
#               precision    recall  f1-score   support

#            0       0.90      0.98      0.94       299
#            1       0.54      0.17      0.26        41

#     accuracy                           0.88       340
#    macro avg       0.72      0.58      0.60       340
# weighted avg       0.85      0.88      0.85       340

# C = 3
# Degree: 3
#               precision    recall  f1-score   support

#            0       0.99      1.00      0.99       299
#            1       0.97      0.90      0.94        41

#     accuracy                           0.99       340
#    macro avg       0.98      0.95      0.96       340
# weighted avg       0.99      0.99      0.99       340

# Degree: 4
#               precision    recall  f1-score   support

#            0       0.92      0.99      0.95       299
#            1       0.78      0.34      0.47        41

#     accuracy                           0.91       340
#    macro avg       0.85      0.66      0.71       340
# weighted avg       0.90      0.91      0.89       340

# Degree: 5
#               precision    recall  f1-score   support

#            0       0.91      0.98      0.94       299
#            1       0.69      0.27      0.39        41

#     accuracy                           0.90       340
#    macro avg       0.80      0.63      0.66       340
# weighted avg       0.88      0.90      0.88       340

# Degree: 6
#               precision    recall  f1-score   support

#            0       0.90      0.98      0.94       299
#            1       0.58      0.17      0.26        41

#     accuracy                           0.89       340
#    macro avg       0.74      0.58      0.60       340
# weighted avg       0.86      0.89      0.86       340

# Degree: 7
#               precision    recall  f1-score   support

#            0       0.90      0.98      0.94       299
#            1       0.54      0.17      0.26        41

#     accuracy                           0.88       340
#    macro avg       0.72      0.58      0.60       340
# weighted avg       0.85      0.88      0.85       340

# Degree: 8
#               precision    recall  f1-score   support

#            0       0.90      0.98      0.94       299
#            1       0.54      0.17      0.26        41

#     accuracy                           0.88       340
#    macro avg       0.72      0.58      0.60       340
# weighted avg       0.85      0.88      0.85       340

# Degree: 9
#               precision    recall  f1-score   support

#            0       0.90      0.98      0.93       299
#            1       0.50      0.17      0.25        41

#     accuracy                           0.88       340
#    macro avg       0.70      0.57      0.59       340
# weighted avg       0.85      0.88      0.85       340