# Using SVM to Predict Who is a Credit Risk

# 1. Necessary Imports

In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

# 2. Import Data, Merge into a New Dataset

In [2]:
def import_data():
    payment_data = pd.read_csv("payment_data.csv")
    customer_data = pd.read_csv("customer_data.csv")
    df = pd.merge(customer_data,payment_data)
    
    return df
# df.shape = (8250, 24)

# 3. Preprocess

In [3]:
from sklearn.preprocessing import StandardScaler

def preprocess(df):
    df.drop_duplicates()
    df.dropna(inplace=True)
    # Encode categorical features
    df = pd.get_dummies(df, columns=['prod_code'])

    # Impute missing values
    df['prod_limit'].fillna(df['prod_limit'].mean(), inplace=True)
    df['fea_2'].fillna(df['fea_2'].median(), inplace=True)
    df['highest_balance'].fillna(df['highest_balance'].median(), inplace=True)
    df['update_date'] = pd.to_datetime(df['update_date'], format='%d/%m/%Y')
    df['report_date'] = pd.to_datetime(df['report_date'], format='%d/%m/%Y')
    df['update_date'] = df['update_date'].apply(lambda x: pd.Timestamp(x).timestamp())
    df['report_date'] = df['report_date'].apply(lambda x: pd.Timestamp(x).timestamp())

    # Scale numerical features
    scaler = StandardScaler()
    num_cols = ['fea_1', 'fea_2', 'fea_3', 'fea_4', 'fea_5', 'fea_6', 'fea_7', 'fea_8', 'fea_9', 'fea_10', 'fea_11', 'OVD_t1', 'OVD_t2', 'OVD_t3', 'OVD_sum', 'pay_normal', 'prod_limit', 'new_balance', 'highest_balance', 'update_date', 'report_date']
    df[num_cols] = scaler.fit_transform(df[num_cols])
    
    return df

# 4. Distribution of the Classes

In [4]:
# df.shape = (1697, 25)
# df['label'].value_counts()
def plot():
    low_credit_df = df[df['label'] == 0][0:200]
    high_credit_df = df[df['label'] == 1][0:200]
    axes = low_credit_df.plot(kind = 'scatter', x = 'OVD_t1', y = 'highest_balance', color = 'blue', label = 'low-risk')
    high_credit_df.plot(kind = 'scatter', x = 'OVD_t1', y = 'highest_balance', color = 'red', label = 'high-risk', ax = axes)

# 5. Remove Unwanted Columns

In [5]:
# Did not end up removing any columns.
def get_X_y(df):
    feature_df = df

    # Independent variables (features)
    X = np.asarray(feature_df)

    # Dependent variables
    y = np.asarray(df['label'])
    
    return X, y

# 6. Divide into training and test data

In [6]:
# feature_df (1697) -> Train (no. rows) / Test (no. rows) ## 80/20 split
# Train(X, y) ## X is a 2D array and y is a 1D array
# Test(X, y)
from sklearn.model_selection import train_test_split

def split():
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
    return X_train, X_test, y_train, y_test

# 7. Classifying using SVM Kernels

In [7]:
from sklearn import svm
# auto means 1/n_features
def linear_class(X_test, X_train, y_train):
    # C is the regularization parameter
    linear_classifier = svm.SVC(kernel = 'linear', gamma = 'auto', C = 0.1)
    linear_classifier.fit(X_train, y_train)
    y_predict = linear_classifier.predict(X_test)
    
    return y_predict

def poly_class(X_test, X_train, y_train):
    poly_classifier = svm.SVC(kernel = 'poly', gamma = 'auto', degree = 3,  C = 2)
    poly_classifier.fit(X_train, y_train)
    y_predict = poly_classifier.predict(X_test)
    
    return y_predict

def rbf_class(X_test, X_train, y_train):
    RBF_classifier = svm.SVC(kernel = 'rbf', gamma = 'auto', C = 2)
    RBF_classifier.fit(X_train, y_train)
    y_predict = RBF_classifier.predict(X_test)
    
    return y_predict
    
    

# 8. Evaluation Results

In [8]:
from sklearn.metrics import classification_report

def linear_results(X_test, X_train, y_train):
    y_predict = linear_class(X_test, X_train, y_train)
    print(classification_report(y_test, y_predict, zero_division = 0))
    
def poly_results(X_test, X_train, y_train):
    y_predict = poly_class(X_test, X_train, y_train)
    print(classification_report(y_test, y_predict, zero_division = 0))
    
def rbf_results(X_test, X_train, y_train):
    y_predict = rbf_class(X_test, X_train, y_train)
    print(classification_report(y_test, y_predict, zero_division = 0))

In [9]:
## To-Do: 
# 1. Make plots for different combinations of regularization (C parameter) and transformations (Kernels)

In [10]:
df = import_data()
df = preprocess(df)
X, y = get_X_y(df)
X_train, X_test, y_train, y_test = split()

In [11]:
poly_results(X_test, X_train, y_train)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       299
           1       0.12      1.00      0.22        41

    accuracy                           0.12       340
   macro avg       0.06      0.50      0.11       340
weighted avg       0.01      0.12      0.03       340



In [None]:
linear_results(X_test, X_train, y_train)

In [None]:
rbf_results(X_test, X_train, y_train)

In [None]:
# C = 0.1
#               precision    recall  f1-score   support

#            0       0.88      1.00      0.94       299
#            1       0.00      0.00      0.00        41

#     accuracy                           0.88       340
#    macro avg       0.44      0.50      0.47       340
# weighted avg       0.77      0.88      0.82       340

#               precision    recall  f1-score   support

#            0       0.00      0.00      0.00       299
#            1       0.12      1.00      0.22        41

#     accuracy                           0.12       340
#    macro avg       0.06      0.50      0.11       340
# weighted avg       0.01      0.12      0.03       340

#               precision    recall  f1-score   support

#            0       0.88      1.00      0.94       299
#            1       0.00      0.00      0.00        41

#     accuracy                           0.88       340
#    macro avg       0.44      0.50      0.47       340
# weighted avg       0.77      0.88      0.82       340

# C = 0.2
#                precision    recall  f1-score   support

#            0       0.88      1.00      0.94       299
#            1       0.00      0.00      0.00        41

#     accuracy                           0.88       340
#    macro avg       0.44      0.50      0.47       340
# weighted avg       0.77      0.88      0.82       340

#               precision    recall  f1-score   support

#            0       0.00      0.00      0.00       299
#            1       0.12      1.00      0.22        41

#     accuracy                           0.12       340
#    macro avg       0.06      0.50      0.11       340
# weighted avg       0.01      0.12      0.03       340

#               precision    recall  f1-score   support

#            0       0.88      1.00      0.94       299
#            1       0.00      0.00      0.00        41

#     accuracy                           0.88       340
#    macro avg       0.44      0.50      0.47       340
# weighted avg       0.77      0.88      0.82       340

# C = 0.3
#               precision    recall  f1-score   support

#            0       0.88      1.00      0.94       299
#            1       0.00      0.00      0.00        41

#     accuracy                           0.88       340
#    macro avg       0.44      0.50      0.47       340
# weighted avg       0.77      0.88      0.82       340

#               precision    recall  f1-score   support

#            0       0.00      0.00      0.00       299
#            1       0.12      1.00      0.22        41

#     accuracy                           0.12       340
#    macro avg       0.06      0.50      0.11       340
# weighted avg       0.01      0.12      0.03       340

#               precision    recall  f1-score   support

#            0       0.89      1.00      0.94       299
#            1       1.00      0.07      0.14        41

#     accuracy                           0.89       340
#    macro avg       0.94      0.54      0.54       340
# weighted avg       0.90      0.89      0.84       340

# C = 0.4
#               precision    recall  f1-score   support

#            0       0.88      1.00      0.94       299
#            1       0.00      0.00      0.00        41

#     accuracy                           0.88       340
#    macro avg       0.44      0.50      0.47       340
# weighted avg       0.77      0.88      0.82       340

#               precision    recall  f1-score   support

#            0       0.00      0.00      0.00       299
#            1       0.12      1.00      0.22        41

#     accuracy                           0.12       340
#    macro avg       0.06      0.50      0.11       340
# weighted avg       0.01      0.12      0.03       340

#               precision    recall  f1-score   support

#            0       0.92      1.00      0.96       299
#            1       1.00      0.37      0.54        41

#     accuracy                           0.92       340
#    macro avg       0.96      0.68      0.75       340
# weighted avg       0.93      0.92      0.91       340