# Using SVM to Predict Who is a Credit Risk

# 1. Necessary Imports

In [2]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

# 2. Import Data, Merge into a New Dataset

In [3]:
def import_data():
    payment_data = pd.read_csv("payment_data.csv")
    customer_data = pd.read_csv("customer_data.csv")
    df = pd.merge(customer_data,payment_data)
    
    return df
# df.shape = (8250, 24)

# 3. Preprocess

In [4]:
from sklearn.preprocessing import StandardScaler

def preprocess(df):
    df.drop_duplicates()
    df.dropna(inplace=True)
    # Encode categorical features
    df = pd.get_dummies(df, columns=['prod_code'])

    # Impute missing values
    df['prod_limit'].fillna(df['prod_limit'].mean(), inplace=True)
    df['fea_2'].fillna(df['fea_2'].median(), inplace=True)
    df['highest_balance'].fillna(df['highest_balance'].median(), inplace=True)
    df['update_date'] = pd.to_datetime(df['update_date'], format='%d/%m/%Y')
    df['report_date'] = pd.to_datetime(df['report_date'], format='%d/%m/%Y')
    df['update_date'] = df['update_date'].apply(lambda x: pd.Timestamp(x).timestamp())
    df['report_date'] = df['report_date'].apply(lambda x: pd.Timestamp(x).timestamp())

    # Scale numerical features
    scaler = StandardScaler()
    num_cols = ['fea_1', 'fea_2', 'fea_3', 'fea_4', 'fea_5', 'fea_6', 'fea_7', 'fea_8', 'fea_9', 'fea_10', 'fea_11', 'OVD_t1', 'OVD_t2', 'OVD_t3', 'OVD_sum', 'pay_normal', 'prod_limit', 'new_balance', 'highest_balance', 'update_date', 'report_date']
    df[num_cols] = scaler.fit_transform(df[num_cols])
    
    return df

# 4. Distribution of the Classes

In [5]:
# df.shape = (1697, 25)
# df['label'].value_counts()
def plot():
    low_credit_df = df[df['label'] == 0][0:200]
    high_credit_df = df[df['label'] == 1][0:200]
    axes = low_credit_df.plot(kind = 'scatter', x = 'OVD_t1', y = 'highest_balance', color = 'blue', label = 'low-risk')
    high_credit_df.plot(kind = 'scatter', x = 'OVD_t1', y = 'highest_balance', color = 'red', label = 'high-risk', ax = axes)

# 5. Identify Unwanted Rows

In [6]:
# Converting the date columns to numeric values, only converts the non-missing data. Then sets the data type of the columns to
# int instead of object.
# df = df[pd.to_numeric(df['update_date'], errors = 'coerce').notnull()]
# df = df[pd.to_numeric(df['report_date'], errors = 'coerce').notnull()]
# df['update_date'] = df['update_date'].astype('int')
# df['report_date'] = df['report_date'].astype('int')

# 6. Remove Unwanted Columns

In [7]:
# Removing the update_date and report_date columns from dataset.

def get_X_y(df):
    feature_df = df

    # Independent variables (features)
    X = np.asarray(feature_df)

    # Dependent variable
    y = np.asarray(df['label'])
    
    return X, y

#feature_df.dtypes

In [8]:
# print(X[0:5])
# print(y[0:5])

# 7. Divide into training and test data

In [19]:
# feature_df (1697) -> Train (no. rows) / Test (no. rows) ## 80/20 split
# Train(X, y) ## X is a 2D array and y is a 1D array
# Test(X, y)
from sklearn.model_selection import train_test_split

def split():
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
    return X_train, X_test, y_train, y_test

In [10]:
# Verify splits
# X_train.shape ## (1357, 25)
# y_train.shape ## (1357,)
# X_test.shape ## (340, 25)
# y_test.shape ## (340,)

In [11]:
from sklearn import svm
# auto means 1/n_features
def classify(X_test, X, y):
    linear_classifier = svm.SVC(kernel = 'linear', gamma = 'auto', C = 0.1)
    linear_classifier.fit(X, y)
    y_linear_predict = linear_classifier.predict(X_test)
    poly_classifier = svm.SVC(kernel = 'poly', gamma = 'auto', C = 0.1)
    poly_classifier.fit(X, y)
    y_poly_predict = poly_classifier.predict(X_test)
    RBF_classifier = svm.SVC(kernel = 'rbf', gamma = 'auto', C = 0.1)
    RBF_classifier.fit(X, y)
    y_RBF_predict = RBF_classifier.predict(X_test)
    
    return y_linear_predict, y_poly_predict, y_RBF_predict

# Evaluation Results

In [12]:
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report

def results(X_test, X, y):
    y_linear_predict, y_poly_predict, y_RBF_predict = classify(X_test, X, y)
    print(classification_report(y_test, y_linear_predict, zero_division = 0))
    print(classification_report(y_test, y_poly_predict, zero_division = 0))
    print(classification_report(y_test, y_RBF_predict, zero_division = 0))

In [13]:
## To-Do: Add regularization (lambda)

In [14]:
df = import_data()
df = preprocess(df)

In [15]:
X, y = get_X_y(df)

In [20]:
X_train, X_test, y_train, y_test = split()

In [21]:
results(X_test, X, y)

              precision    recall  f1-score   support

           0       0.88      1.00      0.94       299
           1       0.00      0.00      0.00        41

    accuracy                           0.88       340
   macro avg       0.44      0.50      0.47       340
weighted avg       0.77      0.88      0.82       340

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       299
           1       0.12      1.00      0.22        41

    accuracy                           0.12       340
   macro avg       0.06      0.50      0.11       340
weighted avg       0.01      0.12      0.03       340

              precision    recall  f1-score   support

           0       0.88      1.00      0.94       299
           1       0.00      0.00      0.00        41

    accuracy                           0.88       340
   macro avg       0.44      0.50      0.47       340
weighted avg       0.77      0.88      0.82       340

