### Data Gathering and Import

First we use pandas to read from the dataset into our data frame.

We'll also display the first few rows so we get an idea of what the data looks like.

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('financial_data.csv')

In [3]:
df.head(10)

Unnamed: 0,Age,Gender,Education Level,Marital Status,Income,Credit Score,Loan Amount,Loan Purpose,Employment Status,Years at Current Job,Payment History,Debt-to-Income Ratio,Assets Value,Number of Dependents,City,State,Country,Previous Defaults,Marital Status Change,Risk Rating
0,49,Male,PhD,Divorced,72799.0,688.0,45713.0,Business,Unemployed,19,Poor,0.154313,120228.0,0.0,Port Elizabeth,AS,Cyprus,2.0,2,Low
1,57,Female,Bachelor's,Widowed,,690.0,33835.0,Auto,Employed,6,Fair,0.14892,55849.0,0.0,North Catherine,OH,Turkmenistan,3.0,2,Medium
2,21,Non-binary,Master's,Single,55687.0,600.0,36623.0,Home,Employed,8,Fair,0.362398,180700.0,3.0,South Scott,OK,Luxembourg,3.0,2,Medium
3,59,Male,Bachelor's,Single,26508.0,622.0,26541.0,Personal,Unemployed,2,Excellent,0.454964,157319.0,3.0,Robinhaven,PR,Uganda,4.0,2,Medium
4,25,Non-binary,Bachelor's,Widowed,49427.0,766.0,36528.0,Personal,Unemployed,10,Fair,0.143242,287140.0,,New Heather,IL,Namibia,3.0,1,Low
5,30,Non-binary,PhD,Divorced,,717.0,15613.0,Business,Unemployed,5,Fair,0.295984,,4.0,Brianland,TN,Iceland,3.0,1,Medium
6,31,Non-binary,Master's,Widowed,45280.0,672.0,6553.0,Personal,Self-employed,1,Good,0.37889,,,West Lindaview,MD,Bouvet Island (Bouvetoya),0.0,1,Low
7,18,Male,Bachelor's,Widowed,93678.0,,,Business,Unemployed,10,Poor,0.396636,246597.0,1.0,Melissahaven,MA,Honduras,1.0,1,Low
8,32,Non-binary,Bachelor's,Widowed,20205.0,710.0,,Auto,Unemployed,4,Fair,0.335965,227599.0,0.0,North Beverly,DC,Pitcairn Islands,4.0,2,Low
9,55,Male,Bachelor's,Married,32190.0,600.0,29918.0,Personal,Self-employed,5,Excellent,0.484333,130507.0,4.0,Davidstad,VT,Thailand,,2,Low


### Data Cleanup and Normalization

##### Data Cleanup
Many rows contain null values or NaN values. We will:

- Locate the missing values
- Replace missing values with the median for that specific column

In [4]:
#missing values per column
missing_vals = df.isnull().sum()
print(missing_vals)

Age                         0
Gender                      0
Education Level             0
Marital Status              0
Income                   2250
Credit Score             2250
Loan Amount              2250
Loan Purpose                0
Employment Status           0
Years at Current Job        0
Payment History             0
Debt-to-Income Ratio        0
Assets Value             2250
Number of Dependents     2250
City                        0
State                       0
Country                     0
Previous Defaults        2250
Marital Status Change       0
Risk Rating                 0
dtype: int64


In [5]:
# store the columns that have null values
cols_missing_vals = ['Income', 'Credit Score', 'Loan Amount', 'Assets Value', 'Number of Dependents', 'Previous Defaults']

# figure out the medians for those columns
medians = df[cols_missing_vals].median()

# reassign null values within those columns to their respective medians
df[cols_missing_vals] = df[cols_missing_vals].fillna(medians)

In [6]:
# see if what we did worked
missing_vals_per_col = df.isnull().sum()

total_missing_vals = missing_vals_per_col.sum()

print(missing_vals_per_col)
print("\nTotal missing values in the entire dataset: ", total_missing_vals)

Age                      0
Gender                   0
Education Level          0
Marital Status           0
Income                   0
Credit Score             0
Loan Amount              0
Loan Purpose             0
Employment Status        0
Years at Current Job     0
Payment History          0
Debt-to-Income Ratio     0
Assets Value             0
Number of Dependents     0
City                     0
State                    0
Country                  0
Previous Defaults        0
Marital Status Change    0
Risk Rating              0
dtype: int64

Total missing values in the entire dataset:  0


In [7]:
# get rid of location information, there are too many categories here
df = df.drop(columns=['City', 'State', 'Country'])

In [8]:
# cleaned and finalized data frame
df

Unnamed: 0,Age,Gender,Education Level,Marital Status,Income,Credit Score,Loan Amount,Loan Purpose,Employment Status,Years at Current Job,Payment History,Debt-to-Income Ratio,Assets Value,Number of Dependents,Previous Defaults,Marital Status Change,Risk Rating
0,49,Male,PhD,Divorced,72799.0,688.0,45713.0,Business,Unemployed,19,Poor,0.154313,120228.0,0.0,2.0,2,Low
1,57,Female,Bachelor's,Widowed,69773.0,690.0,33835.0,Auto,Employed,6,Fair,0.148920,55849.0,0.0,3.0,2,Medium
2,21,Non-binary,Master's,Single,55687.0,600.0,36623.0,Home,Employed,8,Fair,0.362398,180700.0,3.0,3.0,2,Medium
3,59,Male,Bachelor's,Single,26508.0,622.0,26541.0,Personal,Unemployed,2,Excellent,0.454964,157319.0,3.0,4.0,2,Medium
4,25,Non-binary,Bachelor's,Widowed,49427.0,766.0,36528.0,Personal,Unemployed,10,Fair,0.143242,287140.0,2.0,3.0,1,Low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,23,Non-binary,Bachelor's,Widowed,48088.0,609.0,26187.0,Home,Self-employed,2,Fair,0.317633,159362.0,4.0,2.0,0,Low
14996,56,Male,PhD,Single,107193.0,700.0,35111.0,Auto,Self-employed,10,Fair,0.155126,79102.0,2.0,0.0,0,Medium
14997,29,Non-binary,PhD,Married,46250.0,642.0,44369.0,Home,Unemployed,19,Excellent,0.593999,196930.0,4.0,2.0,1,High
14998,53,Non-binary,PhD,Divorced,40180.0,638.0,32752.0,Home,Self-employed,12,Excellent,0.478035,276060.0,2.0,0.0,2,High


### Machine Learning Setup

Importing the necessary libraries and allocating tests:

In [9]:
from sklearn.model_selection import train_test_split

Y = df.iloc[:, 16]
X = df.iloc[:, 0:16]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 100)

### Data Transformation and Conversion

Using Pipelines

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

NumericalProcessing = Pipeline(
 [
 ("knn_imputer", KNNImputer(n_neighbors = 6)),
 ("standard_scalar", StandardScaler())
 ]
)

CategoricalProcessing = Pipeline(
 [
 ("simple_imputer", SimpleImputer(strategy = 'most_frequent')),
 ("one_hot_encoder", OneHotEncoder())
 ]
)

numerical_values = X.select_dtypes(include=['float64', 'int64']).columns
categorical_values = X.select_dtypes(include=['object']).columns

ColumnTransformation = ColumnTransformer(
 [
 ('number_columns', NumericalProcessing, numerical_values),
 ('category_columns', CategoricalProcessing, categorical_values)
 ]
)

### Machine Learning Algorithms

In [11]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import linear_model
from sklearn.decomposition import PCA
from sklearn.svm import SVC

##### Random Forests

In [12]:
random_forests_pipe = Pipeline(
 [
 ('transform_columns', ColumnTransformation),
 ('pca', PCA(n_components = 32)),
 ('rf', RandomForestClassifier(random_state = 100))
 ]
)

random_forests_pipe.fit(X_train, Y_train)
print("Random Forests Accuracy(test data):", random_forests_pipe.score(X_test, Y_test) * 100, "%")
print("Random Forests Accuracy(training data):", random_forests_pipe.score(X_train, Y_train) * 100, "%")

Random Forests Accuracy(test data): 59.91111111111111 %
Random Forests Accuracy(training data): 100.0 %


##### K-Nearest Neighbours

In [13]:
KNN_pipe = Pipeline(
 [
 ('transform_columns', ColumnTransformation),
 ('pca', PCA(n_components = 32)),
 ('knn', KNeighborsClassifier(n_neighbors=5))
 ]
)

KNN_pipe.fit(X_train, Y_train)
print("K-Nearest Neighbours Accuracy(test data):", KNN_pipe.score(X_test, Y_test) * 100, "%")
print("K-Nearest Neighbours Accuracy(training data):",KNN_pipe.score(X_train, Y_train) * 100, "%")

K-Nearest Neighbours Accuracy(test data): 53.08888888888889 %
K-Nearest Neighbours Accuracy(training data): 66.27619047619048 %


##### Gradient Boosting

In [14]:
gradient_boosting_pipe = Pipeline(
 [
 ('transform_columns', ColumnTransformation),
 ('pca', PCA(n_components = 32)),
 ('gb', GradientBoostingClassifier())
 ]
)

gradient_boosting_pipe.fit(X_train, Y_train)
print("Gradient Boosting Accuracy(test data):", gradient_boosting_pipe.score(X_test, Y_test) * 100, "%")
print("Gradient Boosting Accuracy(training data):", gradient_boosting_pipe.score(X_train, Y_train) * 100, "%")

Gradient Boosting Accuracy(test data): 60.199999999999996 %
Gradient Boosting Accuracy(training data): 61.64761904761905 %


##### Naive Bayes

In [15]:
naive_bayes_pipe = Pipeline(
 [
 ('transform_columns', ColumnTransformation),
 ('pca', PCA(n_components = 32)),
 ('nb', GaussianNB())
 ]
)

naive_bayes_pipe.fit(X_train, Y_train)
print("Naive Bayes Accuracy(test data):", naive_bayes_pipe.score(X_test, Y_test) * 100, "%")
print("Naive Bayes Accuracy(training data): ", naive_bayes_pipe.score(X_train, Y_train) * 100, "%")

Naive Bayes Accuracy(test data): 60.46666666666667 %
Naive Bayes Accuracy(training data):  59.8 %


##### Logistic Regression

In [16]:
logistic_regression_pipe = Pipeline(
 [
 ('transform_columns', ColumnTransformation),
 ('pca', PCA(n_components = 32)),
 ('logReg', linear_model.LogisticRegression())
 ]
)

logistic_regression_pipe.fit(X_train, Y_train)
print("Logistic Regression Accuracy(test data):", logistic_regression_pipe.score(X_test, Y_test) * 100, "%")
print("Logistic Regression Accuracy(training data):", logistic_regression_pipe.score(X_train, Y_train) * 100, "%")

Logistic Regression Accuracy(test data): 60.46666666666667 %
Logistic Regression Accuracy(training data): 59.8 %


##### SVC

In [17]:
svc_pipe = Pipeline(
 [
 ('transform_columns', ColumnTransformation),
 ('pca', PCA(n_components = 32)),
 ('svc', SVC())
 ]
)

svc_pipe.fit(X_train, Y_train)
print("SVC Accuracy(test data):", svc_pipe.score(X_test, Y_test) * 100, "%")
print("SVC Accuracy(training data):", svc_pipe.score(X_train, Y_train) * 100, "%")

SVC Accuracy(test data): 60.46666666666667 %
SVC Accuracy(training data): 59.80952380952381 %
