In this post I will share an example that uses Principal Component Analysis as a dimension reduction tool to prepare the data for logistic regression prediction in python.

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as ex
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [11]:
df=pd.read_csv('/content/processed.csv')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df = df.drop(['customerid', 'systemloanid'], axis = 1)
df = df.dropna()
df.head() # Inspect the first 5 rows

Unnamed: 0,bank_branch_clients,loannumber,loanamount,totaldue,termdays,good_bad_flag,referred,realage,TipoInteresAhora,bank_account_Current,...,employment_status_client_Self-Employed,employment_status_client_Student,employment_status_client_Unemployed,employment_status_client_Unknown,number_of_Loans,accumlated_loan_given,accumlated_timediff,time-to-payoff,accumulated_money_won,is_late_for_firstpay
0,0.0,2.0,10000.0,13000.0,30.0,1.0,1.0,48.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1.0,10000.0,3667000000000.0,1300528000000000.0,1500.0,0.0
1,0.0,2.0,10000.0,13000.0,30.0,0.0,0.0,36.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,10000.0,3668000000000.0,2840792000000000.0,3000.0,1.0
2,0.0,4.0,10000.0,13000.0,30.0,1.0,0.0,35.0,1.0,0.0,...,0.0,0.0,0.0,1.0,3.0,30000.0,10835000000000.0,5520142000000000.0,6000.0,1.0
3,0.0,2.0,10000.0,11500.0,15.0,1.0,0.0,30.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,10000.0,3668000000000.0,2873995000000000.0,1500.0,1.0
4,0.0,6.0,20000.0,24500.0,30.0,1.0,0.0,31.0,0.75,0.0,...,0.0,0.0,0.0,0.0,5.0,70000.0,18098000000000.0,1.179669e+16,18000.0,0.0


In [12]:
x = df.loc[:, df.columns != 'good_bad_flag'] # as x variable - the features
x=StandardScaler().fit_transform(x) # standarize the variables
y=df['good_bad_flag'] # assign y variable - the target 

We will start by using only the first 2 leading principal components, and then explore 3 principal components and 4 principal components.

In [13]:
pca=PCA(n_components=2)
PC=pca.fit_transform(x)
principalDF=pd.DataFrame(data=PC,columns=['pc1','pc2'])
finalDf = pd.concat([principalDF, df[['good_bad_flag']]], axis = 1)
finalDf.head()

Unnamed: 0,pc1,pc2,good_bad_flag
0,-2.753636,-0.502676,1.0
1,-2.582214,0.82247,0.0
2,-1.874872,-0.211891,1.0
3,-2.708986,0.696773,1.0
4,0.221011,-1.223886,1.0


To assess how much weightings each feature will have in later predictions, we could construct a loadings table. The loadings shows how much each of our original features have contributed to each of the “new features” — the principal components.

In [14]:
PCloadings = pca.components_.T * np.sqrt(pca.explained_variance_)
components=df.columns.tolist()
components.pop(5)
loadingdf=pd.DataFrame(PCloadings,columns=('PC1','PC2'))
loadingdf["variable"]=components
loadingdf


Unnamed: 0,PC1,PC2,variable
0,0.023311,0.321851,bank_branch_clients
1,0.950287,-0.131185,loannumber
2,0.899165,-0.264556,loanamount
3,0.890737,-0.268732,totaldue
4,0.528805,-0.174681,termdays
5,-0.290131,-0.073155,referred
6,-0.000513,-0.016127,realage
7,-0.901864,0.245038,TipoInteresAhora
8,-0.018982,0.098876,bank_account_Current
9,0.64744,0.447532,bank_account_Other


Now we can plot the loadings and see which of them have high weightings in both principal component 1 and 2:

In [15]:
fig=ex.scatter(x=loadingdf['PC1'],y=loadingdf['PC2'],text=loadingdf['variable'],)
fig.update_layout(
height=600,width=500,
title_text='loadings plot')
fig.update_traces(textposition='bottom center')
fig.add_shape(type="line",
x0=-0, y0=-1,x1=-0,y1=1,
line=dict(color="RoyalBlue",width=3)
)
fig.add_shape(type="line",
x0=-1, y0=0,x1=1,y1=0,
line=dict(color="RoyalBlue",width=3)
)
fig.show()

It is clear that is_late_for_first_pay and exceeds_loan_term_days t are two heavily weighted features.