<a href="https://colab.research.google.com/github/brandon-setegn/fta-724-machine-learning-ai/blob/master/Module_2_Improvements.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Improvements on Module ML Credit Scoring
What effect does lowering the number of predictor variables have on the models.

In [18]:
#let's import the libraries we are going to need
import os, csv
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import statsmodels.api as sm
#%matplotlib notebook
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [19]:
from google.colab import drive
drive.mount('/content/drive')

os.chdir('/content/drive/MyDrive/Development/machine_learning/fta724')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
df = pd.read_csv('module_2_lending_club_credit.csv',index_col=False)
df.columns = df.columns.str.lower()

In [21]:
df

Unnamed: 0,yob,nkid,dep,phon,sinc,aes,dainc,res,dhval,dmort,doutm,doutl,douthp,doutcc,bad
0,19,4,0,1,0,R,0,O,14464,4,0,0,0,0,0
1,41,2,0,1,0,P,36000,O,0,0,280,664,0,80,0
2,66,0,0,1,0,N,30000,N,0,0,0,0,0,0,0
3,51,2,0,1,0,P,464,O,24928,8464,584,320,0,60,0
4,65,0,0,1,0,P,15000,P,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1220,52,4,1,1,0,V,37764,U,0,0,0,340,0,0,0
1221,56,3,0,1,1200,P,31500,O,6928,46464,784,256,0,0,1
1222,60,0,0,1,0,E,46800,O,11392,928,1144,680,0,108,0
1223,20,0,0,1,0,R,0,N,0,0,0,0,0,0,1


A bief description of the variables in this dataset are as follows:

> **Bad:** Good/bad indicator: 1 = Bad, 0 = Good.

> **yob:** Year of birth (If unknown the year will be 99).

> **nkid:** Number of children.

> **dep:** Number of other dependents.

> **phon:** Is there a home phone (1=yes, 0 = no)

> **sinc:** Spouse's income.

> **aes:** Applicant's employment status: V = Government, W = housewife, M = military, P = private sector, B = public sector, R = retired, E = self employed, T = student, U = unemployed, N = others, Z  = no response.

> **dainc:** Applicant's income.

> **res:** Residential status: O = Owner, F = tenant furnished, U = Tenant Unfurnished, P = With parents, N = Other, Z = No response.

> **dhval:** Value of Home: 0 = no response or not owner, 000001 = zero valu, blank = no response.

> **dmort:** Mortgage balance outstanding: 0 = no response or not owner, 000001 = zero balance, blank = no response.

> **doutm:** Outgoings on mortgage or rent.

> **doutl:** Outgoings on Loans.

> **douthp:** Outgoings on Hire Purchase.

> **doutcc:** Outgoings on credit card.

In [22]:
# Combine nkid and dep
df['ndep_kids'] = df['nkid'] + df['dep']

#

In [23]:
#We may use mean income to imute the 0 values.
mean_income = df[df['dainc']!=0]['dainc'].mean()

df.loc[df["dainc"] == 0, "dainc"] = mean_income

df['dainc'].describe()

count     1225.000000
mean     25538.919529
std      12703.432640
min        464.000000
25%      17100.000000
50%      25538.919529
75%      30600.000000
max      64800.000000
Name: dainc, dtype: float64

In [24]:
# Create 1 column out of the employment categorical values
df['is_employed'] = df['aes'].isin(['V', 'M', 'P', 'B']).astype(int)
df.head()

Unnamed: 0,yob,nkid,dep,phon,sinc,aes,dainc,res,dhval,dmort,doutm,doutl,douthp,doutcc,bad,ndep_kids,is_employed
0,19,4,0,1,0,R,25538.919529,O,14464,4,0,0,0,0,0,4,0
1,41,2,0,1,0,P,36000.0,O,0,0,280,664,0,80,0,2,1
2,66,0,0,1,0,N,30000.0,N,0,0,0,0,0,0,0,0,0
3,51,2,0,1,0,P,464.0,O,24928,8464,584,320,0,60,0,2,1
4,65,0,0,1,0,P,15000.0,P,0,0,0,0,0,0,0,0,1


In [25]:
# Create 1 column out of the residential status categorical values
df['is_home_owner'] = df['res'].isin(['O']).astype(int)
df.head()

Unnamed: 0,yob,nkid,dep,phon,sinc,aes,dainc,res,dhval,dmort,doutm,doutl,douthp,doutcc,bad,ndep_kids,is_employed,is_home_owner
0,19,4,0,1,0,R,25538.919529,O,14464,4,0,0,0,0,0,4,0,1
1,41,2,0,1,0,P,36000.0,O,0,0,280,664,0,80,0,2,1,1
2,66,0,0,1,0,N,30000.0,N,0,0,0,0,0,0,0,0,0,0
3,51,2,0,1,0,P,464.0,O,24928,8464,584,320,0,60,0,2,1,1
4,65,0,0,1,0,P,15000.0,P,0,0,0,0,0,0,0,0,1,0


In [26]:
df.is_home_owner.describe()

count    1225.000000
mean        0.509388
std         0.500116
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         1.000000
Name: is_home_owner, dtype: float64

In [27]:
df.columns

Index(['yob', 'nkid', 'dep', 'phon', 'sinc', 'aes', 'dainc', 'res', 'dhval',
       'dmort', 'doutm', 'doutl', 'douthp', 'doutcc', 'bad', 'ndep_kids',
       'is_employed', 'is_home_owner'],
      dtype='object')

In [28]:
# Remove unneeded predictor variables
df = df.drop(['yob', 'nkid', 'dep', 'phon', 'aes', 'res'], axis=1)

In [30]:
df.describe()

Unnamed: 0,sinc,dainc,dhval,dmort,doutm,doutl,douthp,doutcc,bad,ndep_kids,is_employed,is_home_owner
count,1225.0,1225.0,1225.0,1225.0,1225.0,1225.0,1225.0,1225.0,1225.0,1225.0,1225.0,1225.0
mean,1990.084898,25538.919529,15693.857959,11225.691429,342.004898,121.926531,28.721633,39.595102,0.263673,0.662041,0.665306,0.509388
std,4802.341425,12703.43264,20736.331833,18889.207107,427.993865,839.639588,119.324084,168.697101,0.440804,1.056448,0.472076,0.500116
min,0.0,464.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,17100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,25538.919529,0.0,0.0,256.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
75%,1040.0,30600.0,28928.0,20000.0,528.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
max,50000.0,64800.0,64928.0,64000.0,3800.0,28000.0,1600.0,2800.0,1.0,5.0,1.0,1.0
