In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.utils import check_random_state

In [2]:
data = pd.read_csv('cleaned_dataset.csv')
data.head()

Unnamed: 0,Age,Gender,Years at Address,Employment Status,Country,Current Debt,Postcode,Income,Housing,CCJs,Loan Amount,Outcome
0,19,M,2,Unemployed,UK,0.0,TA3 7SH,45500.0,Rent,1,13234.0,Paid
1,66,F,13,Unemployed,UK,1080.0,WV6 8SU,21000.0,Own,0,5561.0,Paid
2,48,F,4,Self Employed,UK,3690.0,BT15 5HG,47500.0,Rent,1,28288.0,Paid
3,67,F,42,Self Employed,UK,6560.0,GU10 3NH,36000.0,Mortgage,0,30199.0,Paid
4,70,M,8,Self Employed,UK,9100.0,GL6 6UB,50500.0,Own,0,35078.0,Paid


In [3]:
X = data.drop(['Postcode', 'Outcome'], axis=1)

In [4]:
y = data['Outcome']

In [9]:
features = pd.get_dummies(X)
features

Unnamed: 0,Age,Years at Address,Current Debt,Income,CCJs,Loan Amount,Gender_F,Gender_M,Employment Status_Employed,Employment Status_Retired,Employment Status_Self Employed,Employment Status_Unemployed,Country_France,Country_Germany,Country_Spain,Country_UK,Housing_Mortgage,Housing_Own,Housing_Rent
0,19,2,0.0,45500.0,1,13234.0,0,1,0,0,0,1,0,0,0,1,0,0,1
1,66,13,1080.0,21000.0,0,5561.0,1,0,0,0,0,1,0,0,0,1,0,1,0
2,48,4,3690.0,47500.0,1,28288.0,1,0,0,0,1,0,0,0,0,1,0,0,1
3,67,42,6560.0,36000.0,0,30199.0,1,0,0,0,1,0,0,0,0,1,1,0,0
4,70,8,9100.0,50500.0,0,35078.0,0,1,0,0,1,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,60,4,2900.0,22500.0,1,15725.0,1,0,0,0,1,0,0,0,0,1,0,1,0
1996,24,7,5130.0,38000.0,0,35911.0,1,0,0,0,1,0,0,0,0,1,0,1,0
1997,34,5,6530.0,41500.0,2,6874.0,0,1,0,0,0,1,0,0,0,1,0,1,0
1998,52,2,930.0,32500.0,0,5412.0,1,0,1,0,0,0,0,0,0,1,0,1,0


In [11]:
le = LabelEncoder().fit(y)
labels = le.transform(y)
labels

array([1, 1, 1, ..., 1, 1, 0])

In [12]:
np.unique(labels, return_counts=True)

(array([0, 1]), array([ 882, 1118], dtype=int64))

In [15]:
data['Outcome'].value_counts()

Paid         1118
Defaulted     882
Name: Outcome, dtype: int64

In [29]:
random_state = check_random_state(0)
features = np.array(features)
permutation = random_state.permutation(features.shape[0])
features = features[permutation]
labels = labels[permutation]
features = features.reshape((features.shape[0], -1))

In [75]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.1, random_state=1234)

In [76]:
X_train.shape

(1800, 19)

In [77]:
X_test.shape

(200, 19)

In [78]:
y_train.shape

(1800,)

In [79]:
y_test.shape

(200,)

In [80]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [81]:
cfl = LogisticRegression(C=50.0/X_train.shape[0], penalty='l1', solver='saga', tol=0.1)
cfl.fit(X_train, y_train)

LogisticRegression(C=0.027777777777777776, penalty='l1', solver='saga', tol=0.1)

In [82]:
sparsity = np.mean(cfl.coef_ == 0) * 100
score = cfl.score(X_test, y_test)

In [83]:
print("Sparsity with L1 penalty: %.2f%%" % sparsity)
print("Test score with L1 penalty: %.4f" % score)

Sparsity with L1 penalty: 26.32%
Test score with L1 penalty: 0.7450


In [84]:
preds = cfl.predict(X_test)
accuracy_score(y_test, preds)

0.745

In [43]:
data

Unnamed: 0,Age,Gender,Years at Address,Employment Status,Country,Current Debt,Postcode,Income,Housing,CCJs,Loan Amount,Outcome
0,19,M,2,Unemployed,UK,0.0,TA3 7SH,45500.0,Rent,1,13234.0,Paid
1,66,F,13,Unemployed,UK,1080.0,WV6 8SU,21000.0,Own,0,5561.0,Paid
2,48,F,4,Self Employed,UK,3690.0,BT15 5HG,47500.0,Rent,1,28288.0,Paid
3,67,F,42,Self Employed,UK,6560.0,GU10 3NH,36000.0,Mortgage,0,30199.0,Paid
4,70,M,8,Self Employed,UK,9100.0,GL6 6UB,50500.0,Own,0,35078.0,Paid
...,...,...,...,...,...,...,...,...,...,...,...,...
1995,60,F,4,Self Employed,UK,2900.0,PR4 4RF,22500.0,Own,1,15725.0,Paid
1996,24,F,7,Self Employed,UK,5130.0,PR7 5PA,38000.0,Own,0,35911.0,Paid
1997,34,M,5,Unemployed,UK,6530.0,RG18 9PH,41500.0,Own,2,6874.0,Paid
1998,52,F,2,Employed,UK,930.0,RG40 2JX,32500.0,Own,0,5412.0,Paid
