In [1]:
# Importing required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
# Importing the dataset
credit_data = pd.read_csv("credit_risk.csv")


In [2]:
credit_data.head()

Unnamed: 0,over_draft,credit_usage,credit_history,purpose,current_balance,Average_Credit_Balance,employment,location,personal_status,other_parties,...,property_magnitude,cc_age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,<0,6,critical/other existing credit,radio/tv,1169,no known savings,>=7,4,male single,none,...,real estate,67,none,own,2,skilled,1,yes,yes,good
1,0<=X<200,48,existing paid,radio/tv,5951,<100,1<=X<4,2,female div/dep/mar,none,...,real estate,22,none,own,1,skilled,1,none,yes,bad
2,no checking,12,critical/other existing credit,education,2096,<100,4<=X<7,2,male single,none,...,real estate,49,none,own,1,unskilled resident,2,none,yes,good
3,<0,42,existing paid,furniture/equipment,7882,<100,4<=X<7,2,male single,guarantor,...,life insurance,45,none,for free,1,skilled,2,none,yes,good
4,<0,24,delayed previously,new car,4870,<100,1<=X<4,3,male single,none,...,no known property,53,none,for free,2,skilled,2,none,yes,bad


In [3]:
# Understanding the values the 'class' column (our target column in this analysis) can take
credit_data['class'].unique()


array(['good', 'bad'], dtype=object)

In [4]:
# Selecting predictors as all columns except the 'class' column
X = credit_data.columns.drop("class")
# Setting the target as the 'class' column
y = credit_data['class']
# Encoding all the features/predictor variables using the get_dummies method()
credit_data_encoded = pd.get_dummies(credit_data[X])
# Checking the shape of the input data
credit_data_encoded.shape


(1000, 61)

In [5]:
# Importing the required module
from sklearn.model_selection import train_test_split
#splitting data into train and test datasets in 85:15 ratio
X_train,X_test,y_train,y_test = train_test_split(credit_data_encoded, y,test_size=0.15,random_state=100)
# Checking the shapes of the resulting datasets
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)


Shape of X_train: (850, 61)
Shape of y_train: (850,)
Shape of X_test: (150, 61)
Shape of y_test: (150,)


In [6]:
# Importing the required class.
from sklearn.linear_model import LogisticRegression
# Instantiating the required algorithm for model building.
model = LogisticRegression()
# Building the model based on the training data.
model.fit(X_train,y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [7]:
# Getting the accuracy on training data
train_accuracy = model.score(X_train,y_train)
print("Train accuracy = ", train_accuracy)
# Getting the accuracy on test data
test_accuracy = model.score(X_test,y_test)
print("Test accuracy = ", test_accuracy)


Train accuracy =  0.7752941176470588
Test accuracy =  0.74


In [8]:
# Predicting targets based on the model built
train_predictions = model.predict(X_train)
test_predictions = model.predict(X_test)
# Importing the required function
from sklearn.metrics import confusion_matrix
# Creating a confusion matrix on the training data
train_conf_matrix = confusion_matrix(y_train,train_predictions)
# Converting the train_conf_matrix into a DataFrame for better readability
pd.DataFrame(train_conf_matrix,columns=model.classes_,index=model.classes_)


Unnamed: 0,bad,good
bad,125,132
good,59,534


The rows of a Confusion Matrix represent the actual target values and the columns represent the predicted target values.

In the above matrix for training data, we can observe that the model predicted -

140 actually 'bad' credit risks as 'bad' 

117 actually 'bad' credit risks as 'good' 

60 actually 'good' credit risks as 'bad' 

533 actually 'good' credit risks as 'good'

In [9]:
# Confusion matrix for the test data
test_conf_matrix = confusion_matrix(y_test,test_predictions)
pd.DataFrame(test_conf_matrix,columns=model.classes_,index=model.classes_)

Unnamed: 0,bad,good
bad,19,24
good,15,92


In [10]:
# Calculating train accuracy from confusion matrix
train_correct_predictions = train_conf_matrix[0][0]+train_conf_matrix[1][1]
train_total_predictions = train_conf_matrix.sum()
train_accuracy = train_correct_predictions/train_total_predictions

print(train_accuracy)

0.7752941176470588


In [11]:
# Calculating test accuracy from confusion matrix
test_correct_predictions = test_conf_matrix[0][0]+test_conf_matrix[1][1]
total_predictions = test_conf_matrix.sum()
test_accuracy = test_correct_predictions/total_predictions

print(test_accuracy)

0.74


In [12]:
# Importing the required function
from sklearn.metrics import classification_report

# Generating the report and printing the same
print(classification_report(y_test,test_predictions))

              precision    recall  f1-score   support

         bad       0.56      0.44      0.49        43
        good       0.79      0.86      0.83       107

    accuracy                           0.74       150
   macro avg       0.68      0.65      0.66       150
weighted avg       0.73      0.74      0.73       150

