In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
loan_df = pd.read_csv('C:\\Users\Daniel-Panovest\\Downloads\\python ultra\\files\\data\\vehicle_loans_feat.csv', index_col='UNIQUEID')

In [3]:
#viewing loan dataframe info to check progress overtime
loan_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 233154 entries, 420825 to 630213
Data columns (total 33 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   DISBURSED_AMOUNT                     233154 non-null  float64
 1   ASSET_COST                           233154 non-null  float64
 2   LTV                                  233154 non-null  float64
 3   MANUFACTURER_ID                      233154 non-null  int64  
 4   EMPLOYMENT_TYPE                      233154 non-null  object 
 5   STATE_ID                             233154 non-null  int64  
 6   AADHAR_FLAG                          233154 non-null  int64  
 7   PAN_FLAG                             233154 non-null  int64  
 8   VOTERID_FLAG                         233154 non-null  int64  
 9   DRIVING_FLAG                         233154 non-null  int64  
 10  PASSPORT_FLAG                        233154 non-null  int64  
 11  PERFORM_

In [4]:
#casting categorical variable columns as type "category" using astype method, to ensure our classfier identifies all relevant categorical data
cat_cols = ['MANUFACTURER_ID', 'STATE_ID', 'DISBURSAL_MONTH', 'DISBURSED_CAT', 'PERFORM_CNS_SCORE_DESCRIPTION', 'EMPLOYMENT_TYPE']
loan_df[cat_cols] = loan_df[cat_cols].astype('category')
loan_df[cat_cols].dtypes

MANUFACTURER_ID                  category
STATE_ID                         category
DISBURSAL_MONTH                  category
DISBURSED_CAT                    category
PERFORM_CNS_SCORE_DESCRIPTION    category
EMPLOYMENT_TYPE                  category
dtype: object

In [5]:
#creating a subset of data from the loan df to simplify the predictive model to be used
cols = ['STATE_ID', 'LTV', 'DISBURSED_CAT', 'PERFORM_CNS_SCORE', 'DISBURSAL_MONTH', 'LOAN_DEFAULT']

#converting catgeorical variables into dummy variables using pd.get_dummies to simplify the model and reduce the impact of correlated variables
loan_df_subset = pd.get_dummies(loan_df[cols], prefix_sep='_', drop_first=True)
loan_df_subset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 233154 entries, 420825 to 630213
Data columns (total 40 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   LTV                     233154 non-null  float64
 1   PERFORM_CNS_SCORE       233154 non-null  float64
 2   LOAN_DEFAULT            233154 non-null  int64  
 3   STATE_ID_2              233154 non-null  uint8  
 4   STATE_ID_3              233154 non-null  uint8  
 5   STATE_ID_4              233154 non-null  uint8  
 6   STATE_ID_5              233154 non-null  uint8  
 7   STATE_ID_6              233154 non-null  uint8  
 8   STATE_ID_7              233154 non-null  uint8  
 9   STATE_ID_8              233154 non-null  uint8  
 10  STATE_ID_9              233154 non-null  uint8  
 11  STATE_ID_10             233154 non-null  uint8  
 12  STATE_ID_11             233154 non-null  uint8  
 13  STATE_ID_12             233154 non-null  uint8  
 14  STATE_ID_13    

In [6]:
#splitting data into train and test to fit the model to our spectific data and testing it afterwards
x = loan_df_subset.drop(['LOAN_DEFAULT'], axis=1)
y = loan_df_subset['LOAN_DEFAULT']

#creating binary classifier - logistic regression object
log_res = LogisticRegression()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [7]:
#using normalize to check the distribution of loan default values across the target train and test samples 
print(y_train.value_counts(normalize=True))
print('\n')
print(y_test.value_counts(normalize=True))

0    0.783099
1    0.216901
Name: LOAN_DEFAULT, dtype: float64


0    0.782248
1    0.217752
Name: LOAN_DEFAULT, dtype: float64


In [8]:
#fitting the binary classifier model with training data
log_res.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [9]:
#increasing number of max iterations to complete the fitting due to convergence error
log_res = LogisticRegression(max_iter=200)
log_res.fit(x_train, y_train)

LogisticRegression(max_iter=200)

In [10]:
#using score to predict and compare the predicted class against with the actual class
print(f'the binary classifier model predicted {log_res.score(x_test, y_test):0%} loan default cases correctly')

the binary classifier model predicted 78.229075% loan default cases correctly
