In [49]:
#importing and pre-processing data
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report 



In [50]:
# Read the data into a Pandas DataFrame
loan_info_df = pd.read_json('Resources/loan_approval_dataset.json')
loan_info_df.set_index('Id', inplace=True)
loan_info_df.drop_duplicates()
loan_info_df.head()


Unnamed: 0_level_0,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,1303834,23,3,single,rented,no,Mechanical_engineer,Rewa,Madhya_Pradesh,3,13,0
2,7574516,40,10,single,rented,no,Software_Developer,Parbhani,Maharashtra,9,13,0
3,3991815,66,4,married,rented,no,Technical_writer,Alappuzha,Kerala,4,10,0
4,6256451,41,2,single,rented,yes,Software_Developer,Bhubaneswar,Odisha,2,12,1
5,5768871,47,11,single,rented,no,Civil_servant,Tiruchirappalli[10],Tamil_Nadu,3,14,1


In [51]:
loan_info_df.tail()

Unnamed: 0_level_0,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
251996,8154883,43,13,single,rented,no,Surgeon,Kolkata,West_Bengal,6,11,0
251997,2843572,26,10,single,rented,no,Army_officer,Rewa,Madhya_Pradesh,6,11,0
251998,4522448,46,7,single,rented,no,Design_Engineer,Kalyan-Dombivli,Maharashtra,7,12,0
251999,6507128,45,0,single,rented,no,Graphic_Designer,Pondicherry,Puducherry,0,10,0
252000,9070230,70,17,single,rented,no,Statistician,Avadi,Tamil_Nadu,7,11,0


In [52]:
loan_info_df.describe()
loan_info_df.info()
loan_info_df.columns

<class 'pandas.core.frame.DataFrame'>
Int64Index: 252000 entries, 1 to 252000
Data columns (total 12 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   Income             252000 non-null  int64 
 1   Age                252000 non-null  int64 
 2   Experience         252000 non-null  int64 
 3   Married/Single     252000 non-null  object
 4   House_Ownership    252000 non-null  object
 5   Car_Ownership      252000 non-null  object
 6   Profession         252000 non-null  object
 7   CITY               252000 non-null  object
 8   STATE              252000 non-null  object
 9   CURRENT_JOB_YRS    252000 non-null  int64 
 10  CURRENT_HOUSE_YRS  252000 non-null  int64 
 11  Risk_Flag          252000 non-null  int64 
dtypes: int64(6), object(6)
memory usage: 25.0+ MB


Index(['Income', 'Age', 'Experience', 'Married/Single', 'House_Ownership',
       'Car_Ownership', 'Profession', 'CITY', 'STATE', 'CURRENT_JOB_YRS',
       'CURRENT_HOUSE_YRS', 'Risk_Flag'],
      dtype='object')

In [53]:
loan_info_df.isna().sum()

Income               0
Age                  0
Experience           0
Married/Single       0
House_Ownership      0
Car_Ownership        0
Profession           0
CITY                 0
STATE                0
CURRENT_JOB_YRS      0
CURRENT_HOUSE_YRS    0
Risk_Flag            0
dtype: int64

In [59]:
unique_state_counts = loan_info_df['STATE'].value_counts()

for state, count in unique_state_counts.items():
    print(f" {state}, {count}")

 Uttar_Pradesh, 28400
 Maharashtra, 25562
 Andhra_Pradesh, 25297
 West_Bengal, 23483
 Bihar, 19780
 Tamil_Nadu, 16537
 Madhya_Pradesh, 14122
 Karnataka, 11855
 Gujarat, 11408
 Rajasthan, 9174
 Jharkhand, 8965
 Haryana, 7890
 Telangana, 7524
 Assam, 7062
 Kerala, 5805
 Delhi, 5490
 Punjab, 4720
 Odisha, 4658
 Chhattisgarh, 3834
 Uttarakhand, 1874
 Jammu_and_Kashmir, 1780
 Puducherry, 1433
 Mizoram, 849
 Manipur, 849
 Himachal_Pradesh, 833
 Tripura, 809
 Uttar_Pradesh[5], 743
 Chandigarh, 656
 Sikkim, 608


In [60]:
#
loan_info_df['Profession'].value_counts()

Physician                     5957
Statistician                  5806
Web_designer                  5397
Psychologist                  5390
Computer_hardware_engineer    5372
Drafter                       5359
Magistrate                    5357
Fashion_Designer              5304
Air_traffic_controller        5281
Comedian                      5259
Industrial_Engineer           5250
Mechanical_engineer           5217
Chemical_engineer             5205
Technical_writer              5195
Hotel_Manager                 5178
Financial_Analyst             5167
Graphic_Designer              5166
Flight_attendant              5128
Biomedical_Engineer           5127
Secretary                     5061
Software_Developer            5053
Petroleum_Engineer            5041
Police_officer                5035
Computer_operator             4990
Politician                    4944
Microbiologist                4881
Technician                    4864
Artist                        4861
Lawyer              

In [61]:
marital_status = loan_info_df['Married/Single'].value_counts()

for status, count in marital_status.items():
    print(f" {status}, {count}")

 single, 226272
 married, 25728


In [62]:
risk_status = loan_info_df['Risk_Flag'].value_counts()

for risks, count in risk_status.items():
    print(f" {risks}, {count}")

 0, 221004
 1, 30996


In [54]:
loan_info_df.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Income,252000.0,,,,4997116.665325,2878311.013611,10310.0,2503015.0,5000694.5,7477502.0,9999938.0
Age,252000.0,,,,49.954071,17.063855,21.0,35.0,50.0,65.0,79.0
Experience,252000.0,,,,10.084437,6.00259,0.0,5.0,10.0,15.0,20.0
Married/Single,252000.0,2.0,single,226272.0,,,,,,,
House_Ownership,252000.0,3.0,rented,231898.0,,,,,,,
Car_Ownership,252000.0,2.0,no,176000.0,,,,,,,
Profession,252000.0,51.0,Physician,5957.0,,,,,,,
CITY,252000.0,317.0,Vijayanagaram,1259.0,,,,,,,
STATE,252000.0,29.0,Uttar_Pradesh,28400.0,,,,,,,
CURRENT_JOB_YRS,252000.0,,,,6.333877,3.647053,0.0,3.0,6.0,9.0,14.0


In [55]:
loan_info_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Income,252000.0,4997117.0,2878311.0,10310.0,2503015.0,5000694.5,7477502.0,9999938.0
Age,252000.0,49.95407,17.06385,21.0,35.0,50.0,65.0,79.0
Experience,252000.0,10.08444,6.00259,0.0,5.0,10.0,15.0,20.0
CURRENT_JOB_YRS,252000.0,6.333877,3.647053,0.0,3.0,6.0,9.0,14.0
CURRENT_HOUSE_YRS,252000.0,11.99779,1.399037,10.0,11.0,12.0,13.0,14.0
Risk_Flag,252000.0,0.123,0.3284379,0.0,0.0,0.0,0.0,1.0


In [31]:
# Export DataFrame as CSV files.
loan_info_df.to_csv("Resources/loan_dataset.csv", index=False)

### Step 2: Create the labels set (`y`)  from the “Risk_Flag” column, and then create the features (`X`) DataFrame from the remaining columns.

In [33]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = loan_info_df['Risk_Flag']

# Separate the X variable, the features
X = loan_info_df.drop(columns=['Risk_Flag'])

In [34]:
# Review the y variable Series
y.head()

Id
1    0
2    0
3    0
4    1
5    1
Name: Risk_Flag, dtype: int64

In [35]:
# Review the X variable DataFrame
X.head()

Unnamed: 0_level_0,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,1303834,23,3,single,rented,no,Mechanical_engineer,Rewa,Madhya_Pradesh,3,13
2,7574516,40,10,single,rented,no,Software_Developer,Parbhani,Maharashtra,9,13
3,3991815,66,4,married,rented,no,Technical_writer,Alappuzha,Kerala,4,10
4,6256451,41,2,single,rented,yes,Software_Developer,Bhubaneswar,Odisha,2,12
5,5768871,47,11,single,rented,no,Civil_servant,Tiruchirappalli[10],Tamil_Nadu,3,14


### 3. Encode the categorical variables from the features data using `get_dummies`.

In [41]:
# Encode the categorical variables using get_dummies
X = pd.get_dummies(X)

# Review the features data
X.head()

Unnamed: 0_level_0,Income,Age,Experience,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Married/Single_married,Married/Single_single,House_Ownership_norent_noown,House_Ownership_owned,House_Ownership_rented,...,STATE_Punjab,STATE_Rajasthan,STATE_Sikkim,STATE_Tamil_Nadu,STATE_Telangana,STATE_Tripura,STATE_Uttar_Pradesh,STATE_Uttar_Pradesh[5],STATE_Uttarakhand,STATE_West_Bengal
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1303834,23,3,3,13,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,7574516,40,10,9,13,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,3991815,66,4,4,10,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,6256451,41,2,2,12,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,5768871,47,11,3,14,0,1,0,0,1,...,0,0,0,1,0,0,0,0,0,0


### Step 4: Split the data into training and testing datasets by using `train_test_split`.

In [42]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [43]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_regression_model = LogisticRegression(solver='lbfgs', random_state=42)

# Fit the model using training data
lr_model = logistic_regression_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [44]:
# Make a prediction using the testing data
prediction_test = logistic_regression_model.predict(X_test)
pd.DataFrame({'Predictions': prediction_test, 'Actual': y_test})

Unnamed: 0_level_0,Predictions,Actual
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
75256,0,0
192436,0,0
154840,0,0
59775,0,0
63789,0,0
...,...,...
8108,0,1
79395,0,0
3539,0,1
220664,0,0


### Step 3: Evaluate the model’s performance by doing the following:

* Generate a confusion matrix.

* Print the classification report.

In [45]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, prediction_test)

array([[55208,     0],
       [ 7792,     0]])

In [63]:
# Print the classification report for the model
target_names = ['Non Risky Application', 'Flagged Risky Application']
print(classification_report(y_test, prediction_test, target_names=target_names))

                           precision    recall  f1-score   support

    Non Risky Application       0.88      1.00      0.93     55208
Flagged Risky Application       0.00      0.00      0.00      7792

                 accuracy                           0.88     63000
                macro avg       0.44      0.50      0.47     63000
             weighted avg       0.77      0.88      0.82     63000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (non-risk loan) and `1` (high-risk loan) labels?

**Answer:** The logistic regression model was 95% accurate at predicting the healthy vs high-risk loan labels