In [107]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf



In [108]:
# Read the JSON file into a Pandas DataFrame
df = pd.read_json('Loan_approval_dataset.json')

df.head()

Unnamed: 0,Id,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag
0,1,1303834,23,3,single,rented,no,Mechanical_engineer,Rewa,Madhya_Pradesh,3,13,0
1,2,7574516,40,10,single,rented,no,Software_Developer,Parbhani,Maharashtra,9,13,0
2,3,3991815,66,4,married,rented,no,Technical_writer,Alappuzha,Kerala,4,10,0
3,4,6256451,41,2,single,rented,yes,Software_Developer,Bhubaneswar,Odisha,2,12,1
4,5,5768871,47,11,single,rented,no,Civil_servant,Tiruchirappalli[10],Tamil_Nadu,3,14,1


In [109]:
df.describe()


Unnamed: 0,Id,Income,Age,Experience,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag
count,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0
mean,126000.5,4997117.0,49.954071,10.084437,6.333877,11.997794,0.123
std,72746.278255,2878311.0,17.063855,6.00259,3.647053,1.399037,0.328438
min,1.0,10310.0,21.0,0.0,0.0,10.0,0.0
25%,63000.75,2503015.0,35.0,5.0,3.0,11.0,0.0
50%,126000.5,5000694.0,50.0,10.0,6.0,12.0,0.0
75%,189000.25,7477502.0,65.0,15.0,9.0,13.0,0.0
max,252000.0,9999938.0,79.0,20.0,14.0,14.0,1.0


In [110]:
df.dtypes

Id                    int64
Income                int64
Age                   int64
Experience            int64
Married/Single       object
House_Ownership      object
Car_Ownership        object
Profession           object
CITY                 object
STATE                object
CURRENT_JOB_YRS       int64
CURRENT_HOUSE_YRS     int64
Risk_Flag             int64
dtype: object

In [111]:
unique_states_count = len(df["STATE"].unique())
print(unique_states_count)

29


In [112]:
state_app_counts = df['STATE'].value_counts()
state_app_counts

STATE
Uttar_Pradesh        28400
Maharashtra          25562
Andhra_Pradesh       25297
West_Bengal          23483
Bihar                19780
Tamil_Nadu           16537
Madhya_Pradesh       14122
Karnataka            11855
Gujarat              11408
Rajasthan             9174
Jharkhand             8965
Haryana               7890
Telangana             7524
Assam                 7062
Kerala                5805
Delhi                 5490
Punjab                4720
Odisha                4658
Chhattisgarh          3834
Uttarakhand           1874
Jammu_and_Kashmir     1780
Puducherry            1433
Mizoram                849
Manipur                849
Himachal_Pradesh       833
Tripura                809
Uttar_Pradesh[5]       743
Chandigarh             656
Sikkim                 608
Name: count, dtype: int64

In [113]:
# place some states (<2000 applications) into an 'OTHER' column to reduce dimensionality
states_to_replace = state_app_counts[state_app_counts < 2000].index.tolist()

for state in states_to_replace:
    df['STATE'] = df['STATE'].replace(state, "Other")

df['STATE'].value_counts()

STATE
Uttar_Pradesh     28400
Maharashtra       25562
Andhra_Pradesh    25297
West_Bengal       23483
Bihar             19780
Tamil_Nadu        16537
Madhya_Pradesh    14122
Karnataka         11855
Gujarat           11408
Other             10434
Rajasthan          9174
Jharkhand          8965
Haryana            7890
Telangana          7524
Assam              7062
Kerala             5805
Delhi              5490
Punjab             4720
Odisha             4658
Chhattisgarh       3834
Name: count, dtype: int64

In [114]:
unique_cities_count = len(df["CITY"].unique())
print(unique_cities_count)

317


In [115]:
# Drop the cities column - see if using just the states as geographic area works: to reduce dimensionality
df = df.drop("CITY", axis=1)

In [116]:
df['Profession'].value_counts()

Profession
Physician                     5957
Statistician                  5806
Web_designer                  5397
Psychologist                  5390
Computer_hardware_engineer    5372
Drafter                       5359
Magistrate                    5357
Fashion_Designer              5304
Air_traffic_controller        5281
Comedian                      5259
Industrial_Engineer           5250
Mechanical_engineer           5217
Chemical_engineer             5205
Technical_writer              5195
Hotel_Manager                 5178
Financial_Analyst             5167
Graphic_Designer              5166
Flight_attendant              5128
Biomedical_Engineer           5127
Secretary                     5061
Software_Developer            5053
Petroleum_Engineer            5041
Police_officer                5035
Computer_operator             4990
Politician                    4944
Microbiologist                4881
Technician                    4864
Artist                        4861
Lawyer   

In [117]:
# reindex the data frame to set the Id as the index
df.set_index('Id', inplace=True)

In [118]:
# loop through values in Married/Single column to make a new column with 0 for married, 1 for single
df = pd.get_dummies(df, columns=['Married/Single'],dtype= int)

df.head()

Unnamed: 0_level_0,Income,Age,Experience,House_Ownership,Car_Ownership,Profession,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag,Married/Single_married,Married/Single_single
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,1303834,23,3,rented,no,Mechanical_engineer,Madhya_Pradesh,3,13,0,0,1
2,7574516,40,10,rented,no,Software_Developer,Maharashtra,9,13,0,0,1
3,3991815,66,4,rented,no,Technical_writer,Kerala,4,10,0,1,0
4,6256451,41,2,rented,yes,Software_Developer,Odisha,2,12,1,0,1
5,5768871,47,11,rented,no,Civil_servant,Tamil_Nadu,3,14,1,0,1


In [119]:
# Encode Ownership columns with get_dummies

df= pd.get_dummies(df, columns=['Car_Ownership'],dtype= int)
df=pd.get_dummies(df,columns=['House_Ownership'],dtype= int)
df.head()

Unnamed: 0_level_0,Income,Age,Experience,Profession,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag,Married/Single_married,Married/Single_single,Car_Ownership_no,Car_Ownership_yes,House_Ownership_norent_noown,House_Ownership_owned,House_Ownership_rented
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,1303834,23,3,Mechanical_engineer,Madhya_Pradesh,3,13,0,0,1,1,0,0,0,1
2,7574516,40,10,Software_Developer,Maharashtra,9,13,0,0,1,1,0,0,0,1
3,3991815,66,4,Technical_writer,Kerala,4,10,0,1,0,1,0,0,0,1
4,6256451,41,2,Software_Developer,Odisha,2,12,1,0,1,0,1,0,0,1
5,5768871,47,11,Civil_servant,Tamil_Nadu,3,14,1,0,1,1,0,0,0,1


In [120]:
# drop the married column  and the Car_Ownership no column to reduce the dimensionality, but retain binary nature of this information
df = df.drop("Married/Single_married", axis=1)
df = df.drop("Car_Ownership_no", axis=1)
df.head()

Unnamed: 0_level_0,Income,Age,Experience,Profession,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag,Married/Single_single,Car_Ownership_yes,House_Ownership_norent_noown,House_Ownership_owned,House_Ownership_rented
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,1303834,23,3,Mechanical_engineer,Madhya_Pradesh,3,13,0,1,0,0,0,1
2,7574516,40,10,Software_Developer,Maharashtra,9,13,0,1,0,0,0,1
3,3991815,66,4,Technical_writer,Kerala,4,10,0,0,0,0,0,1
4,6256451,41,2,Software_Developer,Odisha,2,12,1,1,1,0,0,1
5,5768871,47,11,Civil_servant,Tamil_Nadu,3,14,1,1,0,0,0,1


In [121]:
# Encode the profession column with get_dummies
df= pd.get_dummies(df, columns=['Profession'],dtype= int)

In [123]:
# Encode the State column with git_dummies
df=pd.get_dummies(df, columns=['STATE'], dtype=int)
df.head()

Unnamed: 0_level_0,Income,Age,Experience,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag,Married/Single_single,Car_Ownership_yes,House_Ownership_norent_noown,House_Ownership_owned,...,STATE_Madhya_Pradesh,STATE_Maharashtra,STATE_Odisha,STATE_Other,STATE_Punjab,STATE_Rajasthan,STATE_Tamil_Nadu,STATE_Telangana,STATE_Uttar_Pradesh,STATE_West_Bengal
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1303834,23,3,3,13,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,7574516,40,10,9,13,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,3991815,66,4,4,10,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,6256451,41,2,2,12,1,1,1,0,0,...,0,0,1,0,0,0,0,0,0,0
5,5768871,47,11,3,14,1,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [124]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = df["Risk_Flag"]

# Separate the X variable, the features
X = df.drop("Risk_Flag", axis=1)

In [127]:
df.head()

Unnamed: 0_level_0,Income,Age,Experience,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag,Married/Single_single,Car_Ownership_yes,House_Ownership_norent_noown,House_Ownership_owned,...,STATE_Madhya_Pradesh,STATE_Maharashtra,STATE_Odisha,STATE_Other,STATE_Punjab,STATE_Rajasthan,STATE_Tamil_Nadu,STATE_Telangana,STATE_Uttar_Pradesh,STATE_West_Bengal
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1303834,23,3,3,13,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,7574516,40,10,9,13,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,3991815,66,4,4,10,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,6256451,41,2,2,12,1,1,1,0,0,...,0,0,1,0,0,0,0,0,0,0
5,5768871,47,11,3,14,1,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [125]:
# Split the data using train_test_split
# Assign a random_state of 5 to the function
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=5, stratify=y)

In [126]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [128]:
df.to_csv("Loan_approval_for_training.csv")

In [129]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
inputs = 82
layer1_units = 150
layer2_units = 90
layer3_units = 30
layer4_units = 16
layer1_activation = "relu"
layer2_activation = "relu"
layer3_activation = "tanh"
layer4_activation = "tanh"

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units = layer1_units, activation = layer1_activation, input_dim = inputs))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units = layer2_units, activation = layer2_activation))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units = layer3_units, activation = layer2_activation))

#Fourth hidden layer
nn.add(tf.keras.layers.Dense(units = layer4_units, activation = layer4_activation))

# Output layer
nn.add(tf.keras.layers.Dense(units = 1, activation = "sigmoid"))

# Check the structure of the model
nn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [130]:
# Compile the model
nn.compile(loss = "binary_crossentropy", optimizer = "adam", metrics=["accuracy"])

In [133]:
import math
batch_size=32
num_batches= len(X_train_scaled)/batch_size
num_batches= math.ceil(num_batches)
num_batches

5907

In [134]:
from tensorflow import keras.callbacks

# Define the filepath for saving the model weights
filepath = "/content/drive/MyDrive/Colab Notebooks/Model_Weights/model_weights_{epoch:02d}.h5"

# Create a ModelCheckpoint callback to save weights every five epochs
checkpoint_callback = ModelCheckpoint(filepath, save_weights_only=True, save_freq=5*num_batches, verbose=3)

ModuleNotFoundError: No module named 'tf'

In [132]:
# Train the model
nn_fit = nn.fit(X_train_scaled, y_train, epochs=25)

Epoch 1/25


ValueError: Exception encountered when calling Sequential.call().

[1mInput 0 of layer "dense" is incompatible with the layer: expected axis -1 of input shape to have value 82, but received input with shape (None, 81)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(None, 81), dtype=float32)
  • training=True
  • mask=None