In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

trainingDf = pd.read_csv('./train_dataset.csv.xls')

print(trainingDf.head())



    Loan_ID Gender Married Dependents     Education Self_Employed  \
0  LP001002   Male      No          0      Graduate            No   
1  LP001003   Male     Yes          1      Graduate            No   
2  LP001005   Male     Yes          0      Graduate           Yes   
3  LP001006   Male     Yes          0  Not Graduate            No   
4  LP001008   Male      No          0      Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5849                0.0         NaN             360.0   
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   
3             2583             2358.0       120.0             360.0   
4             6000                0.0       141.0             360.0   

   Credit_History Property_Area Loan_Status  
0             1.0         Urban           Y  
1             1.0         Rural           N  
2             1.0   

In [2]:
print("============COLUMNS WITH EMPTY VALUE=============")
print(trainingDf.columns[trainingDf.isna().any()].tolist())

['Gender', 'Married', 'Dependents', 'Self_Employed', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']


In [3]:
#Dropping loan_id
trainingDf = trainingDf.drop(columns=['Loan_ID'])

#Populating rows with null data with default values
trainingDf['Gender'] = trainingDf['Gender'].fillna('unknown')
trainingDf['Married'] = trainingDf['Married'].fillna('unknown')
trainingDf['Self_Employed'] = trainingDf['Self_Employed'].fillna('unknown')
trainingDf['Dependents'] = trainingDf['Dependents'].fillna(0)

#Dropping rows with empty values for following columns 'LoanAmount', 'Loan_Amount_Term', 'Credit_History'
trainingDf = trainingDf[trainingDf['LoanAmount'].notna()]
trainingDf = trainingDf[trainingDf['Loan_Amount_Term'].notna()]
trainingDf = trainingDf[trainingDf['Credit_History'].notna()]

trainingDf['Dependents'].replace(
    to_replace=['3+'],
    value='4',
    inplace=True
)

In [4]:
print("============COLUMNS WITH EMPTY VALUE=============")
print(trainingDf.columns[trainingDf.isna().any()].tolist())
print("Remaining rows after dropping data:", len(trainingDf.index))


[]
Remaining rows after dropping data: 529


In [5]:
print("============UNIQUE VALUE FOR CATEGORICAL DATA=============")
print("Gender: ", trainingDf['Gender'].unique())
print("Married: ", trainingDf['Married'].unique())
print("Education: ", trainingDf['Education'].unique())
print("Self_Employed: ", trainingDf['Self_Employed'].unique())
print("Property_Area: ", trainingDf['Property_Area'].unique())
print('\n')

Gender:  ['Male' 'Female' 'unknown']
Married:  ['Yes' 'No' 'unknown']
Education:  ['Graduate' 'Not Graduate']
Self_Employed:  ['No' 'Yes' 'unknown']
Property_Area:  ['Rural' 'Urban' 'Semiurban']




In [6]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split


train, test = train_test_split(trainingDf, test_size=0.2, random_state=23)
print(train.head())
X_train, y_train = train.copy().drop(columns=['Loan_Status']), train['Loan_Status']
X_test, y_test = test.copy().drop(columns=['Loan_Status']), test['Loan_Status']

Categorical_columns = ["Gender", "Married", "Education", "Self_Employed", "Property_Area"]
Numerical_columns = ["Dependents", "ApplicantIncome", "CoapplicantIncome", "LoanAmount", "Loan_Amount_Term", "Credit_History"]

Categorical_X_train = X_train[Categorical_columns]
Categorical_X_test = X_test[Categorical_columns]

Numerical_X_train = X_train[Numerical_columns]
Numerical_X_test = X_test[Numerical_columns]

encoder = preprocessing.OneHotEncoder()
encoder.fit(Categorical_X_train)
Categorical_X_train_encoded = encoder.transform(Categorical_X_train).toarray()
Categorical_X_test_encoded = encoder.transform(Categorical_X_test).toarray()

print(Numerical_X_train.head())

standard_Scaler = preprocessing.MinMaxScaler()
standard_Scaler.fit(Numerical_X_train)
Numerical_X_train_encoded = standard_Scaler.transform(Numerical_X_train)
Numerical_X_test_encoded = standard_Scaler.transform(Numerical_X_test)

y_encoder = preprocessing.LabelEncoder()
y_encoder.fit(y_train)
Y_train_encoded = y_encoder.transform(y_train)
Y_test_encoded = y_encoder.transform(y_test)

Combined_X_train_encoded = np.concatenate((Categorical_X_train_encoded, Numerical_X_train_encoded), axis=1)
Combined_X_test_encoded = np.concatenate((Categorical_X_test_encoded, Numerical_X_test_encoded), axis=1)


     Gender Married Dependents     Education Self_Employed  ApplicantIncome  \
330    Male      No          1      Graduate            No             4384   
410  Female      No          1  Not Graduate           Yes             3867   
217    Male     Yes          0      Graduate            No             3727   
143    Male     Yes          0      Graduate            No             2698   
525    Male     Yes          2      Graduate           Yes            17500   

     CoapplicantIncome  LoanAmount  Loan_Amount_Term  Credit_History  \
330             1793.0       117.0             360.0             1.0   
410                0.0        62.0             360.0             1.0   
217             1775.0       131.0             360.0             1.0   
143             2034.0       122.0             360.0             1.0   
525                0.0       400.0             360.0             1.0   

    Property_Area Loan_Status  
330         Urban           Y  
410     Semiurban           

In [7]:
#Sample use case of encoded data

from sklearn.svm import SVC
from sklearn.metrics import classification_report

print("============ RBF Kernal =============")
clf = SVC(kernel='rbf')
clf.fit(Combined_X_train_encoded, Y_train_encoded)
print("Training: ")
print(classification_report(Y_train_encoded, clf.predict(Combined_X_train_encoded)))
print("Testing: ")
print(classification_report(Y_test_encoded, clf.predict(Combined_X_test_encoded)))

Training: 
              precision    recall  f1-score   support

           0       0.88      0.40      0.55       121
           1       0.80      0.98      0.88       302

    accuracy                           0.81       423
   macro avg       0.84      0.69      0.72       423
weighted avg       0.82      0.81      0.79       423

Testing: 
              precision    recall  f1-score   support

           0       1.00      0.55      0.71        42
           1       0.77      1.00      0.87        64

    accuracy                           0.82       106
   macro avg       0.89      0.77      0.79       106
weighted avg       0.86      0.82      0.81       106

