In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# Load loan_prediction datasets
import pandas as pd

X_train = pd.read_csv('loan_prediction/X_train.csv')
Y_train = pd.read_csv('loan_prediction/Y_train.csv')

X_test = pd.read_csv('loan_prediction/X_test.csv')
Y_test = pd.read_csv('loan_prediction/Y_test.csv')

In [3]:
# Let's try to create a Logistic Regression model on our dataset
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
log = LogisticRegression()
log.fit(X_train, Y_train)

log_y = log.predict(X_test)
accuracy_score(Y_test, log_y)

ValueError: could not convert string to float: 'Semiurban'

In [4]:
# In order to create a model with categorical features, 
# we must first encode them

### Using LabelEncoder

In [6]:
# Current categorical features
X_train.select_dtypes(['object']).head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,Property_Area
0,LP001032,Male,No,0,Graduate,No,Urban
1,LP001824,Male,Yes,1,Graduate,No,Semiurban
2,LP002928,Male,Yes,0,Graduate,No,Semiurban
3,LP001814,Male,Yes,2,Graduate,No,Urban
4,LP002244,Male,Yes,0,Graduate,No,Urban


In [7]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

for col in X_test.columns.values:
    if X_test[col].dtypes == 'object': # Check if feature is categorical
        data = X_train[col].append(X_test[col])
        le.fit(data.values)
        X_train[col] = le.transform(X_train[col])
        X_test[col] = le.transform(X_test[col])

In [8]:
# After encoding categorical features
X_train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,13,1,0,0,0,0,4950,0.0,125,360,1,2
1,193,1,1,1,0,0,2882,1843.0,123,480,1,1
2,461,1,1,0,0,0,3000,3416.0,56,180,1,1
3,191,1,1,2,0,0,9703,0.0,112,360,1,2
4,300,1,1,0,0,0,2333,2417.0,136,360,1,2


In [9]:
# Modeling with continuous & encoded categorical features
from sklearn.preprocessing import scale

# First standardize features
X_train_scale = scale(X_train)
X_test_scale = scale(X_test)

In [14]:
# Now fit the Logistic Regression model from above
log.fit(X_train_scale, Y_train['Target'])
log_y = log.predict(X_test_scale)
accuracy_score(Y_test, log_y)

0.73958333333333337

In [15]:
# Logistic Regression was not improved by addition of categorical features
# What about K-Nearest Neighbors model? 
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scale, Y_train['Target'])

knn_y = knn.predict(X_test_scale)
accuracy_score(Y_test, knn_y)

0.76041666666666663