In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

EDA

In [None]:
loans = pd.read_csv('loan_data.csv')
loans.info()
loans.describe()

loans[loans['credit.policy']==1]['fico'].hist(bins=40, color = 'blue', alpha = 0.6, label = 'Credit Policy = 1')
loans[loans['credit.policy']==0]['fico'].hist(bins=40, color = 'red', alpha = 0.6, label = 'Credit Policy = 0')
plt.legend()

loans[loans['not.fully.paid']==1]['fico'].hist(bins=30, color='blue', alpha=0.5, label = 'Not fully paid = 1')
loans[loans['not.fully.paid']==0]['fico'].hist(bins=30, color='red', alpha=0.5, label = 'Not fully paid =0') 
plt.legend()

plt.figure(figsize=(10,6))
sns.countplot(x='purpose', data = loans, hue = 'not.fully.paid', palette='Set1')
plt.tight_layout()

sns.jointplot(x= 'fico', y = 'int.rate', data= loans)
sns.lmplot(x='fico',y='int.rate', data=loans, hue = 'credit.policy', col = 'not.fully.paid', palette='viridis')

## Categorical Features

Notice that the **purpose** column as categorical

That means we need to transform them using dummy variables so sklearn will be able to understand them. Let's do this in one clean step using pd.get_dummies.

Let's show you a way of dealing with these columns that can be expanded to multiple categorical features if necessary.

In [None]:
cat_feats = ['purpose']
final_data = pd.get_dummies(loans, columns=cat_feats, drop_first=True)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
X = final_data.drop('not.fully.paid', axis=1)
y = final_data['not.fully.paid']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

dtree = DecisionTreeClassifier(min_samples_split=5, min_samples_leaf=8)
dtree.fit(X_train, y_train)
pred = dtree.predict(X_test)

from sklearn.metrics import confusion_matrix, classification_report

print(classification_report(y_test, pred))
print('\n')
print(confusion_matrix(y_test,pred))


from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=300)
rfc.fit(X_train, y_train)
predictions = rfc.predict(X_test)

print(classification_report(y_test, predictions))
print('\n')
print(confusion_matrix(y_test, predictions))