In [128]:
#Load libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix 

In [129]:
dataset = pd.read_csv('loan_data_set.csv')

In [None]:
print(dataset['Loan_Status'].value_counts())
dataset.describe()

In [None]:
sns.countplot(x = 'Loan_Status', data=dataset, palette='hls')
plt.show()

In [None]:
print(dataset.columns[dataset.isnull().any()].tolist())
missing_values = dataset.isnull()
missing_values

In [None]:
sns.heatmap(data = missing_values, yticklabels=False, cbar=False, cmap='viridis') #Heatmap of missing data values

In [None]:
sns.countplot(x='Loan_Status', data=dataset, hue='Education') #comparing those who had the loan and those who didint based on some features

In [127]:
import pandas as pd
from sklearn import model_selection
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier


dataset_1 = pd.read_csv('loan_data_set.csv')
dataset_1 = dataset_1.drop('Loan_ID', axis=1)


X = dataset_1.drop('Loan_Status', axis=1)
y = dataset_1['Loan_Status']


x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=7)


numeric_features = dataset_1.select_dtypes(include=['int64', 'float64']).columns
categorical_features = dataset_1.select_dtypes(include=['object']).drop(['Loan_Status'], axis=1).columns


numeric_features = ['ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term']
numeric_features = dataset_1.select_dtypes(include=['int64', 'float64']).columns
numeric_features_steps = [('imputer', SimpleImputer(strategy='median')),('scaler', MinMaxScaler())]
numeric_transformer = Pipeline(steps=numeric_features_steps)



categorical_features = ['Gender','Married','Dependents','Education','Self_Employed','Property_Area','Loan_Status']
categorical_features = dataset_1.select_dtypes(include=['object']).drop(['Loan_Status'], axis=1).columns
categorical_features_steps = [('imputer', SimpleImputer(strategy='constant', fill_value='missing')),('onehot', OneHotEncoder())]
categorical_transformer = Pipeline(steps=categorical_features_steps)



preprocessor = ColumnTransformer(
    remainder = 'passthrough',
    transformers=[
    ('numeric', numeric_transformer, numeric_features),
    ('categorical', categorical_transformer, categorical_features)
])

classifiers = {
    'K-Nearnest Neighbour': KNeighborsClassifier(9),
    'Logistic Regression(solver=liblinear)': LogisticRegression(solver='liblinear'),
    'Support Vector Machine(gamma=auto, kernel=rbf)': SVC(gamma='auto', kernel='rbf'),
    'Support Vector Machine(kernel="rbf", C=0.025, probability=True)': SVC(gamma='auto', kernel="rbf", C=0.025, probability=True),
    'Nu Support Vector Machine(probability=True)': NuSVC(gamma='auto', probability=True),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'Random Forest Classifier': RandomForestClassifier(n_estimators=100),
    'AdaBoost Classifier': AdaBoostClassifier(),
    'Gradient Boosting Classifier': GradientBoostingClassifier()
}


# for name, classifier in classifiers.items():
#     pipe = Pipeline(steps=[('preprocessor', preprocessor),('classifier', classifier)])
#     pipe.fit(x_train, y_train)
#     y_pred = pipe.predict(x_test)
#     print("Classifier: ", name)
#     print("Accuracy: %.4f" % pipe.score(x_test, y_test))


pred_models = []

for name, classifier in classifiers.items():
    pipe = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', classifier)])
    
    pipe.fit(x_train, y_train)
    pred_models.append(pipe)
    
y_pred = pred_models[1].predict(x_test)
# print("Accuracy: %.4f" % pipe.score(x_test, y_test))



dataset_2 = pd.read_csv('loan_data_set.csv')
d = preprocessor.fit_transform(dataset_2)
d = pd.DataFrame(data=d[1:,1:],index=d[1:,0],columns=d[0,1:])
# print(d)


s = d.iloc[:, -2:-1]
s = s.iloc[613:]

print(s)


Empty DataFrame
Columns: [LP001002]
Index: []


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier

var_mod = ['Gender','Married','Dependents','Education','Self_Employed','Property_Area','Loan_Status']
le = LabelEncoder()
for i in var_mod:
    dataset[i] = le.fit_transform(dataset[i].astype(str))

X = pd.DataFrame(dataset.iloc[:, 1:-1])
y = pd.DataFrame(dataset.iloc[:,-1]).values.ravel()


imputer = SimpleImputer(strategy="mean")
imputer = imputer.fit(X)
X = imputer.transform(X)


x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=7)

In [None]:
logistic_reg_model = LogisticRegression(solver='liblinear')
logistic_reg_model.fit(x_train, y_train)
y_pred = logistic_reg_model.predict(x_test)
# y_single = logistic_reg_model.predict(x_test[0].reshape(1, -1))

In [None]:
cnf_matrix = confusion_matrix(y_test, y_pred)

In [None]:
class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
print("Accuracy:",accuracy_score(y_test, y_pred))
print("Precision:",precision_score(y_test, y_pred))
print("Recall:",recall_score(y_test, y_pred))

In [None]:
model_decision_tree = DecisionTreeClassifier()
model_decision_tree.fit(x_train,y_train)
predictions = model_decision_tree.predict(x_test)
print(accuracy_score(y_test, predictions))

In [None]:
model = RandomForestClassifier(n_estimators=100)
model.fit(x_train,y_train)
predictions = model.predict(x_test)
print(accuracy_score(y_test, predictions))

In [None]:
model = KNeighborsClassifier(n_neighbors=9)
model.fit(x_train,y_train)
predictions = model.predict(x_test)
print(accuracy_score(y_test, predictions))

In [None]:
model = SVC(gamma='scale', kernel='rbf')
model.fit(x_train,y_train)
predictions = model.predict(x_test)
print(accuracy_score(y_test, predictions))

In [None]:
dataset_test = dataset

features = ['LP001486', 'Male','Yes',1,'Not Graduate','No',4583,1508,128,360,1,'Rural','N']

new_customer = pd.DataFrame({
     'Loan_ID': [features[0]],
     'Gender': [features[1]],
     'Married': [features[2]],
     'Dependents': [features[3]],
     'Education': [features[4]],
     'Self_Employed': [features[5]],
     'ApplicantIncome': [features[6]],
     'CoapplicantIncome': [features[7]],
     'LoanAmount': [features[8]],
     'Loan_Amount_Term': [features[9]],
     'Credit_History': [features[10]],
     'Property_Area':[features[11]],
     'Loan_Status': [features[12]],
})

dataset_test = dataset_test.append(new_customer)

var_mod = ['Gender','Married','Dependents','Education','Self_Employed','Property_Area','Loan_Status']
le = LabelEncoder()
for i in var_mod:
    dataset_test[i] = le.fit_transform(dataset_test[i].astype(str))

    
user = dataset_test[-1:]
user = pd.DataFrame(user.iloc[:, 1:-1])
user.values

y_single = model_decision_tree.predict(user.values)
print(y_single[0])

In [None]:
def clean_input(features = ['LP001486', 'Male', 'Yes', 1, 'Not Graduate', 'No', 4583, 1508, 128, 360, 1, 'Rural', 'N']):
    dataset_test = pd.read_csv('loan_data_set.csv')


    # create a new DataFrame from user input
    new_customer = pd.DataFrame({
        'Loan_ID': [features[0]],
        'Gender': [features[1]],
        'Married': [features[2]],
        'Dependents': [features[3]],
        'Education': [features[4]],
        'Self_Employed': [features[5]],
        'ApplicantIncome': [features[6]],
        'CoapplicantIncome': [features[7]],
        'LoanAmount': [features[8]],
        'Loan_Amount_Term': [features[9]],
        'Credit_History': [features[10]],
        'Property_Area': [features[11]],
        'Loan_Status': [features[12]],
    })

    dataset_test = dataset_test.append(new_customer)


    cus = preprocessor.fit_transform(dataset_test)
    
    print(cus)


feature = clean_input()
feature
y_pred = pred_models[1].predict(feature)
# y = pred_models[1].predict(feature)
# y[0]