# This is a classification problem.
# CatBoost Algorithm will be used.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn
from sklearn.model_selection import train_test_split, cross_val_score
from catboost import CatBoostClassifier

In [2]:
# Load datasets. "train.csv" is the training data. "test.csv" will be used for predictions.
train_raw = pd.read_csv("train.csv") # Training data
pred_raw = pd.read_csv("test.csv") # Prediction data

In [3]:
print("column names of train set")
print(list(train_raw))
print()
print("column names of prediction set")
print(list(pred_raw))

column names of train set
['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status']

column names of prediction set
['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'Property_Area']


The first column "Loan_ID" will not be used in training.

In [4]:
train_raw[:10]

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y
6,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y
7,LP001014,Male,Yes,3+,Graduate,No,3036,2504.0,158.0,360.0,0.0,Semiurban,N
8,LP001018,Male,Yes,2,Graduate,No,4006,1526.0,168.0,360.0,1.0,Urban,Y
9,LP001020,Male,Yes,1,Graduate,No,12841,10968.0,349.0,360.0,1.0,Semiurban,N


In [5]:
pred_raw[:10]

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban
5,LP001054,Male,Yes,0,Not Graduate,Yes,2165,3422,152.0,360.0,1.0,Urban
6,LP001055,Female,No,1,Not Graduate,No,2226,0,59.0,360.0,1.0,Semiurban
7,LP001056,Male,Yes,2,Not Graduate,No,3881,0,147.0,360.0,0.0,Rural
8,LP001059,Male,Yes,2,Graduate,,13633,0,280.0,240.0,1.0,Urban
9,LP001067,Male,No,0,Not Graduate,No,2400,2400,123.0,360.0,1.0,Semiurban


In [6]:
train_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
Loan_ID              614 non-null object
Gender               601 non-null object
Married              611 non-null object
Dependents           599 non-null object
Education            614 non-null object
Self_Employed        582 non-null object
ApplicantIncome      614 non-null int64
CoapplicantIncome    614 non-null float64
LoanAmount           592 non-null float64
Loan_Amount_Term     600 non-null float64
Credit_History       564 non-null float64
Property_Area        614 non-null object
Loan_Status          614 non-null object
dtypes: float64(4), int64(1), object(8)
memory usage: 62.4+ KB


Some features have numeric values and some features have object values.

In [7]:
# Replace object values with numeric values.
# Replace np.nan with -999.
pd.DataFrame.replace(train_raw["Gender"], to_replace = [np.nan, "Male", "Female"], value=[-999, 1, 2], inplace=True)
pd.DataFrame.replace(train_raw["Married"], to_replace = [np.nan, "Yes", "No"], value=[-999, 1, 0], inplace=True)
pd.DataFrame.replace(train_raw["Dependents"], to_replace = [np.nan, "3+"], value=[-999, 3], inplace=True)
pd.DataFrame.replace(train_raw["Education"], to_replace = ["Graduate", "Not Graduate"], value=[1, 0], inplace=True)
pd.DataFrame.replace(train_raw["Self_Employed"], to_replace = [np.nan, "Yes", "No"], value=[-999, 1, 0], inplace=True)
pd.DataFrame.replace(train_raw["LoanAmount"], to_replace = [np.nan], value=[-999], inplace=True)
pd.DataFrame.replace(train_raw["Loan_Amount_Term"], to_replace = [np.nan], value=[-999], inplace=True)
pd.DataFrame.replace(train_raw["Credit_History"], to_replace = [np.nan], value=[-999], inplace=True)
pd.DataFrame.replace(train_raw["Property_Area"], to_replace = ["Rural", "Semiurban", "Urban"], value=[1, 2, 3], inplace=True)
pd.DataFrame.replace(train_raw["Loan_Status"], to_replace = ["Y", "N"], value=[1,0], inplace=True)

pd.DataFrame.replace(pred_raw["Gender"], to_replace = [np.nan, "Male", "Female"], value=[-999, 1, 2], inplace=True)
pd.DataFrame.replace(pred_raw["Married"], to_replace = [np.nan, "Yes", "No"], value=[-999, 1, 0], inplace=True)
pd.DataFrame.replace(pred_raw["Dependents"], to_replace = [np.nan, "3+"], value=[-999, 3], inplace=True)
pd.DataFrame.replace(pred_raw["Education"], to_replace = ["Graduate", "Not Graduate"], value=[1, 0], inplace=True)
pd.DataFrame.replace(pred_raw["Self_Employed"], to_replace = [np.nan, "Yes", "No"], value=[-999, 1, 0], inplace=True)
pd.DataFrame.replace(pred_raw["LoanAmount"], to_replace = [np.nan], value=[-999], inplace=True)
pd.DataFrame.replace(pred_raw["Loan_Amount_Term"], to_replace = [np.nan], value=[-999], inplace=True)
pd.DataFrame.replace(pred_raw["Credit_History"], to_replace = [np.nan], value=[-999], inplace=True)
pd.DataFrame.replace(pred_raw["Property_Area"], to_replace = ["Rural", "Semiurban", "Urban"], value=[1, 2, 3], inplace=True)

In [8]:
# Generate training datasets
train_array = train_raw.values
X = train_array[:, 1:-1].astype(float)
y = train_array[:, -1].astype(float)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1234)

# Generate the set X_pred for prediction
pred_array = pred_raw.values
X_pred = pred_array[:, 1:12].astype(float)
# Array of Load_ID
X_pred_id = pred_array[:, 0]

In [9]:
# Train CatBoost classification algorithm

parameters = {"iterations":200, 
              "learning_rate": 0.02, 
              "depth": 6, 
              "l2_leaf_reg":3}

cb = CatBoostClassifier(**parameters)
cb.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x7fbf5e48feb8>

In [10]:
# Compare the accuracy scores of training and testing sets.
print("Accuracy score (training data): {0:.3f}".format(cb.score(X_train, y_train)))
print("Accuracy score (testing data): {0:.3f}".format(cb.score(X_test, y_test)))
print()
print("Cross validation score: {0:.3f}".format(round(np.mean(cross_val_score(cb, X_test, y_test, scoring="accuracy", cv=10)), 4)))

Accuracy score (training data): 0.846
Accuracy score (testing data): 0.831

Cross validation score: 0.803


In [11]:
# Make predictions on prediction data
y_pred = cb.predict(X_pred)

In [12]:
print(y_pred)

[ 1.  1.  1.  1.  1.  1.  1.  0.  1.  1.  1.  1.  1.  0.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  0.  1.  1.  1.  1.  1.  1.  1.  1.  1.  0.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  0.  1.  1.  0.  1.  1.  1.  1.  0.  1.  1.  0.  0.  1.  0.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  0.  1.  0.  1.  0.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  0.  1.  1.  1.  1.  1.  1.  0.  1.  1.  1.  1.  0.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  0.  0.  0.  1.  1.  1.  0.  0.  1.
  0.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  0.  1.  0.  1.
  1.  1.  1.  0.  1.  1.  1.  1.  1.  0.  1.  1.  1.  1.  1.  1.  1.  0.
  1.  1.  1.  0.  0.  1.  0.  1.  1.  1.  1.  0.  0.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  0.  0.  1.  1.  0.  1.
  0.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  0.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  0.  1.  1.  1.  1.  0.  0.  1.  1.  1.
  1.  0.  0.  1.  1.  0.  1.  0.  1.  0.  1.  0.  1

In [14]:
#Join "Loan_ID" to the prediction result
y_pred = y_pred.reshape((1, X_pred.shape[0]))[0]
Sample_Submission_df = pd.DataFrame({"Loan_ID": X_pred_id, "Loan_Status": y_pred})
pd.DataFrame.replace(Sample_Submission_df, to_replace=[1.0,0.0],value=["Y", "N"], inplace=True)
Sample_Submission = Sample_Submission_df.set_index(["Loan_ID"])
print(Sample_Submission_df.groupby("Loan_Status").count())

             Loan_ID
Loan_Status         
N                 62
Y                305


In [15]:
# Output prediction to csv file
Sample_Submission.to_csv("Sample_Submission.csv")