diff --git a/exercises/exercise 1 - loan prediction problem/solutions/answers_frank_van_bakel.ipynb b/exercises/exercise 1 - loan prediction problem/solutions/answers_frank_van_bakel.ipynb
new file mode 100644
index 0000000..f451571
--- /dev/null
+++ b/exercises/exercise 1 - loan prediction problem/solutions/answers_frank_van_bakel.ipynb
@@ -0,0 +1,3572 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import matplotlib.pyplot as plt\n",
+ "%matplotlib inline"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_df = pd.read_csv('../data/train.csv')\n",
+ "test_df = pd.read_csv('../data/test.csv')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Explore the data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Loan_ID \n",
+ " Gender \n",
+ " Married \n",
+ " Dependents \n",
+ " Education \n",
+ " Self_Employed \n",
+ " ApplicantIncome \n",
+ " CoapplicantIncome \n",
+ " LoanAmount \n",
+ " Loan_Amount_Term \n",
+ " Credit_History \n",
+ " Property_Area \n",
+ " Loan_Status \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " LP001002 \n",
+ " Male \n",
+ " No \n",
+ " 0 \n",
+ " Graduate \n",
+ " No \n",
+ " 5849 \n",
+ " 0.0 \n",
+ " NaN \n",
+ " 360.0 \n",
+ " 1.0 \n",
+ " Urban \n",
+ " Y \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " LP001003 \n",
+ " Male \n",
+ " Yes \n",
+ " 1 \n",
+ " Graduate \n",
+ " No \n",
+ " 4583 \n",
+ " 1508.0 \n",
+ " 128.0 \n",
+ " 360.0 \n",
+ " 1.0 \n",
+ " Rural \n",
+ " N \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " LP001005 \n",
+ " Male \n",
+ " Yes \n",
+ " 0 \n",
+ " Graduate \n",
+ " Yes \n",
+ " 3000 \n",
+ " 0.0 \n",
+ " 66.0 \n",
+ " 360.0 \n",
+ " 1.0 \n",
+ " Urban \n",
+ " Y \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " LP001006 \n",
+ " Male \n",
+ " Yes \n",
+ " 0 \n",
+ " Not Graduate \n",
+ " No \n",
+ " 2583 \n",
+ " 2358.0 \n",
+ " 120.0 \n",
+ " 360.0 \n",
+ " 1.0 \n",
+ " Urban \n",
+ " Y \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " LP001008 \n",
+ " Male \n",
+ " No \n",
+ " 0 \n",
+ " Graduate \n",
+ " No \n",
+ " 6000 \n",
+ " 0.0 \n",
+ " 141.0 \n",
+ " 360.0 \n",
+ " 1.0 \n",
+ " Urban \n",
+ " Y \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Loan_ID Gender Married Dependents Education Self_Employed \\\n",
+ "0 LP001002 Male No 0 Graduate No \n",
+ "1 LP001003 Male Yes 1 Graduate No \n",
+ "2 LP001005 Male Yes 0 Graduate Yes \n",
+ "3 LP001006 Male Yes 0 Not Graduate No \n",
+ "4 LP001008 Male No 0 Graduate No \n",
+ "\n",
+ " ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term \\\n",
+ "0 5849 0.0 NaN 360.0 \n",
+ "1 4583 1508.0 128.0 360.0 \n",
+ "2 3000 0.0 66.0 360.0 \n",
+ "3 2583 2358.0 120.0 360.0 \n",
+ "4 6000 0.0 141.0 360.0 \n",
+ "\n",
+ " Credit_History Property_Area Loan_Status \n",
+ "0 1.0 Urban Y \n",
+ "1 1.0 Rural N \n",
+ "2 1.0 Urban Y \n",
+ "3 1.0 Urban Y \n",
+ "4 1.0 Urban Y "
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count \n",
+ " mean \n",
+ " std \n",
+ " min \n",
+ " 25% \n",
+ " 50% \n",
+ " 75% \n",
+ " max \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " ApplicantIncome \n",
+ " 614.0 \n",
+ " 5403.459283 \n",
+ " 6109.041673 \n",
+ " 150.0 \n",
+ " 2877.5 \n",
+ " 3812.5 \n",
+ " 5795.00 \n",
+ " 81000.0 \n",
+ " \n",
+ " \n",
+ " CoapplicantIncome \n",
+ " 614.0 \n",
+ " 1621.245798 \n",
+ " 2926.248369 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 1188.5 \n",
+ " 2297.25 \n",
+ " 41667.0 \n",
+ " \n",
+ " \n",
+ " LoanAmount \n",
+ " 592.0 \n",
+ " 146.412162 \n",
+ " 85.587325 \n",
+ " 9.0 \n",
+ " 100.0 \n",
+ " 128.0 \n",
+ " 168.00 \n",
+ " 700.0 \n",
+ " \n",
+ " \n",
+ " Loan_Amount_Term \n",
+ " 600.0 \n",
+ " 342.000000 \n",
+ " 65.120410 \n",
+ " 12.0 \n",
+ " 360.0 \n",
+ " 360.0 \n",
+ " 360.00 \n",
+ " 480.0 \n",
+ " \n",
+ " \n",
+ " Credit_History \n",
+ " 564.0 \n",
+ " 0.842199 \n",
+ " 0.364878 \n",
+ " 0.0 \n",
+ " 1.0 \n",
+ " 1.0 \n",
+ " 1.00 \n",
+ " 1.0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " count mean std min 25% 50% \\\n",
+ "ApplicantIncome 614.0 5403.459283 6109.041673 150.0 2877.5 3812.5 \n",
+ "CoapplicantIncome 614.0 1621.245798 2926.248369 0.0 0.0 1188.5 \n",
+ "LoanAmount 592.0 146.412162 85.587325 9.0 100.0 128.0 \n",
+ "Loan_Amount_Term 600.0 342.000000 65.120410 12.0 360.0 360.0 \n",
+ "Credit_History 564.0 0.842199 0.364878 0.0 1.0 1.0 \n",
+ "\n",
+ " 75% max \n",
+ "ApplicantIncome 5795.00 81000.0 \n",
+ "CoapplicantIncome 2297.25 41667.0 \n",
+ "LoanAmount 168.00 700.0 \n",
+ "Loan_Amount_Term 360.00 480.0 \n",
+ "Credit_History 1.00 1.0 "
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_df.describe().transpose()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "*** Findings: ***\n",
+ "- Credit_History is categorical data."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([[,\n",
+ " ],\n",
+ " [,\n",
+ " ],\n",
+ " [,\n",
+ " ]],\n",
+ " dtype=object)"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "train_df.hist(figsize=(10,10))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "*** Findings ***\n",
+ "- Credit_History is categorial data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "train_df.boxplot(figsize=(10,6))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "ApplicantIncome and CoapplicantIncome have huge outliers. At this scale we can not see what is going on for LoanAmount and Loan_Amount_Term. se lets make a seperate boxplot for these two."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "train_df.boxplot(column=['LoanAmount','Loan_Amount_Term'],figsize=(10,6))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "*** Findings: ***\n",
+ "- Loan_Amount_Term has one most common value\n",
+ "- Both features have outliers"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 614 entries, 0 to 613\n",
+ "Data columns (total 13 columns):\n",
+ "Loan_ID 614 non-null object\n",
+ "Gender 601 non-null object\n",
+ "Married 611 non-null object\n",
+ "Dependents 599 non-null object\n",
+ "Education 614 non-null object\n",
+ "Self_Employed 582 non-null object\n",
+ "ApplicantIncome 614 non-null int64\n",
+ "CoapplicantIncome 614 non-null float64\n",
+ "LoanAmount 592 non-null float64\n",
+ "Loan_Amount_Term 600 non-null float64\n",
+ "Credit_History 564 non-null float64\n",
+ "Property_Area 614 non-null object\n",
+ "Loan_Status 614 non-null object\n",
+ "dtypes: float64(4), int64(1), object(8)\n",
+ "memory usage: 62.4+ KB\n"
+ ]
+ }
+ ],
+ "source": [
+ "train_df.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Loan_ID \n",
+ " Gender \n",
+ " Married \n",
+ " Dependents \n",
+ " Education \n",
+ " Self_Employed \n",
+ " ApplicantIncome \n",
+ " CoapplicantIncome \n",
+ " LoanAmount \n",
+ " Loan_Amount_Term \n",
+ " Credit_History \n",
+ " Property_Area \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " LP001015 \n",
+ " Male \n",
+ " Yes \n",
+ " 0 \n",
+ " Graduate \n",
+ " No \n",
+ " 5720 \n",
+ " 0 \n",
+ " 110.0 \n",
+ " 360.0 \n",
+ " 1.0 \n",
+ " Urban \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " LP001022 \n",
+ " Male \n",
+ " Yes \n",
+ " 1 \n",
+ " Graduate \n",
+ " No \n",
+ " 3076 \n",
+ " 1500 \n",
+ " 126.0 \n",
+ " 360.0 \n",
+ " 1.0 \n",
+ " Urban \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " LP001031 \n",
+ " Male \n",
+ " Yes \n",
+ " 2 \n",
+ " Graduate \n",
+ " No \n",
+ " 5000 \n",
+ " 1800 \n",
+ " 208.0 \n",
+ " 360.0 \n",
+ " 1.0 \n",
+ " Urban \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " LP001035 \n",
+ " Male \n",
+ " Yes \n",
+ " 2 \n",
+ " Graduate \n",
+ " No \n",
+ " 2340 \n",
+ " 2546 \n",
+ " 100.0 \n",
+ " 360.0 \n",
+ " NaN \n",
+ " Urban \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " LP001051 \n",
+ " Male \n",
+ " No \n",
+ " 0 \n",
+ " Not Graduate \n",
+ " No \n",
+ " 3276 \n",
+ " 0 \n",
+ " 78.0 \n",
+ " 360.0 \n",
+ " 1.0 \n",
+ " Urban \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Loan_ID Gender Married Dependents Education Self_Employed \\\n",
+ "0 LP001015 Male Yes 0 Graduate No \n",
+ "1 LP001022 Male Yes 1 Graduate No \n",
+ "2 LP001031 Male Yes 2 Graduate No \n",
+ "3 LP001035 Male Yes 2 Graduate No \n",
+ "4 LP001051 Male No 0 Not Graduate No \n",
+ "\n",
+ " ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term \\\n",
+ "0 5720 0 110.0 360.0 \n",
+ "1 3076 1500 126.0 360.0 \n",
+ "2 5000 1800 208.0 360.0 \n",
+ "3 2340 2546 100.0 360.0 \n",
+ "4 3276 0 78.0 360.0 \n",
+ "\n",
+ " Credit_History Property_Area \n",
+ "0 1.0 Urban \n",
+ "1 1.0 Urban \n",
+ "2 1.0 Urban \n",
+ "3 NaN Urban \n",
+ "4 1.0 Urban "
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The test data does not have the target value included and there is no seperate file with the Loan_Status of the test data. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# What features are categorical?\n",
+ "Get the features with a low number of unique values, these are probably catecorical data."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Column \n",
+ " Nr of unique values \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " Gender \n",
+ " 3 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " Married \n",
+ " 3 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " Dependents \n",
+ " 5 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " Education \n",
+ " 2 \n",
+ " \n",
+ " \n",
+ " 5 \n",
+ " Self_Employed \n",
+ " 3 \n",
+ " \n",
+ " \n",
+ " 9 \n",
+ " Loan_Amount_Term \n",
+ " 11 \n",
+ " \n",
+ " \n",
+ " 10 \n",
+ " Credit_History \n",
+ " 3 \n",
+ " \n",
+ " \n",
+ " 11 \n",
+ " Property_Area \n",
+ " 3 \n",
+ " \n",
+ " \n",
+ " 12 \n",
+ " Loan_Status \n",
+ " 2 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Column Nr of unique values\n",
+ "1 Gender 3\n",
+ "2 Married 3\n",
+ "3 Dependents 5\n",
+ "4 Education 2\n",
+ "5 Self_Employed 3\n",
+ "9 Loan_Amount_Term 11\n",
+ "10 Credit_History 3\n",
+ "11 Property_Area 3\n",
+ "12 Loan_Status 2"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "unique_values_df = pd.DataFrame()\n",
+ "for col in train_df.columns:\n",
+ " unique_values_df[col] = [len(train_df[col].unique())]\n",
+ " \n",
+ "unique_values_df = unique_values_df.transpose().reset_index()\n",
+ "unique_values_df.columns=['Column','Nr of unique values']\n",
+ "unique_values_df[unique_values_df['Nr of unique values']<100]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Unique values for feature: Gender\n",
+ "Male 489\n",
+ "Female 112\n",
+ "NaN 13\n",
+ "Name: Gender, dtype: int64\n",
+ "\n",
+ "\n",
+ "Unique values for feature: Married\n",
+ "Yes 398\n",
+ "No 213\n",
+ "NaN 3\n",
+ "Name: Married, dtype: int64\n",
+ "\n",
+ "\n",
+ "Unique values for feature: Dependents\n",
+ "0 345\n",
+ "1 102\n",
+ "2 101\n",
+ "3+ 51\n",
+ "NaN 15\n",
+ "Name: Dependents, dtype: int64\n",
+ "\n",
+ "\n",
+ "Unique values for feature: Education\n",
+ "Graduate 480\n",
+ "Not Graduate 134\n",
+ "Name: Education, dtype: int64\n",
+ "\n",
+ "\n",
+ "Unique values for feature: Self_Employed\n",
+ "No 500\n",
+ "Yes 82\n",
+ "NaN 32\n",
+ "Name: Self_Employed, dtype: int64\n",
+ "\n",
+ "\n",
+ "Unique values for feature: Loan_Amount_Term\n",
+ " 360.0 512\n",
+ " 180.0 44\n",
+ " 480.0 15\n",
+ "NaN 14\n",
+ " 300.0 13\n",
+ " 84.0 4\n",
+ " 240.0 4\n",
+ " 120.0 3\n",
+ " 36.0 2\n",
+ " 60.0 2\n",
+ " 12.0 1\n",
+ "Name: Loan_Amount_Term, dtype: int64\n",
+ "\n",
+ "\n",
+ "Unique values for feature: Credit_History\n",
+ " 1.0 475\n",
+ " 0.0 89\n",
+ "NaN 50\n",
+ "Name: Credit_History, dtype: int64\n",
+ "\n",
+ "\n",
+ "Unique values for feature: Property_Area\n",
+ "Semiurban 233\n",
+ "Urban 202\n",
+ "Rural 179\n",
+ "Name: Property_Area, dtype: int64\n",
+ "\n",
+ "\n",
+ "Unique values for feature: Loan_Status\n",
+ "Y 422\n",
+ "N 192\n",
+ "Name: Loan_Status, dtype: int64\n",
+ "\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "for col in unique_values_df[unique_values_df['Nr of unique values']<100]['Column']:\n",
+ " print ('Unique values for feature: ', col)\n",
+ " print (train_df[col].value_counts(dropna=False))\n",
+ " print ('\\n')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## What if we drop all Na values?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.2182410423452769"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data = train_df.dropna()\n",
+ "(train_df.shape[0] - data.shape[0])/train_df.shape[0]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "21.8 procent of all data would be dropped, that is alot, so dropping all the rows with NaN values will have significant impact."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.2125340599455041"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data = test_df.dropna()\n",
+ "(test_df.shape[0] - data.shape[0])/test_df.shape[0]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Simular NaN values on the test data."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Missing values\n",
+ "Above already indicates that there are missing values. Which features have missing values and what fraction is missing?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " feature \n",
+ " missing_fraction_train \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " Loan_ID \n",
+ " 0.000000 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " Gender \n",
+ " 0.021173 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " Married \n",
+ " 0.004886 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " Dependents \n",
+ " 0.024430 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " Education \n",
+ " 0.000000 \n",
+ " \n",
+ " \n",
+ " 5 \n",
+ " Self_Employed \n",
+ " 0.052117 \n",
+ " \n",
+ " \n",
+ " 6 \n",
+ " ApplicantIncome \n",
+ " 0.000000 \n",
+ " \n",
+ " \n",
+ " 7 \n",
+ " CoapplicantIncome \n",
+ " 0.000000 \n",
+ " \n",
+ " \n",
+ " 8 \n",
+ " LoanAmount \n",
+ " 0.035831 \n",
+ " \n",
+ " \n",
+ " 9 \n",
+ " Loan_Amount_Term \n",
+ " 0.022801 \n",
+ " \n",
+ " \n",
+ " 10 \n",
+ " Credit_History \n",
+ " 0.081433 \n",
+ " \n",
+ " \n",
+ " 11 \n",
+ " Property_Area \n",
+ " 0.000000 \n",
+ " \n",
+ " \n",
+ " 12 \n",
+ " Loan_Status \n",
+ " 0.000000 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " feature missing_fraction_train\n",
+ "0 Loan_ID 0.000000\n",
+ "1 Gender 0.021173\n",
+ "2 Married 0.004886\n",
+ "3 Dependents 0.024430\n",
+ "4 Education 0.000000\n",
+ "5 Self_Employed 0.052117\n",
+ "6 ApplicantIncome 0.000000\n",
+ "7 CoapplicantIncome 0.000000\n",
+ "8 LoanAmount 0.035831\n",
+ "9 Loan_Amount_Term 0.022801\n",
+ "10 Credit_History 0.081433\n",
+ "11 Property_Area 0.000000\n",
+ "12 Loan_Status 0.000000"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "missing_series = train_df.isnull().sum() / train_df.shape[0]\n",
+ "#missing_stats = pd.DataFrame(missing_series).rename(columns = {'index': 'feature', 0: 'missing_fraction'})\n",
+ "missing_stats = pd.DataFrame(missing_series).reset_index()\n",
+ "missing_stats.columns = ['feature','missing_fraction_train']\n",
+ "missing_stats"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Is the test data simular? Does the test data have the same missing values?\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " feature \n",
+ " missing_fraction_train \n",
+ " missing_fraction_test \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " Loan_ID \n",
+ " 0.000000 \n",
+ " 0.000000 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " Gender \n",
+ " 0.021173 \n",
+ " 0.029973 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " Married \n",
+ " 0.004886 \n",
+ " 0.000000 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " Dependents \n",
+ " 0.024430 \n",
+ " 0.027248 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " Education \n",
+ " 0.000000 \n",
+ " 0.000000 \n",
+ " \n",
+ " \n",
+ " 5 \n",
+ " Self_Employed \n",
+ " 0.052117 \n",
+ " 0.062670 \n",
+ " \n",
+ " \n",
+ " 6 \n",
+ " ApplicantIncome \n",
+ " 0.000000 \n",
+ " 0.000000 \n",
+ " \n",
+ " \n",
+ " 7 \n",
+ " CoapplicantIncome \n",
+ " 0.000000 \n",
+ " 0.000000 \n",
+ " \n",
+ " \n",
+ " 8 \n",
+ " LoanAmount \n",
+ " 0.035831 \n",
+ " 0.013624 \n",
+ " \n",
+ " \n",
+ " 9 \n",
+ " Loan_Amount_Term \n",
+ " 0.022801 \n",
+ " 0.016349 \n",
+ " \n",
+ " \n",
+ " 10 \n",
+ " Credit_History \n",
+ " 0.081433 \n",
+ " 0.079019 \n",
+ " \n",
+ " \n",
+ " 11 \n",
+ " Property_Area \n",
+ " 0.000000 \n",
+ " 0.000000 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " feature missing_fraction_train missing_fraction_test\n",
+ "0 Loan_ID 0.000000 0.000000\n",
+ "1 Gender 0.021173 0.029973\n",
+ "2 Married 0.004886 0.000000\n",
+ "3 Dependents 0.024430 0.027248\n",
+ "4 Education 0.000000 0.000000\n",
+ "5 Self_Employed 0.052117 0.062670\n",
+ "6 ApplicantIncome 0.000000 0.000000\n",
+ "7 CoapplicantIncome 0.000000 0.000000\n",
+ "8 LoanAmount 0.035831 0.013624\n",
+ "9 Loan_Amount_Term 0.022801 0.016349\n",
+ "10 Credit_History 0.081433 0.079019\n",
+ "11 Property_Area 0.000000 0.000000"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "missing_series = test_df.isnull().sum() / test_df.shape[0]\n",
+ "#missing_stats = pd.DataFrame(missing_series).rename(columns = {'index': 'feature', 0: 'missing_fraction'})\n",
+ "missing_stats_test = pd.DataFrame(missing_series).reset_index()\n",
+ "missing_stats_test.columns = ['feature','missing_fraction_test']\n",
+ "missing_stats_combined = missing_stats.merge(missing_stats_test, left_on='feature',right_on='feature')\n",
+ "missing_stats_combined"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "*** Findings: ***\n",
+ "- None of the columns has more that 60% missing values.\n",
+ "- Married has no missing values in the test data, but is missing values in the train_data\n",
+ "- Credit History has the largest number of missing values."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## What to do with the missing values?\n",
+ "\n",
+ "Missing values per column, what to do?\n",
+ "Categorical data with missing values:\n",
+ "- Gender: string data\n",
+ "- Married: This is not missing in the test data\n",
+ "- Dependents: int data\n",
+ "- Self_Employed: string data\n",
+ "- Credit_History: large amount of data is missing\n",
+ "\n",
+ "Options for categorical data:\n",
+ "1. Drop the NaN rows\n",
+ "2. Ignore the NaN category\n",
+ "3. Make a seperate category for the NaN data\n",
+ "4. Fill with a learning or other mechanism\n",
+ "\n",
+ "Number data with NaN values:\n",
+ "- LoanAmount: float data\n",
+ "- Loan_Amount_Term: float data\n",
+ "\n",
+ "Options:\n",
+ "1. Drop NaN\n",
+ "2. Replace with average\n",
+ "3. Replace with median value\n",
+ "4. Fill with a learning or other mechanism\n",
+ "\n",
+ "**Choose** option 3 because there are outliers for these fields.\n",
+ "\n",
+ "With the code below experiments can be done on the different options and the difference in outcome."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "No replacements are done for category NaN values, NaN will not get a dummy column\n"
+ ]
+ }
+ ],
+ "source": [
+ "def replace_cat_nan_values(df):\n",
+ " df.Gender.fillna('missing',inplace=True)\n",
+ " #df.Married.fillna('missing',inplace=True)\n",
+ " df.Dependents.fillna(-1,inplace=True)\n",
+ " df.Self_Employed.fillna('missing',inplace=True)\n",
+ " df.Credit_History.fillna(-1,inplace=True)\n",
+ " df.dropna(inplace=True) \n",
+ " \n",
+ "def replace_nan_values_floats_median(df):\n",
+ " df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].median(), inplace=True)\n",
+ " df['LoanAmount'].fillna(df['LoanAmount'].median(), inplace=True)\n",
+ " \n",
+ "def replace_nan_values_floats_mean(df):\n",
+ " df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mean(), inplace=True)\n",
+ " df['LoanAmount'].fillna(df['LoanAmount'].mean(), inplace=True) \n",
+ "\n",
+ "category_nan_option = 2 \n",
+ "if category_nan_option == 1: \n",
+ " train_df.dropna(subset=['Gender','Married','Dependents','Self_Employed','Credit_History'],inplace=True)\n",
+ " test_df.dropna(subset=['Gender','Married','Dependents','Self_Employed','Credit_History'],inplace=True)\n",
+ "elif category_nan_option == 2:\n",
+ " print ('No replacements are done for category NaN values, NaN will not get a dummy column')\n",
+ "elif category_nan_option == 3:\n",
+ " replace_cat_nan_values(train_df)\n",
+ " replace_cat_nan_values(test_df)\n",
+ "else:\n",
+ " print ('Not implemented, defaults to option 2')\n",
+ "\n",
+ "float_nan_option = 3 \n",
+ "if category_nan_option == 1: \n",
+ " train_df.dropna(subset=['LoanAmount','Loan_Amount_Term'],inplace=True)\n",
+ " test_df.dropna(subset=['LoanAmount','Loan_Amount_Term'],inplace=True)\n",
+ "elif category_nan_option == 2:\n",
+ " replace_nan_values_floats_mean(train_df)\n",
+ " replace_nan_values_floats_mean(test_df)\n",
+ "elif category_nan_option == 3:\n",
+ " replace_nan_values_floats_median(train_df)\n",
+ " replace_nan_values_floats_median(test_df)\n",
+ "else:\n",
+ " print ('Not implemented, defaults to option 2') \n",
+ " replace_nan_values_floats_median(train_df)\n",
+ " replace_nan_values_floats_median(test_df)\n",
+ " \n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " ApplicantIncome \n",
+ " CoapplicantIncome \n",
+ " LoanAmount \n",
+ " Loan_Amount_Term \n",
+ " Credit_History \n",
+ " \n",
+ " \n",
+ " Loan_Status \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " N \n",
+ " 5446.078125 \n",
+ " 1877.807292 \n",
+ " 150.945488 \n",
+ " 344.000000 \n",
+ " 0.541899 \n",
+ " \n",
+ " \n",
+ " Y \n",
+ " 5384.068720 \n",
+ " 1504.516398 \n",
+ " 144.349606 \n",
+ " 341.090047 \n",
+ " 0.981818 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term \\\n",
+ "Loan_Status \n",
+ "N 5446.078125 1877.807292 150.945488 344.000000 \n",
+ "Y 5384.068720 1504.516398 144.349606 341.090047 \n",
+ "\n",
+ " Credit_History \n",
+ "Loan_Status \n",
+ "N 0.541899 \n",
+ "Y 0.981818 "
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_df.groupby('Loan_Status').mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count \n",
+ " mean \n",
+ " std \n",
+ " min \n",
+ " 25% \n",
+ " 50% \n",
+ " 75% \n",
+ " max \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "Empty DataFrame\n",
+ "Columns: [count, mean, std, min, 25%, 50%, 75%, max]\n",
+ "Index: []"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "def find_features_with_no_variation(df):\n",
+ " stats = df.describe().transpose()\n",
+ " return stats[stats['min'] == stats['max']]\n",
+ "\n",
+ "find_features_with_no_variation(train_df)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Below only works for numeric features. For these numeric features, no features with a high correlation are found."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[]"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "def find_features_with_high_correlation(df,treshhold=0.95):\n",
+ " corr_matrix = df.corr().abs()\n",
+ " # Select upper triangle of correlation matrix\n",
+ " upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))\n",
+ " \n",
+ " # Find index of feature columns with correlation greater than treshhold\n",
+ " return [column for column in upper.columns if any(upper[column] >treshhold)]\n",
+ "\n",
+ "find_features_with_high_correlation(train_df)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Does having an education affect yearly income? \n",
+ "Yes."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " ApplicantIncome \n",
+ " \n",
+ " \n",
+ " Education \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " Graduate \n",
+ " 5857.433333 \n",
+ " \n",
+ " \n",
+ " Not Graduate \n",
+ " 3777.283582 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ApplicantIncome\n",
+ "Education \n",
+ "Graduate 5857.433333\n",
+ "Not Graduate 3777.283582"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_df.groupby('Education')['Education','ApplicantIncome'].mean()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Does yearly income affect the requested loan amount?\n",
+ "No, there is not a strong relation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "plt.scatter(train_df[train_df['ApplicantIncome']<10000]['ApplicantIncome'],train_df[train_df['ApplicantIncome']<10000]['LoanAmount'])\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " ApplicantIncome \n",
+ " LoanAmount \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " ApplicantIncome \n",
+ " 1.000000 \n",
+ " 0.565620 \n",
+ " \n",
+ " \n",
+ " CoapplicantIncome \n",
+ " 0.116605 \n",
+ " 0.187828 \n",
+ " \n",
+ " \n",
+ " LoanAmount \n",
+ " 0.565620 \n",
+ " 1.000000 \n",
+ " \n",
+ " \n",
+ " Loan_Amount_Term \n",
+ " 0.045242 \n",
+ " 0.038801 \n",
+ " \n",
+ " \n",
+ " Credit_History \n",
+ " 0.014715 \n",
+ " 0.008301 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ApplicantIncome LoanAmount\n",
+ "ApplicantIncome 1.000000 0.565620\n",
+ "CoapplicantIncome 0.116605 0.187828\n",
+ "LoanAmount 0.565620 1.000000\n",
+ "Loan_Amount_Term 0.045242 0.038801\n",
+ "Credit_History 0.014715 0.008301"
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "corr_matrix = train_df.corr().abs()\n",
+ "corr_matrix[['ApplicantIncome','LoanAmount']]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Does an education in combination with the requested loan amount affect the approval of a loan?\n",
+ "No, see below."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " LoanAmount \n",
+ " \n",
+ " \n",
+ " Education \n",
+ " Loan_Status \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " Graduate \n",
+ " N \n",
+ " 160.746236 \n",
+ " \n",
+ " \n",
+ " Y \n",
+ " 150.969734 \n",
+ " \n",
+ " \n",
+ " Not Graduate \n",
+ " N \n",
+ " 124.558862 \n",
+ " \n",
+ " \n",
+ " Y \n",
+ " 116.900297 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " LoanAmount\n",
+ "Education Loan_Status \n",
+ "Graduate N 160.746236\n",
+ " Y 150.969734\n",
+ "Not Graduate N 124.558862\n",
+ " Y 116.900297"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_df.groupby(['Education','Loan_Status'])[['Education','Loan_Status','LoanAmount']].mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " LoanAmount \n",
+ " \n",
+ " \n",
+ " Education \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " Graduate \n",
+ " 153.821213 \n",
+ " \n",
+ " \n",
+ " Not Graduate \n",
+ " 119.872277 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " LoanAmount\n",
+ "Education \n",
+ "Graduate 153.821213\n",
+ "Not Graduate 119.872277"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_df.groupby(['Education'])[['Education','LoanAmount']].mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " LoanAmount \n",
+ " \n",
+ " \n",
+ " Loan_Status \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " N \n",
+ " 150.945488 \n",
+ " \n",
+ " \n",
+ " Y \n",
+ " 144.349606 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " LoanAmount\n",
+ "Loan_Status \n",
+ "N 150.945488\n",
+ "Y 144.349606"
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_df.groupby(['Loan_Status'])[['Loan_Status','LoanAmount']].mean()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "*** Findings ***\n",
+ "- Graduates tend to request for higher loans in general\n",
+ "- In general, a higher LoanAmount is more likely to be denied, but the difference is small\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Categorical data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Column \n",
+ " Nr of unique values \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " Gender \n",
+ " 3 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " Married \n",
+ " 3 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " Dependents \n",
+ " 5 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " Education \n",
+ " 2 \n",
+ " \n",
+ " \n",
+ " 5 \n",
+ " Self_Employed \n",
+ " 3 \n",
+ " \n",
+ " \n",
+ " 9 \n",
+ " Loan_Amount_Term \n",
+ " 11 \n",
+ " \n",
+ " \n",
+ " 10 \n",
+ " Credit_History \n",
+ " 3 \n",
+ " \n",
+ " \n",
+ " 11 \n",
+ " Property_Area \n",
+ " 3 \n",
+ " \n",
+ " \n",
+ " 12 \n",
+ " Loan_Status \n",
+ " 2 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Column Nr of unique values\n",
+ "1 Gender 3\n",
+ "2 Married 3\n",
+ "3 Dependents 5\n",
+ "4 Education 2\n",
+ "5 Self_Employed 3\n",
+ "9 Loan_Amount_Term 11\n",
+ "10 Credit_History 3\n",
+ "11 Property_Area 3\n",
+ "12 Loan_Status 2"
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "unique_values_df = pd.DataFrame()\n",
+ "for col in train_df.columns:\n",
+ " unique_values_df[col] = [len(train_df[col].unique())]\n",
+ " \n",
+ "unique_values_df = unique_values_df.transpose().reset_index()\n",
+ "unique_values_df.columns=['Column','Nr of unique values']\n",
+ "unique_values_df[unique_values_df['Nr of unique values']<100]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "For category data it is the best to convert these to dummy values. \n",
+ "\n",
+ "*** Notes: ***\n",
+ "- Notice that this makes a huge difference for the Credit_History in particular. If this attribute is not converted, the results are very different."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "cat_vars=['Gender','Married','Dependents','Education','Self_Employed','Property_Area']\n",
+ "\n",
+ "cat_df = pd.get_dummies(train_df[cat_vars])\n",
+ "one_hot_features = cat_df.columns\n",
+ "\n",
+ "# Add one hot encoded data to original data\n",
+ "train_data_with_cat = pd.concat([cat_df, train_df], axis = 1)\n",
+ "# For Credit_History it does not work in one go because the values are float\n",
+ "train_data_with_cat = pd.concat([train_data_with_cat, pd.get_dummies(train_data_with_cat[\"Credit_History\"], prefix=\"Credit_History\", prefix_sep=\"_\")], axis=1)\n",
+ "\n",
+ "# apply the same on test\n",
+ "cat_df = pd.get_dummies(test_df[cat_vars])\n",
+ "\n",
+ "# Add one hot encoded data to original data\n",
+ "test_data_with_cat = pd.concat([cat_df, test_df], axis = 1)\n",
+ "test_data_with_cat = pd.concat([test_data_with_cat, pd.get_dummies(test_data_with_cat[\"Credit_History\"], prefix=\"Credit_History\", prefix_sep=\"_\")], axis=1)\n",
+ "\n",
+ "cat_vars.append('Credit_History')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['ApplicantIncome', 'CoapplicantIncome', 'Credit_History_0.0',\n",
+ " 'Credit_History_1.0', 'Dependents_0', 'Dependents_1', 'Dependents_2',\n",
+ " 'Dependents_3+', 'Education_Graduate', 'Education_Not Graduate',\n",
+ " 'Gender_Female', 'Gender_Male', 'LoanAmount', 'Loan_Amount_Term',\n",
+ " 'Married_No', 'Married_Yes', 'Property_Area_Rural',\n",
+ " 'Property_Area_Semiurban', 'Property_Area_Urban', 'Self_Employed_No',\n",
+ " 'Self_Employed_Yes'],\n",
+ " dtype='object')"
+ ]
+ },
+ "execution_count": 28,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "y_cols = ['Loan_Status']\n",
+ "\n",
+ "# 'Loan_ID' has no prediction value\n",
+ "do_not_use = ['Loan_ID']\n",
+ "feature_cols = train_data_with_cat.columns.difference(set(y_cols + cat_vars + do_not_use))\n",
+ "feature_cols"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Feature selection"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of features before selection: 21\n",
+ "[False False True True False True True False True False True True\n",
+ " False False True False True True False False False]\n",
+ "[12 11 1 1 5 1 1 7 1 8 1 1 9 10 1 2 1 1 6 3 4]\n"
+ ]
+ }
+ ],
+ "source": [
+ "from sklearn.feature_selection import RFE\n",
+ "from sklearn.linear_model import LogisticRegression\n",
+ "\n",
+ "logreg = LogisticRegression()\n",
+ "\n",
+ "print ('Number of features before selection:', len(feature_cols))\n",
+ "n_features_to_select = None\n",
+ "rfe = RFE(logreg, n_features_to_select)\n",
+ "rfe = rfe.fit(train_data_with_cat[feature_cols], train_data_with_cat['Loan_Status'] )\n",
+ "print(rfe.support_)\n",
+ "print(rfe.ranking_)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['Credit_History_0.0',\n",
+ " 'Credit_History_1.0',\n",
+ " 'Dependents_1',\n",
+ " 'Dependents_2',\n",
+ " 'Education_Graduate',\n",
+ " 'Gender_Female',\n",
+ " 'Gender_Male',\n",
+ " 'Married_No',\n",
+ " 'Property_Area_Rural',\n",
+ " 'Property_Area_Semiurban']"
+ ]
+ },
+ "execution_count": 30,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "selected_cols = feature_cols[rfe.support_].tolist()\n",
+ "selected_cols\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# try with manual selected columns\n",
+ "#selected_cols = ['Credit_History_0.0','Credit_History_1.0','Dependents_1','Education_Graduate','Married_No','Property_Area_Semiurban']"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The test data does not have the Y column\n",
+ "Since the y values are not known, the test file is not suitable for validation. So a part of the train data is used for testing."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn import model_selection\n",
+ "X=train_data_with_cat[selected_cols]\n",
+ "Y=train_data_with_cat['Loan_Status']\n",
+ "X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=0.3, random_state=0)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Try different models. For each model the average accuracy for 10 fold cross validation is calculated. Then "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "LR: 0.797121 (0.058265)\n",
+ "LDA: 0.801772 (0.052655)\n",
+ "KNN: 0.776190 (0.061810)\n",
+ "CART: 0.725028 (0.071601)\n",
+ "NB: 0.766888 (0.081246)\n",
+ "SVM: 0.801772 (0.052655)\n",
+ "RFC: 0.750554 (0.057116)\n",
+ "Fitting:LR\n",
+ "Accuracy of LR on test set: 0.83\n",
+ "Fitting:LDA\n",
+ "Accuracy of LDA on test set: 0.83\n",
+ "Fitting:KNN\n",
+ "Accuracy of KNN on test set: 0.82\n",
+ "Fitting:CART\n",
+ "Accuracy of CART on test set: 0.79\n",
+ "Fitting:NB\n",
+ "Accuracy of NB on test set: 0.80\n",
+ "Fitting:SVM\n",
+ "Accuracy of SVM on test set: 0.83\n",
+ "Fitting:RFC\n",
+ "Accuracy of RFC on test set: 0.80\n"
+ ]
+ }
+ ],
+ "source": [
+ "from sklearn.metrics import classification_report\n",
+ "from sklearn.metrics import confusion_matrix\n",
+ "from sklearn.metrics import accuracy_score\n",
+ "from sklearn.linear_model import LogisticRegression\n",
+ "from sklearn.tree import DecisionTreeClassifier\n",
+ "from sklearn.neighbors import KNeighborsClassifier\n",
+ "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n",
+ "from sklearn.naive_bayes import GaussianNB\n",
+ "from sklearn.ensemble import RandomForestClassifier\n",
+ "from sklearn.svm import SVC\n",
+ "\n",
+ "def apply_classification_models(X_train,Y_train,X_test,result):\n",
+ " models = []\n",
+ " \n",
+ " models.append(('LR', LogisticRegression()))\n",
+ " models.append(('LDA', LinearDiscriminantAnalysis()))\n",
+ " models.append(('KNN', KNeighborsClassifier()))\n",
+ " models.append(('CART', DecisionTreeClassifier()))\n",
+ " models.append(('NB', GaussianNB()))\n",
+ " models.append(('SVM', SVC()))\n",
+ " models.append(('RFC', RandomForestClassifier()))\n",
+ "\n",
+ " # evaluate each model in turn\n",
+ " results = []\n",
+ " names = []\n",
+ " for name, model in models:\n",
+ " kfold = model_selection.KFold(n_splits=10)\n",
+ " cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')\n",
+ " results.append(cv_results)\n",
+ " names.append(name)\n",
+ " msg = \"%s: %f (%f)\" % (name, cv_results.mean(), cv_results.std())\n",
+ " print(msg)\n",
+ " \n",
+ " #\n",
+ " # now fit each model on the train data and predict on the test data\n",
+ " for name, model in models:\n",
+ " print ('Fitting:' + name)\n",
+ " model.fit(X_train,Y_train)\n",
+ " y_pred = model.predict(X_test)\n",
+ "\n",
+ " print('Accuracy of {} on test set: {:.2f}'.format(name,model.score(X_test, Y_test)))\n",
+ " result['Predicted_by_'+name] = y_pred\n",
+ " \n",
+ " return models\n",
+ "\n",
+ "result = pd.DataFrame()\n",
+ "models = apply_classification_models(X_train,Y_train,X_test,result)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Result of each predictor on the test data."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Predicted_by_LR \n",
+ " Predicted_by_LDA \n",
+ " Predicted_by_KNN \n",
+ " Predicted_by_CART \n",
+ " Predicted_by_NB \n",
+ " Predicted_by_SVM \n",
+ " Predicted_by_RFC \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Predicted_by_LR Predicted_by_LDA Predicted_by_KNN Predicted_by_CART \\\n",
+ "0 Y Y Y Y \n",
+ "1 Y Y Y Y \n",
+ "2 Y Y Y Y \n",
+ "3 Y Y Y Y \n",
+ "4 Y Y Y Y \n",
+ "\n",
+ " Predicted_by_NB Predicted_by_SVM Predicted_by_RFC \n",
+ "0 Y Y Y \n",
+ "1 Y Y Y \n",
+ "2 Y Y Y \n",
+ "3 Y Y Y \n",
+ "4 Y Y Y "
+ ]
+ },
+ "execution_count": 34,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "result.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# But what if we had choosen different ways to clean the missing values?\n",
+ "\n",
+ "So lets repeat the code from above an experiment with different clean methods. Experimenting with the code below it turns out that ignoring the nan values (option 2) and replacing the floats with median or mean values gives the best results. \n",
+ "\n",
+ "In fact, the best result are achived by selecting only the 'Credit_History_0.0'. This means that not having a credit history is the best indicator if a loan will be granted. In practice that would not be realy helpfull.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "No replacements are done for category NaN values, NaN will not get a dummy column\n"
+ ]
+ }
+ ],
+ "source": [
+ "train_df = pd.read_csv('../data/train.csv')\n",
+ "test_df = pd.read_csv('../data/test.csv')\n",
+ "\n",
+ "category_nan_option = 2 \n",
+ "if category_nan_option == 1: \n",
+ " train_df.dropna(subset=['Gender','Married','Dependents','Self_Employed','Credit_History'],inplace=True)\n",
+ " test_df.dropna(subset=['Gender','Married','Dependents','Self_Employed','Credit_History'],inplace=True)\n",
+ "elif category_nan_option == 2:\n",
+ " print ('No replacements are done for category NaN values, NaN will not get a dummy column')\n",
+ "elif category_nan_option == 3:\n",
+ " replace_cat_nan_values(train_df)\n",
+ " replace_cat_nan_values(test_df)\n",
+ "else:\n",
+ " print ('Not implemented, defaults to option 2')\n",
+ "\n",
+ "float_nan_option = 3 \n",
+ "if category_nan_option == 1: \n",
+ " train_df.dropna(subset=['LoanAmount','Loan_Amount_Term'],inplace=True)\n",
+ " test_df.dropna(subset=['LoanAmount','Loan_Amount_Term'],inplace=True)\n",
+ "elif category_nan_option == 2:\n",
+ " replace_nan_values_floats_mean(train_df)\n",
+ " replace_nan_values_floats_mean(test_df)\n",
+ "elif category_nan_option == 3:\n",
+ " replace_nan_values_floats_median(train_df)\n",
+ " replace_nan_values_floats_median(test_df)\n",
+ "else:\n",
+ " print ('Not implemented, defaults to option 2') \n",
+ " replace_nan_values_floats_median(train_df)\n",
+ " replace_nan_values_floats_median(test_df)\n",
+ "\n",
+ "cat_vars=['Gender','Married','Dependents','Education','Self_Employed','Property_Area']\n",
+ "\n",
+ "cat_df = pd.get_dummies(train_df[cat_vars])\n",
+ "one_hot_features = cat_df.columns\n",
+ "\n",
+ "# Add one hot encoded data to original data\n",
+ "train_data_with_cat = pd.concat([cat_df, train_df], axis = 1)\n",
+ "# For Credit_History it does not work in one go because the values are float\n",
+ "train_data_with_cat = pd.concat([train_data_with_cat, pd.get_dummies(train_data_with_cat[\"Credit_History\"], prefix=\"Credit_History\", prefix_sep=\"_\")], axis=1)\n",
+ "\n",
+ "# apply the same on test\n",
+ "cat_df = pd.get_dummies(test_df[cat_vars])\n",
+ "\n",
+ "# Add one hot encoded data to original data\n",
+ "test_data_with_cat = pd.concat([cat_df, test_df], axis = 1)\n",
+ "test_data_with_cat = pd.concat([test_data_with_cat, pd.get_dummies(test_data_with_cat[\"Credit_History\"], prefix=\"Credit_History\", prefix_sep=\"_\")], axis=1)\n",
+ "\n",
+ "cat_vars.append('Credit_History')\n",
+ "y_cols = ['Loan_Status']\n",
+ "\n",
+ "# 'Loan_ID' has no prediction value\n",
+ "do_not_use = ['Loan_ID']\n",
+ "feature_cols = train_data_with_cat.columns.difference(set(y_cols + cat_vars + do_not_use))\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Feature selection\n",
+ "\n",
+ "*** Notes ***\n",
+ "- Experimenting with the code below it turns out that the 'Credit_History_0.0' is the most defining feature. \n",
+ "- Setting ```n_features_to_select``` to 1 causes 'Credit_History_0.0' to be the selected feature and with this one feature selected the results are the same!"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of features before selection: 21\n",
+ "[False False True True False True True False True False True True\n",
+ " False False True False True True False False False]\n",
+ "[12 11 1 1 5 1 1 7 1 8 1 1 9 10 1 2 1 1 6 3 4]\n"
+ ]
+ }
+ ],
+ "source": [
+ "logreg = LogisticRegression()\n",
+ "\n",
+ "print ('Number of features before selection:', len(feature_cols))\n",
+ "n_features_to_select = None\n",
+ "rfe = RFE(logreg, n_features_to_select)\n",
+ "rfe = rfe.fit(train_data_with_cat[feature_cols], train_data_with_cat['Loan_Status'] )\n",
+ "print(rfe.support_)\n",
+ "print(rfe.ranking_)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['Credit_History_0.0',\n",
+ " 'Credit_History_1.0',\n",
+ " 'Dependents_1',\n",
+ " 'Dependents_2',\n",
+ " 'Education_Graduate',\n",
+ " 'Gender_Female',\n",
+ " 'Gender_Male',\n",
+ " 'Married_No',\n",
+ " 'Property_Area_Rural',\n",
+ " 'Property_Area_Semiurban']"
+ ]
+ },
+ "execution_count": 37,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "selected_cols = feature_cols[rfe.support_].tolist()\n",
+ "selected_cols"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# try with manual selected columns\n",
+ "manual_select_columns = False\n",
+ "if manual_select_columns:\n",
+ " sellected_cols = ['Property_Area_Semiurban']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "LR: 0.797121 (0.058265)\n",
+ "LDA: 0.801772 (0.052655)\n",
+ "KNN: 0.776190 (0.061810)\n",
+ "CART: 0.727353 (0.071051)\n",
+ "NB: 0.766888 (0.081246)\n",
+ "SVM: 0.801772 (0.052655)\n",
+ "RFC: 0.755260 (0.058977)\n",
+ "Fitting:LR\n",
+ "Accuracy of LR on test set: 0.83\n",
+ "Fitting:LDA\n",
+ "Accuracy of LDA on test set: 0.83\n",
+ "Fitting:KNN\n",
+ "Accuracy of KNN on test set: 0.82\n",
+ "Fitting:CART\n",
+ "Accuracy of CART on test set: 0.79\n",
+ "Fitting:NB\n",
+ "Accuracy of NB on test set: 0.80\n",
+ "Fitting:SVM\n",
+ "Accuracy of SVM on test set: 0.83\n",
+ "Fitting:RFC\n",
+ "Accuracy of RFC on test set: 0.80\n"
+ ]
+ }
+ ],
+ "source": [
+ "X=train_data_with_cat[selected_cols]\n",
+ "Y=train_data_with_cat['Loan_Status']\n",
+ "X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=0.3, random_state=0)\n",
+ "result = pd.DataFrame()\n",
+ "models = apply_classification_models(X_train,Y_train,X_test,result)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# What is the relation between 'Loan_Status' and 'Credit_History'?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " Credit_History \n",
+ " 0.0 \n",
+ " 1.0 \n",
+ " nan \n",
+ " \n",
+ " \n",
+ " Loan_Status \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " N \n",
+ " 0.921348 \n",
+ " 0.204211 \n",
+ " 0.26 \n",
+ " \n",
+ " \n",
+ " Y \n",
+ " 0.078652 \n",
+ " 0.795789 \n",
+ " 0.74 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "Credit_History 0.0 1.0 nan\n",
+ "Loan_Status \n",
+ "N 0.921348 0.204211 0.26\n",
+ "Y 0.078652 0.795789 0.74"
+ ]
+ },
+ "execution_count": 40,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.crosstab(train_df['Loan_Status'],train_df['Credit_History'].astype(str),normalize='columns' )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Y 0.687296\n",
+ "N 0.312704\n",
+ "Name: Loan_Status, dtype: float64"
+ ]
+ },
+ "execution_count": 41,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_df['Loan_Status'].value_counts(normalize=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# So how usefull is the Logistic regression model?\n",
+ "The Logistic regression model above gets a 0.83 accuracy. That seems the best, but how usefull and good is this really?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "13"
+ ]
+ },
+ "execution_count": 42,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# merge the original fields back to the test\n",
+ "cols = train_data_with_cat.columns.difference(X_test.columns)\n",
+ "X_test_all = pd.merge(X_test,train_data_with_cat[cols],left_index=True,right_index=True)\n",
+ "X_test_all['Credit_History'].isnull().sum()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Predicted_by_LR \n",
+ " Predicted_by_LDA \n",
+ " Predicted_by_KNN \n",
+ " Predicted_by_CART \n",
+ " Predicted_by_NB \n",
+ " Predicted_by_SVM \n",
+ " Predicted_by_RFC \n",
+ " index \n",
+ " Credit_History_0.0 \n",
+ " Credit_History_1.0 \n",
+ " ... \n",
+ " Loan_Amount_Term \n",
+ " Loan_ID \n",
+ " Loan_Status \n",
+ " Married \n",
+ " Married_Yes \n",
+ " Property_Area \n",
+ " Property_Area_Urban \n",
+ " Self_Employed \n",
+ " Self_Employed_No \n",
+ " Self_Employed_Yes \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " 454 \n",
+ " 0 \n",
+ " 1 \n",
+ " ... \n",
+ " 360.0 \n",
+ " LP002453 \n",
+ " Y \n",
+ " No \n",
+ " 0 \n",
+ " Semiurban \n",
+ " 0 \n",
+ " Yes \n",
+ " 0 \n",
+ " 1 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " 52 \n",
+ " 0 \n",
+ " 1 \n",
+ " ... \n",
+ " 360.0 \n",
+ " LP001164 \n",
+ " N \n",
+ " No \n",
+ " 0 \n",
+ " Semiurban \n",
+ " 0 \n",
+ " No \n",
+ " 1 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " 536 \n",
+ " 0 \n",
+ " 1 \n",
+ " ... \n",
+ " 360.0 \n",
+ " LP002734 \n",
+ " Y \n",
+ " Yes \n",
+ " 1 \n",
+ " Urban \n",
+ " 1 \n",
+ " No \n",
+ " 1 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " 469 \n",
+ " 0 \n",
+ " 1 \n",
+ " ... \n",
+ " 360.0 \n",
+ " LP002505 \n",
+ " N \n",
+ " Yes \n",
+ " 1 \n",
+ " Urban \n",
+ " 1 \n",
+ " No \n",
+ " 1 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " 55 \n",
+ " 0 \n",
+ " 1 \n",
+ " ... \n",
+ " 360.0 \n",
+ " LP001194 \n",
+ " Y \n",
+ " Yes \n",
+ " 1 \n",
+ " Semiurban \n",
+ " 0 \n",
+ " No \n",
+ " 1 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
5 rows × 38 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Predicted_by_LR Predicted_by_LDA Predicted_by_KNN Predicted_by_CART \\\n",
+ "0 Y Y Y Y \n",
+ "1 Y Y Y Y \n",
+ "2 Y Y Y Y \n",
+ "3 Y Y Y Y \n",
+ "4 Y Y Y Y \n",
+ "\n",
+ " Predicted_by_NB Predicted_by_SVM Predicted_by_RFC index \\\n",
+ "0 Y Y Y 454 \n",
+ "1 Y Y Y 52 \n",
+ "2 Y Y Y 536 \n",
+ "3 Y Y Y 469 \n",
+ "4 Y Y Y 55 \n",
+ "\n",
+ " Credit_History_0.0 Credit_History_1.0 ... \\\n",
+ "0 0 1 ... \n",
+ "1 0 1 ... \n",
+ "2 0 1 ... \n",
+ "3 0 1 ... \n",
+ "4 0 1 ... \n",
+ "\n",
+ " Loan_Amount_Term Loan_ID Loan_Status Married Married_Yes \\\n",
+ "0 360.0 LP002453 Y No 0 \n",
+ "1 360.0 LP001164 N No 0 \n",
+ "2 360.0 LP002734 Y Yes 1 \n",
+ "3 360.0 LP002505 N Yes 1 \n",
+ "4 360.0 LP001194 Y Yes 1 \n",
+ "\n",
+ " Property_Area Property_Area_Urban Self_Employed Self_Employed_No \\\n",
+ "0 Semiurban 0 Yes 0 \n",
+ "1 Semiurban 0 No 1 \n",
+ "2 Urban 1 No 1 \n",
+ "3 Urban 1 No 1 \n",
+ "4 Semiurban 0 No 1 \n",
+ "\n",
+ " Self_Employed_Yes \n",
+ "0 1 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "\n",
+ "[5 rows x 38 columns]"
+ ]
+ },
+ "execution_count": 43,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# merge back to the result\n",
+ "result = pd.merge(result,X_test_all.reset_index(),left_index=True,right_index=True)\n",
+ "result.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Credit_History \n",
+ " Credit_History_0.0 \n",
+ " Predicted_by_LR \n",
+ " Predicted_by_RFC \n",
+ " Loan_Status \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 16 \n",
+ " NaN \n",
+ " 0 \n",
+ " Y \n",
+ " Y \n",
+ " N \n",
+ " \n",
+ " \n",
+ " 18 \n",
+ " NaN \n",
+ " 0 \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " \n",
+ " \n",
+ " 24 \n",
+ " NaN \n",
+ " 0 \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " \n",
+ " \n",
+ " 55 \n",
+ " NaN \n",
+ " 0 \n",
+ " Y \n",
+ " N \n",
+ " Y \n",
+ " \n",
+ " \n",
+ " 66 \n",
+ " NaN \n",
+ " 0 \n",
+ " Y \n",
+ " Y \n",
+ " Y \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Credit_History Credit_History_0.0 Predicted_by_LR Predicted_by_RFC \\\n",
+ "16 NaN 0 Y Y \n",
+ "18 NaN 0 Y Y \n",
+ "24 NaN 0 Y Y \n",
+ "55 NaN 0 Y N \n",
+ "66 NaN 0 Y Y \n",
+ "\n",
+ " Loan_Status \n",
+ "16 N \n",
+ "18 Y \n",
+ "24 Y \n",
+ "55 Y \n",
+ "66 Y "
+ ]
+ },
+ "execution_count": 44,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "result[result['Credit_History'].isnull()].head()[['Credit_History','Credit_History_0.0','Predicted_by_LR','Predicted_by_RFC','Loan_Status']]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 45,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAEUCAYAAAAlXv26AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAGbVJREFUeJzt3XuUVeWd5vHvA4gIKAiUxFho0UgrEQRJaYOAl1ajRoM6iooyEMUQI47jtcVhZqDXkhVcEm/jLUyDoEHRMdiirTaKGo0GsLgYlYvgvdRIqQEBJQH5zR9nQ8rKoS7nnOJUbZ7PWrXq7He/e78/Ti2e2vWefVFEYGZm6dWi2AWYmVnjctCbmaWcg97MLOUc9GZmKeegNzNLOQe9mVnKOejNzFLOQW9mlnIOejOzlHPQm5mlXKtiFwDQpUuXKCsrK3YZZmbNyuLFiz+PiJK6+jWJoC8rK6OioqLYZZiZNSuSPqhPP0/dmJmlnIPezCzlHPRmZinXJObozaz52rJlC5WVlWzevLnYpaRWmzZtKC0tZY899shpewe9meWlsrKSvffem7KyMiQVu5zUiQi++OILKisr6d69e0778NSNmeVl8+bNdO7c2SHfSCTRuXPnvP5ictCbWd4c8o0r3/fXQW9mlnKeozery8QOu3i89bt2vEbQsmVL+vTpw9atW+nVqxczZ86kbdu2Oe3rxRdfZMqUKTz55JPMnTuX5cuXM27cuKx9161bx4MPPshll13WoDEmTpxI+/btufbaa7OuP+6445gyZQrl5eUNrr+uMaq/V927d+eBBx6gY8eOOY+TjY/ozazg9tprL5YtW8abb75J69atuffee7+zPiLYtm1bg/c7dOjQnYY8ZIL+7rvvbvB+i6n6e9WpUyfuuuuugo/hoDezRjVkyBDWrFnD+++/T69evbjsssvo378/H330EfPmzWPgwIH079+fYcOGsXHjRgCeeeYZDj30UAYPHsycOXN27GvGjBlcfvnlAHz22WecddZZ9O3bl759+/Lqq68ybtw43nnnHfr168d1110HwM0338yRRx7J4YcfzoQJE3bsa9KkSRxyyCGceOKJrFq1qs5/x29+8xuOPvpoevfuzaJFi9i2bRs9e/akqqoKgG3btnHwwQfz+eef5/xeDRw4kI8//jjn7XfGQW9mjWbr1q08/fTT9OnTB4BVq1YxcuRIli5dSrt27bjxxht57rnnWLJkCeXl5dxyyy1s3ryZn/3sZzzxxBO8/PLL/OlPf8q67yuuuIJjjz2W119/nSVLlnDYYYcxefJkevTowbJly7j55puZN28eq1evZtGiRSxbtozFixfz0ksvsXjxYmbPns3SpUuZM2cOr732Wp3/lk2bNvHqq69y9913c/HFF9OiRQtGjBjBrFmzAHjuuefo27cvXbp0yem9+vbbb5k/fz5Dhw7NafvaOOjNrOC++eYb+vXrR3l5OQceeCCjR48G4KCDDmLAgAEALFiwgOXLlzNo0CD69evHzJkz+eCDD1i5ciXdu3enZ8+eSGLEiBFZx3j++ef5xS9+AWTmuTt0+PvPUubNm8e8efM44ogj6N+/PytXrmT16tW8/PLLnHXWWbRt25Z99tmnXuE6fPhwAI455hi++uor1q1bx8UXX8z9998PwPTp07noootyfq86d+7Ml19+yUknndTgfdTFH8aaWcFtn3euqV27djteRwQnnXQSDz300Hf6LFu2rGCna0YEN9xwAz//+c+/037bbbc1eIya/SXRrVs3unbtyvPPP8/ChQt3HN03xPb3av369Zx++uncddddXHHFFQ3eT218RG9mRTFgwABeeeUV1qxZA8DXX3/N22+/zaGHHsp7773HO++8A/B3vwi2O+GEE7jnnnuAzLTHV199xd57782GDRt29Dn55JOZPn36jrn/jz/+mLVr13LMMcfw2GOP8c0337BhwwaeeOKJOut9+OGHAfj9739Phw4ddvwFcckllzBixAjOPfdcWrZsmeO7AR06dOCOO+5gypQpbNmyJef9ZOOgN7OiKCkpYcaMGQwfPpzDDz+cAQMGsHLlStq0acPUqVM57bTTGDx4MAcddFDW7W+//XZeeOEF+vTpww9/+EPeeustOnfuzKBBg+jduzfXXXcdP/rRj7jgggsYOHAgffr04ZxzzmHDhg3079+f8847j379+nH22WczZMiQOuvdd999Ofroo7n00kuZNm3ajvahQ4eycePGek3b3HjjjZSWlu74qumII46gb9++zJ49u859NYQioqA7zEV5eXn4wSPWZPk8+lqtWLGCXr16FbuMoqmoqOCqq67i5ZdfbtRxsr3PkhZHRJ0n99d5RC9puqS1kt7Msu5aSSGpS7IsSXdIWiPpj5L6N+DfYWbWrEyePJmzzz6bX/7yl8UupVb1mbqZAZxSs1FSN+Ak4MNqzacCPZOvMcA9+ZdoZrZrjB07ln79+n3n67777ttp/3HjxvHBBx8wePDgHW2TJk36u31MmjRpV5S/U3WedRMRL0kqy7LqVuBfgMertZ0B3B+Z+aAFkjpK2j8iPi1EsWZmjakQV6WOHz+e8ePHF6Cawsnpw1hJQ4GPI+L1GqsOAD6qtlyZtGXbxxhJFZIqtl9ZZmZmhdfgoJfUFhgP/O9sq7O0Zf20NyKmRkR5RJSXlJQ0tAwzM6unXC6Y6gF0B15PLiAoBZZIOorMEXy3an1LgU/yLdLMzHLX4KCPiDeA/bYvS3ofKI+IzyXNBS6XNBv4J2C95+fNrFDKxv1HQff3/uTT6uwjiauvvppf/epXAEyZMoWNGzcyceLEgtbSmOpzeuVDwB+AQyRVShpdS/engHeBNcD/BRp2U2gzsyZmzz33ZM6cOXndlbLY6gz6iBgeEftHxB4RURoR02qsL4uIz5PXERFjI6JHRPSJCF8FZWbNWqtWrRgzZgy33nprsUvJmW+BYGZWh7FjxzJr1izWr29eVy1v56A3M6vDPvvsw8iRI7njjjuKXUpOHPRmZvVw5ZVXMm3aNDZt2lTsUhrMQW9mVg+dOnXi3HPP/c6dK5sLP3jEzJqN+pwO2ZiuueYa7rzzzqLWkAsHvZlZLbY/tASga9eufP3110WsJjeeujEzSzkHvZlZyjnozcxSzkFvZpZyDnozs5Rz0JuZpZxPrzSz5mNihwLvr/Z710QEQ4YMYfz48Zx66qkAPPLII0yfPp1nnnmmsLU0Ige9mdlOSOLee+9l2LBhHH/88Xz77beMHz++WYU8OOjNzGrVu3dvfvKTn3DTTTexadMmRo4cSY8ePYpdVoM46M3M6jBhwgT69+9P69atqahofo/ZcNCbmdWhXbt2nHfeebRv354999yz2OU0mM+6MTOrhxYtWtCiRfOMzPo8M3a6pLWS3qzWdrOklZL+KOkxSR2rrbtB0hpJqySd3FiFm5lZ/dRn6mYGcCdwf7W2Z4EbImKrpJuAG4DrJf0AOB84DPg+8Jykf4yIbwtbtpntluo4HdKyqzPoI+IlSWU12uZVW1wAnJO8PgOYHRF/Ad6TtAY4CvhDQao1MyuSiRMnFruEnBViwuli4Onk9QHAR9XWVSZtZmZWJHkFvaTxwFZg1vamLN1iJ9uOkVQhqaKqqiqfMszMrBY5B72kUcDpwIURsT3MK4Fu1bqVAp9k2z4ipkZEeUSUl5SU5FqGmTUBf4sAawz5vr85Bb2kU4DrgaERUf25WnOB8yXtKak70BNYlFeFZtaktWnThi+++MJh30gigi+++II2bdrkvI86P4yV9BBwHNBFUiUwgcxZNnsCz0oCWBARl0bEW5IeAZaTmdIZ6zNuzNKttLSUyspKPAXbeNq0aUNpaWnO29fnrJvhWZqn1dJ/EjAp54rMrFnZY4896N69e7HLsFo0z8u8zMys3hz0ZmYp56A3M0s5B72ZWco56M3MUs5Bb2aWcg56M7OUc9CbmaWcg97MLOUc9GZmKeegNzNLOQe9mVnKOejNzFLOQW9mlnIOejOzlHPQm5mlnIPezCzlHPRmZinnoDczS7k6g17SdElrJb1Zra2TpGclrU6+75u0S9IdktZI+qOk/o1ZvJmZ1a0+R/QzgFNqtI0D5kdET2B+sgxwKtAz+RoD3FOYMs3MLFd1Bn1EvAR8WaP5DGBm8nomcGa19vsjYwHQUdL+hSrWzMwaLtc5+q4R8SlA8n2/pP0A4KNq/SqTNjMzK5JCfxirLG2RtaM0RlKFpIqqqqoCl2FmZtvlGvSfbZ+SSb6vTdorgW7V+pUCn2TbQURMjYjyiCgvKSnJsQwzM6tLrkE/FxiVvB4FPF6tfWRy9s0AYP32KR4zMyuOVnV1kPQQcBzQRVIlMAGYDDwiaTTwITAs6f4U8GNgDfA1cFEj1GxmZg1QZ9BHxPCdrDohS98AxuZblJmZFY6vjDUzSzkHvZlZyjnozcxSzkFvZpZyDnozs5Rz0JuZpZyD3sws5Rz0ZmYp56A3M0s5B72ZWco56M3MUs5Bb2aWcg56M7OUc9CbmaWcg97MLOUc9GZmKeegNzNLOQe9mVnKOejNzFIur6CXdJWktyS9KekhSW0kdZe0UNJqSQ9Lal2oYs3MrOFyDnpJBwBXAOUR0RtoCZwP3ATcGhE9gT8DowtRqJmZ5SbfqZtWwF6SWgFtgU+BfwYeTdbPBM7McwwzM8tDzkEfER8DU4APyQT8emAxsC4itibdKoED8i3SzMxyl8/Uzb7AGUB34PtAO+DULF1jJ9uPkVQhqaKqqirXMszMrA75TN2cCLwXEVURsQWYAxwNdEymcgBKgU+ybRwRUyOiPCLKS0pK8ijDzMxqk0/QfwgMkNRWkoATgOXAC8A5SZ9RwOP5lWhmZvnIZ45+IZkPXZcAbyT7mgpcD1wtaQ3QGZhWgDrNzCxHrerusnMRMQGYUKP5XeCofPZrZmaF4ytjzcxSzkFvZpZyDnozs5Rz0JuZpZyD3sws5Rz0ZmYp56A3M0s5B72ZWco56M3MUs5Bb2aWcg56M7OUc9CbmaWcg97MLOUc9GZmKeegNzNLOQe9mVnKOejNzFLOQW9mlnIOejOzlMsr6CV1lPSopJWSVkgaKKmTpGclrU6+71uoYs3MrOHyPaK/HXgmIg4F+gIrgHHA/IjoCcxPls3MrEhyDnpJ+wDHANMAIuKvEbEOOAOYmXSbCZyZb5FmZpa7fI7o/wGoAu6TtFTSv0lqB3SNiE8Bku/7FaBOMzPLUT5B3wroD9wTEUcAm2jANI2kMZIqJFVUVVXlUYaZmdUmn6CvBCojYmGy/CiZ4P9M0v4Ayfe12TaOiKkRUR4R5SUlJXmUYWZmtck56CPiT8BHkg5Jmk4AlgNzgVFJ2yjg8bwqNDOzvLTKc/v/BsyS1Bp4F7iIzC+PRySNBj4EhuU5hpmZ5SGvoI+IZUB5llUn5LNfMzMrHF8Za2aWcg56M7OUc9CbmaWcg97MLOUc9GZmKeegNzNLOQe9mVnKOejNzFLOQW9mlnIOejOzlHPQm5mlnIPezCzlHPRmZinnoDczSzkHvZlZyjnozcxSzkFvZpZyDnozs5Rz0JuZpVzeQS+ppaSlkp5MlrtLWihptaSHkweHm5lZkRTiiP6/AyuqLd8E3BoRPYE/A6MLMIaZmeUor6CXVAqcBvxbsizgn4FHky4zgTPzGcPMzPKT7xH9bcC/ANuS5c7AuojYmixXAgdk21DSGEkVkiqqqqryLMPMzHYm56CXdDqwNiIWV2/O0jWybR8RUyOiPCLKS0pKci3DzMzq0CqPbQcBQyX9GGgD7EPmCL+jpFbJUX0p8En+ZZqZWa5yPqKPiBsiojQiyoDzgecj4kLgBeCcpNso4PG8qzQzs5w1xnn01wNXS1pDZs5+WiOMYWZm9ZTP1M0OEfEi8GLy+l3gqELs18zM8ucrY83MUs5Bb2aWcg56M7OUc9CbmaWcg97MLOUc9GZmKeegNzNLOQe9mVnKOejNzFLOQW9mlnIOejOzlHPQm5mlnIPezCzlHPRmZinnoDczSzkHvZlZyjnozcxSzkFvZpZyOT9KUFI34H7ge8A2YGpE3C6pE/AwUAa8D5wbEX/Ov9TCKRv3H7t0vPcnn7ZLxzMzqy6fI/qtwDUR0QsYAIyV9ANgHDA/InoC85NlMzMrkpyDPiI+jYglyesNwArgAOAMYGbSbSZwZr5FmplZ7goyRy+pDDgCWAh0jYhPIfPLANivEGOYmVlu8g56Se2B3wJXRsRXDdhujKQKSRVVVVX5lmFmZjuRV9BL2oNMyM+KiDlJ82eS9k/W7w+szbZtREyNiPKIKC8pKcmnDDMzq0XOQS9JwDRgRUTcUm3VXGBU8noU8Hju5ZmZWb5yPr0SGAT8V+ANScuStv8BTAYekTQa+BAYll+JZmaWj5yDPiJ+D2gnq0/Idb9mZlZYvjLWzCzlHPRmZinnoDczSzkHvZlZyjnozcxSLp/TK83MmraJHXbxeOt37Xj15CN6M7OUc9CbmaWcg97MLOUc9GZmKeegNzNLOQe9mVnKOejNzFLOQW9mlnIOejOzlHPQm5mlnIPezCzlHPRmZinnoDczS7lGC3pJp0haJWmNpHGNNY6ZmdWuUYJeUkvgLuBU4AfAcEk/aIyxzMysdo11RH8UsCYi3o2IvwKzgTMaaSwzM6tFYwX9AcBH1ZYrkzYzM9vFGusJU8rSFt/pII0BxiSLGyWtaqRaik430QX4vNh1WM527c/vX7P997Ecpf1nd1B9OjVW0FcC3aotlwKfVO8QEVOBqY00fpMiqSIiyotdh+XGP7/myz+7jMaaunkN6Cmpu6TWwPnA3EYay8zMatEoR/QRsVXS5cB/Ai2B6RHxVmOMZWZmtWusqRsi4ingqcbafzOzW0xRpZh/fs2Xf3aAIqLuXmZm1mz5FghmZinnoDczSzkHvZlZyjnoG5GkTpL2LXYdZrZ7c9AXmKQDJc2WVAUsBF6TtDZpKytudVZfkrpK6i/pCEldi12P1Z+k/yJptaT1kr6StEHSV8Wuq5h81k2BSfoDcBvwaER8m7S1BIYBV0bEgGLWZ7WT1A+4F+gAfJw0lwLrgMsiYkmxarP6kbQG+ElErCh2LU2Fg77AJK2OiJ4NXWdNg6RlwM8jYmGN9gHAryOib3Eqs/qS9EpEDCp2HU1Jo10wtRtbLOluYCZ/u4NnN2AUsLRoVVl9tasZ8gARsUBSu2IUZA1WIelh4N+Bv2xvjIg5xSupuHxEX2DJvX1Gk7n//gFk7uT5EfAEMC0i/lLL5lZkku4AegD3891f1COB9yLi8mLVZvUj6b4szRERF+/yYpoIB71ZDZJO5bu/qCuBucltPcyaHQf9LiTp9Ih4sth1mKWZpDZk/qo+DGizvX13PqL36ZW71pHFLsBylzwsx5q+B4DvAScDvyNz1tSGolZUZP4wthFIOpS//ekfZB66MjciJhS1MMuXH/3UPBwcEcMknRERMyU9SOaW6bstH9EXmKTryTwMXcAiMg9hEfCQpHHFrM3y9tdiF2D1siX5vk5SbzLXRJQVr5zi8xx9gUl6GzgsIrbUaG8NvOXz6JsvSR9GxIHFrsNqJ+kS4LdAH2AG0B74XxHx62LWVUyeuim8bcD3gQ9qtO+frLMmTNIfd7YK8K0QmocHgLPJHMXPTNp265+dg77wrgTmS1rN387DPhA4GPA52E1fVzIf4v25RruAV3d9OZaDx4H1wGKqXTC1O3PQF1hEPCPpH4Gj+O552K9tv/eNNWlPAu0jYlnNFZJe3PXlWA5KI+KUYhfRlHiO3sxSRdJU4P9ExBvFrqWpcNCbWapIWk5mqvQ9MlM3InMLhMOLWlgROejNLFUkHZStPSJqniCx23DQm5mlnC+YMjNLOQe9mVnKOejNzFLOQW9NlqTvJQ9Vf0fScklPJdco5LKvn0q6M3l9qaSR1dq/X8e2L0oqr7ZcJunN5HV58rCSnW1bJumCXGo2KxRfMGVNkiQBjwEzI+L8pK0fmStX306WW+ZyEVpE3Ftt8afAm2TuMNpgEVEBVNTSpQy4AHiwvvuU1CoituZSj1k2PqK3pup4YEv1UE6uVm0p6YXk1rNvAEgaIWmRpGWSfi2pZdJ+kaS3Jf0O2PGwaEkTJV0r6RygHJiVbLtXQ4uUdJykJ5PXxyb7WSZpqaS9gcnAkKTtKkltJN0n6Y2kz/HJtj+V9P8kPQHMk/SApDOqjTNL0tAGv4tm+Ijemq7eZO5Vks1RQO+IeE9SL+A8YFBEbEkezH6hpGeBfwV+SOa+Jy9Q4+HsEfGopMuBa5Mj89rMkvRN8ro12W9Qdy0wNiJekdQe2AyMS/Z/OoCka5Kx+yTPLZhXbTpqIHB4RHwp6VjgKuBxSR2Ao8k8YN6swXxEb83Rooh4L3l9Apkwf03SsmT5H4B/Al6MiKqI+CvwcJ5jXhgR/SKiH/DjnfR5BbhF0hVAx51Mvwwmc3dFImIlmbucbg/6ZyPiy2Td74CDJe0HDAd+6+kcy5WD3pqqt8gEeDabqr0WmXn8fsnXIRExMVm3S68GjIjJwCXAXsCC5Ii9ptqeUrWpxvIDwIXARcB9BSnSdksOemuqngf2lPSz7Q2SjgSOrdFvPnBOcuSLpE7JJfALgeMkdZa0BzBsJ+NsAPYuRMGSekTEGxFxE5kPaA/Nsv+XyIQ3yZTNgcCqnexyBpnbXhMRbxWiRts9eY7emqSICElnAbclj2DcDLwP/HuNfssl/U8yc90tyDxGbmxELJA0EfgD8CmwBGiZZagZwL3J/PvAiPgmS5/6ujL5cPVbYDnwNJm5/K2SXk/GujsZ7w1gK/DTiPhL5iSjv3sPPpO0oua/2ayhfK8bsyZKUlsyZxb1j4j1xa7Hmi9P3Zg1QZJOBFaSua+6Q97y4iN6s4Skx4DuNZqvj4j/LEY9ZoXioDczSzlP3ZiZpZyD3sws5Rz0ZmYp56A3M0s5B72ZWcr9f9mVPYSY/alAAAAAAElFTkSuQmCC\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "pd.crosstab(result['Credit_History'].astype(str),result['Predicted_by_LR']).plot(kind='bar')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " Predicted_by_LR \n",
+ " N \n",
+ " Y \n",
+ " \n",
+ " \n",
+ " Credit_History \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0.0 \n",
+ " 23 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 1.0 \n",
+ " 0 \n",
+ " 149 \n",
+ " \n",
+ " \n",
+ " nan \n",
+ " 0 \n",
+ " 13 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "Predicted_by_LR N Y\n",
+ "Credit_History \n",
+ "0.0 23 0\n",
+ "1.0 0 149\n",
+ "nan 0 13"
+ ]
+ },
+ "execution_count": 46,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.crosstab(result['Credit_History'].astype(str),result['Predicted_by_LR'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " Predicted_by_RFC \n",
+ " N \n",
+ " Y \n",
+ " \n",
+ " \n",
+ " Credit_History \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0.0 \n",
+ " 23 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 1.0 \n",
+ " 6 \n",
+ " 143 \n",
+ " \n",
+ " \n",
+ " nan \n",
+ " 1 \n",
+ " 12 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "Predicted_by_RFC N Y\n",
+ "Credit_History \n",
+ "0.0 23 0\n",
+ "1.0 6 143\n",
+ "nan 1 12"
+ ]
+ },
+ "execution_count": 47,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.crosstab(result['Credit_History'].astype(str),result['Predicted_by_RFC'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " Credit_History_0.0 \n",
+ " 0 \n",
+ " 1 \n",
+ " \n",
+ " \n",
+ " Credit_History \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0.0 \n",
+ " 0 \n",
+ " 23 \n",
+ " \n",
+ " \n",
+ " 1.0 \n",
+ " 149 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " nan \n",
+ " 13 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "Credit_History_0.0 0 1\n",
+ "Credit_History \n",
+ "0.0 0 23\n",
+ "1.0 149 0\n",
+ "nan 13 0"
+ ]
+ },
+ "execution_count": 48,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.crosstab(result['Credit_History'].astype(str),result['Credit_History_0.0'].astype(str))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "It seems that the Logistic Regression model has learned to always grant the request if there was credit history, and always deny if there was no history. That is not helpfull because using this method no new people would ever get a loan! The method that was used to deal with the NaN values causes all cases where the Credit_History did not have a value to grant the loan."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Evaluations per model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " Model ACC\n",
+ "0 LR 0.827027\n",
+ "1 LDA 0.827027\n",
+ "2 SVM 0.827027\n",
+ "3 KNN 0.821622\n",
+ "4 NB 0.800000\n",
+ "5 RFC 0.800000\n",
+ "6 CART 0.794595\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " Model Precision\n",
+ "0 LR 0.913043\n",
+ "1 LDA 0.913043\n",
+ "2 SVM 0.913043\n",
+ "3 KNN 0.821429\n",
+ "4 RFC 0.733333\n",
+ "5 NB 0.705882\n",
+ "6 CART 0.675676\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " Model Recall\n",
+ "0 CART 0.490196\n",
+ "1 NB 0.470588\n",
+ "2 KNN 0.450980\n",
+ "3 RFC 0.431373\n",
+ "4 LR 0.411765\n",
+ "5 LDA 0.411765\n",
+ "6 SVM 0.411765\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " Model F1\n",
+ "0 KNN 0.582278\n",
+ "1 CART 0.568182\n",
+ "2 LR 0.567568\n",
+ "3 LDA 0.567568\n",
+ "4 SVM 0.567568\n",
+ "5 NB 0.564706\n",
+ "6 RFC 0.543210\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAA6UAAAGQCAYAAABWCcZxAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAH1VJREFUeJzt3Xm0ZGdZL+DfS0JABpnSTJmFgERA0CY4Mgi6kgsSLiAkIhKWGFAiXhA1DBcwXq6ICg5EJSzxIi5IEAEbjAYnBFQwDQY0hEiIgTQB6YQwB5LAe/+o3VA5nNOnkpzTX1L9PGvVWrX3/vZXb9Xu6rN/9X21q7o7AAAAMMKNRhcAAADA3ksoBQAAYBihFAAAgGGEUgAAAIYRSgEAABhGKAUAAGAYoRSAPaqqDq2qrqp9R9dyXVXVhVX10NF1LIuq+n9V9X9G1wHAniWUAiyRKSRdXlVfmLvdedp2alWdV1Vfq6rj93BNmxLcqupBVbVjg/p6e1U9eSP6WqP/rqq7blb/AHBDJZQCLJ8f7e5bzN0unta/P8nPJnnfwNrYSyzDSDgAe4ZQCrCX6O5Tuvvvknx5vbZVdauq+pOq2llVH62q51XVjaZtx1fVu6rqN6vqsqr6r6o6eo1+XpPk4CRvmUZtf2lu8+Or6mNVdUlVPXdunxtV1UlV9ZGqurSqXl9Vt12l75sn+askd54fFd7d/lV106r602n9Z6rqrKq6Q1W9KMkPJnn51M/L13g+T5hej0vna562HVlV/zL1+4mqenlV7Tdte8fU7P1T/4+rqttU1Vun1/iy6f6BuzkmB1XVG6f2l+6qcXq+z5vq+tR03G41bds1VfpJVXXR9DhPrar7VdUHplpfPvcYx1fVP1XV71XVZ6vqQ1X1kLntT6qqc6vq81V1QVU9ZW7bg6pqR1X9clV9MskfT+sfXlVnT4/1z1V177l97ltV75v6Oz3JTdd6/gAsL6EUgNX8XpJbJfm2JA9M8pNJnjS3/f5Jzkuyf5KXJPmjqqqVnXT3E5J8LN8YvX3J3OYfSHL3JA9J8vyquse0/ulJHjk97p2TXJbklFX6/mKSo5NcvGJUeHf7P3F6XgcluV2Spya5vLufm+SdSU6c+jlx5eNV1RFJ/iDJE6Z+b5dkPkR+Nckzptfke6fn9bNTrQ+Y2nzn1P/pmf0N/uMkh2QW3C9PslYY3ifJW5N8NMmhSQ5Ictq0+fjp9uDMjtctVunn/kkOT/K4JL+d5LlJHprkO5I8tqoeuKLtBdPzeEGSN859KPCpJA9P8q2Z/Xt4WVV919y+d0xy2+k5nTBte1WSp0yv1yuSbKuqm0yB/c1JXjPt82dJHr3a8wdguQmlAMvnzdOo1Geq6s3XdOcpAD0uybO7+/PdfWGS38osjO3y0e5+ZXd/Ncmrk9wpyR2u4UP9Sndf3t3vz2xq8XdO65+S5LndvaO7v5LkhUkecw2mg+5u/yszC0d37e6vdvd7u/tzC/b7mCRv7e53TP3+7yRf27Vx6uvd3X3V9Jq9IrNgvKruvrS7/7y7v9Tdn0/yot20PzKzIPyL3f3F7v5yd79r2vb4JC/t7gu6+wtJnp3k2BWv169O+7wtyReTvK67P9XdH88sjN93ru2nkvx2d185hefzkjxsqvkvu/sjPfOPSd6W2QjzLl9L8oLu/kp3X57kp5O8orvfM73er07ylSTfM91uPPdYb0hy1lqvFwDLy/c9AJbPI7v7b6/D/vsn2S+zUbldPprZ6Nwun9x1p7u/NA2S3uIaPs4n5+5/aW7/Q5K8qaq+Nrf9q5mF3o8v0O/u9n9NZqOkp1XVrZP8aWYB9soF+r1zkot2LXT3F6vq0l3LVXW3JC9NsjXJzTL7G/vetTqrqpsleVmSo5LcZlp9y6raZwr78w7K7IOAq9aoa+Wx2jdX/5Dgv+fuX77K8vyx+3h394r+dl0s6+jMRk/vltkH2zdL8u9zbXd29/z08EOSPLGqfm5u3X5Tf73GYwGwlzFSCsBKl2Q2onjI3LqDs1ggXE2v3+RqLkpydHffeu5202lUb5G+19x/GpH7le4+Isn3ZTYV9ScXrPMTmYXDJF8Plbeb2/4HST6U5PDu/tYkz0nyTVOa5/xCZtOX7z+13zXFd7V9Lkpy8BqjxRfnm4/VVbl68LwmDlgxFfvgJBdX1U2S/HmS30xyh+6+dZIzVtS78jW8KMmLVhyLm3X36zJ7PVd7LAD2MkIpwF6iqvarqptmFiJuPF3055v+DkyjdK9P8qKqumVVHZLkmZmNKl4b/53Zdx0X9YfTYx8y1b2lqo7ZTd+323Vhn/X2r6oHV9W9pinKn8ssfH91rq/d1fmGJA+vqh+Yvg95cq7+d/SWU59fqKpvT/Izq9T6bSvaX57kM9N3Nl+wm8f+18xC3Iur6ubTsfv+advrkjyjqg6rqlsk+b9JTl9jVHURt0/y9Kq6cVX9WJJ7ZBY+90tykyQ7k1w1jZr+yDp9vTLJU6vq/jVz86p6WFXdMsm/ZBaen15V+1bVozKbpgzAXkYoBdh7vC2zEPR9SU6d7j9gjbY/l9l3Dy9I8q4kr83sgjXXxq8led70HddnLdD+d5JsS/K2qvp8kndndvGdb9LdH8oslF0w9X/ndfa/Y2bh8nNJzk3yj/lG2P6dzL57ellV/e4qj3VOkqdl9lp8IrMLKM3/Ruqzkvx4ks9nFsZOX9HFC5O8eqrzsZldcOhbMhuZfneSv17rBZk+KPjRJHfN7MJROzL73m8yOy6vSfKOJP+V2dWVf26Vbhb1nswuinRJZt9zfcz0/dfPZ3YRqddn9tx/PLPXeU3dvT2z75W+fNrn/MwuypTuviLJo6bly6bn88brUDcAN1B19a9yAAB7q6o6PsmTu/sHRtcCwN7DSCkAAADDCKUAAAAMY/ouAAAAwxgpBQAAYBihFAAAgGGEUgAAAIYRSgEAABhGKAUAAGAYoRQAAIBhhFIAAACGEUoBAAAYRigFAABgGKEUAACAYYRSAAAAhhFKAQAAGEYoBQAAYBihFAAAgGGEUgAAAIYRSgEAABhGKAUAAGAYoRQAAIBhhFIAAACGEUoBAAAYRigFAABgGKEUAACAYYRSAAAAhhFKAQAAGEYoBQAAYBihFAAAgGGEUgAAAIYRSgEAABhGKAUAAGCYfUc98P7779+HHnroqIcHAABgE733ve+9pLu3rNduWCg99NBDs3379lEPDwAAwCaqqo8u0m6h6btVdVRVnVdV51fVSWu0eWxVfbCqzqmq116TYgEAANg7rTtSWlX7JDklyQ8n2ZHkrKra1t0fnGtzeJJnJ/n+7r6sqm6/WQUDAACwPBYZKT0yyfndfUF3X5HktCTHrGjz00lO6e7LkqS7P7WxZQIAALCMFgmlByS5aG55x7Ru3t2S3K2q/qmq3l1VR21UgQAAACyvRS50VKus61X6OTzJg5IcmOSdVXXP7v7M1TqqOiHJCUly8MEHX+NiAQAAWC6LjJTuSHLQ3PKBSS5epc1fdPeV3f1fSc7LLKReTXef2t1bu3vrli3rXhkYAACAJbdIKD0ryeFVdVhV7Zfk2CTbVrR5c5IHJ0lV7Z/ZdN4LNrJQAAAAls+6obS7r0pyYpIzk5yb5PXdfU5VnVxVj5ianZnk0qr6YJJ/SPKL3X3pZhUNAADAcqjulV8P3TO2bt3a27dvH/LYAAAAbK6qem93b12v3SLTdwEAAGBTCKUAAAAMI5QCAAAwjFAKAADAMPuOLuD67NCT/nJ0CXutC1/8sNElAAAAe4CRUgAAAIYRSgEAABhGKAUAAGAYoRQAAIBhhFIAAACGEUoBAAAYRigFAABgGL9Tyl7Jb9COtZm/Q+vYjrXZvzHs+I7j2C43vw8OjGSkFAAAgGGMlAIAsGmMgo9lFJwbAiOlAAAADCOUAgAAMIxQCgAAwDBCKQAAAMMIpQAAAAwjlAIAADCMUAoAAMAwQikAAADDCKUAAAAMI5QCAAAwzL6jCwAAAG6YDj3pL0eXsNe68MUPG13ChjFSCgAAwDBCKQAAAMMIpQAAAAwjlAIAADCMUAoAAMAwQikAAADDCKUAAAAMI5QCAAAwjFAKAADAMEIpAAAAwwilAAAADCOUAgAAMIxQCgAAwDBCKQAAAMMIpQAAAAwjlAIAADCMUAoAAMAwQikAAADDCKUAAAAMs1Aoraqjquq8qjq/qk5aZfvxVbWzqs6ebk/e+FIBAABYNvuu16Cq9klySpIfTrIjyVlVta27P7ii6endfeIm1AgAAMCSWmSk9Mgk53f3Bd19RZLTkhyzuWUBAACwN1gklB6Q5KK55R3TupUeXVUfqKo3VNVBq3VUVSdU1faq2r5z585rUS4AAADLZJFQWqus6xXLb0lyaHffO8nfJnn1ah1196ndvbW7t27ZsuWaVQoAAMDSWSSU7kgyP/J5YJKL5xt096Xd/ZVp8ZVJvntjygMAAGCZLRJKz0pyeFUdVlX7JTk2ybb5BlV1p7nFRyQ5d+NKBAAAYFmte/Xd7r6qqk5McmaSfZK8qrvPqaqTk2zv7m1Jnl5Vj0hyVZJPJzl+E2sGAABgSawbSpOku89IcsaKdc+fu//sJM/e2NIAAABYdotM3wUAAIBNIZQCAAAwjFAKAADAMEIpAAAAwwilAAAADCOUAgAAMIxQCgAAwDBCKQAAAMMIpQAAAAwjlAIAADCMUAoAAMAwQikAAADDCKUAAAAMI5QCAAAwjFAKAADAMEIpAAAAwwilAAAADCOUAgAAMIxQCgAAwDBCKQAAAMMIpQAAAAwjlAIAADCMUAoAAMAwQikAAADDCKUAAAAMI5QCAAAwjFAKAADAMEIpAAAAwwilAAAADCOUAgAAMIxQCgAAwDBCKQAAAMMIpQAAAAwjlAIAADCMUAoAAMAwQikAAADDCKUAAAAMI5QCAAAwjFAKAADAMEIpAAAAwwilAAAADCOUAgAAMIxQCgAAwDBCKQAAAMMsFEqr6qiqOq+qzq+qk3bT7jFV1VW1deNKBAAAYFmtG0qrap8kpyQ5OskRSY6rqiNWaXfLJE9P8p6NLhIAAIDltMhI6ZFJzu/uC7r7iiSnJTlmlXa/muQlSb68gfUBAACwxBYJpQckuWhuece07uuq6r5JDurut+6uo6o6oaq2V9X2nTt3XuNiAQAAWC6LhNJaZV1/fWPVjZK8LMkvrNdRd5/a3Vu7e+uWLVsWrxIAAICltEgo3ZHkoLnlA5NcPLd8yyT3TPL2qrowyfck2eZiRwAAAKxnkVB6VpLDq+qwqtovybFJtu3a2N2f7e79u/vQ7j40ybuTPKK7t29KxQAAACyNdUNpd1+V5MQkZyY5N8nru/ucqjq5qh6x2QUCAACwvPZdpFF3n5HkjBXrnr9G2wdd97IAAADYGywyfRcAAAA2hVAKAADAMEIpAAAAwwilAAAADCOUAgAAMIxQCgAAwDBCKQAAAMMIpQAAAAwjlAIAADCMUAoAAMAwQikAAADDCKUAAAAMI5QCAAAwjFAKAADAMEIpAAAAwwilAAAADCOUAgAAMIxQCgAAwDBCKQAAAMMIpQAAAAwjlAIAADCMUAoAAMAwQikAAADDCKUAAAAMI5QCAAAwjFAKAADAMEIpAAAAwwilAAAADCOUAgAAMIxQCgAAwDBCKQAAAMMIpQAAAAwjlAIAADCMUAoAAMAwQikAAADDCKUAAAAMI5QCAAAwjFAKAADAMEIpAAAAwwilAAAADCOUAgAAMIxQCgAAwDBCKQAAAMMIpQAAAAwjlAIAADDMQqG0qo6qqvOq6vyqOmmV7U+tqn+vqrOr6l1VdcTGlwoAAMCyWTeUVtU+SU5JcnSSI5Ict0rofG1336u775PkJUleuuGVAgAAsHQWGSk9Msn53X1Bd1+R5LQkx8w36O7PzS3ePElvXIkAAAAsq30XaHNAkovmlnckuf/KRlX1tCTPTLJfkh9araOqOiHJCUly8MEHX9NaAQAAWDKLjJTWKuu+aSS0u0/p7rsk+eUkz1uto+4+tbu3dvfWLVu2XLNKAQAAWDqLhNIdSQ6aWz4wycW7aX9akkdel6IAAADYOywSSs9KcnhVHVZV+yU5Nsm2+QZVdfjc4sOSfHjjSgQAAGBZrfud0u6+qqpOTHJmkn2SvKq7z6mqk5Ns7+5tSU6sqocmuTLJZUmeuJlFAwAAsBwWudBRuvuMJGesWPf8ufs/v8F1AQAAsBdYZPouAAAAbAqhFAAAgGGEUgAAAIYRSgEAABhGKAUAAGAYoRQAAIBhhFIAAACGEUoBAAAYRigFAABgGKEUAACAYYRSAAAAhhFKAQAAGEYoBQAAYBihFAAAgGGEUgAAAIYRSgEAABhGKAUAAGAYoRQAAIBhhFIAAACGEUoBAAAYRigFAABgGKEUAACAYYRSAAAAhhFKAQAAGEYoBQAAYBihFAAAgGGEUgAAAIYRSgEAABhGKAUAAGAYoRQAAIBhhFIAAACGEUoBAAAYRigFAABgGKEUAACAYYRSAAAAhhFKAQAAGEYoBQAAYBihFAAAgGGEUgAAAIYRSgEAABhGKAUAAGAYoRQAAIBhhFIAAACGEUoBAAAYZqFQWlVHVdV5VXV+VZ20yvZnVtUHq+oDVfV3VXXIxpcKAADAslk3lFbVPklOSXJ0kiOSHFdVR6xo9m9Jtnb3vZO8IclLNrpQAAAAls8iI6VHJjm/uy/o7iuSnJbkmPkG3f0P3f2lafHdSQ7c2DIBAABYRouE0gOSXDS3vGNat5afSvJXq22oqhOqantVbd+5c+fiVQIAALCUFgmltcq6XrVh1U8k2ZrkN1bb3t2ndvfW7t66ZcuWxasEAABgKe27QJsdSQ6aWz4wycUrG1XVQ5M8N8kDu/srG1MeAAAAy2yRkdKzkhxeVYdV1X5Jjk2ybb5BVd03ySuSPKK7P7XxZQIAALCM1g2l3X1VkhOTnJnk3CSv7+5zqurkqnrE1Ow3ktwiyZ9V1dlVtW2N7gAAAODrFpm+m+4+I8kZK9Y9f+7+Qze4LgAAAPYCi0zfBQAAgE0hlAIAADCMUAoAAMAwQikAAADDCKUAAAAMI5QCAAAwjFAKAADAMEIpAAAAwwilAAAADCOUAgAAMIxQCgAAwDBCKQAAAMMIpQAAAAwjlAIAADCMUAoAAMAwQikAAADDCKUAAAAMI5QCAAAwjFAKAADAMEIpAAAAwwilAAAADCOUAgAAMIxQCgAAwDBCKQAAAMMIpQAAAAwjlAIAADCMUAoAAMAwQikAAADDCKUAAAAMI5QCAAAwjFAKAADAMEIpAAAAwwilAAAADCOUAgAAMIxQCgAAwDBCKQAAAMMIpQAAAAwjlAIAADCMUAoAAMAwQikAAADDCKUAAAAMI5QCAAAwjFAKAADAMEIpAAAAwywUSqvqqKo6r6rOr6qTVtn+gKp6X1VdVVWP2fgyAQAAWEbrhtKq2ifJKUmOTnJEkuOq6ogVzT6W5Pgkr93oAgEAAFhe+y7Q5sgk53f3BUlSVaclOSbJB3c16O4Lp21f24QaAQAAWFKLTN89IMlFc8s7pnXXWFWdUFXbq2r7zp07r00XAAAALJFFQmmtsq6vzYN196ndvbW7t27ZsuXadAEAAMASWSSU7khy0NzygUku3pxyAAAA2JssEkrPSnJ4VR1WVfslOTbJts0tCwAAgL3BuqG0u69KcmKSM5Ocm+T13X1OVZ1cVY9Ikqq6X1XtSPJjSV5RVedsZtEAAAAsh0WuvpvuPiPJGSvWPX/u/lmZTesFAACAhS0yfRcAAAA2hVAKAADAMEIpAAAAwwilAAAADCOUAgAAMIxQCgAAwDBCKQAAAMMIpQAAAAwjlAIAADCMUAoAAMAwQikAAADDCKUAAAAMI5QCAAAwjFAKAADAMEIpAAAAwwilAAAADCOUAgAAMIxQCgAAwDBCKQAAAMMIpQAAAAwjlAIAADCMUAoAAMAwQikAAADDCKUAAAAMI5QCAAAwjFAKAADAMEIpAAAAwwilAAAADCOUAgAAMIxQCgAAwDBCKQAAAMMIpQAAAAwjlAIAADCMUAoAAMAwQikAAADDCKUAAAAMI5QCAAAwjFAKAADAMEIpAAAAwwilAAAADCOUAgAAMIxQCgAAwDBCKQAAAMMIpQAAAAwjlAIAADDMQqG0qo6qqvOq6vyqOmmV7TepqtOn7e+pqkM3ulAAAACWz7qhtKr2SXJKkqOTHJHkuKo6YkWzn0pyWXffNcnLkvz6RhcKAADA8llkpPTIJOd39wXdfUWS05Ics6LNMUlePd1/Q5KHVFVtXJkAAAAso+ru3TeoekySo7r7ydPyE5Lcv7tPnGvzH1ObHdPyR6Y2l6zo64QkJ0yLd09y3kY9Eb7J/kkuWbcVN1SO7/JybJeXY7vcHN/l5dguN8d3cx3S3VvWa7TvAh2tNuK5Msku0ibdfWqSUxd4TK6jqtre3VtH18HmcHyXl2O7vBzb5eb4Li/Hdrk5vtcPi0zf3ZHkoLnlA5NcvFabqto3ya2SfHojCgQAAGB5LRJKz0pyeFUdVlX7JTk2ybYVbbYleeJ0/zFJ/r7XmxcMAADAXm/d6bvdfVVVnZjkzCT7JHlVd59TVScn2d7d25L8UZLXVNX5mY2QHruZRbMQ06SXm+O7vBzb5eXYLjfHd3k5tsvN8b0eWPdCRwAAALBZFpm+CwAAAJtCKAUAAGAYofQGqKq+MHf/f1TVh6vq4Kp6YVV9qapuv0bbrqrfmlt+VlW9cI8Vzpqq6o5VdVpVfaSqPlhVZ1TV3aZtz6iqL1fVrebaP6iqPltV/1ZVH6qq35zWP6mqzp5uV1TVv0/3XzzqubG2+ffn3LoXVtXHp+P2wao6bkRtXDMLHMsPV9Ubq+qIFW22VNWVVfWUPVct11RVPbeqzqmqD0zH86+q6tdWtLlPVZ073b+wqt65YvvZ0++6cz21u/OkFe/nD1XVH1SV8+jruar66q73XlW9papuPa0/tKounztnOnu6oGuq6uiq2l5V586fY7G5vJluwKrqIUl+L8lR3f2xafUlSX5hjV2+kuRRVbX/nqiPxVRVJXlTkrd39126+4gkz0lyh6nJcZldBft/rtj1nd193yT3TfLwqvr+7v7j7r5Pd98ns59uevC0fNKeeTZskJdNx/CYJK+oqhuPLohr7WXTe/DwJKcn+fuqmv8R8R9L8u7M3udcD1XV9yZ5eJLv6u57J3lokhcnedyKpscmee3c8i2ratfP5d1jT9TKdbbeedKu/5uPSHKvJA/cY5VxbV0+/R98z8wuxvq0uW0f2XXONN2uqKp7Jnl5kp/o7nskuWeSCwbUvdcRSm+gquoHk7wyycO6+yNzm16V5HFVddtVdrsqsyuMPWMPlMjiHpzkyu7+w10ruvvs7n5nVd0lyS2SPC9rnLR29+VJzk5ywJ4olj2nuz+c5EtJbjO6Fq677j49yduS/Pjc6uMy+yDxwKryHr5+ulOSS7r7K0nS3Zd09z8m+UxV3X+u3WOTnDa3/Pp8I7gel+R1e6JYrpNFz5P2S3LTJJdtekVspH/J+udKv5TkRd39oWT2KyTd/fubXhlC6Q3UTZL8RZJH7nrTzPlCZsH059fY95Qkj5+fCspw90zy3jW27TqReWeSu89Pzd6lqm6T5PAk79i0Chmiqr4ryYe7+1Oja2HDvC/JtyfJNIp2x+7+11w9wHD98rYkB1XVf1bV71fVrtGx12X6Cbyq+p4kl04fJO3yhiSPmu7/aJK37KmCuU52d570jKo6O8knkvxnd5+9Z0vj2qqqfZI8JMm2udV3mZu6e8q0bnfnZGwiofSG6cok/5zkp9bY/rtJnlhV37pyQ3d/LsmfJHn65pXHBjo2yWnd/bUkb8xsqt8uP1hVH0jyySRv7e5PjiiQTfGMqjovyXuSvHBwLWysmrt/bGZhNJmNsJnCez3U3V9I8t1JTkiyM8npVXV8ZsfsMdP3Co/NN4+EfjrJZVV1bJJzM5v1wPXcOudJu6bv3j7Jzadjy/Xbt0wfJFya5LZJ/mZu2/z03aetvjt7ilB6w/S1zKYJ3a+qnrNyY3d/JrPvtfzsGvv/dmaB9uabViHXxDmZnfBcTVXdO7MR0L+pqgszO+mZP2l95/T9pnsl+Zmqus8eqJU942XdfffMRs7+pKpuOrogNsx9Mwsoyez9fPz0/t6W5Dur6vBRhbG27v5qd7+9u1+Q5MQkj+7ui5JcmNn3Ch+db3zAMO/0zEbeTN29YdnteVJ3X5nkr5M8YE8WxbVy+fRBwiGZTbteL3yuek7G5hNKb6C6+0uZXXjh8VW12ojpS5M8Jcm+q+z76cz+eK410sqe9fdJblJVP71rRVXdL8nvJHlhdx863e6c5ICqOmR+5+7+zyS/luSX92TRbL7ufmOS7UmeOLoWrruqenSSH0nyuqq6e5Kbd/cBu97jmb2Pjbxcz1TV3Vd8WHCfJB+d7r8uycsyG3HZscrub0rykiRnbm6VbKT1zpOmCxR+X5KPrLad65/u/mxmo9/PWufigb+R5Dn1jV9AuFFVPXNP1Li3E0pvwKb/NI9K8ryqOmbFtksy+2N4kzV2/60krsJ7PdDdndmVdX+4Zj8Jc05mUzYflNkxnPemrH7S+odJHlBVh21iqWy8m1XVjrnban/4Tk7yTD89cL231rF8xq6fhEnyE0l+qLt3ZjZKuvL9/ecxhff66BZJXj39RNMHMrvy6gunbX+W5Dty9QscfV13f767f727r9gjlbKRVjtP2vWd0v/I7EN/F8C5Aenuf0vy/uzmw7/u/kCS/5XZh4fnZnas77RnKty71ex8GAAAAPY8n7wDAAAwjFAKAADAMEIpAAAAwwilAAAADCOUAgAAMIxQCgAAwDBCKQAAAMP8f/KRgY3EeWOwAAAAAElFTkSuQmCC\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "from sklearn.metrics import confusion_matrix\n",
+ "from sklearn.metrics import classification_report\n",
+ "from sklearn.metrics import roc_auc_score\n",
+ "from sklearn.metrics import roc_curve\n",
+ "from sklearn.metrics import accuracy_score\n",
+ "from sklearn.metrics import precision_score\n",
+ "from sklearn.metrics import recall_score\n",
+ "\n",
+ "def get_classification_statistics(y_true,y_pred,labels,pos_label=None):\n",
+ " if pos_label == None:\n",
+ " pos_label = labels[0]\n",
+ " precision = precision_score(y_true, y_pred,pos_label=labels[0],labels=labels)\n",
+ " recall = recall_score(y_true, y_pred,pos_label=labels[0],labels=labels)\n",
+ " f1 = 2 * (precision * recall) / (precision + recall)\n",
+ " acc = accuracy_score (y_true, y_pred)\n",
+ " return acc, precision,recall,f1\n",
+ "\n",
+ "def plot_dict(info_dict,columns,title):\n",
+ " model_overview_list = sorted([[name, val] for name, val in info_dict.items()], key = lambda x: x[1],reverse=True)\n",
+ "\n",
+ " names = [x[0] for x in model_overview_list]\n",
+ " values = [x[1] for x in model_overview_list]\n",
+ "\n",
+ " print( pd.DataFrame(model_overview_list,columns=columns))\n",
+ "\n",
+ " fig = plt.figure(figsize = (16, 6))\n",
+ " fig.suptitle(title)\n",
+ " ax = fig.add_subplot(111)\n",
+ " plt.bar(range(len(values)), values, align='center')\n",
+ " plt.xticks(range(len(names)), names)\n",
+ " plt.show()\n",
+ "\n",
+ "def plot_model_compare_classification(model_names,y_col,results,labels):\n",
+ " model_overview_acc = {}\n",
+ " model_overview_precision = {}\n",
+ " model_overview_recall = {}\n",
+ " model_overview_f1 = {}\n",
+ " for name in model_names:\n",
+ " acc, precision,recall,f1 = get_classification_statistics(results[y_col],results['Predicted_by_'+name],labels)\n",
+ " \n",
+ " model_overview_acc[name] = acc\n",
+ " model_overview_precision[name] = precision\n",
+ " model_overview_recall[name] = recall\n",
+ " model_overview_f1[name] = f1\n",
+ " \n",
+ " plot_dict(model_overview_acc,['Model','ACC'],'Accuracy on the test data compared')\n",
+ " plot_dict(model_overview_precision,['Model','Precision'],'Precision on the test data compared')\n",
+ " plot_dict(model_overview_recall,['Model','Recall'],'Recall on the test data compared')\n",
+ " plot_dict(model_overview_f1,['Model','F1'],'F1 on the test data compared')\n",
+ " \n",
+ "labels = ['N','Y']\n",
+ "model_names = [name for name,model in models]\n",
+ "plot_model_compare_classification(model_names,'Loan_Status',result,labels)\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Confusion matrix "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " Predicted_by_LR \n",
+ " N \n",
+ " Y \n",
+ " \n",
+ " \n",
+ " Loan_Status \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " N \n",
+ " 21 \n",
+ " 30 \n",
+ " \n",
+ " \n",
+ " Y \n",
+ " 2 \n",
+ " 132 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "Predicted_by_LR N Y\n",
+ "Loan_Status \n",
+ "N 21 30\n",
+ "Y 2 132"
+ ]
+ },
+ "execution_count": 50,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.crosstab(result['Loan_Status'],result['Predicted_by_LR'])\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Precision, recall, F-measure and support"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " precision recall f1-score support\n",
+ "\n",
+ " N 0.91 0.41 0.57 51\n",
+ " Y 0.81 0.99 0.89 134\n",
+ "\n",
+ "avg / total 0.84 0.83 0.80 185\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(classification_report(result['Loan_Status'],result['Predicted_by_LR']))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.5"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}