# Functions

In [1]:
def fill_missing_values(df):
    """Fill in missing values based on column type"""
    return df.dropna()

def replace_binary_labels_with_numeric(df):
    """Replace binary categories with 0 and 1"""
    return df.replace({
        'Gender': {'Male': 0, 'Female': 1}, 
        'Education': {'Graduate': 0, 'Not Graduate': 1}, 
        'Married': { 'Yes': 0, 'No': 1},
        'Self_Employed': { 'Yes': 0, 'No': 1},
        'Loan_Status': { 'Y': 0, 'N': 1}
    })

def replace_binary_labels_with_numeric_no_loan(df):
    """Replace binary categories with 0 and 1"""
    return df.replace({
        'Gender': {'Male': 0, 'Female': 1}, 
        'Education': {'Graduate': 0, 'Not Graduate': 1}, 
        'Married': { 'Yes': 0, 'No': 1},
        'Self_Employed': { 'Yes': 0, 'No': 1}
    })

def create_dummies(df, columns):
    """Create dummy columns for categories and merge into dataframe"""
    for column in columns:
        dummy = pd.get_dummies(df[column], drop_first = True)
        df = pd.concat([df, dummy], axis = 1)
        df = df.drop(column, axis = 1)
   
    return df

# Predict Loan status

In [2]:
import pandas as pd
import seaborn as sb
import numpy as np
from sklearn.linear_model import LogisticRegression

## Load data

In [3]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

## Explore data

In [4]:
train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [5]:
test.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


## Clean the data

In [6]:
train = fill_missing_values(train)
test = fill_missing_values(test)

In [7]:
train = replace_binary_labels_with_numeric(train)
test = replace_binary_labels_with_numeric_no_loan(test)

In [8]:
train = create_dummies(train, ['Property_Area', 'Dependents'])
test = create_dummies(test, ['Property_Area', 'Dependents'])

## Feature selection

In [9]:
# Building our Machine Learning model
X = train[['Gender','ApplicantIncome','Credit_History','LoanAmount','Education']] # X are all the features 
Y = train[['Loan_Status']] # Y is the target we want to predict

## Create model

In [10]:
# Train the model
model = LogisticRegression()
model.fit(X, Y)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [14]:
prediction = model.predict(test[['Gender','ApplicantIncome','Credit_History','LoanAmount','Education']])
test['Loan_Status'] = prediction

In [19]:
test['Loan_Status']

0      0
1      0
2      0
4      0
5      0
6      0
7      1
9      0
10     0
14     0
15     0
16     0
17     0
18     0
19     0
20     0
21     0
23     0
24     0
25     1
27     0
29     0
30     0
31     0
32     0
33     0
34     0
35     1
37     0
38     0
      ..
332    0
333    0
334    0
335    0
337    0
338    0
339    1
340    0
341    0
342    0
343    0
344    0
345    0
346    1
347    0
348    0
349    0
350    0
352    0
353    0
354    1
355    0
356    0
357    0
359    0
361    0
362    0
363    0
365    0
366    0
Name: Loan_Status, Length: 289, dtype: int64