# Functions

In [16]:
def fill_missing_values(df):
    """Fill in missing values based on column type"""
    return df.dropna()

def replace_binary_labels_with_numeric(df):
    """Replace binary categories with 0 and 1"""
    return df.replace({
        'Gender': {'Male': 0, 'Female': 1}, 
        'Education': {'Graduate': 0, 'Not Graduate': 1}, 
        'Married': { 'Yes': 0, 'No': 1},
        'Self_Employed': { 'Yes': 0, 'No': 1},
        'Loan_Status': { 'Y': 0, 'N': 1}
    })

def replace_binary_labels_with_numeric_no_loan(df):
    """Replace binary categories with 0 and 1"""
    return df.replace({
        'Gender': {'Male': 0, 'Female': 1}, 
        'Education': {'Graduate': 0, 'Not Graduate': 1}, 
        'Married': { 'Yes': 0, 'No': 1},
        'Self_Employed': { 'Yes': 0, 'No': 1}
    })

def create_dummies(df, columns):
    """Create dummy columns for categories and merge into dataframe"""
    for column in columns:
        dummy = pd.get_dummies(df[column], drop_first = True)
        df = pd.concat([df, dummy], axis = 1)
        df = df.drop(column, axis = 1)
   
    return df

In [32]:
import pandas as pd
import seaborn as sb
import numpy as np
from sklearn.ensemble import RandomForestRegressor

train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

train = fill_missing_values(train)
test = fill_missing_values(test)

train = replace_binary_labels_with_numeric(train)
test = replace_binary_labels_with_numeric_no_loan(test)

train = create_dummies(train, ['Property_Area', 'Dependents'])
test = create_dummies(test, ['Property_Area', 'Dependents'])

# Model

In [33]:
# Labels are the values we want to predict
labels = np.array(train['Loan_Status'])

# Remove the labels from the features
# axis 1 refers to the columns
train = train.drop('Loan_Status', axis = 1)
train = train.drop('Loan_ID', axis = 1)

# Saving feature names for later use
feature_list = list(train.columns)

# Convert to numpy array
train = np.array(train)

In [34]:
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)

# Train the model on training data
rf.fit(train, labels);

# Predictions

In [38]:
predictions = rf.predict(test.drop('Loan_ID', axis = 1))

In [None]:
# Calculate the absolute errors
errors = abs(predictions - test_labels)