In [None]:
""" 
Importing the necessary libraries required for analysis
- Working directory is already set in the Jupyter Notebook
- The problem statement is to create a binary classification in Python to predict the Loan Status (Loan_Status)
of a particular individual. 

Methodology followed and quick inferences:
    - All records with blank values will be imputed with mean (for continuous variables)
    - We'll determine the nature of variables, continuous or categorical:
        Categorical Features:
            Gender
            Married
            Dependents
            Education
            Self_Employed
            Loan_Amount_Term
            Credit_History
            Property_Area
        Continuous Features:
            ApplicantIncome
            CoapplicantIncome
            LoanAmount

Upon quick look and EDA into the file we can see that the Loan_ID is an identifier field however, the
Loan_Status field is a label field (dependant variable)

We create plots to determine the outlier features for this dataset. 

For this current problem, we are going to use four algorithms

- Logistic Regression
- Support Vector Machine
- Random Forest Classifer

"""

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from sklearn.preprocessing import LabelEncoder
from sklearn import cross_validation, metrics
import sklearn.linear_model
import sklearn.tree
import sklearn.ensemble

In [None]:
# Define modelfit function
# ------------------------
def modelfit(model, dtrain, dtest, predictors, performCV=True, printFeatImp=True, n_cvfolds=10):
    # Fit the model to the data
    model.fit(dtrain[predictors], dtrain['Loan_Status'])

    # Predict training set:
    dtrain_predictions = model.predict(dtrain[predictors])
    dtrain_predprob = model.predict_proba(dtrain[predictors])[:, 1]

    # Perform cross-validation:
    if performCV:
        cv_score = cross_validation.cross_val_score(model, dtrain[predictors],
                                                    dtrain['Loan_Status'], cv=n_cvfolds,
                                                    scoring='roc_auc')

    # Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % metrics.accuracy_score(dtrain['Loan_Status'].values, dtrain_predictions))
    print("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['Loan_Status'], dtrain_predprob))

    if performCV:
        print("CV Score : Mean %.7g | Std %.7g | Min %.7g | Max %.7g" % (np.mean(cv_score),
                                                                         np.std(cv_score),
                                                                         np.min(cv_score),
                                                                         np.max(cv_score)))

    # Print Feature Importance:
    if printFeatImp:
        feat_imp = pd.Series(model.feature_importances_, predictors).sort_values(ascending=False)
        feat_imp.plot(kind='bar', title='Feature Importances')
        plt.ylabel('Feature Importance Score')

In [None]:
# Load datasets
# -------------
train = pd.read_csv('train_1.csv')
print(type(train))
#test = pd.read_csv('test_data.csv')


# Basic data exploration and plots
# --------------------------------
print(train.head(5))
train.describe()
train['Education'].value_counts()

cols = (['ApplicantIncome', 'LoanAmount',
         'Loan_Amount_Term'])  # numeric features
for c in cols:
    train.hist(column=c, bins=50)
    train.boxplot(column=c, by = 'Gender')

pd.crosstab(train['Education'], train['Gender'], margins=True, normalize='columns')
pd.crosstab(train['Credit_History'], train['Property_Area'], margins=True, normalize='columns')
plt.scatter(train['LoanAmount'],train['Credit_History'])

In [None]:
# Handling missing data and imputation

"""X = train.iloc[:,:-1].values
#y = train.iloc[:, -1].values
from sklearn.preprocessing import Imputer

imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
imputer.fit(train[:, 5:8])
train[:, 5:8] = imputer.transform(train[:, 5:8])
imputer_2 = Imputer(missing_values = 'NaN', strategy = 'most_frequent', axis =0)
imputer_2.fit(train[:, 1:5])
train[:, 1:5] = imputer_2.transform(train[:, 1:5])
imputer_2.fit(train[:, 8:10])
train[:, 8:10] = imputer_2.transform(train[:, 8:10])

from sklearn.preprocessing import LabelEncoder
labelencoder_X = LabelEncoder()
train[:, 0] = labelencoder_X(train[:,0])

from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder(categorical_features = [1,2,3,4,8,9])

labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)

"""

In [None]:
# Impute missing values
# ---------------------
train_mod = train.copy()

# Exclude observations with missing 'Credit_History'
train_mod = train_mod.dropna(subset=['Credit_History']).reset_index()

# Impute 'LoanAmount' with median values
train_mod['LoanAmount'] = train_mod['LoanAmount'].fillna(train_mod['LoanAmount'].median())

# For now, impute 'Gender' simply with 'Male' (the majority)
train_mod['Gender'] = train_mod['Gender'].fillna('Male')

In [None]:
# Label encoding
# --------------
number = LabelEncoder()

train_mod['Gender'] = number.fit_transform(train_mod['Gender'].astype(str))

train_mod['Education'] = number.fit_transform(train_mod['Education'].astype(str))

train_mod['Loan_Status'] = number.fit_transform(train_mod['Loan_Status'].astype(str))

In [None]:
# Model Building: Logistic Regression
# -----------------------------------
# Create object of Logistic Regression
model = sklearn.linear_model.LogisticRegression()

# Select predictors
# predictors = ['Credit_History', 'Education', 'Gender', 'ApplicantIncome', 'LoanAmount']
predictors = ['Credit_History', 'Education', 'ApplicantIncome', 'LoanAmount']

# Converting predictors and outcome to numpy array
x_train = train_mod[predictors].values
y_train = train_mod['Loan_Status'].values

# Coss-validation
# Simple K-Fold cross validation. 10 folds.
cv = cross_validation.KFold(len(train_mod), n_folds=10)

cv_score = cross_validation.cross_val_score(model, train_mod[predictors],
                                            train_mod['Loan_Status'], cv=10,
                                            scoring='roc_auc')
results = []
for traincv, testcv in cv:
    model.fit(x_train[traincv, :], y_train[traincv])
    x_test = train_mod.loc[testcv, predictors]
    predicted = model.predict(x_test)
    results.append(sum(abs(predicted - train_mod.ix[testcv, 'Loan_Status'].values))/len(testcv))

print("\nCV Results: " + str(np.mean(100*np.array(results))) + "% wrong predictions")
print("\nCV Score: " + str(np.mean(cv_score)))

In [None]:
# Model Building: Decision Tree
# -----------------------------
# Create object of Decision Tree
model = sklearn.tree.DecisionTreeClassifier()

# Select predictors
predictors = ['Credit_History', 'Education', 'Gender', 'ApplicantIncome', 'LoanAmount']

# Converting predictors and outcome to numpy array
x_train = train_mod[predictors].values
y_train = train_mod['Loan_Status'].values

# Fit model
model.fit(x_train, y_train)


In [None]:
# Model Building: Random Forest
# -----------------------------
# Create object of Random Forest
model = sklearn.ensemble.RandomForestClassifier()

# Select all predictors
predictors = ['Credit_History', 'Education', 'Gender', 'ApplicantIncome', 'LoanAmount']

# Converting predictors and outcome to numpy array
x_train = train_mod[predictors].values
y_train = train_mod['Loan_Status'].values

# Fit model
model.fit(x_train, y_train)

# Feature importance
featimp = pd.Series(model.feature_importances_, index=predictors).sort_values(ascending=False)
print(featimp)

In [None]:
# Support Vector Machine

from sklearn.svm import SVC

classifier_SVC = SVC(kernel = 'linear', random_state = 42)
classifier_SVC.fit(X_train, y_train)
y_pred_SVC = classifier_SVC.predict(X_test)
cm_SVC = confusion_matrix(y_test, y_pred_SVC)
accuracy_score_SVC = accuracy_score(y_test, y_pred_SVC)
print("Support Vector Machine accuracy score", accuracy_score_SVC)
print("Support Vector Machine confusion matrix", cm_SVC)