In [2]:
import os
import pandas as pd
pd.set_option('display.max_colwidth', -1)
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.model_selection import train_test_split
import graphviz

%matplotlib inline

ModuleNotFoundError: No module named 'graphviz'

In [None]:
# Function for reading the data (hard-coded for now)
def read_data(filename):
    
    _, ext = os.path.splitext(filename)
    
    if ext == '.csv':
        df = pd.read_csv(filename, index_col=0)
    elif ext == '.xls':
        df = pd.read_excel(filename, header=1)
    
    return df

## Read Data

First, we will read the dataset and the data dictionary for the dataset.

In [None]:
# Read 'credit-data.csv'
df = read_data('data/credit-data.csv')

In [None]:
df.shape

In [None]:
# data/data-dictionary.xls
data_dict = read_data('data/data-dictionary.xls')

In [None]:
data_dict.head(12)

## Exploratory Data Analysis

We will look at distributions of variables, correlations between them, and summarize data.

### The Distribution of the Outcome Variable

The Outcome Variable: __*SeriousDlgin2yrs*__

In [None]:
df['SeriousDlqin2yrs'].value_counts()

In [None]:
df['SeriousDlqin2yrs'].value_counts().plot('bar', rot=0)
plt.show()

There are an imbalance in the outcome variable. Specifically, there are far more people who experienced 90 days past due delinquency or worse.

### Missing Values

Now we should look at the number of missing values in each column.

In [None]:
def generate_nan_df(df):
    
    nan = df.isna().sum()
    nan_perc = round(100 * nan / len(df.index), 2)
    nan_df = pd.concat([nan, nan_perc], axis=1)
    nan_df = nan_df.rename(columns = {0: 'NaN', 1: 'Percent of NaN'})
    nan_df = nan_df.sort_values(by=['Percent of NaN'], ascending=False)
    
    return nan_df

In [None]:
# Check the missing data
nan_df = generate_nan_df(df)
nan_df.head(12)

- We can see that there are a lot of missing data for __MonthlyIncome__ and __NumberOfDependents__.
- Therefore, we probably do not want to simply drop all missing data.

First, we will look at the distribution of __MonthlyIncome__ and __NumberOfDependents__.

In [None]:
def generate_boxplots(df, columns):
    
    for column in columns:
        
        fig, ax = plt.subplots(figsize=(15, 5))
        ax = sns.boxplot(x=df[column])
        
        # if kurtosis is beyond -3 and 3, log scale the x axis.
        if abs(df[column].kurt()) > 3:
            ax.set_xscale('log')
    
    plt.show()

In [None]:
generate_boxplots(df, ['MonthlyIncome', 'NumberOfDependents'])

As we can see from these boxplots, the distribution of __MonthlyIncome__ is very skewed.
- For __MonthlyIncome__, which is very *skewed*, we will use median to impute NaN.
- For __NumberOfDependents__, we will use mean to impute NaN.

In [None]:
def impute_missing_data(df, columns):
    
    for column in columns:
        if abs(df[column].kurt()) > 3:
            cond = df[column].median()
        else:
            cond = df[column].mean()
        estimate = round(cond)
        df[column] = df[column].fillna(estimate)
    
    return df

In [None]:
df = impute_missing_data(df, ['MonthlyIncome', 'NumberOfDependents'])

### Data Types

Since we finally have a complete dataset, we will make sure if the dtype of each columns is correct based on the data dictionary we have.

In [None]:
df.dtypes

For most of the columns, the dtype looks like right. However, __NumberOfDependents__ should be in integer; we will change it.

In [None]:
df['NumberOfDependents'] = df['NumberOfDependents'].astype(int)

### Correlations between Variables

Next, we will examine correlations between variables to detect any patterns.

In [None]:
def generate_corr_heatmap(df):

    # compute correlation
    corr = df.corr()
    
    # generate a mask for the upper triangle
    mask = np.zeros_like(corr, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True
    
    # create figure and plot
    f, ax = plt.subplots(figsize=(15, 5))
    
    # Generate a diverging colormap
    cmap = sns.diverging_palette(220, 10, as_cmap=True)
    sns.heatmap(corr, mask=mask, annot=True, fmt='.2f', cmap=cmap, linewidths=.5)

    plt.show()

In [None]:
generate_corr_heatmap(df)

From this heatmap, we can see that the following variables:
- __NumberOfTime30-59DaysPastDueNotWorse__,
- __NumberOfTimes90DaysLate__,
- __NumberOfTime60-89DaysPastDueNotWorse__

have extremely high correlations amongst one another (0.98 - 0.99).

We will drop the following variables:
- __NumberOfTime30-59DaysPastDueNotWorse__,
- __NumberOfTime60-89DaysPastDueNotWorse__.

In [None]:
def drop_variables(df, columns):
    
    df = df.drop(labels=columns, axis=1)
    
    return df

In [None]:
unnecessary_vars = ['NumberOfTime30-59DaysPastDueNotWorse', 'NumberOfTime60-89DaysPastDueNotWorse']
df = drop_variables(df, unnecessary_vars)

In [None]:
generate_corr_heatmap(df)

### Finding Outliers

Next, we will try to find outliers.

In [None]:
def find_iqr_outliers(df, column, weight=1.5):

    data = df[column]
    quantile_25, quantile_75 = np.percentile(data, [25, 75])
    iqr = quantile_75 - quantile_25
    iqr_weight = iqr * weight
    lowest = quantile_25 - iqr_weight
    highest = quantile_75 + iqr_weight
    outlier_ind = np.where((data < lowest) | (data > highest))
        
    return outlier_ind

In [None]:
def visualize_outliers(df, columns):
    
    for column in columns:
        f, ax = plt.subplots(figsize=(15, 5))
        df.iloc[find_iqr_outliers(df, column)][column].hist(bins=25)
        plt.xlabel(column)
    
    plt.show()

We will look at the following variables (selected based on the correlation with the outcome variable) and check if they have any outliers.
- __age__
- __NumberOfTimes90DaysLate__

In [None]:
visualize_outliers(df, ['age', 'NumberOfTimes90DaysLate'])

## Generate Features/Predictors



In [None]:
df['NumberOfDependents'].value_counts().plot('bar', rot=0)
plt.show()

In [None]:
df.dtypes

In [None]:
def get_age_category(age):
    
    # bins based on Consumer Financial Protection Bureau
    if age < 30:
        cat = 1
    elif age <= 44:
        cat = 2
    elif age <= 64:
        cat = 3
    else:
        cat = 4
    
    return cat

In [None]:
def get_dependent_dummy(num_of_dependent):
    
    if num_of_dependent == 0:
        cat = 0
    elif num_of_dependent >= 1:
        cat = 1
    
    return cat

In [None]:
def discretize_age(df):
    
    df['age_cat'] = df['age'].apply(lambda x: get_age_category(x))
    df = df.drop(labels=['age'])
    
    return df

In [None]:
def discretize_dependent(df):
    
    df['dep_cat'] = df['NumberOfDependents'].apply(lambda x: get_dependent_dummy(x))
    df = df.drop(labels=['NumberOfDependents'])
    
    return df

In [None]:
test_df = discretize_age(df)
test_df = discretize_dependent(test_df)

In [None]:
test_df.head()

In [None]:
def generate_dummy(df, variable):
    
    df = pd.get_dummies(df, columns=[variable])
    df = df.drop(labels=['dep_cat'])
    
    return df

In [None]:
df = generate_dummy(df, 'dep_cat')
df.head()

## Build Classifier

We will build a classifier with __*DecisionTreeClassifier*__.

1. Split the dataset into y_df (Outcome set), X_df (Feature set).

In [None]:
y_df = df[['SeriousDlqin2yrs']]
X_df = df.drop(labels=['SeriousDlqin2yrs'], axis=1)

2. Split the outcome set and feature set into training and test sets.


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2, random_state=11)

3. Train the Decision Tree.

In [None]:
dt_loan = DecisionTreeClassifier(criterion='entropy', random_state=156)
dt_loan.fit(X_train, y_train)

4. Visualize the trained Decision Tree.

In [None]:
export_graphviz(dt_loan, out_file="tree.dot", class_names=['Delinquent', 'Not Delinquent'], feature_names=X_train.columns.tolist(), impurity=True, filled=True)



In [None]:
data_dict.head(14)

In [None]:
df.SeriousDlqin2yrs.value_counts()

In [None]:
g = sns.FacetGrid(df, col='SeriousDlqin2yrs')
g.map(plt.hist, 'age', bins=20)

In [None]:
g = sns.FacetGrid(df, col='SeriousDlqin2yrs')
g.map(plt.hist, 'NumberOfDependents', bins=20)

In [None]:
df.NumberOfDependents.unique()

In [None]:
estimate