In [1]:
%matplotlib inline


# Classifier comparison


A comparison of a several classifiers in scikit-learn on synthetic datasets.
The point of this example is to illustrate the nature of decision boundaries
of different classifiers.
This should be taken with a grain of salt, as the intuition conveyed by
these examples does not necessarily carry over to real datasets.

Particularly in high-dimensional spaces, data can more easily be separated
linearly and the simplicity of classifiers such as naive Bayes and linear SVMs
might lead to better generalization than is achieved by other classifiers.

The plots show training points in solid colors and testing points
semi-transparent. The lower right shows the classification accuracy on the test
set.



In [2]:
print(__doc__)

import pandas as pd
import os
# Code source: Gaël Varoquaux
#              Andreas Müller
# Modified for documentation by Jaques Grobler
# License: BSD 3 clause

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
#from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis


Automatically created module for IPython interactive environment


In [3]:
loanData = pd.read_csv("DataSets/clean_loan_data_2017.csv", low_memory=False)
loanData.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,,,11875,11875,11875,36 months,11.44%,391.26,B,B4,...,,,Cash,N,,,,,,
1,,,1500,1500,1500,36 months,5.32%,45.18,A,A1,...,,,Cash,N,,,,,,
2,,,35000,35000,35000,60 months,25.49%,1037.38,E,E4,...,,,Cash,N,,,,,,
3,,,12000,12000,12000,36 months,6.99%,370.48,A,A2,...,,,Cash,N,,,,,,
4,,,14000,14000,14000,36 months,8.24%,440.27,B,B1,...,,,Cash,N,,,,,,


In [4]:
h = .02  # step size in the mesh

names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA"]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]


In [5]:
data = loanData.drop(["funded_amnt_inv","out_prncp", "last_pymnt_amnt","dti", "inq_last_6mths", "open_acc", "pub_rec", "revol_bal", "out_prncp_inv", 
"total_rec_prncp", "total_rec_late_fee", "recoveries", "collection_recovery_fee", "collections_12_mths_ex_med",
"policy_code", "tot_coll_amt", "open_acc_6m", "open_act_il", "open_il_12m", "open_il_24m", "mths_since_rcnt_il",
"total_bal_il", "il_util", "open_rv_12m", "open_rv_24m", "all_util", "total_rev_hi_lim", "total_cu_tl",
"inq_last_12m", "chargeoff_within_12_mths", "mo_sin_old_il_acct", "mo_sin_old_rev_tl_op", "mo_sin_rcnt_rev_tl_op", 
"mo_sin_rcnt_tl", "mort_acc", "mths_since_recent_bc", "mths_since_recent_inq", "num_actv_bc_tl", "num_actv_rev_tl", 
"num_bc_sats", "num_bc_tl", "num_il_tl", "num_op_rev_tl", "num_rev_accts", "num_rev_tl_bal_gt_0", "num_sats", "percent_bc_gt_75", 
"pub_rec_bankruptcies", "tax_liens", "total_bc_limit", "total_il_high_credit_limit",
"orig_projected_additional_accrued_interest","last_pymnt_d","next_pymnt_d",
"grade", "sub_grade", "emp_title", "emp_length", "home_ownership", "verification_status", "issue_d", "loan_status", "pymnt_plan",
"url", "desc", "purpose", "title", "zip_code", "addr_state", "earliest_cr_line", "mths_since_last_delinq", 
"mths_since_last_record", "revol_util", "initial_list_status", "last_pymnt_d", "next_pymnt_d", "last_credit_pull_d", 
"mths_since_last_major_derog", "application_type", "annual_inc_joint", "dti_joint", "verification_status_joint", 
"bc_open_to_buy", "bc_util", "mths_since_recent_bc_dlq", "mths_since_recent_revol_delinq", "num_tl_120dpd_2m",
"revol_bal_joint", "sec_app_earliest_cr_line", "sec_app_inq_last_6mths", "sec_app_mort_acc", "sec_app_open_acc", 
"sec_app_revol_util", "sec_app_open_act_il", "sec_app_num_rev_accts", "sec_app_chargeoff_within_12_mths", 
"sec_app_collections_12_mths_ex_med", "sec_app_mths_since_last_major_derog", "hardship_flag", "hardship_type", 
"hardship_reason", "hardship_status", "deferral_term", "hardship_amount", "hardship_start_date", "hardship_end_date",
"payment_plan_start_date", "hardship_length", "hardship_dpd", "hardship_loan_status", "hardship_payoff_balance_amount", 
"hardship_last_payment_amount", "disbursement_method", "debt_settlement_flag", "debt_settlement_flag_date", "settlement_status",
"settlement_date", "settlement_amount", "settlement_percentage", "settlement_term","int_rate","term","id","member_id",
"acc_now_delinq","delinq_amnt","num_tl_30dpd","num_tl_90g_dpd_24m","delinq_2yrs","num_accts_ever_120_pd","inq_fi",
"num_tl_op_past_12m","pct_tl_nvr_dlq","acc_open_past_24mths","total_acc","tot_cur_bal","max_bal_bc",
"avg_cur_bal","total_bal_ex_mort","tot_hi_cred_lim","funded_amnt","loan_amnt","installment","total_rec_int",
"total_pymnt_inv","total_pymnt","annual_inc"],axis=1)

feature_names = data.columns


data.isnull().any()

out_prncp          False
last_pymnt_amnt    False
dtype: bool

In [6]:
X = data

target = loanData["loan_status"]
target_names = ["current", "charged off","fully paid","in grace period","late (16-30 days)", "late (31-120 days)"]
y = target



NameError: name 'make_classification' is not defined

In [17]:
i = 1
# iterate over datasets
for ds_cnt, ds in enumerate[loanData.loan_status[X:]]:
    # preprocess dataset, split into training and test part
    X, y = ds
    X = StandardScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=.4, random_state=42)

    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))

    # just plot the dataset first
    cm = plt.cm.RdBu
    cm_bright = ListedColormap(['#FF0000', '#0000FF'])
    ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
    if ds_cnt == 0:
        ax.set_title("Input data")
    # Plot the training points
    ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
               edgecolors='k')
    # Plot the testing points
    ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6,
               edgecolors='k')
    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    ax.set_xticks(())
    ax.set_yticks(())
    i += 1

    # iterate over classifiers
    for name, clf in zip(names, classifiers):
        ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)

        # Plot the decision boundary. For that, we will assign a color to each
        # point in the mesh [x_min, x_max]x[y_min, y_max].
        if hasattr(clf, "decision_function"):
            Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
        else:
            Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]

        # Put the result into a color plot
        Z = Z.reshape(xx.shape)
        ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)

        # Plot the training points
        ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
                   edgecolors='k')
        # Plot the testing points
        ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
                   edgecolors='k', alpha=0.6)

        ax.set_xlim(xx.min(), xx.max())
        ax.set_ylim(yy.min(), yy.max())
        ax.set_xticks(())
        ax.set_yticks(())
        if ds_cnt == 0:
            ax.set_title(name)
        ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
                size=15, horizontalalignment='right')
        i += 1

plt.tight_layout()
plt.show()

TypeError: cannot do slice indexing on <class 'pandas.core.indexes.range.RangeIndex'> with these indexers [       out_prncp  last_pymnt_amnt
0        5444.16       391.260000
1         653.97        45.176391
2           0.00     35565.100000
3           0.00      8001.300000
4        6254.62       440.270000
5        4962.71      1039.930000
6        3976.13       290.470000
7           0.00      8895.230000
8        4632.17       335.690000
9           0.00      4005.060000
10          0.00     14437.860000
11          0.00       153.940000
12       6977.45       481.840000
13       3681.80       272.430000
14          0.00      4856.790000
15          0.00      5011.030000
16          0.00     27594.820000
17          0.00     18373.210000
18      14424.76       486.260000
19       2233.78       157.240000
20          0.00     13243.180000
21          0.00      1105.380000
22       4467.55       314.480000
23       9264.35       671.380000
24      20310.10       644.140000
25       2819.10       205.900000
26       4360.90       301.150000
27          0.00      2709.470000
28      22350.67       716.510000
29          0.00      3821.950000
...          ...              ...
43015       0.00       242.830000
43016    2469.16       128.020000
43017    9259.27       480.080000
43018   18265.69       927.280000
43019    4865.48       261.640000
43020    5925.96       307.250000
43021       0.00       371.260000
43022   15030.36       934.430000
43023    9394.19       498.150000
43024    6224.73       326.970000
43025       0.00      8683.790000
43026    5178.54       289.470000
43027    4320.96       224.040000
43028   12177.07       618.190000
43029       0.00      1353.200000
43030       0.00     15109.840000
43031    5309.95       309.310000
43032   16888.61      1011.270000
43033       0.00     23671.760000
43034       0.00      1260.820000
43035       0.00       653.930000
43036    9649.87       525.560000
43037   25790.37      1489.070000
43038    5231.58       297.310000
43039   12170.04       349.650000
43040    5342.14       284.850000
43041       0.00      3832.740000
43042    3095.00       161.180000
43043       0.00         2.890000
43044   27261.47       973.740000

[43045 rows x 2 columns]] of <class 'pandas.core.frame.DataFrame'>

In [None]:
print(__doc__)


# Code source: Gaël Varoquaux
#              Andreas Müller
# Modified for documentation by Jaques Grobler
# License: BSD 3 clause

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

h = .02  # step size in the mesh

names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA"]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
                           random_state=1, n_clusters_per_class=1)
rng = np.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)
linearly_separable = (X, y)

datasets = [make_moons(noise=0.3, random_state=0),
            make_circles(noise=0.2, factor=0.5, random_state=1),
            linearly_separable
            ]

figure = plt.figure(figsize=(27, 9))
i = 1
# iterate over datasets
for ds_cnt, ds in enumerate(datasets):
    # preprocess dataset, split into training and test part
    X, y = ds
    X = StandardScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=.4, random_state=42)

    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))

    # just plot the dataset first
    cm = plt.cm.RdBu
    cm_bright = ListedColormap(['#FF0000', '#0000FF'])
    ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
    if ds_cnt == 0:
        ax.set_title("Input data")
    # Plot the training points
    ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
               edgecolors='k')
    # Plot the testing points
    ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6,
               edgecolors='k')
    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    ax.set_xticks(())
    ax.set_yticks(())
    i += 1

    # iterate over classifiers
    for name, clf in zip(names, classifiers):
        ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)

        # Plot the decision boundary. For that, we will assign a color to each
        # point in the mesh [x_min, x_max]x[y_min, y_max].
        if hasattr(clf, "decision_function"):
            Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
        else:
            Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]

        # Put the result into a color plot
        Z = Z.reshape(xx.shape)
        ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)

        # Plot the training points
        ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
                   edgecolors='k')
        # Plot the testing points
        ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
                   edgecolors='k', alpha=0.6)

        ax.set_xlim(xx.min(), xx.max())
        ax.set_ylim(yy.min(), yy.max())
        ax.set_xticks(())
        ax.set_yticks(())
        if ds_cnt == 0:
            ax.set_title(name)
        ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
                size=15, horizontalalignment='right')
        i += 1

plt.tight_layout()
plt.show()