# Classification through random forests
Builds on yesterday's assignment.

# Copied from yesterday's assignment
I originally modified my own function, but for some reason that I can't figure out, that made everything not work.  In the end, I copied this directly from the lecture until it all worked.

In [7]:
import pandas as pd
pd.options.display.max_columns = 200
pd.options.display.max_rows = 200

X_train = pd.read_csv('../kaggle_data/train_features.csv')
X_test = pd.read_csv('../kaggle_data/test_features.csv')
y_train = pd.read_csv('../kaggle_data/train_labels.csv')['charged_off']
sample_submission = pd.read_csv('../kaggle_data/sample_submission.csv')

X_train.shape, X_test.shape, y_train.shape

((37745, 103), (9437, 103), (37745,))

In [8]:
def wrangle(X):
    X = X.copy()
    
    # Drop some columns
    X = X.drop(columns='id')  # id is random
    X = X.drop(columns=['member_id', 'url', 'desc'])  # All null
    X = X.drop(columns='title')  # Duplicative of purpose
    X = X.drop(columns='grade')  # Duplicative of sub_grade
    
    # Transform sub_grade from "A1" - "G5" to 1.1 - 7.5
    def wrangle_sub_grade(x):
        first_digit = ord(x[0]) - 64
        second_digit = int(x[1])
        return first_digit + second_digit/10
    
    X['sub_grade'] = X['sub_grade'].apply(wrangle_sub_grade)

    # Convert percentages from strings to floats
    X['int_rate'] = X['int_rate'].str.strip('%').astype(float)
    X['revol_util'] = X['revol_util'].str.strip('%').astype(float)
        
    # Transform earliest_cr_line to an integer: how many days it's been open
    X['earliest_cr_line'] = pd.to_datetime(X['earliest_cr_line'], infer_datetime_format=True)
    X['earliest_cr_line'] = pd.Timestamp.today() - X['earliest_cr_line']
    X['earliest_cr_line'] = X['earliest_cr_line'].dt.days
    
    # Create features for three employee titles: teacher, manager, owner
    X['emp_title'] = X['emp_title'].str.lower()
    X['emp_title_teacher'] = X['emp_title'].str.contains('teacher', na=False)
    X['emp_title_manager'] = X['emp_title'].str.contains('manager', na=False)
    X['emp_title_owner']   = X['emp_title'].str.contains('owner', na=False)
    
    # Drop categoricals with high cardinality
    X = X.drop(columns=['emp_title', 'zip_code'])
    
    # Transform features with many nulls to binary flags
    many_nulls = ['sec_app_mths_since_last_major_derog',
                  'sec_app_revol_util',
                  'sec_app_earliest_cr_line',
                  'sec_app_mort_acc',
                  'dti_joint',
                  'sec_app_collections_12_mths_ex_med',
                  'sec_app_chargeoff_within_12_mths',
                  'sec_app_num_rev_accts',
                  'sec_app_open_act_il',
                  'sec_app_open_acc',
                  'revol_bal_joint',
                  'annual_inc_joint',
                  'sec_app_inq_last_6mths',
                  'mths_since_last_record',
                  'mths_since_recent_bc_dlq',
                  'mths_since_last_major_derog',
                  'mths_since_recent_revol_delinq',
                  'mths_since_last_delinq',
                  'il_util',
                  'emp_length',
                  'mths_since_recent_inq',
                  'mo_sin_old_il_acct',
                  'mths_since_rcnt_il',
                  'num_tl_120dpd_2m',
                  'bc_util',
                  'percent_bc_gt_75',
                  'bc_open_to_buy',
                  'mths_since_recent_bc']

    for col in many_nulls:
        X[col] = X[col].isnull()
    
    # For features with few nulls, do mean imputation
    for col in X:
        if X[col].isnull().sum() > 0:
            X[col] = X[col].fillna(X[col].mean())
    
    # Return the wrangled dataframe
    return X


X_train = wrangle(X_train)
X_test  = wrangle(X_test)
X_train.shape, X_test.shape

((37745, 98), (9437, 98))

In [19]:
%%time
import category_encoders as ce
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier

pipe = make_pipeline(
    ce.OrdinalEncoder(), 
    DecisionTreeClassifier(max_depth=5, class_weight='balanced')
)

for x in cross_val_score(pipe, X_train, y_train, cv=5, scoring='roc_auc'):
    print(f'{x:0.4f}')

0.7030
0.7005
0.6933
0.6888
0.6892
CPU times: user 6.94 s, sys: 382 ms, total: 7.33 s
Wall time: 5.06 s


## Lose the tree for the forest

In [24]:
%%time
from sklearn.ensemble import RandomForestClassifier

pipe = make_pipeline(
    ce.OrdinalEncoder(), 
    RandomForestClassifier(
        n_estimators=100, # default
        max_depth=5, # Was the best for single trees
        min_samples_leaf=0.005,
        oob_score=True,
        class_weight='balanced',
        n_jobs=-1)
)

for x in cross_val_score(pipe, X_train, y_train, cv=5, scoring='roc_auc'):
    print(f'{x:0.4f}')

0.7222
0.7206
0.7154
0.7207
0.7289
CPU times: user 11 s, sys: 918 ms, total: 12 s
Wall time: 18.5 s


## Make and export predictions

In [31]:
# Fit the forest to the data
pipe.fit(X_train, y_train)

# Make a prediction
probs = [x[1] for x in pipe.predict_proba(X_test)]

In [18]:
# Export to CSV
actual_submission = sample_submission.copy()
actual_submission.charged_off = probs
actual_submission.to_csv('DMA4.csv', index=False)

## Play around with OOB
So what was that OOB score?

In [33]:

pipe.named_steps['randomforestclassifier'].oob_score_

0.632640084779441

Oh, right, that score is calculated through accuracy or something.  So it looks really different from the `roc_auc` scores that we'd gotten from cross-validation.  Alright, then, let's get this score the hard way.

In [35]:
# Create a probability vector from the decision function created from
# OOB estimates on the training set. 
oob_probs = pipe.named_steps['randomforestclassifier'].oob_decision_function_
oob_probs

array([[0.526113  , 0.473887  ],
       [0.36979378, 0.63020622],
       [0.7112949 , 0.2887051 ],
       ...,
       [0.73515697, 0.26484303],
       [0.68454795, 0.31545205],
       [0.5404652 , 0.4595348 ]])

I'm not sure which of the two arrays of labels is the right one, so let's start with `oob_probs[:,0]`.

In [40]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_train, oob_probs[:,0])

0.2814166271050939

Nope.  How about the other?

In [41]:
roc_auc_score(y_train, oob_probs[:,1])

0.7185833728949061

This one finally matches the CV score.  Success!

# Permutation importance

In [46]:
import eli5
from eli5.sklearn import PermutationImportance

# PermutationImportance can't deal with pipelines, so we'll do our 
# encoding separately.
encoder = ce.OrdinalEncoder()
X_transformed = encoder.fit_transform(X_train)

# Same model as before
forest = RandomForestClassifier(
            n_estimators=100, # default
            max_depth=5, # Was the best for single trees
            min_samples_leaf=0.005,
            oob_score=True,
            class_weight='balanced',
            n_jobs=-1)

forest.fit(X_transformed, y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=5, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=0.005,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=-1, oob_score=True, random_state=None,
            verbose=0, warm_start=False)

In [50]:
perm = PermutationImportance(estimator=forest,
                             scoring='roc_auc',
                             n_iter=1,
                             cv='prefit'
                            ).fit(X_transformed, y_train)
eli5.show_weights(perm, 
                  top=None, 
                  feature_names=X_transformed.columns.tolist())

Weight,Feature
0.0251  ± 0.0000,sub_grade
0.0193  ± 0.0000,int_rate
0.0064  ± 0.0000,installment
0.0050  ± 0.0000,loan_amnt
0.0047  ± 0.0000,term
0.0046  ± 0.0000,funded_amnt
0.0031  ± 0.0000,home_ownership
0.0029  ± 0.0000,dti
0.0021  ± 0.0000,num_il_tl
0.0020  ± 0.0000,tot_cur_bal
