### Reproducing Ryan Herr's lecture NoteBook and submitting results to Kaggle

In [1]:
import pandas as pd
pd.options.display.max_columns = 200
pd.options.display.max_rows = 200

X_train = pd.read_csv('train_features.csv')
X_test = pd.read_csv('test_features.csv')
y_train = pd.read_csv('train_labels.csv')['charged_off']
sample_submission = pd.read_csv('sample_submission.csv')

X_train.shape, X_test.shape, y_train.shape

((37745, 103), (9437, 103), (37745,))

In [2]:
def wrangle(X):
    X = X.copy()
    
    # Drop some columns
    X = X.drop(columns='id')  # id is random
    X = X.drop(columns=['member_id', 'url', 'desc'])  # All null
    X = X.drop(columns='title')  # Duplicative of purpose
    X = X.drop(columns='grade')  # Duplicative of sub_grade
    
    # Transform sub_grade from "A1" - "G5" to 1.1 - 7.5
    def wrangle_sub_grade(x):
        first_digit = ord(x[0]) - 64
        second_digit = int(x[1])
        return first_digit + second_digit/10
    
    X['sub_grade'] = X['sub_grade'].apply(wrangle_sub_grade)

    # Convert percentages from strings to floats
    X['int_rate'] = X['int_rate'].str.strip('%').astype(float)
    X['revol_util'] = X['revol_util'].str.strip('%').astype(float)
        
    # Transform earliest_cr_line to an integer: how many days it's been open
    X['earliest_cr_line'] = pd.to_datetime(X['earliest_cr_line'], infer_datetime_format=True)
    X['earliest_cr_line'] = pd.Timestamp.today() - X['earliest_cr_line']
    X['earliest_cr_line'] = X['earliest_cr_line'].dt.days
    
    # Create features for three employee titles: teacher, manager, owner
    X['emp_title'] = X['emp_title'].str.lower()
    X['emp_title_teacher'] = X['emp_title'].str.contains('teacher', na=False)
    X['emp_title_manager'] = X['emp_title'].str.contains('manager', na=False)
    X['emp_title_owner']   = X['emp_title'].str.contains('owner', na=False)
    
    # Drop categoricals with high cardinality
    X = X.drop(columns=['emp_title', 'zip_code'])
    
    # Transform features with many nulls to binary flags
    many_nulls = ['sec_app_mths_since_last_major_derog',
                  'sec_app_revol_util',
                  'sec_app_earliest_cr_line',
                  'sec_app_mort_acc',
                  'dti_joint',
                  'sec_app_collections_12_mths_ex_med',
                  'sec_app_chargeoff_within_12_mths',
                  'sec_app_num_rev_accts',
                  'sec_app_open_act_il',
                  'sec_app_open_acc',
                  'revol_bal_joint',
                  'annual_inc_joint',
                  'sec_app_inq_last_6mths',
                  'mths_since_last_record',
                  'mths_since_recent_bc_dlq',
                  'mths_since_last_major_derog',
                  'mths_since_recent_revol_delinq',
                  'mths_since_last_delinq',
                  'il_util',
                  'emp_length',
                  'mths_since_recent_inq',
                  'mo_sin_old_il_acct',
                  'mths_since_rcnt_il',
                  'num_tl_120dpd_2m',
                  'bc_util',
                  'percent_bc_gt_75',
                  'bc_open_to_buy',
                  'mths_since_recent_bc']

    for col in many_nulls:
        X[col] = X[col].isnull()
    
    # For features with few nulls, do mean imputation
    for col in X:
        if X[col].isnull().sum() > 0:
            X[col] = X[col].fillna(X[col].mean())
    
    # Return the wrangled dataframe
    return X


X_train = wrangle(X_train)
X_test  = wrangle(X_test)
X_train.shape, X_test.shape

((37745, 98), (9437, 98))

In [3]:
# Decision Tree

%%time
import category_encoders as ce
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier

pipe = make_pipeline(
    ce.OrdinalEncoder(),
    DecisionTreeClassifier(max_depth=5, class_weight='balanced')
)

for x in cross_val_score(pipe, X_train, y_train, cv=5, scoring='roc_auc'):
    print(f'{x:0.4}')

0.703
0.7004
0.6933
0.6888
0.6892
CPU times: user 6.36 s, sys: 1.31 s, total: 7.67 s
Wall time: 11.1 s


In [5]:
# fit to the training set and create a submission CSV:

pipe.fit(X_train, y_train)

# x[1] Because charged_off is 1 and that's what we are predicting
probabilities = [x[1] for x in pipe.predict_proba(X_test)]

In [6]:
probabilities[:5]

[0.54397045773717,
 0.4024876970761504,
 0.54397045773717,
 0.4024876970761504,
 0.395612592094086]

In [7]:
submission_2 = sample_submission.copy()
submission_2.charged_off = probabilities
submission_2.to_csv('Submission_2.csv', index=False)

In [9]:
y_preds = pipe.predict_proba(X_test)
y_preds[:5]

array([[0.45602954, 0.54397046],
       [0.5975123 , 0.4024877 ],
       [0.45602954, 0.54397046],
       [0.5975123 , 0.4024877 ],
       [0.60438741, 0.39561259]])

In [17]:
import numpy as np

y_preds_max = [np.max(x) for x in y_preds] 
y_preds_max[:5]

[0.54397045773717,
 0.5975123029238496,
 0.54397045773717,
 0.5975123029238496,
 0.604387407905914]

In [19]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier

pipe = make_pipeline(
    ce.OrdinalEncoder(),
    RandomForestClassifier(
        n_estimators=100,
        max_depth=5,
        min_samples_leaf=0.005,
        oob_score=True,
        class_weight='balanced',
        n_jobs=-1
    )
)

for x in cross_val_score(pipe, X_train, y_train, cv=5, scoring='roc_auc'):
    print(f'{x:0.4f}')

0.7195
0.7194
0.7125
0.7202
0.7301


In [20]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('ordinalencoder', OrdinalEncoder(cols=['term', 'home_ownership', 'purpose', 'addr_state', 'initial_list_status', 'application_type', 'disbursement_method'],
        drop_invariant=False, handle_unknown='impute', impute_missing=True,
        mapping=[{'col': 'term', 'mapping': [(' 36 months',...mators=100, n_jobs=-1, oob_score=True, random_state=None,
            verbose=0, warm_start=False))])

In [21]:
probabilities = [x[1] for x in pipe.predict_proba(X_test)]

In [22]:
probabilities[:5]

[0.5445188152973004,
 0.538669647976989,
 0.48696618189697405,
 0.3958403717153855,
 0.38662389217413845]

In [23]:
submission_3 = sample_submission.copy()
submission_3.charged_off = probabilities
submission_3.to_csv('Submission_3.csv', index=False)
