# Import

In [1]:
!pip install category_encoders

Collecting category_encoders
[?25l  Downloading https://files.pythonhosted.org/packages/f7/d3/82a4b85a87ece114f6d0139d643580c726efa45fa4db3b81aed38c0156c5/category_encoders-1.3.0-py2.py3-none-any.whl (61kB)
[K    100% |████████████████████████████████| 61kB 14.4MB/s ta 0:00:01
Installing collected packages: category-encoders
Successfully installed category-encoders-1.3.0
[33mYou are using pip version 10.0.1, however version 19.0.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [7]:
!pip install eli5

Collecting eli5
[?25l  Downloading https://files.pythonhosted.org/packages/ee/2b/246db9e1c2d6f38e999daf0c4d5e54f36fbd0b937ffb13a34d32c2139403/eli5-0.8.2-py2.py3-none-any.whl (98kB)
[K    100% |████████████████████████████████| 102kB 19.4MB/s a 0:00:01
Collecting tabulate>=0.7.7 (from eli5)
[?25l  Downloading https://files.pythonhosted.org/packages/c2/fd/202954b3f0eb896c53b7b6f07390851b1fd2ca84aa95880d7ae4f434c4ac/tabulate-0.8.3.tar.gz (46kB)
[K    100% |████████████████████████████████| 51kB 25.3MB/s ta 0:00:01
Collecting graphviz (from eli5)
  Downloading https://files.pythonhosted.org/packages/1f/e2/ef2581b5b86625657afd32030f90cf2717456c1d2b711ba074bf007c0f1a/graphviz-0.10.1-py2.py3-none-any.whl
Building wheels for collected packages: tabulate
  Running setup.py bdist_wheel for tabulate ... [?25ldone
[?25h  Stored in directory: /home/ec2-user/.cache/pip/wheels/2b/67/89/414471314a2d15de625d184d8be6d38a03ae1e983dbda91e84
Successfully built tabulate
Installing collected packages: 

In [2]:
%%time
import pandas as pd
%matplotlib inline
from ipywidgets import interact
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
pd.options.display.max_columns = 200
pd.options.display.max_rows = 200

X_train = pd.read_csv('data/train_features.csv')
X_test = pd.read_csv('data/test_features.csv')
y_train = pd.read_csv('data/train_labels.csv')['charged_off']
sample_submission = pd.read_csv('data/sample_submission.csv')

X_train.shape, X_test.shape, y_train.shape

  from numpy.core.umath_tests import inner1d


CPU times: user 17.3 s, sys: 1.93 s, total: 19.2 s
Wall time: 18.9 s


In [3]:
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, train_size=40000, 
    random_state=42, stratify=y_train)

X_train.shape, X_val.shape, y_train.shape, y_val.shape



((40000, 103), (1269457, 103), (40000,), (1269457,))

# Data Wrangling

In [4]:
%%time

def wrangle(X):
    X = X.copy()
    
    # Drop some columns
    X = X.drop(columns='id')  # id is random
    X = X.drop(columns=['member_id', 'url', 'desc'])  # All null
    X = X.drop(columns='title')  # Duplicative of purpose
    X = X.drop(columns='grade')  # Duplicative of sub_grade
    
    # Transform sub_grade from "A1" - "G5" to 1.1 - 7.5
    def wrangle_sub_grade(x):
        first_digit = ord(x[0]) - 64
        second_digit = int(x[1])
        return first_digit + second_digit/10
    
    X['sub_grade'] = X['sub_grade'].apply(wrangle_sub_grade)

    # Convert percentages from strings to floats
    X['int_rate'] = X['int_rate'].str.strip('%').astype(float)
    X['revol_util'] = X['revol_util'].str.strip('%').astype(float)
        
    # Transform earliest_cr_line to an integer: how many days it's been open
    X['earliest_cr_line'] = pd.to_datetime(X['earliest_cr_line'], infer_datetime_format=True)
    X['earliest_cr_line'] = pd.Timestamp.today() - X['earliest_cr_line']
    X['earliest_cr_line'] = X['earliest_cr_line'].dt.days
    
    # Create features for three employee titles: teacher, manager, owner
    X['emp_title'] = X['emp_title'].str.lower()
    X['emp_title_teacher'] = X['emp_title'].str.contains('teacher', na=False)
    X['emp_title_manager'] = X['emp_title'].str.contains('manager', na=False)
    X['emp_title_owner']   = X['emp_title'].str.contains('owner', na=False)
    
    # Drop categoricals with high cardinality
    X = X.drop(columns=['emp_title', 'zip_code'])
    
    # Transform features with many nulls to binary flags
    many_nulls = ['sec_app_mths_since_last_major_derog',
                  'sec_app_revol_util',
                  'sec_app_earliest_cr_line',
                  'sec_app_mort_acc',
                  'dti_joint',
                  'sec_app_collections_12_mths_ex_med',
                  'sec_app_chargeoff_within_12_mths',
                  'sec_app_num_rev_accts',
                  'sec_app_open_act_il',
                  'sec_app_open_acc',
                  'revol_bal_joint',
                  'annual_inc_joint',
                  'sec_app_inq_last_6mths',
                  'mths_since_last_record',
                  'mths_since_recent_bc_dlq',
                  'mths_since_last_major_derog',
                  'mths_since_recent_revol_delinq',
                  'mths_since_last_delinq',
                  'il_util',
                  'emp_length',
                  'mths_since_recent_inq',
                  'mo_sin_old_il_acct',
                  'mths_since_rcnt_il',
                  'num_tl_120dpd_2m',
                  'bc_util',
                  'percent_bc_gt_75',
                  'bc_open_to_buy',
                  'mths_since_recent_bc']

    for col in many_nulls:
        X[col] = X[col].isnull()
    
    # For features with few nulls, do mean imputation
    for col in X:
        if X[col].isnull().sum() > 0:
            X[col] = X[col].fillna(X[col].mean())
    
    # Return the wrangled dataframe
    return X


X_train = wrangle(X_train)
X_val   = wrangle(X_val)
X_test  = wrangle(X_test)
X_train.shape, X_val.shape, X_test.shape

CPU times: user 1min 10s, sys: 11.3 s, total: 1min 22s
Wall time: 1min 22s


In [4]:
null_counts = X_train.isnull().sum()
all(null_counts == 0)
cardinality = X_train.select_dtypes(exclude='number').nunique()
all(cardinality <= 50)

False

# RandomForestClassifier Model

In [11]:
import eli5
from eli5.sklearn import PermutationImportance
import category_encoders as ce
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

encoder = ce.OrdinalEncoder()
X_train_transformed = encoder.fit_transform(X_train)

model = RandomForestClassifier(
    n_estimators=100, 
    class_weight='balanced', 
    min_samples_leaf=0.005, 
    n_jobs=-1)

model.fit(X_train_transformed, y_train)
permuter = PermutationImportance(model, scoring='roc_auc', n_iter=1, cv='prefit')
permuter.fit(X_train_transformed, y_train)

eli5.show_weights(permuter, top=None, feature_names=X_train_transformed.columns.tolist())

subset = X_train.columns[permuter.feature_importances_ > 0]

pipe = make_pipeline(
    ce.OrdinalEncoder(), 
    RandomForestClassifier(
        n_estimators=100, 
        class_weight='balanced', 
        min_samples_leaf=0.005, 
        n_jobs=-1)
)

cross_val_score(pipe, X_train[subset], y_train, cv=5, scoring='roc_auc', verbose=10)

pipe.fit(X_train, y_train)
roc_auc_score(y_val, pipe.predict_proba(X_val)[:, 1])

submission = sample_submission.copy() 
submission['charged_off'] = pipe.predict_proba(X_test)[:, 1] 
submission.to_csv('Model_RandomForrestClassifier_1.csv', index=False)

[CV]  ................................................................
[CV] ....................... , score=0.7209488082159045, total=   2.2s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.2s remaining:    0.0s


[CV] ....................... , score=0.7179909081557903, total=   2.2s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    4.3s remaining:    0.0s


[CV] ....................... , score=0.7068262082439076, total=   2.2s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    6.5s remaining:    0.0s


[CV] ....................... , score=0.7299839282729704, total=   2.2s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    8.7s remaining:    0.0s


[CV] ....................... , score=0.7199312704688474, total=   2.1s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   10.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   10.8s finished
