In [1]:
import pandas as pd
import numpy as np
import eli5
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import category_encoders as ce
pd.options.display.max_columns=(999)
pd.options.display.max_rows=(999)

In [2]:
train = pd.read_csv('train_features.csv')
train_labels = pd.read_csv('train_labels.csv')
test = pd.read_csv('test_features.csv')
sample_submission = pd.read_csv('sample_submission.csv')
print(train.shape, train_labels.shape, test.shape)

(1309457, 103) (1309457, 2) (26724, 103)


In [12]:
X_train = train
y_train = train_labels['charged_off']
X_test = test

In [13]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import xgboost as xgb

def wrangle(X):
    
    X = X.copy()
    
    def remove_months(string):
        return int(string.strip(' months'))

    def remove_percent(string):
        if string == str:
            return float(string.strip(' %'))
        elif string == float:
            return float(string)
        elif string == int:
            return int(string)

    def remove_years(string):
        if "< 1 year" in string:
            return float(string.replace('< 1 year',0.5))
        elif "10+ years" in string:
            return float(string.replace('10+ years', 10.0))
        else:
            return float(string.strip(' years'))

    def remove_first4char(string):
        return int(string[4:])

    def wrangle_sub_grade(x):
        first_digit = ord(x[0]) - 64
        second_digit = int(x[1])
        return first_digit + second_digit/10
    
    # Drop some columns
    X = X.drop(columns='id')  # id is random
    X = X.drop(columns=['member_id', 'url', 'desc'])  # All null
    X = X.drop(columns='title')  # Duplicative of purpose
    X = X.drop(columns='grade')  # Duplicative of sub_grade
    
    # Transform sub_grade from "A1" - "G5" to 1.1 - 7.5
    def wrangle_sub_grade(x):
        first_digit = ord(x[0]) - 64
        second_digit = int(x[1])
        return first_digit + second_digit/10
    
    X['sub_grade'] = X['sub_grade'].apply(wrangle_sub_grade)

    # Convert percentages from strings to floats
    X['int_rate'] = X['int_rate'].str.strip('%').astype(float)
    X['revol_util'] = X['revol_util'].str.strip('%').astype(float)
        
    # Transform earliest_cr_line to an integer: how many days it's been open
    X['earliest_cr_line'] = pd.to_datetime(X['earliest_cr_line'], infer_datetime_format=True)
    X['earliest_cr_line'] = pd.Timestamp.today() - X['earliest_cr_line']
    X['earliest_cr_line'] = X['earliest_cr_line'].dt.days
    
    # Create features for three employee titles: teacher, manager, owner
    X['emp_title'] = X['emp_title'].str.lower()
    X['emp_title_teacher'] = X['emp_title'].str.contains('teacher', na=False)
    X['emp_title_manager'] = X['emp_title'].str.contains('manager', na=False)
    X['emp_title_owner']   = X['emp_title'].str.contains('owner', na=False)
    
    # Drop categoricals with high cardinality
    X = X.drop(columns=['emp_title', 'zip_code'])
    
    # Transform features with many nulls to binary flags
    many_nulls = ['sec_app_mths_since_last_major_derog',
                  'sec_app_revol_util',
                  'sec_app_earliest_cr_line',
                  'sec_app_mort_acc',
                  'dti_joint',
                  'sec_app_collections_12_mths_ex_med',
                  'sec_app_chargeoff_within_12_mths',
                  'sec_app_num_rev_accts',
                  'sec_app_open_act_il',
                  'sec_app_open_acc',
                  'revol_bal_joint',
                  'annual_inc_joint',
                  'sec_app_inq_last_6mths',
                  'mths_since_last_record',
                  'mths_since_recent_bc_dlq',
                  'mths_since_last_major_derog',
                  'mths_since_recent_revol_delinq',
                  'mths_since_last_delinq',
                  'il_util',
                  'emp_length',
                  'mths_since_recent_inq',
                  'mo_sin_old_il_acct',
                  'mths_since_rcnt_il',
                  'num_tl_120dpd_2m',
                  'bc_util',
                  'percent_bc_gt_75',
                  'bc_open_to_buy',
                  'mths_since_recent_bc']

    for col in many_nulls:
        X[col] = X[col].isnull()
    
    # For features with few nulls, do mean imputation
    for col in X:
        if X[col].isnull().sum() > 0:
            X[col] = X[col].fillna(X[col].mean())
    
    # Return the wrangled dataframe
    return X


X_train = wrangle(X_train)
X_test  = wrangle(X_test)
y_train = y_train
X_train.shape, X_test.shape

((1309457, 98), (26724, 98))

In [14]:
encoder = ce.BinaryEncoder()
from sklearn.model_selection import train_test_split

X_train = encoder.fit_transform(X_train)
X_test = encoder.fit_transform(X_test)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.23, stratify = y_train, random_state=420)

print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_val, label=y_val)
xgb_test = xgb.DMatrix(X_test)

print(xgb_test)

(1008281, 115) (301176, 115) (1008281,) (301176,)


  if getattr(data, 'base', None) is not None and \


<xgboost.core.DMatrix object at 0x12a587eb8>


In [16]:
params = {
        'booster' : 'dart',
        'objective' : 'binary:logistic',
        'rate_drop' : 0.1,
        'sample_type' : 'weighted',
        'learning_rate' : 0.1, 
        'min_child_weight' : 5,
        'max_depth' : 4,
        'gamma' : 0.2,
        'max_delta_step' : 1,
        'subsample' : 1,
        'colsample_bytree' : 0.4,
        'colsample_bylevel' : 1,
        'n_estimators' : 200,
        'alpha' : 0,
        'scale_pos_weight' : 1,
        'eval_metric' : "auc",
        'xgbclassifier__silent' : False,
        'n_jobs' : -1
}

num_rounds = 1000

evals = [(dtrain, 'train'), (dtest, 'validation')]

boost = xgb.train(params, dtrain, num_rounds, evals, early_stopping_rounds=50)
boost.save_model('DART_BABY_LETSGOOO.model')

[0]	train-auc:0.615469	validation-auc:0.5
Multiple eval metrics have been passed: 'validation-auc' will be used for early stopping.

Will train until validation-auc hasn't improved in 50 rounds.
[1]	train-auc:0.640683	validation-auc:0.615213
[2]	train-auc:0.66282	validation-auc:0.663398
[3]	train-auc:0.68663	validation-auc:0.687086
[4]	train-auc:0.688811	validation-auc:0.678506
[5]	train-auc:0.695062	validation-auc:0.695453
[6]	train-auc:0.696696	validation-auc:0.700926
[7]	train-auc:0.70507	validation-auc:0.705324
[8]	train-auc:0.705849	validation-auc:0.703998
[9]	train-auc:0.706026	validation-auc:0.706371
[10]	train-auc:0.706823	validation-auc:0.707792
[11]	train-auc:0.707745	validation-auc:0.705528
[12]	train-auc:0.708177	validation-auc:0.708624
[13]	train-auc:0.70797	validation-auc:0.709395
[14]	train-auc:0.709101	validation-auc:0.709669
[15]	train-auc:0.709144	validation-auc:0.709961
[16]	train-auc:0.709145	validation-auc:0.710321
[17]	train-auc:0.710317	validation-auc:0.710775
[1

[168]	train-auc:0.716601	validation-auc:0.715954
[169]	train-auc:0.716893	validation-auc:0.716194
[170]	train-auc:0.71662	validation-auc:0.716813
[171]	train-auc:0.716259	validation-auc:0.716657
[172]	train-auc:0.71732	validation-auc:0.715427
[173]	train-auc:0.716473	validation-auc:0.716768
[174]	train-auc:0.717086	validation-auc:0.715961
[175]	train-auc:0.717356	validation-auc:0.716584
[176]	train-auc:0.716878	validation-auc:0.71618
[177]	train-auc:0.716823	validation-auc:0.717109
[178]	train-auc:0.716919	validation-auc:0.716532
[179]	train-auc:0.716051	validation-auc:0.716196
[180]	train-auc:0.716693	validation-auc:0.717004
[181]	train-auc:0.717905	validation-auc:0.7166
[182]	train-auc:0.716594	validation-auc:0.716517
[183]	train-auc:0.717798	validation-auc:0.716322
[184]	train-auc:0.717274	validation-auc:0.716426
[185]	train-auc:0.717064	validation-auc:0.716776
[186]	train-auc:0.716515	validation-auc:0.716926
[187]	train-auc:0.716677	validation-auc:0.716934
[188]	train-auc:0.716985	

In [43]:
submission = sample_submission.copy()
preds = boost.predict(xgb_test, ntree_limit=boost.best_ntree_limit)
preds

array([0.11848793, 0.18083645, 0.14676853, ..., 0.11503875, 0.10629128,
       0.29001722], dtype=float32)