In [1]:
%%time

%matplotlib inline
from ipywidgets import interact
import pandas as pd
import numpy as np
import logging
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('dark_background')
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_colwidth', -1)
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import xgboost as xgb
import category_encoders as ce

def clean(dat: pd.DataFrame) -> pd.DataFrame: 
    ''' refactored Ryan H's wrangle function from lecture today into method chaining'''
    greater_than_10k_ = [k for k,v in dat.mean().iteritems() if v > 10**4]

    todrop_ = ['id', # id is random
              'member_id', # all null
              'url', # all null
              'desc', # all null
              'title', # duplicate of purpose
              'grade', # duplicate of sub_grade
              'emp_title', # getting re-engineered, cardinality too high
              'zip_code' # cardinality too high
               # list `greater_than_10k` will be engineered into units of k
             ] 

    many_nulls = ['sec_app_mths_since_last_major_derog',
                  'sec_app_revol_util',
                  'sec_app_earliest_cr_line',
                  'sec_app_mort_acc',
                  'dti_joint',
                  'sec_app_collections_12_mths_ex_med',
                  'sec_app_chargeoff_within_12_mths',
                  'sec_app_num_rev_accts',
                  'sec_app_open_act_il',
                  'sec_app_open_acc',
                  'revol_bal_joint',
                  'annual_inc_joint',
                  'sec_app_inq_last_6mths',
                  'mths_since_last_record',
                  'mths_since_recent_bc_dlq',
                  'mths_since_last_major_derog',
                  'mths_since_recent_revol_delinq',
                  'mths_since_last_delinq',
                  'il_util',
                  'emp_length',
                  'mths_since_recent_inq',
                  'mo_sin_old_il_acct',
                  'mths_since_rcnt_il',
                  'num_tl_120dpd_2m',
                  'bc_util',
                  'percent_bc_gt_75',
                  'bc_open_to_buy',
                  'mths_since_recent_bc']

    greater_than_10k = [i for i in greater_than_10k_ if i not in many_nulls]

    todrop = [i for i in todrop_ if i not in many_nulls] + greater_than_10k
    
    def wrangle_sub_grade(x):
        '''Transform sub_grade from "A1" - "G5" to 1.1 - 7.5'''
        first_digit = ord(x[0]) - 64
        second_digit = int(x[1])
        return first_digit + second_digit/10
    
    assigns = {# sub_grade to ordinal
        **{'sub_grade': dat.sub_grade.apply(wrangle_sub_grade)}, # sub_grade to ordinal
        # Convert percentages from strings to floats
        **{name: dat[name].str.strip('%').astype(float) 
                 for name in ['int_rate', 'revol_util']}, # Convert percentages from strings to floats
        # Transform earliest_cr_line to an integer: how many days it's been open
        **{'earliest_cr_line': (pd.Timestamp.today() - \
                                  pd.to_datetime(dat.earliest_cr_line, infer_datetime_format=True)
                               ).dt.days},  
        # Create features for three employee titles: teacher, manager, owner
        **{'emp_title_'+name: dat.emp_title.str.contains(name, na=False) 
                              for name in ['teacher', 'manager', 'owner']},
        # Transform features with many nulls to binary flags
        **{name: dat[name].isnull() for name in many_nulls},
        # For features with few nulls, do mean imputation
        **{name: dat[name].fillna(dat[name].mean()) for name in dat.select_dtypes(include=['int', 'float', 'float64']).columns}
              }
    
    return (dat.assign(emp_title = dat.emp_title.str.lower())
               .assign(**assigns)
               #.assign(revol_util = dat.revol_util.fillna(dat.revol_util.mean()))
               .drop(todrop, axis=1)), dat.id.values
    
# df = clean(pd.read_csv('train_features.csv').sample(5000))[0]
# # inscrutably, it is completely necessary to do this here. it doesn't get caught in above code (it should). 
# df.revol_util = df.revol_util.fillna(df.revol_util.mean())

# assert df.isna().sum().sum()==0

# cats = list(df.select_dtypes(include=['object']).columns)
# nums = list(df.select_dtypes(exclude=['object']).columns)

def encode(encoder, trainpath = 'train_features.csv', testpath = 'test_features.csv'): 
    ''' pass a fresh encoder instance from ce library. '''

    df_test = clean(pd.read_csv(testpath))
    X_train = encoder.fit_transform(clean(pd.read_csv(trainpath))[0])
    X_test = encoder.fit_transform(df_test[0])
    return {'train': X_train, 'test': X_test, 'TEST_IDs': df_test[1]}

dfs = encode(ce.BinaryEncoder())

y_train = pd.read_csv('train_labels.csv')['charged_off']

dtrain = xgb.DMatrix(dfs['train'].values, y_train.values)
dtest = xgb.DMatrix(dfs['test'].values)


# specify parameters via map
param = {'booster': 'dart',
         'max_depth': 5, 'learning_rate': 0.1,
         'objective': 'binary:logistic', 'silent': True,
         'sample_type': 'uniform',
         'normalize_type': 'tree',
         'rate_drop': 0.1,
         'skip_drop': 0.5}
num_round = 50
bst = xgb.train(param, dtrain, num_round)
# make prediction
# ntree_limit must not be 0
preds = bst.predict(dtest, ntree_limit=num_round)

preds_df = pd.DataFrame({'id': dfs['TEST_IDs'], 'charged_off': preds})

assert all([x==y for x,y in zip(pd.read_csv('test_features.csv').id, preds_df.id)]) #sample_submission.index

def write_submit(submit_df, name='submission.csv'): 
    ''''''
    submit_df.to_csv(name, index=False)
    
    !kaggle competitions submit -c ds1-tree-ensembles -f submission.csv -m "Basic XGBoost "
    return submit_df

write_submit(preds_df)


100%|████████████████████████████████████████| 157k/157k [00:00<00:00, 82.0kB/s]
Successfully submitted to DS1 Tree EnsemblesCPU times: user 1min 6s, sys: 898 ms, total: 1min 7s
Wall time: 1min 8s
