In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier


In [2]:
test_features = pd.read_csv('test_features.csv')
train_features = pd.read_csv('train_features.csv')
train_labels = pd.read_csv('train_labels.csv')


In [3]:
pd.set_option('display.max_columns', 500)

In [None]:
train_features.describe()
# train_features.shape  # 37745, 103
# train_features.head()

In [None]:
train_features.isna().sum()

In [13]:
# From kaggle, already have data split out into train and test, so can assign directly
# Credit to DMA/LSDS01

X_train = train_features.dropna(axis='columns', how='any')
# X_train = train_features.drop(columns=['id', 'member_id'])
y_train = train_labels.charged_off  # labels are 'id' and 'charged_off'

X_test = test_features.dropna(axis='columns', how='any')
# X_test = test_features.drop(columns=['pct_tl_nvr_dlq'])

In [14]:
X_train.shape, X_test.shape, y_train.shape

((37745, 66), (9437, 67), (37745,))

In [15]:
for col in X_test.columns:
    if col not in X_train.columns:
        print(col, 'in X_test but not in X_train')

pct_tl_nvr_dlq in X_test but not in X_train


In [16]:
X_test = X_test.drop(columns=['pct_tl_nvr_dlq'])

In [None]:
# Model a Decision Tree regression - HT RH/LSDS

def regress_wave(max_depth):
    tree = DecisionTreeRegressor(max_depth=max_depth)
    tree.fit(X_train, y_train)
    print('Train R^2 score:', tree.score(X_train, y_train))
    # print('Test R^2 score:', tree.score(X_test, y_test))
    plt.scatter(X_train, y_train)
    # plt.scatter(X_test, y_test)
    plt.step(X, tree.predict(X))
    plt.show()


# regress_wave((1,8,1))  # crashes on 'term' feature, which is an object dtype

In [None]:
# Draw from LSDS01 Unit 2 Tanzania waterpump project methods

X_train_no_object_dtypes = X_train.select_dtypes(include='number')
# X_train_no_object_dtypes = X_train_no_object_dtypes.dropna()

X_test_no_object_dtypes = X_test.select_dtypes(include='number')

model = DecisionTreeClassifier()
# model.get_params().keys()

param_grid = {}
# param_grid = {'decisiontreeclassifier__max_depth': [1, 2, 3, 4, 5, 6, 7, 8]}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid,
                           scoring='roc_auc', cv=3, n_jobs=2)  # can define specific args here

grid_search.fit(X_train_no_object_dtypes, y_train)
# estimator is your model or pipeline, which you've instantiated and fitted

# X_test is your dataframe or numpy array, 
# with the same number of rows, in the same order, as test_features, 
# and the same number of columns, in the same order, as X_train

# y_pred = grid_search.predict(X_test_no_object_dtypes)

In [None]:
# Check for nulls

X_train.isnull().sum().sum(), X_test.isnull().sum().sum()

In [None]:
# Show best parameters

print(f'Best parameters: {grid_search.best_params_}')
print(f'Best score: {grid_search.best_score_:0.4f}')

In [None]:
# Submit DT Classifier predictive probabilities to kaggle

sample_submission = pd.read_csv('sample_submission.csv')
submission = sample_submission.copy()
submission['charged_off'] = grid_search.predict_proba(X_test_no_object_dtypes)[:, 1]
submission.to_csv('submission-001.csv', index=False)

In [8]:
# Now try a 'first, fast' Random Forest/OOB model for roc auc on same kaggle/Lending Club data

import category_encoders as ce
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
# from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline  # sklearn appears to encourage use of pipelines


In [24]:
# Make pipeline

pipe = make_pipeline(ce.OrdinalEncoder(),
                     RandomForestClassifier(n_estimators=100,
                                            class_weight='balanced',
                                            min_samples_leaf=0.005,
                                            oob_score=True,
                                            n_jobs=-1))

# Optional use of cross_val_score

# cross_val_score(pipe, X_train, y_train, cv=5, scoring='roc_auc')

In [None]:
# Optional feature engineering
# "['member_id' 'url' 'desc'] not found in axis"

# X_train_minus_drops = X_train.drop(columns='id') -- did not significantly increase pred acc
# X_test_minus_drops = X_test.drop(columns='id')

# Try converting interest rate percentages from strings to floats
X_train['int_rate'] = X_train['int_rate'].str.strip('%').astype(float)
X_test['int_rate'] = X_test['int_rate'].str.strip('%').astype(float)

# Try dropping features with high cardinality
# X_train = X_train.drop(columns='zip_code')
# X_test = X_test.drop(columns='zip_code')

In [None]:
X_train.info()

In [25]:
# Fit pipeline, and compute predictive probabilities - all HT RH/LSDS

%time
pipe.fit(X_train, y_train)
# pipe.fit(X_train_minus_drops, y_train)
y_pred_proba = pipe.named_steps['randomforestclassifier'].oob_decision_function_[:, 1]
print('ROC AUC, Out-of-Bag estimate:', roc_auc_score(y_train, y_pred_proba))

Wall time: 0 ns
ROC AUC, Out-of-Bag estimate: 0.7268300409442843


In [None]:
# Submit RF/OOB predictive probabilities to kaggle

submission_02 = sample_submission.copy()
submission_02['charged_off'] = pipe.predict_proba(X_test)[:, 1]
submission_02.to_csv('submission-002.csv', index=False)

In [20]:
# New - return to original train and test data to test a different workflow on RF

X_train = train_features
# X_train = train_features.drop(columns=['id', 'member_id'])
y_train = train_labels.charged_off  # labels are 'id' and 'charged_off'

X_test = test_features
# X_test = test_features.dropna(axis='columns', how='any')

In [21]:
X_train.shape, y_train.shape, X_test.shape

((37745, 103), (37745,), (9437, 103))

In [22]:
# Key difference in this workflow is that won't drop nulls immediately,
# but will use 'wrange' function from RH/LSDS


def wrangle(X):
    X = X.copy()
    
    # Drop some columns
    X = X.drop(columns='id')  # id is random
    X = X.drop(columns=['member_id', 'url', 'desc'])  # All null
    X = X.drop(columns='title')  # Duplicative of purpose
    X = X.drop(columns='grade')  # Duplicative of sub_grade
    
    # Transform sub_grade from "A1" - "G5" to 1.1 - 7.5
    def wrangle_sub_grade(x):
        first_digit = ord(x[0]) - 64
        second_digit = int(x[1])
        return first_digit + second_digit/10
    
    X['sub_grade'] = X['sub_grade'].apply(wrangle_sub_grade)

    # Convert percentages from strings to floats
    X['int_rate'] = X['int_rate'].str.strip('%').astype(float)
    X['revol_util'] = X['revol_util'].str.strip('%').astype(float)
        
    # Transform earliest_cr_line to an integer: how many days it's been open
    X['earliest_cr_line'] = pd.to_datetime(X['earliest_cr_line'], infer_datetime_format=True)
    X['earliest_cr_line'] = pd.Timestamp.today() - X['earliest_cr_line']
    X['earliest_cr_line'] = X['earliest_cr_line'].dt.days
    
    # Create features for three employee titles: teacher, manager, owner
    X['emp_title'] = X['emp_title'].str.lower()
    X['emp_title_teacher'] = X['emp_title'].str.contains('teacher', na=False)
    X['emp_title_manager'] = X['emp_title'].str.contains('manager', na=False)
    X['emp_title_owner']   = X['emp_title'].str.contains('owner', na=False)
    
    # Drop categoricals with high cardinality
    X = X.drop(columns=['emp_title', 'zip_code'])
    
    # Transform features with many nulls to binary flags
    many_nulls = ['sec_app_mths_since_last_major_derog',
                  'sec_app_revol_util',
                  'sec_app_earliest_cr_line',
                  'sec_app_mort_acc',
                  'dti_joint',
                  'sec_app_collections_12_mths_ex_med',
                  'sec_app_chargeoff_within_12_mths',
                  'sec_app_num_rev_accts',
                  'sec_app_open_act_il',
                  'sec_app_open_acc',
                  'revol_bal_joint',
                  'annual_inc_joint',
                  'sec_app_inq_last_6mths',
                  'mths_since_last_record',
                  'mths_since_recent_bc_dlq',
                  'mths_since_last_major_derog',
                  'mths_since_recent_revol_delinq',
                  'mths_since_last_delinq',
                  'il_util',
                  'emp_length',
                  'mths_since_recent_inq',
                  'mo_sin_old_il_acct',
                  'mths_since_rcnt_il',
                  'num_tl_120dpd_2m',
                  'bc_util',
                  'percent_bc_gt_75',
                  'bc_open_to_buy',
                  'mths_since_recent_bc']

    for col in many_nulls:
        X[col] = X[col].isnull()
    
    # For features with few nulls, do mean imputation
    for col in X:
        if X[col].isnull().sum() > 0:
            X[col] = X[col].fillna(X[col].mean())
    
    # Return the wrangled dataframe
    return X


X_train = wrangle(X_train)
X_test  = wrangle(X_test)
X_train.shape, X_test.shape

((37745, 98), (9437, 98))

In [None]:
# Fit pipeline [see key previous workflow several lines above]

%time
pipe.fit(X_train, y_train)
y_pred_proba = pipe.named_steps['randomforestclassifier'].oob_decision_function_[:, 1]
print('ROC AUC, Out-of-Bag estimate:', roc_auc_score(y_train, y_pred_proba))

In [26]:
# Submit these RF/OOB predictive probabilities to kaggle

sample_submission = pd.read_csv('sample_submission.csv')
submission_03 = sample_submission.copy()
submission_03['charged_off'] = pipe.predict_proba(X_test)[:, 1]
submission_03.to_csv('submission-003.csv', index=False)

In [27]:
from xgboost import XGBClassifier

ModuleNotFoundError: No module named 'xgboost'