# Classification through random forests
Builds on yesterday's assignment.

# Imports and incantations

In [None]:
import sklearn 
print (sklearn.__version__)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
pd.set_option('display.max_columns', None)  # Unlimited columns

In [None]:
sample_submission = pd.read_csv('../kaggle_data/sample_submission.csv')
X_test = pd.read_csv('../kaggle_data/test_features.csv')
X_train = pd.read_csv('../kaggle_data/train_features.csv')
y_train = pd.read_csv('../kaggle_data/train_labels.csv')

# Model 2: Cleaner data
I want to apply the same cleaning operations to train and test data, so I will write it into functions.  I'll be careful about how those functions depend on attributes that may be unique to the training or testing dataset, to make sure the two cleaned datasets have the same columns. This notebook contains the steps where I figure outh what cleanup operations to perform, and then a single function that actually carries them all out.

In [None]:
def cleanup(X):
    # Start with a fresh copy of the df
    X = X.copy()
    
    # Drop several columns
    X = X.drop(columns='id')  # id is random
    X = X.drop(columns=['member_id', 'url', 'desc'])  # All null
    X = X.drop(columns='title')  # Duplicative of purpose
    X = X.drop(columns='grade')  # Duplicative of sub_grade
    
    # Transform sub_grade from "A1" - "G5" to 1.1 - 7.5
    def wrangle_sub_grade(x):
        first_digit = ord(x[0]) - 64
        second_digit = int(x[1])
        return first_digit + second_digit/10
    
    X['sub_grade'] = X['sub_grade'].apply(wrangle_sub_grade)
    
    # Transform earliest_cr_line to an integer: how many days it's been open
    X['earliest_cr_line'] = pd.to_datetime(X['earliest_cr_line'], infer_datetime_format=True)
    X['earliest_cr_line'] = pd.Timestamp.today() - X['earliest_cr_line']
    X['earliest_cr_line'] = X['earliest_cr_line'].dt.days
    
    # Several columns contain percentages mis-parsed as strings. 
    # I'll cast them as floats.
    X['int_rate'] = X['int_rate'].str.strip('%').astype(float)
    X['revol_util'] = X['revol_util'].str.strip('%').astype(float)
    
    # Create features for three employee titles: teacher, manager, owner
    X['emp_title'] = X['emp_title'].str.lower()
    X['emp_title_teacher'] = X['emp_title'].str.contains('teacher', na=False)
    X['emp_title_manager'] = X['emp_title'].str.contains('manager', na=False)
    X['emp_title_owner']   = X['emp_title'].str.contains('owner', na=False)
    
    # Drop categoricals with high cardinality
    X = X.drop(columns=['emp_title', 'zip_code'])
    
    # There's several columns where we want to just mark the presence of 
    # nulls.
    many_nulls = ['sec_app_mths_since_last_major_derog',
                  'sec_app_revol_util',
                  'sec_app_earliest_cr_line',
                  'sec_app_mort_acc',
                  'dti_joint',
                  'sec_app_collections_12_mths_ex_med',
                  'sec_app_chargeoff_within_12_mths',
                  'sec_app_num_rev_accts',
                  'sec_app_open_act_il',
                  'sec_app_open_acc',
                  'revol_bal_joint',
                  'annual_inc_joint',
                  'sec_app_inq_last_6mths',
                  'mths_since_last_record',
                  'mths_since_recent_bc_dlq',
                  'mths_since_last_major_derog',
                  'mths_since_recent_revol_delinq',
                  'mths_since_last_delinq',
                  'il_util',
                  'emp_length',
                  'mths_since_recent_inq',
                  'mo_sin_old_il_acct',
                  'mths_since_rcnt_il',
                  'num_tl_120dpd_2m',
                  'bc_util',
                  'percent_bc_gt_75',
                  'bc_open_to_buy',
                  'mths_since_recent_bc']
    
    for col in many_nulls:
        X[col] = X[col].isnull()
    
    return X

X_train = cleanup(X_train)
X_test  = cleanup(X_test)
X_train.shape, X_test.shape

## Fit the model

In [None]:
from sklearn.ensemble import RandomForestClassifier
import category_encoders as ce
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier

In [1]:
import category_encoders as ce

ImportError: Missing required dependencies ['numpy']

In [None]:
pipe = make_pipeline(
    ce.OrdinalEncoder(), 
    RandomForestClassifier(
        n_estimators=100, 
        class_weight='balanced', 
        min_samples_leaf=0.005, 
        oob_score=True, 
        n_jobs=-1)
)

cross_val_score(pipe, X_train, y_train, cv=5, scoring='roc_auc', verbose=10)

In [None]:
# Create a pipeline, such that this workflow can be easily expanded in later sections
parameters = {'max_depth':[5,6,7]}
gs = GridSearchCV(estimator=DecisionTreeClassifier(), 
                  param_grid=parameters, 
                  cv=5, 
                  n_jobs=-1,
                  return_train_score=False,
                  scoring='roc_auc')

In [None]:
%%time
gs.fit(X2,y_train)
print(f'Best parameters: {gs.best_params_}')
print(f'Best score: {gs.best_score_:0.4f}')

## Make and export predictions

In [None]:
# Cleanup test data
x_test = cleanup1(test_features).select_dtypes(include='number')

# Make a prediction, extract results
probs = [x[1] for x in gs.predict_proba(x_test)]

# Export to CSV
actual_submission = sample_submission.copy()
actual_submission.charged_off = probs
actual_submission.to_csv('DMA3.csv', index=False)