In [25]:

%matplotlib inline
from ipywidgets import interact
import pandas as pd
import logging
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('dark_background')
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
import category_encoders as ce

In [22]:
df = pd.read_csv('train_features.csv')
print(df.shape)
df.describe()

(37745, 103)


Unnamed: 0,id,member_id,loan_amnt,funded_amnt,installment,annual_inc,url,desc,dti,delinq_2yrs,...,revol_bal_joint,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog
count,37745.0,0.0,37745.0,37745.0,37745.0,37745.0,0.0,0.0,37653.0,37745.0,...,4738.0,4738.0,4738.0,4738.0,4639.0,4738.0,4738.0,4738.0,4738.0,1668.0
mean,33579.507193,,14913.304411,14913.304411,444.702933,79629.21,,,18.508652,0.236111,...,33654.305192,0.724567,1.736809,11.296117,55.261199,2.855424,12.897425,0.048333,0.07366,36.332134
std,13609.172414,,10153.364841,10153.364841,293.571158,80522.91,,,19.670753,0.74959,...,29585.734663,1.068437,1.867062,6.557512,26.96727,3.054911,8.51273,0.455754,0.330458,24.058143
min,10000.0,,1000.0,1000.0,30.12,0.0,,,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,21824.0,,7000.0,7000.0,222.99,47000.0,,,10.32,0.0,...,13968.5,0.0,0.0,7.0,35.35,1.0,7.0,0.0,0.0,15.0
50%,33603.0,,12000.0,12000.0,361.5,67000.0,,,16.68,0.0,...,25657.0,0.0,1.0,10.0,57.3,2.0,11.0,0.0,0.0,34.0
75%,45357.0,,20000.0,20000.0,610.17,95000.0,,,23.93,0.0,...,43273.5,1.0,3.0,15.0,76.9,4.0,17.0,0.0,0.0,57.0
max,57181.0,,40000.0,40000.0,1587.23,9300000.0,,,999.0,19.0,...,290237.0,6.0,15.0,58.0,163.9,35.0,75.0,20.0,7.0,125.0


In [23]:

def clean(dat: pd.DataFrame) -> pd.DataFrame: 
    ''' refactored Ryan H's wrangle function from lecture today into method chaining'''
    todrop = ['id', # id is random
              'member_id', # all null
              'url', # all null
              'desc', # all null
              'title', # duplicate of purpose
              'grade', # duplicate of sub_grade
              'emp_title', # getting re-engineered, cardinality too high
              'zip_code' # cardinality too high
             ]

    def wrangle_sub_grade(x):
        '''Transform sub_grade from "A1" - "G5" to 1.1 - 7.5'''
        first_digit = ord(x[0]) - 64
        second_digit = int(x[1])
        return first_digit + second_digit/10
    
    assigns = {# sub_grade to ordinal
        **{'sub_grade': dat.sub_grade.apply(wrangle_sub_grade)}, # sub_grade to ordinal
        # Convert percentages from strings to floats
        **{name: dat[name].str.strip('%').astype(float) 
                 for name in ['int_rate', 'revol_util']}, # Convert percentages from strings to floats
        # Transform earliest_cr_line to an integer: how many days it's been open
        **{'earliest_cr_line': (pd.Timestamp.today() - \
                                  pd.to_datetime(dat.earliest_cr_line, infer_datetime_format=True)
                               ).dt.days},  
        # Create features for three employee titles: teacher, manager, owner
        **{'emp_title_'+name: dat.emp_title.str.contains(name, na=False) 
                              for name in ['teacher', 'manager', 'owner']} 
              }

    return dat.assign(emp_title = dat.emp_title.str.lower()).assign(**assigns).drop(todrop, axis=1)
    
df = clean(pd.read_csv('train_features.csv').sample(2000))

df.head()

Unnamed: 0,loan_amnt,funded_amnt,term,int_rate,installment,sub_grade,emp_length,home_ownership,annual_inc,purpose,...,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,disbursement_method,emp_title_teacher,emp_title_manager,emp_title_owner
30040,12000,12000,60 months,11.99,266.88,2.5,10+ years,MORTGAGE,47700.0,home_improvement,...,,,,,,,Cash,False,False,False
9613,5000,5000,36 months,10.41,162.31,2.3,1 year,MORTGAGE,60000.0,debt_consolidation,...,,,,,,,Cash,False,False,False
27913,9500,9500,36 months,10.91,310.62,2.4,10+ years,MORTGAGE,120000.0,debt_consolidation,...,,,,,,,Cash,False,False,False
9959,15000,15000,36 months,15.04,520.28,3.4,10+ years,MORTGAGE,19000.0,debt_consolidation,...,64.9,2.0,9.0,0.0,0.0,,Cash,False,False,False
10598,40000,40000,60 months,11.55,880.71,2.4,10+ years,MORTGAGE,120000.0,small_business,...,,,,,,,Cash,False,False,False
