#### Module Importation

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

%reload_ext autoreload
%autoreload 
from HelperFunctions import missingValuesInfo

from sklearn.impute import SimpleImputer
from fancyimpute import KNN
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

Using TensorFlow backend.


#### Helper Function

In [2]:
def imputer_score(imputer,estimator,x_miss,y):
    x_imputed=imputer.fit_transform(x_miss)
    impute_scores=cross_val_score(estimator,x_imputed,y,scoring='neg_mean_squared_error',cv=5)
    return -1*impute_scores.mean()

# Import Data

In [3]:
df=pd.read_csv('lean_df_5.csv',index_col='id')

  mask |= (ar1 == a)


# Variable Type Definition

In [4]:
df_number = df.select_dtypes(include = 'number')
df_object = df.select_dtypes(include = 'object')
df_category = df.select_dtypes(include = 'category')
df_boolean = df.select_dtypes(include = 'bool')
df_datetime = df.select_dtypes(include = 'datetime')
df_timedelta = df.select_dtypes(include = 'timedelta')
#######################################################
nominal_var=list(df_object.columns)
ordinal_var=list(df_number.columns)
continuous_var=list(df_number.columns)
time_var=list(df_datetime.columns)

# Instantiate Imputer 

In [5]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_median = SimpleImputer(missing_values=np.nan, strategy= 'median')
imp_mode = SimpleImputer(missing_values=np.nan,strategy="most_frequent")
imp_const = SimpleImputer(missing_values=np.nan,fill_value='none',strategy="constant")
imp_const_n1 = SimpleImputer(missing_values=np.nan,fill_value=-999,strategy="constant")
imp_KNN = KNN(k=3)
# imp_MICE = IterativeImputer(random_state=0)


In [6]:
missingdf=missingValuesInfo(df)
missingdf

Unnamed: 0,Total,Percent
mths_since_last_derog_record,1899411,84.11
mths_since_last_major_derog,1678024,74.31
mths_since_last_delinq,1157161,51.24
install_util,1067769,47.28
active_install_frac,909200,40.26
all_util,865694,38.33
inq_last_12m,865477,38.33
emp_length,146661,6.49
active_card_frac,75835,3.36
pct_acc_nvr_dlq,70317,3.11


# Impute variables where missing may mean 0 with zero.
# impute non existent observation with -999

In [7]:
missinglist=list(missingdf.index)
impute_with_zero_list=['pub_rec_bankruptcies','tax_liens',\
             'delinq_amnt','total_acc','derog_records',\
             'delinq_2yrs','collections_12_mths_ex_med',\
             'chargeoff_within_12_mths','pub_rec_bankruptcies','dti']
impute_with_neg999=list(set(missinglist)-set(impute_with_zero_list))

In [8]:
for item in impute_with_zero_list:
    df.loc[df.loc[:,item].isnull(),item]=0

In [9]:
for item in impute_with_neg999:
    df.loc[df.loc[:,item].isnull(),item]=-999

In [10]:
missingValuesInfo(df)

Unnamed: 0,Total,Percent


# Only look at completed loans

In [11]:
final_df=df[df['loan_status']!='Current']

In [12]:
final_df.to_csv('pre_downsample_df.csv')

In [13]:
final_df.loan_status.value_counts()

Fully Paid    1078739
Default        267035
Name: loan_status, dtype: int64

# Take out the Current loans as the holdout dataset

In [14]:
holdout_df=df[df['loan_status']=='Current']

In [15]:
holdout_df.head()

Unnamed: 0_level_0,funded_amnt,term,int_rate,grade,sub_grade,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,purpose,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,mths_since_last_delinq,mths_since_last_derog_record,derog_records,revol_util,total_acc,initial_list_status,collections_12_mths_ex_med,mths_since_last_major_derog,application_type,acc_now_delinq,collection_amt,install_util,all_util,rev_credit,inq_last_12m,chargeoff_within_12_mths,delinq_amnt,mths_since_RecentAcc_opened,all_accs_120days+_PastDue_ever,accs_90days+_PastDue_24m,accs_opened_past_12m,pct_acc_nvr_dlq,pub_rec_bankruptcies,tax_liens,total_credit,install_credit,fico,Outstanding_mortgage_debt,revol_frac,install_frac,mort_frac,card_frac,active_card_frac,active_revol_frac,active_install_frac,open_revol_frac,good_acc_frac,loan_duration,profit,RANDOM
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1
66310712,35000.0,60,14.85,C,C5,10+ years,MORTGAGE,110000.0,Source Verified,2015-12-01,Current,debt_consolidation,076xx,NJ,17.06,0.0,2008-09-01,-999.0,-999.0,0.0,11.6,17.0,w,0.0,-999.0,Individual,0.0,0.0,70.0,45.0,67300.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,100.0,0.0,0.0,381215.0,18000.0,787.0,249274.0,0.764706,0.117647,0.058824,0.769231,0.4,0.384615,0.5,0.769231,0.764706,38.6,31464.01,0.050002
68356421,22400.0,60,12.88,C,C2,6 years,MORTGAGE,95000.0,Not Verified,2015-12-01,Current,debt_consolidation,290xx,SC,22.98,0.0,1995-04-01,54.0,-999.0,0.0,32.7,35.0,w,0.0,54.0,Individual,0.0,0.0,67.0,54.0,39800.0,0.0,0.0,0.0,1.0,2.0,0.0,3.0,97.0,0.0,0.0,436841.0,191682.0,712.0,191862.0,0.428571,0.428571,0.142857,0.333333,0.6,0.533333,0.333333,0.666667,0.457143,38.6,19275.33,0.887323
68426545,16000.0,60,12.88,C,C2,1 year,MORTGAGE,70000.0,Not Verified,2015-12-01,Current,debt_consolidation,786xx,TX,26.4,0.0,1988-02-01,-999.0,-999.0,0.0,56.3,29.0,w,0.0,-999.0,Individual,0.0,0.0,74.0,64.0,51000.0,1.0,0.0,0.0,9.0,0.0,0.0,1.0,100.0,0.0,0.0,309638.0,45838.0,722.0,203429.0,0.724138,0.206897,0.068966,0.571429,0.333333,0.285714,0.333333,0.47619,0.448276,38.6,13768.04,0.491469
68506798,23000.0,60,8.49,B,B1,5 years,RENT,64000.0,Not Verified,2015-12-01,Current,credit_card,117xx,NY,18.28,0.0,2001-09-01,29.0,-999.0,0.0,52.7,33.0,w,0.0,-999.0,Individual,0.0,0.0,84.0,68.0,47300.0,1.0,0.0,0.0,8.0,0.0,0.0,2.0,87.9,0.0,0.0,93962.0,46662.0,702.0,0.0,0.575758,0.424242,0.0,0.421053,0.75,0.421053,0.5,0.736842,0.636364,38.6,17900.14,0.423722
68537655,16800.0,60,12.88,C,C2,10+ years,MORTGAGE,118000.0,Not Verified,2015-12-01,Current,debt_consolidation,636xx,MO,34.29,0.0,1997-06-01,35.0,57.0,1.0,24.1,46.0,w,0.0,67.0,Individual,0.0,0.0,67.0,53.0,32600.0,1.0,0.0,0.0,11.0,12.0,0.0,1.0,64.4,0.0,0.0,412771.0,115941.0,682.0,238985.0,0.608696,0.26087,0.086957,0.785714,0.136364,0.107143,0.333333,0.178571,0.23913,38.6,14456.69,0.459798


In [16]:
holdout_df.to_csv('holdout_df.csv')

In [17]:
# df[continuous_var]=imp_mean.fit_transform(df[continuous_var])
# df[ordinal_var]=imp_mean.fit_transform(df[ordinal_var])
# df[nominal_var]=imp_const_n1.fit_transform(df[nominal_var])
# df[continuous_var]=imp_KNN.fit_transform(df[continuous_var])
# df[continuous_var]=imp_MICE.fit_transform(df[continuous_var])

In [18]:
# x=df_number.drop('SalePrice',axis=1)
# y=df_number.SalePrice
# regressor=RandomForestRegressor(n_estimators=100,random_state=0)
# classifier=RandomForestClassifier(n_estimators=100,random_state=0)
# imputer_score(imp_MICE,regressor,x,y)

In [19]:
# from sklearn.metrics import accuracy_score, log_loss
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.svm import SVC, LinearSVC, NuSVC
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
# classifiers = [
#     KNeighborsClassifier(3),
#     SVC(kernel="rbf", C=0.025, probability=True),
#     NuSVC(probability=True),
#     DecisionTreeClassifier(),
#     RandomForestClassifier(),
#     AdaBoostClassifier(),
#     GradientBoostingClassifier()
#     ]
# for classifier in classifiers:
#     pipe = Pipeline(steps=[('preprocessor', preprocessor),
#                       ('classifier', classifier)])
#     pipe.fit(X_train, y_train)   
#     print(classifier)
#     print("model score: %.3f" % pipe.score(X_test, y_test))

In [20]:
# from sklearn.impute import MissingIndicator
# x=np.array([[np.nan, 1, 3],[4, 0, np.nan],[8, 1, 0]])
# indicator = MissingIndicator()
# indicator.fit_transform(x)  