In [3]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# from HelperFunctions import minibatch 
%reload_ext autoreload
%autoreload 2
from HelperFunctions import minibatch, dummify_columns, undummify, feature_standardize, label_encode_column, columns_of_type

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn import metrics

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [None]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, AdaBoostRegressor
randomForest = RandomForestRegressor()
gbm = GradientBoostingRegressor()
abr = AdaBoostRegressor()
from xgboost import XGBRegressor
xgb = XGBRegressor()
from lightgbm import LGBMRegressor
lgb = LGBMRegressor()

# Helper Function

In [None]:
def adjust_prob(unsampled_df,sampled_df,sampled_prob): # sampled_df, sampled_prob can also be test_df, test_prob
    # turning the predicted probability into a dataframe with column name default_prob
    sample_prob_df=pd.DataFrame(sampled_prob[:,0],columns=['sampled_prob'])
    # find actual default rate for each class
    grade_rate=unsampled_df.groupby('grade')['loan_status'].apply(lambda x:(x=='Default').sum()/x.count())
    grade_rate_dict=grade_rate.to_dict()
    # mapping the unsampled_df default rates to the test_df=sampled_df
    # and then getting the array of default_rates in the test_df
    sampled_df['default_rate']=sampled_df['grade'].map(grade_rate_dict)
    sampled_df.reset_index(drop=True, inplace=True)
    sample_prob_df.reset_index(drop=True, inplace=True)
    pre_adjust_df=pd.concat([sampled_df,sample_prob_df],axis=1)
    # Adjusting the default_probability to the true probability (accounting for down/up sampling)    
    sampled_frac=0.5
    real_prob=[]
    for row in pre_adjust_df.loc[:,['default_rate','sampled_prob']].iterrows():
        beta=sampled_frac/(1-row[1]['default_rate'])
        real_prob.append(beta*row[1]['sampled_prob']/((beta-1)*row[1]['sampled_prob']+1))
        #     prob=1/(1+(1/original_fraction-1)/(1/sampled_fraction-1)*(1/sampled_prob-1))
    a=pd.DataFrame(real_prob,columns=['actual_prob'])
    b=pd.DataFrame(sampled_prob[:,0],columns=['downsampled_prob'])
    return pd.concat([a,b],axis=1)

In [4]:
df=pd.read_csv('down_sampled_df_v2.csv',index_col='id')
pre_df=pd.read_csv('pre_downsample_df.csv',index_col='id')

In [None]:
grade_weight_dict={'A':1,
                  'B':2,
                  'C':3,
                  'D':4,
                  'E':5,
                  'G':6}
df['weight']=df['grade'].map(grade_weight_dict)

In [None]:
drop_list=['sub_grade','issue_d','zip_code','RANDOM']
df.drop(drop_list,axis=1,inplace=True)

In [5]:
x_train=df.drop(['loan_status','return_rate'],axis=1)
y_train=df.loan_status

Unnamed: 0,id,funded_amnt,term,int_rate,grade,sub_grade,emp_length,home_ownership,annual_inc,verification_status,...,mort_frac,card_frac,active_card_frac,active_revol_frac,active_install_frac,open_revol_frac,good_acc_frac,loan_duration,return_rate,RANDOM
0,65251140,25000.0,36,7.89,A,A5,1,MORTGAGE,94000.0,Source Verified,...,0.1,0.578947,0.363636,0.368421,-999.0,0.578947,0.5,34.5,0.040767,0.220221
1,62519711,14000.0,36,6.89,A,A3,3,RENT,45000.0,Not Verified,...,0.222222,0.857143,0.5,0.428571,-999.0,0.428571,0.333333,37.566667,0.033656,0.008344
2,50646937,5000.0,36,5.32,A,A1,5,MORTGAGE,80000.0,Not Verified,...,0.03125,0.65,0.538462,0.45,-999.0,0.75,0.65625,34.5,0.027468,0.412407
3,16442318,6500.0,36,7.69,A,A4,7,MORTGAGE,61000.0,Not Verified,...,0.133333,0.5,0.666667,0.5,-999.0,0.666667,0.6,36.533333,0.037775,0.562369
4,49924755,16000.0,36,5.32,A,A1,1,MORTGAGE,98000.0,Source Verified,...,0.119048,0.533333,0.1875,0.166667,-999.0,0.433333,0.333333,36.533333,0.025984,0.941376


In [None]:
cat_list=columns_of_type(x_train,'string')
cont_list=columns_of_type(x_train,'number')

# Label Encode ALL Categoricals

In [None]:
label_encoded_df=label_encode_column(x_train,cat_list)

# Standardize ALL Features

In [None]:
final_train_df=label_encoded_df.apply(lambda x:feature_standardize(x,scaleType='standardize'),axis=0)

# CUSTOM SCORING FUNCTION

In [None]:
from sklearn.metric import accuracy_score
from sklearn.metrics.scorer import make_scorer
def grade_weighted_accuracy(y_true,y_pred,sample_weight):
    weighted_acc=accuracy_score(y_true,y_pred,normalize=True,sample_weight)
    return weighted_acc    
grade_weighted_scorer = make_scorer(grade_weighted_accuracy, needs_proba=False, sample_weight=df['weight'])

In [None]:
grid_para_forest = {
    'n_estimators': range(20, 80, 7),
    'max_depth':[2,3],
    'max_features':[1, 2],
    'min_samples_leaf':[1,2],
    'min_samples_split':[2,3]
}
grid_search_forest = GridSearchCV(ensemble.RandomForestClassifier(class_weight='balanced'),\
                                  grid_para_forest, cv=3,  n_jobs=-1,\
                                  scoring=grade_weighted_scorer,return_train_score=False)
grid_search_forest.fit(x_train,y_train)
grid_search_forest.predict(x_train)





In [None]:
y_pred=grid_search_forest.predict(final_train_df)
y_true=y_train

In [None]:
print('best parameters:', grid_search_forest.best_params_)
print('best score:', grid_search_forest.best_score_)
confusion_matrix(y, grid_search_forest.predict(x))

In [None]:
# K mean clustering performed on categorical variables
# perform for train (non-test) set
nominal_var_cluster_num=[]
nominal_var_cluster_num_test=[]
for cat_var in nominal_var_processed:
    # dummifying the nominal categorical variable
    dummified_column=pd.get_dummies(x.loc[:,cat_var],prefix=cat_var, prefix_sep='__')
    df_for_cluster=pd.concat([y,dummified_column],axis=1,sort='False')
    print(cat_var)
    # Finding the optimal number of clusters and storing into nominal_var_cluster_num
    kmax = df_for_cluster.shape[1]
    KNumberChoice=range(2, kmax)
    # dissimilarity would not be defined for a single cluster, thus, minimum number of clusters should be 2
    # Doing mulitple trials for each category
    cluster_count=[]
    for trial_num in range(0,5):
        sil = []
        for k in range(2, kmax):
            kmeans = KMeans(n_clusters = k,random_state=trial_num,init='k-means++').fit(df_for_cluster)
            labels = kmeans.labels_
            sil.append(silhouette_score(df_for_cluster, labels, metric = 'euclidean'))
        if np.argmax(sil)<0.2: # defining threshold for 1 cluster
            cluster_count.append(1)
        else:
            cluster_count.append(KNumberChoice[np.argmax(sil)])
    nominal_var_cluster_num.append(max(set(cluster_count), key=cluster_count.count))
print(nominal_var_cluster_num)
print(nominal_var_cluster_num_test)

In [None]:
# With the known cluster number for each categorical variable, perform the clustering
# perform for train (non-test) set
for ind,cat_var in enumerate(nominal_var_processed):
    dummified_column=pd.get_dummies(x.loc[:,cat_var],prefix=cat_var, prefix_sep='__')
    df_for_cluster=pd.concat([y,dummified_column],axis=1,sort='False')
    kmeans=KMeans(n_clusters=nominal_var_cluster_num[ind]).fit(df_for_cluster)
    x.loc[:,cat_var]=kmeans.labels_
    
clustered_df=pd.concat([y,x],axis=1,sort='False')    
clustered_df_test=pd.concat([y_test,x_test],axis=1,sort='False')    
print(clustered_df.shape)
print(clustered_df_test.shape)

In [None]:
# Removing the columns with pure 0's
undummified_clustered_df=clustered_df.loc[:,(clustered_df != 0).any(axis=0)]
undummified_clustered_df.shape

In [None]:
####################### ONLY FOR NON-TEST !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
####################### ONLY FOR NON-TEST !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# Removing sub-categories with low counts and their associated observations (if less than 2% of total observation)
boolmatrix=[]
non_cont_columns=dummified_clustered_df.columns[~dummified_clustered_df.columns.isin(cont_var_for_tuning)]
for cat_feature in non_cont_columns:
    if dummified_clustered_df.loc[:,cat_feature].sum()<0.020*dummified_clustered_df.shape[0]:
        print('yes')
        boolvec=(dummified_clustered_df.loc[:,cat_feature]==1)
        boolmatrix.append(list(boolvec))
   

In [None]:
####################### ONLY FOR NON-TEST !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
####################### ONLY FOR NON-TEST !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# creating boolean vector to takeout observations that have categorical low count observations
if boolmatrix!=[]:
    reduction_bool_vec=np.any(np.array(boolmatrix).transpose(),axis=1)
    temp_df=dummified_clustered_df[~reduction_bool_vec]
    purged_dummified_clustered_df=temp_df.loc[:,(temp_df != 0).any(axis=0)]
else:
    purged_dummified_clustered_df=dummified_clustered_df
print(purged_dummified_clustered_df.shape)
print(dummified_clustered_df.shape)

In [None]:
# Removing from the test set columns there were removed from the training set!
cols_to_keep=set(dummified_clustered_df_test.columns)&(set(purged_dummified_clustered_df.columns))


purged_dummified_clustered_df_test=dummified_clustered_df_test.loc[:,list(cols_to_keep)]
purged_dummified_clustered_df=dummified_clustered_df.loc[:,list(cols_to_keep)+['SalePrice']]

# purged_dummified_clustered_df.reset_index(inplace=True)
# purged_dummified_clustered_df.set_index('Id')
# purged_dummified_clustered_df_test.reset_index(inplace=True)
# purged_dummified_clustered_df_test.set_index('Id')
# purged_dummified_clustered_df

# purged_dummified_clustered_df.sample(20)

In [None]:
def undummify(dataframe):
    tot_col=dataframe.columns
    cat_col=list(tot_col[tot_col.str.contains('__')])
    cat_col_split=set(map(lambda x:x.split('__')[0],cat_col))
    cat_dict={}
    for col in cat_col_split:
        sub_df=dataframe[cat_col].loc[:,list(map(lambda x:col in x, dataframe[cat_col].columns))]
        for i in sub_df.columns:
            label_num=int(i.split('__')[1])
            sub_df.loc[:,i]=np.array(sub_df.loc[:,i])*label_num
        cat_dict[col]=sub_df.sum(axis=1)+1
    df1=dataframe.drop(cat_col,axis=1)
    df2=pd.DataFrame(cat_dict)
    return pd.concat([df1,df2],axis=1)

In [11]:
import numpy as np
from kmodes.kmodes import KModes

# random categorical data
data = np.random.choice(20, (100, 10))

km = KModes(n_clusters=4, init='Huang', n_init=5, verbose=1)

clusters = km.fit_predict(data)

# Print the cluster centroids
print(km.cluster_centroids_)

[autoreload of HelperFunctions failed: Traceback (most recent call last):
  File "/home/auscheng/anaconda3/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 245, in check
    superreload(m, reload, self.old_objects)
  File "/home/auscheng/anaconda3/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 450, in superreload
    update_generic(old_obj, new_obj)
  File "/home/auscheng/anaconda3/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 387, in update_generic
    update(a, b)
  File "/home/auscheng/anaconda3/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 357, in update_class
    update_instances(old, new)
  File "/home/auscheng/anaconda3/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 317, in update_instances
    update_instances(old, new, obj, visited)
  File "/home/auscheng/anaconda3/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 317, in update_instances
    update_instan

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 17, cost: 803.0
Run 1, iteration: 2/100, moves: 10, cost: 799.0
Run 1, iteration: 3/100, moves: 4, cost: 799.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 2, iteration: 1/100, moves: 19, cost: 794.0
Run 2, iteration: 2/100, moves: 3, cost: 794.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 3, iteration: 1/100, moves: 21, cost: 806.0
Run 3, iteration: 2/100, moves: 7, cost: 806.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 4, iteration: 1/100, moves: 21, cost: 801.0
Run 4, iteration: 2/100, moves: 5, cost: 798.0
Run 4, iteration: 3/100, moves: 1, cost: 797.0
Run 4, iteration: 4/100, moves: 0, cost: 797.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 5, iteration: 1/100, moves: 20, cost: 796.0
Run 5, iteration: 2/100, moves

In [12]:
km

array([[13,  2,  5, 19,  1, 19,  7,  6,  0,  7],
       [ 0,  1,  0, 16, 19,  4, 10,  9,  5,  4],
       [14,  5, 19,  7,  5,  1,  1,  9,  4,  2],
       [14, 13, 14, 16, 14, 15,  2,  4, 15,  0],
       [ 6,  0, 19,  0,  8, 15, 12,  3,  3,  2],
       [ 1,  7, 18,  1, 17,  7,  8,  0, 11,  8],
       [15,  6, 15, 15,  2,  4,  1,  2,  3,  5],
       [ 6,  7,  4,  2,  5,  5,  7, 14,  5,  1],
       [13, 14,  2, 10, 17, 17, 18,  7,  9,  7],
       [ 7,  1, 14,  9, 14,  4, 18,  7,  0, 13],
       [16,  8,  3,  7, 14,  1,  8, 19,  2, 17],
       [13, 14,  2,  2, 10,  8, 10, 18,  6,  5],
       [ 3,  6, 17,  8, 12, 11,  3, 12,  4,  0],
       [13,  3, 11,  0, 12,  2,  8, 12, 18, 19],
       [ 4, 12,  3,  2,  4, 18,  8,  6,  6,  2],
       [14,  5, 16, 18,  0, 11, 14,  6, 17, 19],
       [ 7, 12, 13, 16, 19, 14, 11, 13, 17,  3],
       [ 2,  3,  8, 13,  9,  2,  7,  8, 19, 17],
       [19,  3, 17,  8, 17,  5,  6,  5,  7,  4],
       [17, 18, 17,  9,  6, 19, 11, 14, 16, 12],
       [ 3,  5, 15, 