In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

# https://towardsdatascience.com/financial-data-analysis-bf4b5e78c45c
# https://towardsdatascience.com/financial-data-analysis-51e7275d0ae

In [None]:
df1 = pd.read_csv('LoanStats_2017Q1.csv', skiprows=[0])
df2 = pd.read_csv('LoanStats_2017Q2.csv', skiprows=[0])
df3 = pd.read_csv('LoanStats_2017Q3.csv', skiprows=[0])
df4 = pd.read_csv('LoanStats_2014.csv', skiprows=[0])
df5 = pd.read_csv('LoanStats_2015.csv', skiprows=[0])

Find column columns

In [None]:

columns = np.dstack((list(df1.columns), list(df2.columns), list(df3.columns), list(df4.columns), list(df5.columns)))
coldf = pd.DataFrame(columns[0])

In [None]:
coldf.head()

In [None]:
df = pd.concat([df1, df2, df3, df4, df5])
df.shape

In [None]:
print(list(df.columns))

In [None]:
df.loan_status.value_counts()

In [None]:
# only select wbere status is fully paid and charged off
df = df.loc[(df['loan_status'].isin(['Fully Paid', 'Charged Off']))]
df.shape


In [None]:
#  remove record with 90% missing value

missing_df = df.isnull().sum(axis = 0).sort_values().to_frame('missing_value').reset_index()
miss_4000 = list(missing_df[missing_df.missing_value >= 400000]['index'])
print(len(miss_4000))
df.drop(miss_4000, axis = 1, inplace = True)

In [None]:
# 53 attributes has been removed 
df.shape

In [None]:
#  remove constant value columns
def find_constant_features(dataFrame):
    const_features = []
    for column in list(dataFrame.columns):
        if dataFrame[column].unique().size < 2:
            const_features.append(column)
    return const_features
const_features = find_constant_features(df)
print(const_features)

In [None]:
df.drop(const_features, axis = 1, inplace = True)

In [None]:
df.shape

In [None]:
# Remove duplicate features ..columsn  having same values
def duplicate_columns(frame):
    groups = frame.columns.to_series().groupby(frame.dtypes).groups
    dups = []
    for t, v in groups.items():
        cs = frame[v].columns
        vs = frame[v]
        lcs = len(cs)
        for i in range(lcs):
            ia = vs.iloc[:,i].values
            for j in range(i+1, lcs):
                ja = vs.iloc[:,j].values
                if np.array_equal(ia, ja):
                    dups.append(cs[i])
                    break
        return dups
duplicate_cols = duplicate_columns(df)
print(duplicate_cols)

In [None]:
#  df.drop(duplicate_cols, axis = 1, inplace = True)
df.shape

In [None]:

# drop duplciate rows
df.drop_duplicates(inplace= True)


In [None]:
df.shape

In [None]:
def plot_feature(col_name, isContinuous):
    """
    Visualize a variable with and without faceting on the loan status.
    - col_name is the variable name in the dataframe
    - full_name is the full variable name
    - continuous is True if the variable is continuous, False otherwise
    """
    f, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12,3), dpi=90)
    
    # Plot without loan status
    if isContinuous:
        sns.distplot(df.loc[df[col_name].notnull(), col_name], kde=False, ax=ax1)
    else:
        sns.countplot(df[col_name], order=sorted(df[col_name].unique()), color='#5975A4', saturation=1, ax=ax1)
    ax1.set_xlabel(col_name)
    ax1.set_ylabel('Count')
    ax1.set_title(col_name)
    plt.xticks(rotation = 90)
# Plot with loan status
    if isContinuous:
        sns.boxplot(x=col_name, y='loan_status', data=df, ax=ax2)
        ax2.set_ylabel('')
        ax2.set_title(col_name + ' by Loan Status')
    else:
        data = df.groupby(col_name)['loan_status'].value_counts(normalize=True).to_frame('proportion').reset_index()        
        sns.barplot(x = col_name, y = 'proportion', hue= "loan_status", data = data, saturation=1, ax=ax2)
        ax2.set_ylabel('Loan fraction')
        ax2.set_title('Loan status')
        plt.xticks(rotation = 90)
    ax2.set_xlabel(col_name)
    
    plt.tight_layout()


In [None]:
df.iloc[0:5, 0: 10]

In [None]:
len(df.loan_amnt.value_counts())
plot_feature('loan_amnt', True)

In [None]:
plot_feature('term', False)

In [None]:
# plot_feature('int_rate', True)

In [None]:
# plot_feature('grade', False)
# plot_feature('sub_grade', False)

In [None]:
len(df.emp_title.value_counts())
# 128310
# It looks like emp_title has lots of unique value, which may not strongly be associated with predicted loan_status. 
# Therefore, I delete the feature.
features_to_be_removed =[]
features_to_be_removed.extend(['emp_title', 'id'])

In [None]:
df.iloc[0:5, 6: 20]
df.emp_length.value_counts()
df.emp_length.fillna(value=0,inplace=True)
df['emp_length'].replace(to_replace='[^0-9]+', value='', inplace=True, regex=True)
df['emp_length'] = df['emp_length'].astype(int)

In [None]:
plot_feature('emp_length', False)

In [None]:
plot_feature('home_ownership', False)

In [None]:
df.issue_d.value_counts()

In [None]:
# convert issue_d to month
df['issue_month'] = pd.Series(df.issue_d).str.replace(r'-\d+', '')
plot_feature('issue_month', False)

In [None]:
df.iloc[0:5, 6: 20]
features_to_be_removed.extend([ 'url' ,  'title' , 'zip_code'])

In [None]:
df.iloc[0:5, 6: 20]

In [None]:
features_to_be_removed.extend(['issue_d', 'mths_since_last_delinq', 
                               'mths_since_last_record', 'inq_last_6mths', 
                               'mths_since_last_delinq', 'mths_since_last_record'])

In [None]:
features_to_be_removed.extend(['total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 
                               'total_rec_int', 'total_rec_late_fee'])

In [None]:
features_to_be_removed.extend(['recoveries', 'collection_recovery_fee', 'last_pymnt_d', 
 'last_pymnt_amnt', 'last_credit_pull_d', 'last_fico_range_high', 
                               'last_fico_range_low', 'collections_12_mths_ex_med', 'mths_since_last_major_derog'])


In [None]:
features_to_be_removed.extend(['mo_sin_old_il_acct', 'mo_sin_old_rev_tl_op', 
                               'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl', 
                               'mths_since_recent_bc', 'mths_since_recent_bc_dlq', 
                               'mths_since_recent_inq', 'mths_since_recent_revol_delinq', 
                               'num_accts_ever_120_pd'])

In [None]:

features_to_be_removed.extend(['num_actv_bc_tl', 'num_actv_rev_tl', 'num_bc_sats', 
                               'num_bc_tl', 'num_il_tl', 'num_op_rev_tl', 
                               'num_rev_accts', 'num_rev_tl_bal_gt_0', 
                               'num_sats', 'num_tl_120dpd_2m'])

In [None]:

features_to_be_removed.extend(['num_tl_30dpd', 'num_tl_90g_dpd_24m', 'num_tl_op_past_12m', 
                               'pct_tl_nvr_dlq', 'percent_bc_gt_75', 'tot_hi_cred_lim', 
                               'total_bal_ex_mort', 'total_bc_limit'])

In [None]:
features_to_be_removed.extend(['debt_settlement_flag', 'total_il_high_credit_limit'])

In [None]:
features_to_be_removed.extend([ 'tot_coll_amt', 'tot_cur_bal', 'total_rev_hi_lim', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
    'chargeoff_within_12_mths', 'delinq_amnt'])

In [None]:
features_to_be_removed.extend(['funded_amnt', 'funded_amnt_inv'])

In [None]:
len(set(features_to_be_removed))
print(features_to_be_removed)

In [None]:
df_selected = df.drop(list(set(features_to_be_removed)), axis = 1)
df_selected.shape

In [None]:
df_dtypes = pd.merge(df_selected.isnull().sum(axis = 0).sort_values().to_frame('missing_value').reset_index(),
         df_selected.dtypes.to_frame('feature_type').reset_index(),
         on = 'index',
         how = 'inner')

df_dtypes.sort_values(['missing_value', 'feature_type'])


In [None]:
# Feature encode to categorical value 
df_selected.purpose = df_selected.purpose.astype("category", categories=np.unique(df_selected.purpose)).cat.codes
df_selected.home_ownership = df_selected.home_ownership.astype("category", categories = np.unique(df_selected.home_ownership)).cat.codes
df_selected.grade = df_selected.grade.astype("category", categories = np.unique(df_selected.grade)).cat.codes
df_selected.sub_grade = df_selected.sub_grade.astype("category", categories = np.unique(df_selected.sub_grade)).cat.codes
df_selected.addr_state = df_selected.addr_state.astype("category", categories = np.unique(df_selected.addr_state)).cat.codes



In [None]:
df_selected= df_selected.dropna()
df_selected.to_csv('df_selected.csv', index = False)

In [None]:
import seaborn as sns
import pandas as pd
from sklearn.utils import shuffle, class_weight
from sklearn import preprocessing,  tree, metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

In [None]:
df_selected = pd.read_csv('df_selected.csv')
df_selected= df_selected.dropna()

In [None]:

df_selected.shape

In [None]:
corr = df_selected.corr()
plt.figure(figsize = (10, 8))
sns.heatmap(corr)
plt.show()

In [None]:
corr.columns

In [None]:
df_selected.columns

In [None]:


feature_cols=['loan_amnt', 'term', 'int_rate', 'installment', 'grade', 'sub_grade',
       'emp_length', 'home_ownership', 'annual_inc', 'verification_status',
        'purpose', 'addr_state', 'dti', 'delinq_2yrs',
       'earliest_cr_line', 'fico_range_low', 'fico_range_high', 'open_acc',
       'pub_rec', 'revol_bal', 'revol_util', 'total_acc',
       'initial_list_status', 'application_type', 'acc_now_delinq',
       'acc_open_past_24mths', 'mort_acc', 'pub_rec_bankruptcies', 'tax_liens',
       'disbursement_method', 'issue_month']

In [None]:
y= df_selected.loan_status
X = df_selected.drop("loan_status", axis = 1)
X = pd.get_dummies(X)


In [None]:
X_train, X_test, y_train, y_test=train_test_split(X, y, 
                                                  test_size = 0.05,
                                                  random_state = 123)

In [None]:
clf = RandomForestClassifier(n_estimators=500,criterion = 'entropy', max_depth=5,random_state=123)
clf.fit(X_train,y_train)


In [None]:
y_pred = clf.predict(X_test)
pred_proba = clf.predict_proba(X_test)

In [None]:
print("Accuracy:", metrics.accuracy_score(y_test,y_pred))


In [None]:
from sklearn.metrics import confusion_matrix
y_true = y_test.tolist()
mat = confusion_matrix(y_true,y_pred)
sns.heatmap(mat.T, square =True, annot = True, fmt = 'd', cbar= False)
plt.xlabel('True data')
plt.ylabel('predicted values')

In [None]:
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
estimator = clf.estimators_[10]
export_graphviz(estimator, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True, feature_names=X_train.columns,class_names = ['no', 'yes'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  


In [None]:
Image(graph.create_png())

In [None]:
feature_importances = pd.Series(clf.feature_importances_, index = X.columns )

feature_importances.columns = ['columns','imp']

feature_importances = feature_importances.sort_values()
feature_importances = feature_importances[lambda x: x>0.01]



In [None]:

feature_importances.plot(kind='barh', figsize = (10,10))
