In [None]:
import pandas as pd
import random
import numpy as np

#visual library
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import cufflinks as cf
cf.go_offline()

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [None]:
# importing Lending club data from csv file
loan_data = pd.read_csv("loan_data.csv",index_col=0)

## Feature Description

|    | LoanStatNew          | Description                                                                                                                                                                                              |
|---:|:---------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|  0 | addr_state           | The state provided by the borrower in the loan application                                                                                                                                               |
|  1 | annual_inc           | The self-reported annual income provided by the borrower during registration.                                                                                                                            |
|  2 | application_type     | Indicates whether the loan is an individual application or a joint application with two co-borrowers                                                                                                     |
|  3 | dti                  | A ratio calculated using the borrower’s total monthly debt payments on the total debt obligations, excluding mortgage and the requested LC loan, divided by the borrower’s self-reported monthly income. |
|  4 | earliest_cr_line     | The month the borrower's earliest reported credit line was opened                                                                                                                                        |
|  5 | emp_length           | Employment length in years. Possible values are between 0 and 10 where 0 means less than one year and 10 means ten or more years.                                                                        |
|  6 | emp_title            | The job title supplied by the Borrower when applying for the loan.*                                                                                                                                      |
|  7 | fico_range_high      | The upper boundary range the borrower’s FICO at loan origination belongs to.                                                                                                                             |
|  8 | fico_range_low       | The lower boundary range the borrower’s FICO at loan origination belongs to.                                                                                                                             |
|  9 | funded_amnt          | The total amount committed to that loan at that point in time.                                                                                                                                           |
| 10 | funded_amnt_inv      | The total amount committed by investors for that loan at that point in time.                                                                                                                             |
| 11 | grade                | LC assigned loan grade                                                                                                                                                                                   |
| 12 | home_ownership       | The home ownership status provided by the borrower during registration or obtained from the credit report. Our values are: RENT, OWN, MORTGAGE, OTHER                                                    |
| 13 | id                   | A unique LC assigned ID for the loan listing.                                                                                                                                                            |
| 14 | initial_list_status  | The initial listing status of the loan. Possible values are – W, F                                                                                                                                       |
| 15 | installment          | The monthly payment owed by the borrower if the loan originates.                                                                                                                                         |
| 16 | int_rate             | Interest Rate on the loan                                                                                                                                                                                |
| 17 | issue_d              | The month which the loan was funded                                                                                                                                                                      |
| 18 | loan_amnt            | The listed amount of the loan applied for by the borrower. If at some point in time, the credit department reduces the loan amount, then it will be reflected in this value.                             |
| 19 | loan_status          | Current status of the loan                                                                                                                                                                               |
| 20 | mort_acc             | Number of mortgage accounts.                                                                                                                                                                             |
| 21 | open_acc             | The number of open credit lines in the borrower's credit file.                                                                                                                                           |
| 22 | pub_rec_bankruptcies | Number of public record bankruptcies                                                                                                                                                                     |
| 23 | purpose              | A category provided by the borrower for the loan request.                                                                                                                                                |
| 24 | revol_bal            | Total credit revolving balance                                                                                                                                                                           |
| 25 | revol_util           | Revolving line utilization rate, or the amount of credit the borrower is using relative to all available revolving credit.                                                                               |
| 26 | sub_grade            | LC assigned loan subgrade                                                                                                                                                                                |
| 27 | term                 | The number of payments on the loan. Values are in months and can be either 36 or 60.                                                                                                                     |
| 28 | title                | The loan title provided by the borrower                                                                                                                                                                  |
| 29 | total_acc            | The total number of credit lines currently in the borrower's credit file                                                                                                                                 |
| 30 | verification_status  | Indicates if income was verified by LC, not verified, or if the income source was verified                                                                                                               |
| 31 | zip_code             | The first 3 numbers of the zip code provided by the borrower in the loan application.                                                                                                                    |

In [None]:
loan_data.info(memory_usage = 'deep')

In [None]:
## peeking at df
loan_data.head()

In [None]:
### dowcasting to save memory
def convert_columns_to_catg(df, column_list):
    for col in column_list:
        print("converting", col.ljust(30), "size: ", round(df[col].memory_usage(deep=True)*1e-6,2), end="\t")
        df[col] = df[col].astype("category")
        print("->\t", round(df[col].memory_usage(deep=True)*1e-6,2))

In [None]:
convert_columns_to_catg(loan_data, ['grade','sub_grade','emp_title','home_ownership','verification_status','loan_status',
                                   'purpose','title','zip_code','addr_state','initial_list_status','application_type','emp_length'])

In [None]:
import gc

def downcast_df_float_columns(df):
    list_of_columns = list(df.select_dtypes(include=["float64"]).columns)
        
    if len(list_of_columns)>=1:
        max_string_length = max([len(col) for col in list_of_columns]) # finds max string length for better status printing
        print("downcasting float for:", list_of_columns, "\n")
        
        for col in list_of_columns:
            print("reduced memory usage for:  ", col.ljust(max_string_length+2)[:max_string_length+2],
                  "from", str(round(df[col].memory_usage(deep=True)*1e-6,2)).rjust(8), "to", end=" ")
            df[col] = pd.to_numeric(df[col], downcast="float")
            print(str(round(df[col].memory_usage(deep=True)*1e-6,2)).rjust(8))
    else:
        print("no columns to downcast")
    
    gc.collect()
    
    print("done")

In [None]:
downcast_df_float_columns(loan_data)

In [None]:
loan_data.info(memory_usage = "deep")

In [None]:
loan_data.describe()

In [None]:
## checking for missing values
# function to show percentage of null values-
def null_values(df):
        mis_val = df.isnull().sum()
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        print ("Dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        return mis_val_table_ren_columns

In [None]:
null_df = null_values(loan_data)
print(null_df)
del null_df
gc.collect

import dtale
import dtale.app as eda
eda.show(loan_data)

In [None]:
gc.collect()

In [None]:
#Function that will return the value count and frequency of each observation within a feature
def value_count_norm_cal(df,feature):
    value_count = df[feature].value_counts()
    value_count_norm = df[feature].value_counts(normalize=True) * 100
    value_count_concat = pd.concat([value_count, value_count_norm], axis=1)
    value_count_concat.columns = ['Count', 'Frequency (%)']
    return value_count_concat

In [None]:
#generating target columns
loan_data["loan_status"] = loan_data["loan_status"].apply(lambda x: "Default" 
                                                          if x in ["Charged Off", "Default"]
                                                          else "Fully Paid")
gc.collect()

In [None]:
#function to explore features
def feature_exploration (df,feature,numeric):
    if numeric :
            print('Description:\n{}'.format((np.abs(df[feature])).describe()))
            print('*'*50)
            print('Object type:{}'.format(df[feature].dtype))
    else:
        print('Description:\n{}'.format(df[feature].describe()))
        print('*'*50)
        print('Object type:\n{}'.format(df[feature].dtype))
        print('*'*50)
        value_cnt = value_count_norm_cal(df,feature)
        print('Value count:\n{}'.format(value_cnt))
##default rates vs cat_cols
def Cat_col_vs_default_rate (df,col_name,feature,title):
    Default_rates = df.groupby(col_name)[feature].value_counts(dropna=True,normalize=True).round(2).loc[:,'Default']
    fig = px.histogram(Default_rates, x=Default_rates.index,y = Default_rates.values,
                   color = Default_rates.index,title=title,labels={'y':'Default_rates(%)'})
    return fig.show()

#box plot function-

def box_plot (df,feature):
    f, ax = plt.subplots(figsize=(15,3))
    sns.boxplot(x=df[feature])
    ax.set_xlabel('Count')
    ax.set_title('{} distribution(Boxplot)'.format(feature))
    return plt.show()

#pie chart function
def pie_plot (df,feature,title):
    pie_df = value_count_norm_cal(df,feature)
    fig = px.pie(pie_df, values='Count', names = pie_df.index, title=title)
    fig.update_traces(textposition='inside',hoverinfo='label+percent')
    gc.collect()
    return fig.show() 

#distribution plot function
def dist_plot(df,feature):
    f, (ax1,ax2) = plt.subplots(figsize=(11, 7),nrows=2, ncols=1,gridspec_kw={'height_ratios': [2,1]})
    sns.set_theme(style="darkgrid")
    sns.histplot(df[feature], kde=False,ax=ax1)
    ax1.set_xlabel(feature)
    ax1.set_ylabel('Count')
    ax1.set_title('{} distribution(Histogram)'.format(feature))
#box_plot
    sns.boxplot(x=df[feature],y =df['loan_status'],ax=ax2) 
    ax2.xaxis.grid(True)
    ax2.set(ylabel="Loan_statues")
    sns.despine(trim=True, left=True)
    return plt.show()
#Count_plot functions
def count_in_order_plot (df,feature,title,order) :
    fig = px.histogram(df, x=feature,color=feature,title=title,
                       category_orders=order)
    return fig.show()
def count_plot (df,feature,title) :
    fig = px.histogram(df.loc[df[feature].notnull(),feature],x=feature,color = feature ,title=title)
    return fig.show()

## Univariate analysis

### loan_status

In [None]:
feature_exploration (loan_data,'loan_status',numeric = False)

In [None]:
pie_plot (loan_data,'loan_status','loan_status_distribution')

In [None]:
count_plot (loan_data,"loan_status","loan_status_count") 

* data set is inballanced 


### ID

In [None]:
# chicking is all values are unique.
print(loan_data['id'].describe())
print('*'*50)
print('Unique variables:',loan_data['id'].nunique())

* They are all distinctive, yes. We exclude the ID since it is not useful for modelling, either as a categorical variable (there are too many distinct values) or a numerical variable (the magnitudes of the IDs vary greatly, probably without any importance).

In [None]:
#Creating empty drop list to collect drop features.

drop_feature = []
drop_feature.append("id")

### Loan_amnt , Funded ,Invested

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# checking distribution of loan_mount
loan_amount = loan_data["loan_amnt"].values
funded_amount = loan_data["funded_amnt"].values
investor_funds = loan_data["funded_amnt_inv"].values

fig = make_subplots(
    rows=1, cols=3,
    subplot_titles=("Loan_amoun_applide", "loan_amount_funded", "loan_amount_Invested",))

fig.add_trace(go.Histogram(x=loan_amount),
              row=1, col=1)

fig.add_trace(go.Histogram(x=funded_amount),
              row=1, col=2)

fig.add_trace(go.Histogram(x=investor_funds),
              row=1, col=3)

fig.update_layout(title_text="Loan_Distribution")

fig.show()


* as we can see from the above loan_distribution all of them have similer distribution and vales so only loan_amount will be used.


In [None]:
#adding to drop list
drop_feature.extend(['loan_amnt','funded_amnt_inv'])
drop_feature

###  loan term

In [None]:
feature_exploration (loan_data,'term',numeric = False)

In [None]:
#pipe plot of term distribution
pie_plot (loan_data,"term",title = "loan_term_distribution")

In [None]:
Cat_col_vs_default_rate (loan_data,'term','loan_status','loan Term vs Default rates')

* (76%)Most loans issued are 36 months long.
* 30 % of 60 montsh are deafults compared 15 % of 36 months loan.
* long term loan has higher risk then short term loan.

### Intrest rate

In [None]:
feature_exploration (loan_data,"int_rate",numeric = True)

In [None]:
 box_plot (loan_data,"int_rate")

In [None]:
dist_plot (loan_data,'int_rate')

* intrest rate ranges from 5 % to 31 %.
* Default rate tends to increse as intrest rate increses as expected.
* positively skewed.  

### Loan Grade and Sub-grade

In [None]:
# loan grad and sub grade
print(feature_exploration (loan_data,"grade",numeric = False))

In [None]:
feature_exploration (loan_data,"sub_grade",numeric = False)

In [None]:
count_in_order_plot (loan_data,"sub_grade","Lona_grade_count",dict(sub_grade= loan_data["sub_grade"].unique().sort_values().to_list()))

In [None]:
pie_plot (loan_data,"sub_grade",title = "loan grade_distribution(%)")

In [None]:
Cat_col_vs_default_rate (loan_data,'sub_grade','loan_status','Loan sub Grade vs Default rates')

* Since loan grade and sub grade imply each other only loan grade will be used.
* Most of the loan issued either belongs too grade B or C wich make up to 57% of all loans.
* Default rate tends to worsen as the loan grade worsen.
* Loan garde E,F,G have more that 30% deafult rate but also verly less loan were issued for those grades

### Installment

In [None]:
feature_exploration (loan_data,"installment",numeric = True)

In [None]:
box_plot(loan_data,'installment')

In [None]:
dist_plot (loan_data,"installment")

In [None]:
loan_data.groupby('loan_status')['installment'].describe()

* instalment range from $ 4 - $ 1720 
* distribution is positively skewed 
* Loans that charge off have $30 higher installments on average.

### Employment title

In [None]:
print(loan_data['emp_title'].describe())
print('*'*50)
print(loan_data['emp_title'].nunique()) 

In [None]:
drop_feature.append('emp_title')
drop_feature

* since their are 378358 unique variables feature will be droped 

### Employment Length

In [None]:
feature_exploration(loan_data,'emp_length', numeric = False)

In [None]:
loan_data['emp_length'].replace(to_replace='10+ years', value='10 years', inplace=True)
loan_data['emp_length'].replace('< 1 year', '0 years', inplace=True)

def emp_length_to_int(s):
    if pd.isnull(s):
        return s
    else:
        return np.int8(s.split()[0])

In [None]:
loan_data['emp_length'] = loan_data['emp_length'].apply(emp_length_to_int)

In [None]:
count_in_order_plot (loan_data,"emp_length","Employment_length_count",dict(emp_length= loan_data["emp_length"].unique().sort_values().to_list()))

In [None]:
pie_plot (loan_data,"emp_length",title = "emp_length_distribution(%)")

In [None]:
Cat_col_vs_default_rate (loan_data,'emp_length','loan_status','Employment_lenght vs Default_rates')

* most of the loan applicants have 10+ years of experience.
* this is no significant difference in default rate across employment length hence this feature will be droped

In [None]:
drop_feature.append('emp_length')
drop_feature

* most of the loan applicants have 10 years experiance 
* applicant their no significant default rate different between all experince levels

### fico_range_high , fico_range_low

In [None]:
print(loan_data[['fico_range_low','fico_range_high']].describe())
print('*'*50)
print(loan_data[['fico_range_low','fico_range_high']].corr())

In [None]:
#creating new variable Fico_average
loan_data['Fico_average'] = (loan_data['fico_range_low'] + loan_data['fico_range_high'])/2
drop_feature.extend(['fico_range_low','fico_range_high'])
print(drop_feature)
#creting new variable Fico_average
print(loan_data['Fico_average'].head())

In [None]:
feature_exploration(loan_data,'Fico_average',numeric=True)

In [None]:
box_plot(loan_data,"Fico_average")

In [None]:
dist_plot(loan_data,"Fico_average")

* most the applicant have a Fico score ranging form 660 to 700.
* as expexted applicant with higher fico score are more likely pay back the loan.
* positively skewed.

### home_ownership

In [None]:
feature_exploration (loan_data,'home_ownership',numeric=False)

In [None]:
#binning none and any to other
loan_data['home_ownership'].replace(['NONE', 'ANY'], 'OTHER', inplace=True)
loan_data['home_ownership'].value_counts(dropna=False)

In [None]:
count_plot (loan_data,"home_ownership","Home_ownership")

In [None]:
pie_plot (loan_data,"home_ownership","Home_ownership(%)")

In [None]:
Cat_col_vs_default_rate (loan_data,'home_ownership','loan_status','Home_ownership vs Default rates')

* most for the loan applicants have mortgage no thier home 
* applicants with mortgage also have lower default rate compaired ot own and rent 
* other only consists 0.3% of total loan applicants

### Annual_inc

In [None]:
feature_exploration (loan_data,'annual_inc',numeric=True)

In [None]:
box_plot(loan_data,'annual_inc')

In [None]:
annual_inc_less = np.log10(loan_data['annual_inc'][loan_data['annual_inc'] <= 300000]+1)
annual_inc = np.log10(loan_data["annual_inc"]+1)

fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=("Income < 300000", "Income"))

fig.add_trace(go.Histogram(x=annual_inc_less),
              row=1, col=1)

fig.add_trace(go.Histogram(x=annual_inc),
              row=1, col=2)


fig.update_layout(title_text="Income_distribution")

fig.show()


In [None]:
print('annual income greater than 1 million',loan_data.loc[loan_data.annual_inc >= 1000000, 'loan_status'].value_counts())
print('*'*50)
print('annual income greater than 300k',loan_data.loc[loan_data.annual_inc >= 300000, 'loan_status'].value_counts())

In [None]:
fig = px.box( x=np.log10(1+loan_data['annual_inc']),title = 'log_annual_inc')
fig.show()

### dti

In [None]:
feature_exploration(loan_data,"dti",numeric = True)

In [None]:
box_plot(loan_data,"dti")

In [None]:
dist_plot(loan_data,"dti")

### verification_status

In [None]:
feature_exploration (loan_data,'verification_status',numeric=False)

In [None]:
count_plot (loan_data,"verification_status","verification_status")

In [None]:
pie_plot (loan_data,"verification_status","verification_status(%)")

In [None]:
Cat_col_vs_default_rate (loan_data,'verification_status','loan_status','verification_status vs Default rates')

### Loan Purpose , Title

In [None]:
feature_exploration (loan_data,'purpose',numeric=False)

In [None]:
print('Description:\n{}'.format(loan_data["title"].describe()))
print('*'*50)
print('Value count:\n{}'.format(loan_data["title"].value_counts().head(10)))
print('*'*50)
print('df_length:\n{}'.format(len(loan_data['title'])))

In [None]:
count_plot (loan_data,'purpose',"Loan_purpose")

In [None]:
pie_plot (loan_data,"purpose","Loan purpose(%)")

In [None]:
Cat_col_vs_default_rate (loan_data,'purpose','loan_status','Purpose vs Default rates')

### addr_state,zip code

In [None]:
## Bining US states into Regions for better  understanding
west = ['CA', 'OR', 'UT','WA', 'CO', 'NV', 'AK', 'MT', 'HI', 'WY', 'ID']
south_west = ['AZ', 'TX', 'NM', 'OK']
south_east = ['GA', 'NC', 'VA', 'FL', 'KY', 'SC', 'LA', 'AL', 'WV', 'DC', 'AR', 'DE', 'MS', 'TN' ]
mid_west = ['IL', 'MO', 'MN', 'OH', 'WI', 'KS', 'MI', 'SD', 'IA', 'NE', 'IN', 'ND']
north_east = ['CT', 'NY', 'PA', 'NJ', 'RI','MA', 'MD', 'VT', 'NH', 'ME']



loan_data['region'] = np.nan

def finding_regions(state):
    if state in west:
        return 'West'
    elif state in south_west:
        return 'SouthWest'
    elif state in south_east:
        return 'SouthEast'
    elif state in mid_west:
        return 'MidWest'
    elif state in north_east:
        return 'NorthEast'
    


loan_data['region'] = loan_data['addr_state'].apply(finding_regions)

In [None]:
feature_exploration (loan_data,'region',numeric=False)

In [None]:
count_plot(loan_data,"region","US_state_regions")

In [None]:
pie_plot(loan_data,'region','US_state_regions(%)')

In [None]:
Cat_col_vs_default_rate (loan_data,'region','loan_status','US_state_regions vs Default rate(%)')

### open_acc

In [None]:
feature_exploration (loan_data,'open_acc',numeric=True)

In [None]:
box_plot(loan_data,'open_acc')

In [None]:
dist_plot(loan_data,"open_acc")

### revol_bal

In [None]:
feature_exploration(loan_data,"revol_bal",numeric = True)

In [None]:
box_plot(loan_data,'revol_bal')

In [None]:
dist_plot(loan_data,'revol_bal') 

### total_acc

In [None]:
feature_exploration(loan_data,'total_acc',numeric = True)

In [None]:
box_plot(loan_data,'total_acc')

In [None]:
dist_plot(loan_data,'total_acc')

### revol_util

In [None]:
feature_exploration(loan_data,'revol_util',numeric = True)

In [None]:
box_plot(loan_data,'revol_util')

In [None]:
dist_plot(loan_data,'revol_util')

### initial_list_status

In [None]:
feature_exploration(loan_data,'initial_list_status',numeric = False)

In [None]:
count_plot(loan_data,"initial_list_status","initial_list_status")

In [None]:
pie_plot (loan_data,"initial_list_status","Initial list status(%)")

In [None]:
Cat_col_vs_default_rate(loan_data,"initial_list_status",'loan_status',"Initial list status vs Default rate(%)")

### application_type

In [None]:
feature_exploration (loan_data,"application_type",numeric = False)

In [None]:
count_plot(loan_data,'application_type','Application type')

In [None]:
pie_plot(loan_data,'application_type','Application type(%)')

In [None]:
Cat_col_vs_default_rate (loan_data,'application_type','loan_status','Application type vs Default rate(%)')

### mort_acc


In [None]:
feature_exploration(loan_data,'mort_acc',numeric = True)

In [None]:
box_plot(loan_data,'mort_acc')

In [None]:
dist_plot(loan_data,'mort_acc')

### pub_rec_bankruptcies

In [None]:
feature_exploration(loan_data,'pub_rec_bankruptcies',numeric = False)

In [None]:
box_plot(loan_data,'pub_rec_bankruptcies')

In [None]:
dist_plot(loan_data,'pub_rec_bankruptcies')

## Bivariate analysis

In [None]:
# converting target column to int check corelation 
loan_data["status_int"] = loan_data["loan_status"].apply(lambda x: 0 
                                                          if x =="Default"
                                                          else 1)
cor = loan_data.corr().round(2)
mask = np.triu(np.ones_like(cor, dtype=bool))

In [None]:
fig = px.imshow(cor.mask(mask),text_auto=True, 
                aspect="auto",zmin=-1,zmax=1)
fig.update_layout(title = "Pearson co-relation",width=1100,height=500,)
fig.show()

* Fico range is positively co-related to traget and int-rate is negatively co-related to traget.
* thier no other variable co-related to target 
* loan_amnt,funded_amnt_inv and funded_amnt are higely co-related to each outher
* fico_range,revol_util is negatively co-realte int-rate 

In [None]:
loan_data['issue_d'].info()
dt_series = pd.to_datetime(loan_data['issue_d'])
loan_data['year'] = dt_series.dt.year
loan_data['month'] = dt_series.dt.month

In [None]:
#fig = px.bar(loan_data, x="year", y="loan_amnt", color="month",
             #text="nation",barmode="grouped")
#fig.show()
plt.figure(figsize=(24,12))
sns.barplot(data=loan_data,
            x="year", y="loan_amnt",dodge=False)


# loan issued by regions

In [None]:
#sorting data frame by issue date co

In [None]:
cat_coll = loan_data.drop(['issue_d','title','zip_code','earliest_cr_line','addr_state','grade','emp_title'],axis=1).select_dtypes(exclude=np.number).columns.to_list()

In [None]:
fig = plt.figure(figsize=(15,30),dpi=180)
fig.tight_layout()
for n, cat_feat in enumerate(cat_coll) :
    ax = plt.subplot(5, 2,n+1)
    sns.boxplot(ax=ax,x=loan_data[cat_feat],y=loan_data['loan_amnt'])
    ax.set_title(cat_feat + " vs loan_data")
    plt.xticks(rotation=45,ha='right')
    plt.ylabel('Loan')

In [None]:
fig = plt.figure(figsize=(15,30),dpi=180)
fig.tight_layout()
for n, cat_feat in enumerate(cat_coll) :
    ax = plt.subplot(5, 2,n+1)
    sns.boxplot(ax=ax,x=loan_data[cat_feat],y=loan_data['log_annual_inc'])
    ax.set_title(cat_feat + " vs Income")
    plt.xticks(rotation=45,ha='right')
    plt.ylabel('Income')

In [None]:
fig = plt.figure(figsize=(15,30),dpi=180)
fig.tight_layout()
for n, cat_feat in enumerate(cat_coll) :
    ax = plt.subplot(5, 2,n+1)
    sns.boxplot(ax=ax,x=loan_data[cat_feat],y=loan_data['dti'].apply(lambda x: np.log10(x+1)))
    ax.set_title(cat_feat + " vs DTI")
    plt.xticks(rotation=45,ha='right')
    plt.ylabel('DTI')


In [None]:
fig = plt.figure(figsize=(15,30),dpi=180)
fig.tight_layout()
for n, cat_feat in enumerate(cat_coll) :
    ax = plt.subplot(5, 2,n+1)
    sns.boxplot(ax=ax,x=loan_data[cat_feat],y=loan_data['Fico_average'])
    ax.set_title(cat_feat + " vs Fico") 
    plt.xticks(rotation=45,ha='right')
    plt.ylabel('Fico')

In [None]:
loan_data.info()