# <center>[Imperial College Loan Default Dataset](https://www.kaggle.com/c/loan-default-prediction)</center><br><br>
### <center>by Hector Cadeaux</center><br><br>
This competition asks you to determine whether a loan will default, as well as the loss incurred if it does default. Unlike traditional finance-based approaches to this problem, where one distinguishes between good or bad counterparties in a binary way, we seek to anticipate and incorporate both the default and the severity of the losses that result. In doing so, we are building a bridge between traditional banking, where we are looking at reducing the consumption of economic capital, to an asset-management perspective, where we optimize on the risk to the financial investor.

This competition is sponsored by researchers at Imperial College London.<br><br>
## Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
loan = pd.read_csv('train_v2.csv',low_memory=False)


In [None]:
sns.heatmap(loan.isna())
plt.show()

In [None]:
loan.describe()

In [None]:
loan_profile= loan.describe().T

In [None]:
loan_profile['count'].value_counts()

In [None]:
loan.loss.value_counts()

In [None]:
loan

In [None]:
loan_t = pd.read_csv('test_v2.csv',low_memory=False)

In [None]:
loan.info()

In [None]:
loan.dtypes

## Cleaning Train

In [None]:
obj_col= [col for col in loan.columns if loan[col].dtype == 'object']
obj_col

In [None]:
def large_floats(s):
    try:
        return float(s)
    except ValueError:
        if (type(s) != 'float64' or type(s) != 'int64') and (len(str(s)) > 6):
            s= str(s)
            to_the = str(len(s) -1)
            eight = str(round(float(s[:8])/10**7,7))
            return eight+'e'+to_the
    

In [None]:
for col in loan.columns:
    nan_col_tr = []
    nnas = loan[col].isna().sum()
    if nnas > 0:
        print("{} has {} missing values".format(col,nnas))
        nan_col_tr.append(col)

In [None]:
loan[obj_col]

In [None]:
loan['f5'].value_counts()

In [None]:
loan['f770'].value_counts()

In [None]:
loan['f2'].value_counts()

In [None]:
loan['f776'].value_counts().plot.barh()

In [None]:
loan['f9'].value_counts().head(40).plot.barh()

In [None]:
loan['f137'].value_counts().head(40).plot.bar()

In [None]:
loan['f7'].value_counts()

In [None]:
for col in loan.columns:
    loan[col] = loan[col].fillna(loan[col].median())

In [None]:
sns.heatmap(loan.isna())
plt.show()

### Cleaning Test

In [None]:
loan_t

In [None]:
obj_col_t= [col for col in loan_t.columns if loan[col].dtype == 'object']
obj_col_t

In [None]:
for clm in obj_col:
    loan[clm] = loan[clm].fillna(0)


In [None]:
for clm in obj_col:
    loan[clm] = loan[clm].apply(large_floats)
    loan[clm] = loan[clm].astype('float64')
    print("{} is now {}".format(clm,loan[clm].dtype))

In [None]:
sns.heatmap(loan[obj_col].isna())
plt.show()

In [None]:
loan[obj_col].describe().T

In [None]:
loan.describe().T

In [None]:
for col in loan_t.columns:
    nan_col_t = []
    nnas = loan_t[col].isna().sum()
    if nnas > 0:
        print("{} has {} missing values".format(col,nnas))
        nan_col_t.append(col)

In [None]:
for clm in obj_col_t:
    loan_t[clm] = loan_t[clm].fillna(0)

In [None]:
for clm in obj_col_t:
    loan_t[clm] = loan_t[clm].apply(large_floats)
    loan_t[clm] = loan_t[clm].astype('float64')
    print("{} is now {}".format(clm,loan_t[clm].dtype))

In [None]:
for col in loan_t.columns:
    loan_t[col] = loan_t[col].fillna(loan_t[col].median(skipna=True))

In [None]:
sns.heatmap(loan_t.isna())

In [None]:
loan_t[obj_col_t].describe().T

In [None]:
from sklearn.feature_selection import f_classif, SelectKBest
y= loan['loss']
X = loan.drop(['id','loss'],axis=1)

In [None]:
X.describe().T

In [None]:
selector = SelectKBest(score_func=f_classif,k=10)
Xdisc= selector.fit_transform(X,y)
print(selector.get_support())

In [None]:
df1 = pd.DataFrame(loan.drop(columns=['id','loss']).columns, columns=['feature'])

df2 = pd.DataFrame(selector.scores_, columns=['importance'])

importance = df1.join(df2)

importance.sort_values('importance', ascending=False).head(20)

In [None]:
Xdisc.shape

In [None]:
importance.sort_values('importance', ascending=False)[20:40]

In [None]:
importance.describe()

In [None]:
importance.importance.sum()

In [None]:
importance.query('importance>=4.068193').importance.sum()

In [None]:
importance.query('importance>=	1.918834').importance.sum()/importance.importance.sum()

In [None]:
X.isna().any().sum()

In [None]:
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split
trainX, validX, trainy, validy = train_test_split(X,y,train_size=.3,random_state=15)

In [None]:
from sklearn.metrics import mean_squared_error
hgbr= HistGradientBoostingRegressor()
hgbr.fit(trainX, trainy)
y_pred=hgbr.predict(trainX)
mse= mean_squared_error(y_pred,trainy)

In [None]:
mse

In [None]:
valy_prd=hgbr.predict(validX)
msev= mean_squared_error(valy_prd,validy)


In [None]:
msev

In [None]:
trainy.describe()

In [None]:
np.log1p(trainy).to_frame().query('loss >0').plot.hist(bins=100)

In [None]:
trainy