# Loan Repayment Prediction

In [1]:
#Importing Libraries
import pandas as pd
import matplotlib.pyplot as  plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import StandardScaler,RobustScaler,LabelEncoder
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import LogisticRegression

  import pandas.util.testing as tm


In [6]:
#Load the Data & statistical analysis
def load_data():
    train=pd.read_csv('../static/assets/loan.csv')
    print(train.shape)
    print(train.info())
    print(train.describe().T)
    return train

In [7]:
def feature_engg(train):
    #Filling Missing Values
    print(train.isnull().sum())
    #Employee Length:Fill na values with mean
    train['emp_length'].fillna(train['emp_length'].mean(skipna=True),inplace=True)
    
    #Annual Income:mean income
    train['annual_inc'].fillna(train['annual_inc'].median(),inplace=True)
    
    #delinq_2 yrs
    train['delinq_2yrs'].fillna(train['delinq_2yrs'].mean(skipna=True),inplace=True)
    
    #revol_util
    train['revol_util'].fillna(train['revol_util'].median(),inplace=True)
    
    #revol_util
    train['total_acc'].fillna(train['total_acc'].median(),inplace=True)
    
    #longest_credit_length
    train['longest_credit_length'].fillna(train['longest_credit_length'].median(),inplace=True)
    print(train.isnull().sum())
    
    #Categorical Features
    cat_features=train.select_dtypes(include='object').columns
    print("Categorical Features:\n",cat_features,"\n No of categorical features:",len(cat_features))
    #Converting Categorical-Numeric features using Label Encoding
    le=LabelEncoder()
    for feature in cat_features:
        train[feature]=le.fit_transform(train[feature])
    
    #Apply log transformation to deal with skewness of annual income
    train['annual_inc']=np.log(train['annual_inc'])
    return train

# Model Building

In [8]:
def construct_model(train):
    # Split into train and test data
    x=train.drop('bad_loan',axis=1)
    y=train.bad_loan
    x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)
    print(x_train.shape,y_train.shape,x_test.shape,y_test.shape)
        
    #Feature Scaling
    rs=RobustScaler()
    x_train=rs.fit_transform(x_train)
    x_test=rs.transform(x_test)
    
    model=LogisticRegression(solver='liblinear')
    model.fit(x_train,y_train)
    y_pred=model.predict(x_test)
    print("Accuracy:",accuracy_score(y_test,y_pred))
    print("Confusion Matrix:\n",confusion_matrix(y_test,y_pred))
    print("Classification Report:\n",classification_report(y_test,y_pred))

    #Cross Validation
    sc=RobustScaler()
    x=sc.fit_transform(x)
    cv_score=cross_val_score(model,x,y,cv=10)
    print("Accuracy with Cross Validation:",np.mean(cv_score))
    return model

In [5]:
df=load_data()
df.head()

FileNotFoundError: [Errno 2] File ../templates/static/assets/loan.csv does not exist: '../templates/static/assets/loan.csv'

In [None]:
df=load_data()
df=feature_engg(df)

In [None]:
#To check feature importances
x=df.drop('bad_loan',axis=1)
y=df['bad_loan']
model=ExtraTreesRegressor()
model.fit(x,y)
model.feature_importances_

In [None]:
df_feat=pd.Series(model.feature_importances_,index=x.columns)
df_feat.nlargest(10).plot(kind='barh')

In [None]:
lr=construct_model(df)

In [None]:
import pickle
file=open("loan_prediction.pkl","wb")
pickle.dump(lr,file)