### Loan Prediction Model using Linear Regression

In [1]:
import numpy as np
import pandas as pd
from matplotlib.pyplot import subplots
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import (ModelSpec as MS,
                        summarize,poly)

In [61]:
from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF
from statsmodels.stats.anova import anova_lm

In [50]:
Loan = pd.read_csv("traincsv.csv")
Loan

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


I'll use [ Gender, Married status, Education status] as a input variables, and use Loan_Status as targer variables.

In [53]:
loan = Loan[['Gender','Married','Education','Loan_Status']]
loan

Unnamed: 0,Gender,Married,Education,Loan_Status
0,Male,No,Graduate,Y
1,Male,Yes,Graduate,N
2,Male,Yes,Graduate,Y
3,Male,Yes,Not Graduate,Y
4,Male,No,Graduate,Y
...,...,...,...,...
609,Female,No,Graduate,Y
610,Male,Yes,Graduate,Y
611,Male,Yes,Graduate,Y
612,Male,Yes,Graduate,Y


In [54]:
location = loan[loan.isna().any(axis = 1)]
location

Unnamed: 0,Gender,Married,Education,Loan_Status
23,,Yes,Not Graduate,N
104,Male,,Graduate,Y
126,,Yes,Graduate,Y
171,,Yes,Graduate,Y
188,,Yes,Graduate,Y
228,Male,,Graduate,Y
314,,Yes,Graduate,N
334,,Yes,Graduate,Y
435,Female,,Graduate,Y
460,,Yes,Graduate,Y


In [60]:
#thought that NaN in categorical value have no need for regression.. so drop..
loan.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  loan.dropna(inplace=True)


In [57]:
category_map_gender = {'Male': 0, 'Female': 1}
category_map_married = {'Yes': 1, 'No': 0}
category_map_education = {'Graduate': 1, 'Not Graduate': 0}
category_map_loan = {'Y':1,'N':0}

loan['Gender'] = loan['Gender'].map(category_map_gender)
loan['Married'] = loan['Married'].map(category_map_married)
loan['Education'] = loan['Education'].map(category_map_education)
loan['Loan_Status'] = loan['Loan_Status'].map(category_map_loan)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  loan['Gender'] = loan['Gender'].map(category_map_gender)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  loan['Married'] = loan['Married'].map(category_map_married)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  loan['Education'] = loan['Education'].map(category_map_education)
A value is trying to b

In [88]:
X_train = sm.add_constant(MS(['Gender','Married','Education']).fit_transform(loan))
y_train = loan['Loan_Status']
model = sm.OLS(y_train,X_train)
results = model.fit()

In [90]:
summarize(results)

Unnamed: 0,coef,std err,t,P>|t|
intercept,0.5512,0.051,10.771,0.0
Gender,0.0133,0.052,0.255,0.799
Married,0.0957,0.043,2.248,0.025
Education,0.092,0.045,2.024,0.043


##### check if there any interaction terms

In [62]:
vals = [VIF(X,i)
       for i in range (1, X.shape[1])]
vif = pd.DataFrame({'vif':vals}, index = X.columns[1:])
vif

Unnamed: 0,vif
Gender,1.160735
Married,1.158246
Education,1.00237


use the model in test dataset

In [94]:
Loan_test = pd.read_csv('test.csv')
test = Loan_test[['Gender','Married','Education']]
test

Unnamed: 0,Gender,Married,Education
0,Male,Yes,Graduate
1,Male,Yes,Graduate
2,Male,Yes,Graduate
3,Male,Yes,Graduate
4,Male,No,Not Graduate
...,...,...,...
362,Male,Yes,Not Graduate
363,Male,Yes,Graduate
364,Male,No,Graduate
365,Male,Yes,Graduate


In [95]:
category_map_gender = {'Male': 0, 'Female': 1}
category_map_married = {'Yes': 1, 'No': 0}
category_map_education = {'Graduate': 1, 'Not Graduate': 0}

test['Gender'] = test['Gender'].map(category_map_gender)
test['Married'] = test['Married'].map(category_map_married)
test['Education'] = test['Education'].map(category_map_education)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Gender'] = test['Gender'].map(category_map_gender)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Married'] = test['Married'].map(category_map_married)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Education'] = test['Education'].map(category_map_education)


In [100]:
X_test = sm.add_constant(MS(['Gender','Married','Education']).fit_transform(test))
predictions = results.predict(X_test)
predictions

0      0.738865
1      0.738865
2      0.738865
3      0.738865
4      0.551245
         ...   
362    0.646905
363    0.738865
364    0.643205
365    0.738865
366    0.643205
Length: 367, dtype: float64

In [108]:
binary_predicition = np.where(predictions >= 0.7, 'Yes', 'No')

In [110]:
test['predictions'] = binary_predicition
test_results = test
test_results

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['predictions'] = binary_predicition


Unnamed: 0,Gender,Married,Education,predictions
0,0.0,1,1,Yes
1,0.0,1,1,Yes
2,0.0,1,1,Yes
3,0.0,1,1,Yes
4,0.0,0,0,No
...,...,...,...,...
362,0.0,1,0,No
363,0.0,1,1,Yes
364,0.0,0,1,No
365,0.0,1,1,Yes
