##Frame the Problem:

Predict the cluster of people who get accepted for their loan application.

##  Import libraries:

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


## Get the Data:

In [0]:
!ls -l

In [0]:

# use this to upload  'C:\Users\vgopu\Downloads\Loan_train.csv'
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

In [0]:

data=pd.read_csv('Loan_train.csv')

In [0]:
data.head()

In [0]:
data.info()

#**Data Analysis:**

In [0]:
# Heat map shows the correlation between the features.
corr=data.corr()
corr
sns.heatmap(corr,cmap='Accent')

## From the above plot we can see that
1) Applicant income and loan amount has +ve correlation. 2)Loan amount and coapplicant's income has +ve correlation.

In [0]:
sns.jointplot(x='Credit_History',y='ApplicantIncome',data=data)

# **Feature Engineering:**

In [0]:
#Married Column:
#Missing values
data.loc[data['Married'].isna(),'Married']='Married'
data['Married'].head()
data['Married'].value_counts()


In [0]:
#changing the values in the column from 'Yes/No' to 'Married/NotMarried' to avoid the confussion of having 'Yes' as a column name after one hot encoding.:
data.loc[data['Married']=='Yes', 'Married']='Married'
data.loc[data['Married']=='No', 'Married']='NotMarried'
data['Married'].value_counts()

In [0]:
#Gender Column:
#Missing values
data.loc[data['Gender'].isna(),'Gender']='Male'
data['Gender'].head()
data['Gender'].value_counts()

In [0]:
#Dependents Column:
#Missing values
data.loc[(data['Married']=='Married') &( data['Dependents'].isnull()), 'Dependents']='One_Dependent'
data.loc[data['Dependents'].isnull(), 'Dependents']= 'No_Dependents'

In [0]:

#Changing the valuesin the Dependents column:
data.loc[data['Dependents']=='0','Dependents']='No_Dependents'

data.loc[data['Dependents']=='1','Dependents']='One_Dependent'
data.loc[data['Dependents']=='2','Dependents']='Two_Dependents'

data.loc[data['Dependents']=='3+','Dependents']='3+_Dependents'
data['Dependents'].head()

In [0]:
#Self_Employed column:
#Taking care of missing values

data.loc[data['Self_Employed'].isnull(), 'Self_Employed']='NotSelf_Employed'
data.loc[data['Self_Employed']=='No','Self_Employed']='NotSelf_Employed'

data.loc[data['Self_Employed']=='Yes','Self_Employed']='Self_Employed'
data['Self_Employed'].value_counts()

In [0]:
#Loan_amount_term column:
data['Loan_Amount_Term'].isnull().value_counts()
data['Loan_Amount_Term'].value_counts()
data.loc[data['Loan_Amount_Term'].isnull(),'Loan_Amount_Term'] =360.0
data['Loan_Amount_Term'].isnull().value_counts()

In [0]:
#LoanAmount Column:
#Loan Amount- we are imputing NaN's with avg of the total loanamount.:
data['LoanAmount'].value_counts()
avg=np.floor( data['LoanAmount'].mean())
data.loc[data['LoanAmount'].isnull(),'LoanAmount']=avg
data['LoanAmount'].isnull().value_counts()

In [0]:
# Credit_History: Missing values.
data['Credit_History'].value_counts()
data.loc[data['Credit_History'].isnull(),'Credit_History']=1.0
data['Credit_History'].isnull().value_counts()


## One Hot Encoding:



> Since the categorical variables are not ordinal so, we are performing one hot encoding.









In [0]:
Gender=pd.get_dummies(data['Gender'],drop_first=1)
Married=pd.get_dummies(data['Married'],drop_first=1)
Education=pd.get_dummies(data['Education'],drop_first=1)
Dependents=pd.get_dummies(data['Dependents'],drop_first=1)
Self_Employed=pd.get_dummies(data['Self_Employed'],drop_first=1)
Property_Area=pd.get_dummies(data['Property_Area'],drop_first=1)
Loan_Status= pd.get_dummies(data['Loan_Status'], drop_first=1)




In [0]:
# Dropping the columns from the data set.
old_data=data.copy()
data.drop(['Loan_ID','Gender','Married','Dependents','Education','Self_Employed','Property_Area','Loan_Status'],axis=1,inplace=True)
data.head()

In [0]:
#Concatinating the One hot encoded columns to the dataset.
data=pd.concat([data,Gender,Married,Dependents,Education,Self_Employed,Property_Area,Loan_Status],axis=1)

In [0]:
data.head()

In [0]:
#cChecking wheather data has all the values and of the type numeric:
data.info()

In [0]:
corr= data.corr()
sns.heatmap(corr, cmap='coolwarm')

##Using random forest for feature importance:

In [0]:
from sklearn.ensemble import RandomForestClassifier
randclf=RandomForestClassifier()
X=data.iloc[:,0:14]
X.head()

In [0]:
y= data.Y
y.head()

In [0]:
randclf.fit(X,y)

In [0]:
# Creating a dictionary of features along with their co-efficients.
feat= {}
for feature,feature_coeff in zip(data.columns,randclf.feature_importances_):
  feat[feature]=feature_coeff

#Sorting them as per the importance.
import operator
sorted_fet= sorted(feat.items(),key=operator.itemgetter(1),reverse=True)
#sorted()
  
#print(randclf.feature_importances_)

In [0]:
sorted_fet

In [0]:
#saving data into data2.
#data2=data
#data=data2

In [0]:
data=data[['Credit_History','ApplicantIncome','LoanAmount','CoapplicantIncome','Loan_Amount_Term',]]

In [0]:
data=pd.concat([data,y],axis=1)



> Taking the top 5 features from the Random forest algorithm, building a model using Logistic Regression.



In [0]:
data.head()

##Train- Test Split:

In [0]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(data.drop('Y',axis=1),
                                               data['Y'],
                                               test_size=0.3,
                                               random_state=100)

In [0]:
X_train.head()

In [0]:
from sklearn.linear_model import LogisticRegression
clf= LogisticRegression()
model=clf.fit(X_train,y_train)

In [0]:
model.coef_

In [0]:
predict=clf.predict(X_test)

In [0]:
predict[:5]

In [0]:
y_test[:5]

##Model evaluation:

In [0]:
# Confusion matrix.
from sklearn.metrics import confusion_matrix, classification_report

In [0]:
print(confusion_matrix(y_test,predict))

In [0]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
print("Precision:{}".format(precision_score(y_test,predict)))
print("Recall score:{}".format(recall_score(y_test,predict)))
print("Accuracy:{}".format(accuracy_score(y_test,predict)))
print("F1 score:{}".format(f1_score(y_test,predict)))
print("classification report:{}".format(classification_report(y_test,predict)))



##Prediction on Analytic Vidhya's test cases:

In [0]:

# use this to upload  'C:\Users\vgopu\Downloads\Loan_train.csv'
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

In [0]:
prod=pd.read_csv('Loan_test.csv')

In [0]:
prod.head()

In [0]:
prod.info()

#Performing the same tranformations to the test data as of train data:

In [0]:
#Gender inmuting Nan's with "Male".
prod.loc[prod['Gender'].isnull(),'Gender']='Male'
prod['Gender'].isnull().value_counts()

In [0]:
#Dependents:
#Making same changes in 'Dependents' column.
prod.loc[prod['Dependents']=='0','Dependents']='No_Dependents'

prod.loc[prod['Dependents']=='1','Dependents']='One_Dependent'
prod.loc[prod['Dependents']=='2','Dependents']='Two_Dependents'

prod.loc[prod['Dependents']=='3+','Dependents']='3+_Dependents'
prod['Dependents'].head()

#Fillinf Nan's:

prod.loc[(prod['Married']=='Married') &( prod['Dependents'].isnull()), 'Dependents']='One_Dependent'
prod.loc[prod['Dependents'].isnull(), 'Dependents']= 'No_Dependents'
prod['Dependents'].isnull().value_counts()

In [0]:
#Making same changes in self_Employed column as well.
prod.loc[prod['Self_Employed']=='No','Self_Employed']='NotSelf_Employed'

prod.loc[prod['Self_Employed']=='Yes','Self_Employed']='Self_Employed'
prod['Self_Employed'].head()

# Filling missing values.
# Filling NaN's in Self_Employed column and performing one hot encoding.
prod.loc[prod['Self_Employed'].isnull(), 'Self_Employed']='NotSelf_Employed'
prod['Self_Employed'].isnull().value_counts()


In [0]:
#Loan Amount- we are imputing NaN's with avg of the total loanamount.:
prod['LoanAmount'].value_counts()
avg=np.floor( prod['LoanAmount'].mean())
prod.loc[prod['LoanAmount'].isnull(),'LoanAmount']=avg
prod['LoanAmount'].isnull().value_counts()

In [0]:
#Loan Amount terms:
prod['Loan_Amount_Term'].isnull().value_counts()
prod['Loan_Amount_Term'].value_counts()
prod.loc[prod['Loan_Amount_Term'].isnull(),'Loan_Amount_Term'] =360.0

In [0]:
# Credit_History: Missing values.
prod['Credit_History'].value_counts()
prod.loc[prod['Credit_History'].isnull(),'Credit_History']=1.0
prod['Credit_History'].isnull().value_counts()


## One Hot Encoding:

In [0]:
Gender=pd.get_dummies(prod['Gender'],drop_first=1)
Married=pd.get_dummies(prod['Married'],drop_first=1)
Education=pd.get_dummies(prod['Education'],drop_first=1)
Dependents=pd.get_dummies(prod['Dependents'],drop_first=1)
Self_Employed=pd.get_dummies(prod['Self_Employed'],drop_first=1)
Property_Area=pd.get_dummies(prod['Property_Area'],drop_first=1)



In [0]:
#Dropping the columns:
old_prod=prod.copy()
prod.drop(['Loan_ID','Gender','Married','Dependents','Education','Self_Employed','Property_Area'],axis=1,inplace=True)
prod.head()

In [0]:
#Concatinating the one hot encoded columns to the production data.
prod = pd.concat([prod,Gender,Married,Dependents,Education,Self_Employed,Property_Area],axis=1)
prod.head()

In [0]:
#To make sure our production data has no missing values and all values are numeric:
prod.info()

In [0]:
#Assigning the production data to a variable.
prod1=prod

In [0]:
#Since we build our model using random forest feature importance inference.
# so we have to drop some of the features from the production data.
prod=prod[['Credit_History','ApplicantIncome','LoanAmount','CoapplicantIncome','Loan_Amount_Term',]]

In [0]:
prod.head()

#Predicting on the production data:

In [0]:
predict1= clf.predict(prod)

In [0]:
#Converting Loan Status cloumn values back into Y/N 
predict2=predict1.copy()

In [0]:
predict3=np.array([])
for i in range(0,len(predict2)):
  pred=np.where(predict2[i]==1,'Y','N')
  predict3=np.append(predict3,pred)

In [0]:
np.shape(predict3)

In [0]:
#to see the values in predict.
predict3[0:5]

In [0]:
len(old_prod['Loan_ID'])

In [0]:
#Creating a data frame with columns as Row_ID and predicted value.
pre={'Loan_ID':old_prod['Loan_ID'],
    'Loan_Status':predict3}

In [0]:
Result=pd.DataFrame(pre)

In [0]:
Result.head()

In [0]:
#Converting into a CSV file.
Result.to_csv('Result1.csv',index=False)

In [0]:
#Downloading CSV file to local drive.
from google.colab import files
files.download('Result1.csv')