# ML project 

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# We predicting Loan Status

dev = pd.read_csv('train.csv')
test = pd.read_csv ('test.csv')

In [3]:
print(dev.shape)

(614, 13)


## 1. Data cleaning

In [4]:
# null values replace with mean 

dev.isna().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [5]:
# checking unique values 

dev.Self_Employed.unique()
dev.Credit_History.unique()
dev.Gender.unique()
dev.Dependents.unique()

array(['0', '1', '2', '3+', nan], dtype=object)

In [6]:
# replace all NaN values with missing in order to not decrease the quality of the data

dev['Credit_History'] = dev['Credit_History'].fillna('missing')
test['Credit_History'] = test['Credit_History'].fillna('missing')

In [7]:
dev['Self_Employed'] = dev['Self_Employed'].fillna('missing')
test['Self_Employed'] = test['Self_Employed'].fillna('missing')

In [8]:
dev['Dependents'] = dev['Dependents'].fillna('missing')
test['Dependents'] = test['Dependents'].fillna('missing')

In [9]:
dev['Gender'] = dev['Gender'].fillna('missing')
test['Gender'] = test['Gender'].fillna('missing')

In [10]:
# check the mean for loan amount term 

mean_lat = dev['Loan_Amount_Term'].mean()


In [11]:
mean_lat_test = dev['Loan_Amount_Term'].mean()

In [12]:
# replace all NaN values med mean 

dev['Loan_Amount_Term'] = dev['Loan_Amount_Term'].fillna(mean_lat)

In [13]:
test['Loan_Amount_Term'] = test['Loan_Amount_Term'].fillna(mean_lat_test)

In [14]:
# check the mean for loan amount  

mean_la = dev['LoanAmount'].mean()

In [15]:
mean_la_test = dev['LoanAmount'].mean()

In [16]:
# replace all NaN values med mean 

dev['LoanAmount'] = dev['LoanAmount'].fillna(mean_la)

In [17]:
test['LoanAmount'] = test['LoanAmount'].fillna(mean_la_test)

In [18]:
dev['Credit_History'].value_counts(normalize=True)

1.0        0.773616
0.0        0.144951
missing    0.081433
Name: Credit_History, dtype: float64

In [19]:
dev['Dependents'].unique()

array(['0', '1', '2', '3+', 'missing'], dtype=object)

In [20]:
dev['Married'].value_counts(normalize=True)

Yes    0.651391
No     0.348609
Name: Married, dtype: float64

In [21]:
dev['Property_Area'].unique()

array(['Urban', 'Rural', 'Semiurban'], dtype=object)

In [22]:
# get the NaN-values from Married

dev[dev.Married.isna()]

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
104,LP001357,Male,,missing,Graduate,No,3816,754.0,160.0,360.0,1,Urban,Y
228,LP001760,Male,,missing,Graduate,No,4758,0.0,158.0,480.0,1,Semiurban,Y
435,LP002393,Female,,missing,Graduate,No,10047,0.0,146.412162,240.0,1,Semiurban,Y


In [23]:
# filling the NaN values with yes since it is the most frequent one

dev['Married'] = dev['Married'].fillna('Yes')

In [24]:
test['Married'] = test['Married'].fillna('Yes')

In [25]:
# drop the ID-kolumn

dev.drop(columns='Loan_ID', inplace=True)

In [26]:
test.drop(columns='Loan_ID', inplace=True)

## 2. Data Cleaning & Feature

In [27]:
dev.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,146.412162,360.0,1,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1,Urban,Y


In [1]:
dev.Loan_status.value_counts()

In [None]:
dev.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
# Build heatmap

plt.figure(figsize=(20,12))
sns.heatmap(dev.corr(),annot=True,cmap='coolwarm')

In [None]:
dev.groupby(['Education','Loan_Status']).size().unstack().plot(kind='bar', stacked=True, figsize=(10,7),rot=50)

In [None]:
dev.groupby(['Property_Area','Loan_Status']).size().unstack().plot(kind='bar', stacked=True, figsize=(10,7),rot=50 )

In [None]:
dev.groupby(['Self_Employed','Loan_Status']).size().unstack().plot(kind='bar', stacked=True, figsize=(10,7),rot=50 )

In [None]:
dev.groupby(['Credit_History','Loan_Status']).size().unstack().plot(kind='bar', stacked=True, figsize=(10,7),rot=50 )

## 3. Changing cat. to num.

In [None]:
# Change all cathagorical data to numerical 

dev['Loan_status'] = dev['Loan_Status'].map({'Y':1,'N':0})

In [None]:
# Chanching name on the Loan Status column

dev.rename(columns={'Loan_Status':'Loan_S'}, inplace= True)

In [None]:
# Dropping the Loan_S column

dev.drop(columns= 'Loan_S', inplace=True)

In [None]:
# Getting the Scandard scaler

from sklearn.preprocessing import StandardScaler 
sc = StandardScaler()

In [None]:
# making dummies out of the catagorical columns

cat_dev = pd.get_dummies(dev.drop(['Loan_Amount_Term','LoanAmount','CoapplicantIncome','ApplicantIncome'], axis=1))

In [None]:
# collecting the numerical columns

num_dev = dev.drop(columns= ['Loan_status','Dependents','Gender', 'Married','Education','Self_Employed','Property_Area','Credit_History'], axis=1)

In [None]:
# doing the same thing for the test data

cat_test = pd.get_dummies(test.drop(['Loan_Amount_Term','LoanAmount','CoapplicantIncome','ApplicantIncome'], axis=1))
num_test = test.drop(columns= ['Dependents','Gender', 'Married','Education','Self_Employed','Property_Area','Credit_History'], axis=1)
S_Test = pd.DataFrame (sc.fit_transform(num_test), columns=num_test.columns)
S_Test = pd.concat([S_Test,cat_test], axis=1)

## 4. Scale numerical col.

In [None]:
S_dev = pd.DataFrame (sc.fit_transform(num_dev), columns=num_dev.columns)

In [None]:
# concat the catagorical and numerical columns 

S_dev = pd.concat([S_dev,cat_dev], axis=1)

In [None]:
S_dev.columns.tolist()

In [None]:
S_dev = S_dev[['ApplicantIncome',
 'CoapplicantIncome',
 'LoanAmount',
 'Loan_Amount_Term',
 'Gender_Female',
 'Gender_Male',
 'Gender_missing',
 'Married_No',
 'Married_Yes',
 'Dependents_0',
 'Dependents_1',
 'Dependents_2',
 'Dependents_3+',
 'Dependents_missing',
 'Education_Graduate',
 'Education_Not Graduate',
 'Self_Employed_No',
 'Self_Employed_Yes',
 'Self_Employed_missing',
 'Credit_History_0.0',
 'Credit_History_1.0',
 'Credit_History_missing',
 'Property_Area_Rural',
 'Property_Area_Semiurban',
 'Property_Area_Urban',
'Loan_status']]

In [None]:
sns.heatmap(S_dev.corr(),annot=True,cmap='coolwarm')

## 5. Train, Valid and Test Split

In [None]:
X = S_dev.drop('Loan_status', axis=1)
y = S_dev['Loan_status']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_val,y_train,y_val = train_test_split(X,y,train_size=0.8, random_state=42)

In [None]:
from sklearn.pipeline import make_pipeline

## 6. Creating a pipeline

In [None]:
from sklearn.metrics import precision_score

In [None]:
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression


classifiers = [DecisionTreeClassifier(), 
               RandomForestClassifier(),SVC(),
               KNeighborsClassifier()
              ,GaussianNB(),
               LogisticRegression()]

# train the pipline and print the score of the testdata 
# get the one that preform the best

for classifier in classifiers: 
    pipe = make_pipeline(classifier)
    
    # fit our model on the training data
    pipe.fit(X_train, y_train)
    
    # predict on our validation data
    y_pred_val = pipe.predict(X_val)
    
    # evaluate our validation data
    print(classifier, precision_score(y_val, y_pred_val))

In [None]:
# TOP-picks of model with the best score of precision - SVC(),LogReg(), RandomForestClassifier()

## 7. Predictions - SVC

In [None]:
from sklearn.svm import SVC

classifier = SVC(C= 0.25, gamma= 0.8, kernel= 'rbf')
classifier.fit(X_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix, precision_score

y_pred= classifier.predict(X_val)

cm = confusion_matrix(y_val,y_pred)

print(cm)
print('\n Precision: ', precision_score(y_val,y_pred))

## 8. Grid Search - SVC

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = (#{'C':[0.5, 1.0, 18.5,25], 
              #'gamma':['auto']},
             {'C':[0.5, 1.0,17.0], 
              'degree': [3,10,40],
              'kernel':['poly'],
              'gamma':[0.1,0.6,0.7,1]})
             #{'C':[0.5, 1.0, 18.5,17.0, 20], 
              #'gamma':['scale']},)

grid = GridSearchCV(SVC(), param_grid = parameters,cv=5, scoring='precision')


In [None]:
grid.fit(X_train,y_train)

In [None]:
grid.score(X_val,y_val)

In [None]:
grid.best_params_

## 9. Grid Search - LogReg

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = ({'random_state': [7,42,70],
              'solver':['newton-cg'],
              'C':[10,40,200]},
             {'random_state': [7,42,70],
              'solver':['liblinear'],
              'C':[10,40,200]},
             {'random_state': [7,42,70],
             'solver':['lbfgs'],
              'C':[10,40,200]})


grid = GridSearchCV(LogisticRegression(), param_grid = parameters,cv=5, scoring='precision')

In [None]:
grid.fit(X_train,y_train)

In [None]:
grid.score(X_val,y_val)

In [None]:
# finding the best parameters

grid.best_params_

## 10. Grid Search- RandomForestClassifier

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = ({'n_estimators': [10, 100 ,200],
               'criterion':['gini'],
              'max_depth':[10,50,100],
              'min_samples_split':[2,8,10]},
             {'n_estimators': [10, 100 ,200],
               'criterion':['entropy'],
              'max_depth':[10,50,100],
              'min_samples_split':[2,8,10]})

grid = GridSearchCV(RandomForestClassifier(), param_grid = parameters,cv=5, scoring='precision')


In [None]:
grid.fit(X_train,y_train)

In [None]:
grid.score(X_val,y_val)

In [None]:
grid.best_params_


## 11. Creating a model

In [None]:
# Creating a model based on the best parameters

model = grid.best_estimator_

In [None]:
from joblib import dump, load
dump(model, 'model_ML.joblib') 

In [None]:
y_pred_final = model.predict(X_train)

In [None]:
# Using model on train data

from sklearn.metrics import (precision_score, 
classification_report, accuracy_score, confusion_matrix)


print ('Accuracy: ', accuracy_score (y_train,y_pred_final))
print ('Precision: ', precision_score (y_train,y_pred_final))
print ('\n Classification Report: \n', classification_report (y_train,y_pred_final))
print ('\n Confusion Matrix: \n', confusion_matrix (y_train,y_pred_final))

In [None]:
y_pred_val = model.predict(X_val)

In [None]:
# Using model on test data

from sklearn.metrics import (precision_score, 
classification_report, accuracy_score, confusion_matrix)

print ('Accuracy: ', accuracy_score (y_val,y_pred_test))
print ('Precision: ', precision_score (y_val,y_pred_test))
print ('\n Classification Report: \n', classification_report (y_val,y_pred_test))
print ('\n Confusion Matrix: \n', confusion_matrix (y_val,y_pred_test))

## 12. Preparing for competition

In [None]:
# Getting the Loan ID column 

test_ID = pd.read_csv ('test.csv')['Loan_ID']

In [None]:
# Predicting on the test data

y_pred_test = model.predict(S_Test)

In [None]:
output = pd.DataFrame(data= {'Loan_status':y_pred_test})

In [None]:
# Chanching Yes to 1 and No to 0

output['Loan_Status'] = output['Loan_Status'].map({1:'Y',0:'N'})

In [None]:
# Concat the data from testID and output 

output_test = pd.concat([test_ID,output], axis=1)

In [None]:
output_test.set_index('Loan_ID', inplace=True)

In [None]:
output_test.to_csv("ML_project_op.csv")

In [None]:
output_test.rename(columns={'Loan_status':'Loan_Status'}, inplace= True)