<!-- 
1. We need two datasets. (The ML model will learn from these datasets)

2. We will append or attach the datasets.

3. Pre-process the raw data. (Label Encoding, Missing values, Drop features)

4. Split the data into Training & Testing data.
   -  ML model will be trained on Training data.
   -  ML model will be evaluated on Test data. (To check how well model is performing on a given dataset)

5. Feed training data to ML model (Support Vector Machine Learning Model - Supervised Model)
   - In supervised model there are two labels:
        - Loan Approved 
        - Loan Disapproved

6. Once SVML is 'Trained' we will evaluate it against our Test data.
    It will tell us accuracy score of our model (How well our model performs)

7. When new data is fed to our 'Trained SVML' model it will predict an outcome/label. -->


### Importing Dependencies

In [None]:

#Data Cleaning, Processing, & Visualization:
import numpy as np      #For Linear Algebra & Array Handling 
import pandas as pd     #For Data Processing  (Data Frames - Structured Tables)
import matplotlib.pyplot as plt #For Graphical visualization
import seaborn as sns   #For statistical data visualization

#Train & Test:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split #Split Train& Test Data

#ML Model:
from sklearn import svm #Support Vector Machine 
from sklearn.linear_model import LogisticRegression

### Data Collection

In [None]:
#Loading dataset for Training purpose
dataset=pd.read_csv('dataset/dataset.csv')

In [None]:
#Check dataset type (It shows it is a dataframe object)
type(dataset)

In [None]:
#Check Total number of rows & columns in our dataset
dataset.shape

In [None]:
#Checking first 5 rows of our dataset
dataset.head()

In [None]:
#Checking dataset for Statistical Measures for Numerical Columns Only, and not Categorical columns
dataset.describe()

### Pre-processing & Cleaning

In [None]:
#Checking missing values in our dataset
missing_data = dataset.isnull().sum().to_frame()

#Rename column holding the sums
missing_data = missing_data.rename(columns={0:'Missing Values in columns'})

#Print the results
print(missing_data)

In [None]:
#Dropping Columns (Features) not required 
dataset.drop('Loan_ID',inplace=True,axis=1)

#Checking column again
dataset.head()

### Visualization

In [None]:
#Education vs Loan Status
#If you are a Graduate how likely are you to get a loan?
sns.countplot(x='Education',hue='Loan_Status',data=dataset)

In [None]:
#Married vs Loan Status
#If you are married how likely are you to get a loan?
sns.countplot(x='Married',hue='Loan_Status',data=dataset)

In [None]:
# plt.figure(figsize=(18, 6))
# plt.title("Relation Between Application Income vs Loan Amount ")

# plt.grid()
# plt.scatter(data['ApplicantIncome'] , data['LoanAmount'], c='k', marker='x')
# plt.xlabel("Applicant Income")
# plt.ylabel("Loan Amount")
# plt.show()

In [None]:
# Label Encoding refers to converting the labels into a numeric form so as to convert them into the machine-readable form.  
# Machine learning algorithms can then decide in a better way how those labels must be operated.

# Label encoding for gender
dataset.Gender=dataset.Gender.map({'Male':1,'Female':0})

# Labelling encoding for Marrital status
dataset.Married=dataset.Married.map({'Yes':1,'No':0})

# Labelling 0 & 1 for Dependents
dataset.Dependents=dataset.Dependents.map({'0':0,'1':1,'2':2,'3+':3})

# Labelling 0 & 1 for Education Status
dataset.Education=dataset.Education.map({'Graduate':1,'Not Graduate':0})

# Labelling 0 & 1 for Employment status
dataset.Self_Employed=dataset.Self_Employed.map({'Yes':1,'No':0})

# Labelling 0 & 1 for Property area
dataset.Property_Area=dataset.Property_Area.map({'Rural':0,'Semiurban':1, 'Urban':2})

# Labelling 0 & 1 for Loan Status
dataset.Loan_Status=dataset.Loan_Status.map({'Y':1,'N':0})


In [None]:
#Checking first 5 rows of our dataset
dataset.head()

### Filling Missing Data
Missing value can bias the results of the machine learning models and/or reduce the accuracy of the model. <br>
If values are missing completely at random, the data sample is likely still representative of the population. <br> 
But if the values are missing systematically, analysis may be biased.


In [None]:
#See which features are missing values:
dataset.isnull().sum()

In [None]:
from random import randint 

## Filling Gender with random number between 0-2
dataset.Gender.fillna(np.random.randint(0,2),inplace=True)

## Filling Married with random number between 0-2
dataset.Married.fillna(np.random.randint(0,2),inplace=True)

## Filling Dependents with median
dataset.Dependents.fillna(dataset.Dependents.median(),inplace=True)

## Filling Gender with random number between 0-2
dataset.Self_Employed.fillna(np.random.randint(0,2),inplace=True)

## Filling Loan Amount with median
dataset.LoanAmount.fillna(dataset.LoanAmount.median(),inplace=True)

## Filling Loan Amount Term with mean
dataset.Loan_Amount_Term.fillna(dataset.Loan_Amount_Term.mean(),inplace=True)

## Filling Credit History with random number between 0-2
dataset.Credit_History.fillna(np.random.randint(0,2),inplace=True)


#Recheck if all missing values have been filled.
dataset.isnull().sum()

### Separate Data & Label 

In [None]:
# This is un-split data
# X (All Features) vs Y (Loan_Status)

#Independent Variable 
#Storing All features except Loan_Status
X=dataset.drop(columns=['Loan_Status'],axis=1) #1 - Columns / 0 - Rows

#Dependent Variable 
Y=dataset['Loan_Status']

In [None]:
print(X)
print(Y)

### Splitting Train & Test Data

In [None]:
#X_train will store all Data (Married) - Except Loan_Status
#Y_train will store all Labels (values, i.e. Yes-1, No-0)

#X_test will store certain Data for evaluation
#Y_test will store corresponding Labels (values) for evaluation

#test_size= 0.1 means use 10% of original data for Test Purposes
#90% of data will be stored in X_train & 10% in X_test
#Stratify Parameter means give me equal proportions of 0, 1 label values in Y_train & Y_test
#random_state can have any value 
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.1, stratify=Y, random_state=2)

In [None]:
# View Total data size (100%) = train data size (90%) & test data size (10%)
print(X.shape, X_train.shape, X_test.shape)

### Model Training

In [None]:
#Using SVM - Support Vector Machine Model (Loan Approved vs Not Approved)
model_1 = svm.SVC(kernel='linear')

#Fit the Data & Label
model_1.fit(X_train,Y_train)

### Model Evaluation

In [None]:
#Prediction using Training data
X_train_prediction = model_1.predict(X_train)

#Accuracy of our prediction
X_train_accuracy = accuracy_score(X_train_prediction, Y_train)

#View Score
print('Accuracy of Training Data is: ', X_train_accuracy)

In [None]:
#Prediction using Test data
X_test_prediction = model_1.predict(X_test)

#Accuracy of our prediction
X_test_accuracy = accuracy_score(X_test_prediction, Y_test)

#View Score
print('Accuracy of Test Data is: ', X_test_accuracy)

### Model Prediction

In [None]:
input_data = (0.0,	0.0,	0.0,	1,	0.0,	4230,	0.0,	112.0,	360.0,	1.0,	1)

#Converting input data to numpy array
input_data_to_array = np.asarray(input_data)

#Reshaping array to predict for a Single Instance
input_data_reshape = input_data_to_array.reshape(1,-1)

prediction = model_1.predict(input_data_reshape)
print(prediction)


if(prediction[0]==0):
    print('Loan is not Approved')
else:
    print('Loan is Approved')

### Saving Model 

In [None]:
import pickle

pickle_out = open('./Model/model.pkl', mode = "wb")
pickle.dump(model_1, pickle_out)
pickle_out.close()

### Loading Model

In [None]:
pickle_in = open('./model/model.pkl', mode = "rb")
load_model = pickle.load(pickle_in)

### Quick Test Model

In [None]:
prediction = load_model.predict([[0, 0, 3, 0, 1, 10000, 0, 500000.0, 180, 0, 0]])
print(prediction)