# Code 8
- Random Forest
- Hyperparameter Optimization
- GridsearchCV
- Best Features

## 1/ Import Libraries

In [1]:
#CodeSection1
#from google.colab import drive
#drive.mount('/mntDrive') 

In [2]:
#CodeSection2
import pandas as pd
import numpy as np

## 2/ Import Data

In [3]:
#CodeSection3
train = pd.read_csv('C:/Users/admin/OneDrive/Documents/Data Sciesnce Repository/Machine_Leaning_Projects/Loan Prediction/input/train.csv')
test = pd.read_csv('C:/Users/admin/OneDrive/Documents/Data Sciesnce Repository/Machine_Leaning_Projects/Loan Prediction/input/train.csv')

## 3/ Skipping Few Steps to simplify

## 4/ Preprocessing Improvement

### 4.1/ Identify Numerical and Categorical Features

In [4]:
#CodeSection4
# Identify all Numerical and Categorical features
numerical_features = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']
categorical_features = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']

### 4.2/ Outlier Strategy

In [5]:
#CodeSection5

# Write a loop to do the same
for num_var in numerical_features:
  Q1 = train[num_var].quantile(0.25)
  Q3 = train[num_var].quantile(0.75)

  IQR = Q3-Q1

  Lower_Whisker = Q1 - 1.5*IQR
  Upper_Whisker = Q3 + 1.5*IQR

  train[num_var] = train[num_var].apply(lambda x : Upper_Whisker if x >= Upper_Whisker else x)
  test[num_var] = test[num_var].apply(lambda x : Upper_Whisker if x >= Upper_Whisker else x)

### 4.3/ Import Libraries
- Missing Value Imputation - SimpleImputer
- Preprocessing - StandardScaler, OrdinalEncoder
- Pipeline - make_pipeline, make_column_transformer
- Model - Decision Tree

In [6]:
#CodeSection6

# Import SimpleImputer
from sklearn.impute import SimpleImputer

# Import StandardScaler
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

# Make and Compose Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

# Import Random Forest
from sklearn.ensemble import RandomForestClassifier

### 4.4/ Build Pipeline

In [7]:
#CodeSection7
# Create Preprocessor Pipeline
preprocessor = make_column_transformer(
    
    (make_pipeline(
    SimpleImputer(strategy = 'median'), 
    StandardScaler()), numerical_features),
    
    (make_pipeline(
    SimpleImputer(strategy = 'most_frequent'),
    OrdinalEncoder(categories = 'auto')), categorical_features),
)

### 4.5/ Divide Data into X and y

In [8]:
#CodeSection8
X = train.drop(['Loan_Status','Loan_ID'], axis =  1)
y = train['Loan_Status']

### 4.6/ Create Train and Validation Data

In [9]:
#CodeSection9
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size = 0.2, random_state = 5) # also see stratify

## 5/ Build Model and Fit

In [10]:
#CodeSection10
# Create Parameter Grid
parameter = {'randomforestclassifier__n_estimators' : (23, 24, 25, 26, 27, 28),
             'randomforestclassifier__max_depth' : (2,3,4,5,6,7,8,9,10),
             'randomforestclassifier__criterion' : ('gini', 'entropy'),
             'randomforestclassifier__max_features' : ('auto', 'sqrt', 'log2')

}
#(23, 24, 25, 26, 27, 28)
#(2,3,4,5,6,7,8,9,10)
#('gini', 'entropy')
#('auto', 'sqrt', 'log2')

In [11]:
#CodeSection11
# Create Model Pipeline and Initiate Model
# Change max_depth to find which one gives the best accuracy
model = make_pipeline(preprocessor, RandomForestClassifier())

In [12]:
#CodeSection12
# Instead of Fit we so Grid Search (also Random Grid Search)
from sklearn.model_selection import GridSearchCV 
model_search = GridSearchCV(model, param_grid = parameter, verbose = True)

In [None]:
#CodeSection13
# Fit Gridsearch
model_search.fit(X_train,y_train)

Fitting 5 folds for each of 324 candidates, totalling 1620 fits


In [None]:
#CodeSection14
# Get best estimator from the Girdsearch
model_search.best_estimator_.named_steps.randomforestclassifier

## 6/ Check Best Features

In [None]:
#CodeSection15
# Get Feature Importance Score
feat_imp = model_search.best_estimator_.named_steps.randomforestclassifier.feature_importances_
feat_imp

In [None]:
#CodeSection16
# Convert to Series with Feature Names
imp_feat=pd.Series(feat_imp,index=X_train.columns.tolist())

In [None]:
#CodeSection17
# List Feature Importance
imp_feat.sort_values(ascending=False) # You also plot the same

## 6/ Check Accuracy of Model on Train Data

In [None]:
#CodeSection18
# Predict on Train Data
y_train_pred = model_search.predict(X_train)
y_val_pred = model_search.predict(X_val)

### We can use a Accuracy Function from Metrics
- Check Train Accuracy

In [None]:
#CodeSection19
# Import metrics library
from sklearn.metrics import accuracy_score

In [None]:
#CodeSection20
# Print Train Accuracy
print(f" Train Accuracy : {accuracy_score(y_train, y_train_pred):0.1%}")
print(f" Validation Accuracy : {accuracy_score(y_val, y_val_pred):0.1%}")

## 7/ Predict and Submission

### Predict on "Test Data"

In [None]:
#CodeSection21
# Get all the X Variables from the Test Dataset
X_test = test.drop(['Loan_ID'], axis =  1)

# Predict on X_test Data ("X_test_prep")
X_test_prep = model_search.predict(X_test)

### Create Submission File

In [None]:
#CodeSection22
submission = pd.DataFrame({
    'Loan_ID' : test['Loan_ID'],
    'Loan_Status' : X_test_prep
})

### Export Submission File

In [None]:
#CodeSection23
submission.to_csv('C:/Users/admin/OneDrive/Documents/Data Sciesnce Repository/Machine_Leaning_Projects/Loan Prediction/output/O8_RF_Best_Features.csv', index = False)

In [None]:
# LB Accuracy : 0.7777