## 1. Importing libraries

In [8]:
#import all libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler#for train test splitting
from sklearn.model_selection import train_test_split#for decision tree object
from sklearn.tree import DecisionTreeClassifier#for checking testing results
from sklearn.metrics import classification_report, confusion_matrix#for visualizing tree 

from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')

In [9]:
data_raw = pd.read_csv('Fraud_check.csv')
data_raw

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO
...,...,...,...,...,...,...
595,YES,Divorced,76340,39492,7,YES
596,YES,Divorced,69967,55369,2,YES
597,NO,Divorced,47334,154058,0,YES
598,YES,Married,98592,180083,17,NO


In [10]:
data = data_raw.copy()
x = 0
for i in data_raw['Taxable.Income']:
    if i <= 30000:
        data['Taxable.Income'][x] = 'Risky'
    else:
        data['Taxable.Income'][x] = 'Good'
    x += 1
data

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,Good,50047,10,YES
1,YES,Divorced,Good,134075,18,YES
2,NO,Married,Good,160205,30,YES
3,YES,Single,Good,193264,15,YES
4,NO,Married,Good,27533,28,NO
...,...,...,...,...,...,...
595,YES,Divorced,Good,39492,7,YES
596,YES,Divorced,Good,55369,2,YES
597,NO,Divorced,Good,154058,0,YES
598,YES,Married,Good,180083,17,NO


In [11]:
y = data['Taxable.Income']
X = data.drop('Taxable.Income', axis = 1)
X['Undergrad'] = X['Undergrad'].map({'NO' : 0, 'YES' : 1})
X['Marital.Status'] = X['Marital.Status'].map({'Single' : 0, 'Married' : 1, 'Divorced' : 2})
X['Urban'] = X['Urban'].map({'NO' : 0, 'YES' : 1})
X

Unnamed: 0,Undergrad,Marital.Status,City.Population,Work.Experience,Urban
0,0,0,50047,10,1
1,1,2,134075,18,1
2,0,1,160205,30,1
3,1,0,193264,15,1
4,0,1,27533,28,0
...,...,...,...,...,...
595,1,2,39492,7,1
596,1,2,55369,2,1
597,0,2,154058,0,1
598,1,1,180083,17,0


In [14]:
from sklearn.preprocessing import StandardScaler
scaled_X=StandardScaler().fit_transform(X)
scaled_X

array([[-1.040833  , -1.16079994, -1.17852072, -0.62914254,  0.99335541],
       [ 0.96076892,  1.27444469,  0.50850002,  0.27636996,  0.99335541],
       [-1.040833  ,  0.05682237,  1.03310911,  1.63463872,  0.99335541],
       ...,
       [-1.040833  ,  1.27444469,  0.90969648, -1.76103318,  0.99335541],
       [ 0.96076892,  0.05682237,  1.43219749,  0.1631809 , -1.00668904],
       [-1.040833  ,  1.27444469,  0.9915901 ,  0.04999184, -1.00668904]])

In [15]:
# Splitting data into training and testing data set
from sklearn.model_selection import train_test_split
X_train, X_test,y_train,y_test = train_test_split(scaled_X,y, test_size=0.2,random_state=40)

In [16]:
X_train

array([[ 0.96076892,  1.27444469, -0.49419999, -1.64784411, -1.00668904],
       [-1.040833  ,  0.05682237, -0.44366647,  0.04999184,  0.99335541],
       [ 0.96076892,  0.05682237,  0.07702762,  1.06869341, -1.00668904],
       ...,
       [ 0.96076892, -1.16079994,  0.45184305, -1.42146599,  0.99335541],
       [ 0.96076892,  1.27444469, -1.64496706, -0.74233161, -1.00668904],
       [ 0.96076892,  1.27444469,  1.12939786,  0.38955903,  0.99335541]])

In [17]:
#Training data
X_train.shape,y_train.shape

((480, 5), (480,))

In [18]:
#Test data
X_test.shape,y_test.shape

((120, 5), (120,))

### 6. Model Training

In [19]:
import warnings
warnings.filterwarnings('ignore')

In [20]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100,max_depth=5)
rf_model.fit(X_train,y_train)


RandomForestClassifier(max_depth=5)

### GridSearch CV

In [21]:
from sklearn.model_selection import GridSearchCV

In [22]:
grid_search = GridSearchCV(estimator=rf_model,param_grid = {'max_depth': [4,5,6,7,8],'criterion':['gini','entropy']},cv=5)
grid_search.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(max_depth=5),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [4, 5, 6, 7, 8]})

In [23]:
grid_search.best_params_

{'criterion': 'gini', 'max_depth': 4}

In [24]:
grid_search.best_score_

0.8

### K-Fold CV

In [25]:
from sklearn.model_selection import cross_val_score,KFold #Explore Kfold
cv_scores = cross_val_score(estimator = rf_model,X = X,y=y,cv=5)
print('5 Fold CV Scores : ',cv_scores)
print('Mean Accuracy    : ',round(cv_scores.mean(),4))
print('STD Deviation    : ',round(cv_scores.std(),4))

5 Fold CV Scores :  [0.8        0.79166667 0.79166667 0.79166667 0.79166667]
Mean Accuracy    :  0.7933
STD Deviation    :  0.0033


### 7. Model Testing

#### Training data

In [26]:
y_train_pred = rf_model.predict(X_train) #Trained question paper

#### Test data

In [27]:
y_test_pred = rf_model.predict(X_test) #New question

### 8. Model Evaluation

In [28]:
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score,classification_report

#### Training data

In [29]:
accuracy_score(y_train,y_train_pred)

0.8041666666666667

In [30]:
confusion_matrix(y_train,y_train_pred)

array([[384,   0],
       [ 94,   2]], dtype=int64)

In [31]:
precision_score(y_train,y_train_pred,average='weighted')

0.8426778242677825

In [32]:
recall_score(y_train,y_train_pred,average='weighted')

0.8041666666666667

In [33]:
print(classification_report(y_train,y_train_pred))

              precision    recall  f1-score   support

        Good       0.80      1.00      0.89       384
       Risky       1.00      0.02      0.04        96

    accuracy                           0.80       480
   macro avg       0.90      0.51      0.47       480
weighted avg       0.84      0.80      0.72       480



#### Test data

In [35]:
accuracy_score(y_test,y_test_pred)

0.7666666666666667

In [36]:
confusion_matrix(y_test,y_test_pred)

array([[92,  0],
       [28,  0]], dtype=int64)

In [37]:
precision_score(y_test,y_test_pred,average='weighted')

0.5877777777777777

In [38]:
recall_score(y_test,y_test_pred,average='weighted')

0.7666666666666667

### 8. Model Deployement

In [39]:
from pickle import dump

In [40]:
#Pickling/Serialization
dump(rf_model,open('rf_model.pkl','wb')) #write bytes 

In [41]:
from pickle import load

In [42]:
# Unpickling/deserialization
rf_model_pickle = load(open('rf_model.pkl','rb')) 

In [43]:
pickle_pred = rf_model_pickle.predict(X_test)

### ==================================================================================