### Random Forests - Fraud Data.

* Use Random Forest to prepare a model on fraud data 
  treating those who have taxable_income <= 30000 as "Risky" and others are "Good"


In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [32]:
fraud_data = pd.read_csv('D:/DS_Assignments/15_Random_Forests/Fraud_check.csv')
fraud_data.head(20)


Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO
5,NO,Divorced,33329,116382,0,NO
6,NO,Divorced,83357,80890,8,YES
7,YES,Single,62774,131253,3,YES
8,NO,Single,83519,102481,12,YES
9,YES,Divorced,98152,155482,4,YES


### Data understanding and Data Preparation

In [33]:
fraud_data.shape

(600, 6)

In [34]:
fraud_data.isna().sum()

Undergrad          0
Marital.Status     0
Taxable.Income     0
City.Population    0
Work.Experience    0
Urban              0
dtype: int64

In [35]:
# No NaN values in data.

In [36]:
fraud_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Undergrad        600 non-null    object
 1   Marital.Status   600 non-null    object
 2   Taxable.Income   600 non-null    int64 
 3   City.Population  600 non-null    int64 
 4   Work.Experience  600 non-null    int64 
 5   Urban            600 non-null    object
dtypes: int64(3), object(3)
memory usage: 28.2+ KB


In [37]:
fraud_data.describe()

Unnamed: 0,Taxable.Income,City.Population,Work.Experience
count,600.0,600.0,600.0
mean,55208.375,108747.368333,15.558333
std,26204.827597,49850.075134,8.842147
min,10003.0,25779.0,0.0
25%,32871.5,66966.75,8.0
50%,55074.5,106493.5,15.0
75%,78611.75,150114.25,24.0
max,99619.0,199778.0,30.0


In [38]:
fraud_data.dtypes

Undergrad          object
Marital.Status     object
Taxable.Income      int64
City.Population     int64
Work.Experience     int64
Urban              object
dtype: object

### For categorical variable need to use Label Encoder

In [39]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()


In [40]:
fraud_data['Undergrad'] = label_encoder.fit_transform(fraud_data['Undergrad'])

In [41]:
fraud_data['Marital.Status'] = label_encoder.fit_transform(fraud_data['Marital.Status'])
fraud_data['Urban'] = label_encoder.fit_transform(fraud_data['Urban'])

In [42]:
fraud_data.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,0,2,68833,50047,10,1
1,1,0,33700,134075,18,1
2,0,1,36925,160205,30,1
3,1,2,50190,193264,15,1
4,0,1,81002,27533,28,0


### For Taxable.Income we will use pd.cut to make it categorical variable. 
### This is our Target variable.

In [44]:
fraud_data.isna().sum()

Undergrad          0
Marital.Status     0
Taxable.Income     0
City.Population    0
Work.Experience    0
Urban              0
dtype: int64

In [45]:
a = fraud_data['Taxable.Income'].max()

In [46]:
fraud_data['Taxable.Income'] = pd.cut(fraud_data['Taxable.Income'],bins=[0,30000,a], labels=[0,1]) 
#0: Risky & 1: Good as per problem statement

In [47]:
fraud_data

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,0,2,1,50047,10,1
1,1,0,1,134075,18,1
2,0,1,1,160205,30,1
3,1,2,1,193264,15,1
4,0,1,1,27533,28,0
...,...,...,...,...,...,...
595,1,0,1,39492,7,1
596,1,0,1,55369,2,1
597,0,0,1,154058,0,1
598,1,1,1,180083,17,0


In [49]:
fraud_data.isna().sum()

Undergrad          0
Marital.Status     0
Taxable.Income     0
City.Population    0
Work.Experience    0
Urban              0
dtype: int64

In [50]:
fraud_data.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,0,2,1,50047,10,1
1,1,0,1,134075,18,1
2,0,1,1,160205,30,1
3,1,2,1,193264,15,1
4,0,1,1,27533,28,0


In [51]:
fraud_data.dtypes

Undergrad             int64
Marital.Status        int64
Taxable.Income     category
City.Population       int64
Work.Experience       int64
Urban                 int64
dtype: object

In [52]:
### For model building we need to separate X and Y.

In [62]:
X = fraud_data.drop(labels='Taxable.Income', axis=1)
Y = fraud_data['Taxable.Income']
X.shape, Y.shape

((600, 5), (600,))

### Data is in required format for modelling.

### Model Building & Training

In [63]:
# Splitting data into training and testing data

In [64]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.20, random_state=1)

In [65]:
X_train.shape, Y_train.shape

((480, 5), (480,))

In [66]:
Y_train = Y_train.astype('int')

### Model building

In [88]:
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, Y_train)


from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=25, random_state=1)
rf_model.fit(X_train, Y_train)

RandomForestClassifier(n_estimators=25, random_state=1)

### Model Evaluation

### For Decision Tree - Parameters for training & Test data

In [89]:
Y_Train_pred = dt_model.predict(X_train)
Y_test_pred = dt_model.predict(X_test)

In [90]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

print('************************************************************************************')
print('Decision Tree results for training data.')
print('\n')
print('Accuracy score for Training data \t:', round(accuracy_score(Y_train, Y_Train_pred),4))
print('Precision score for Training data \t:', round(precision_score(Y_train, Y_Train_pred),4))
print('recall score for Training data \t\t:', round(recall_score(Y_train, Y_Train_pred),4))
print('Confusion Matrix for Training data:\t\t\n', confusion_matrix(Y_train, Y_Train_pred))
print('\n')
print('************************************************************************************')
print('Decision Tree results for test data.')
print('\n')
print('Accuracy score for Test data \t\t:', round(accuracy_score(Y_test, Y_test_pred),4))
print('Precision score for Test data \t\t:', round(precision_score(Y_test, Y_test_pred),4))
print('recall score for Test data \t\t:', round(recall_score(Y_test, Y_test_pred),4))
print('Confusion Matrix for Test data \t\t:\n', confusion_matrix(Y_test, Y_test_pred))

************************************************************************************
Decision Tree results for training data.


Accuracy score for Training data 	: 1.0
Precision score for Training data 	: 1.0
recall score for Training data 		: 1.0
Confusion Matrix for Training data:		
 [[101   0]
 [  0 379]]


************************************************************************************
Decision Tree results for test data.


Accuracy score for Test data 		: 0.575
Precision score for Test data 		: 0.7738
recall score for Test data 		: 0.6701
Confusion Matrix for Test data 		:
 [[ 4 19]
 [32 65]]


### For Random Forest - Parameters for training & Test data

In [91]:
Y_Train_pred_rf = rf_model.predict(X_train)
Y_test_pred_rf = rf_model.predict(X_test)

In [92]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

print('************************************************************************************')
print('Random Forest results for training data.')
print('\n')
print('Accuracy score for Training data \t:', round(accuracy_score(Y_train, Y_Train_pred_rf),4))
print('Precision score for Training data \t:', round(precision_score(Y_train, Y_Train_pred_rf),4))
print('recall score for Training data \t\t:', round(recall_score(Y_train, Y_Train_pred_rf),4))
print('Confusion Matrix for Training data:\t\t\n', confusion_matrix(Y_train, Y_Train_pred_rf))
print('\n')
print('************************************************************************************')
print('Random Forest results for test data.')
print('\n')
print('Accuracy score for Test data \t\t:', round(accuracy_score(Y_test, Y_test_pred_rf),4))
print('Precision score for Test data \t\t:', round(precision_score(Y_test, Y_test_pred_rf),4))
print('recall score for Test data \t\t:', round(recall_score(Y_test, Y_test_pred_rf),4))
print('Confusion Matrix for Test data \t\t:\n', confusion_matrix(Y_test, Y_test_pred_rf))

************************************************************************************
Random Forest results for training data.


Accuracy score for Training data 	: 0.9938
Precision score for Training data 	: 0.9921
recall score for Training data 		: 1.0
Confusion Matrix for Training data:		
 [[ 98   3]
 [  0 379]]


************************************************************************************
Random Forest results for test data.


Accuracy score for Test data 		: 0.8
Precision score for Test data 		: 0.8174
recall score for Test data 		: 0.9691
Confusion Matrix for Test data 		:
 [[ 2 21]
 [ 3 94]]


In [101]:

print('\n')
print('Accuracy is improved in Random Forest model')




Accuracy is improved in Random Forest model
