## Problem3

Build a Decision Tree & Random Forest model on the fraud data. Treat those who have taxable_income <= 30000 as Risky and others as Good (discretize the taxable income column)

In [34]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

data = pd.read_csv('Fraud_check.csv')
data.isnull().sum()

Undergrad          0
Marital.Status     0
Taxable.Income     0
City.Population    0
Work.Experience    0
Urban              0
dtype: int64

In [35]:
data.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO


In [36]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Undergrad        600 non-null    object
 1   Marital.Status   600 non-null    object
 2   Taxable.Income   600 non-null    int64 
 3   City.Population  600 non-null    int64 
 4   Work.Experience  600 non-null    int64 
 5   Urban            600 non-null    object
dtypes: int64(3), object(3)
memory usage: 28.2+ KB


In [37]:
data.shape

(600, 6)

In [38]:
#target variable = risky
data[data['Taxable.Income']<=30000]['Taxable.Income']

10     29732
12     11794
16     24987
19     10987
21     14310
       ...  
544    29916
557    12810
567    16316
575    10735
591    27394
Name: Taxable.Income, Length: 124, dtype: int64

In [39]:
#target variable = good
data[data['Taxable.Income']>30000]['Taxable.Income']

0      68833
1      33700
2      36925
3      50190
4      81002
       ...  
595    76340
596    69967
597    47334
598    98592
599    96519
Name: Taxable.Income, Length: 476, dtype: int64

In [40]:
# We need to discretize Taxable.Income <= 30000 as Risky (0)and others as Good (1)
data['group'] = pd.DataFrame(np.where(data['Taxable.Income'] > 30000, 1, 0))

In [43]:
data.drop(['Urban'],axis=1,inplace=True)
data.sample(10)

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,group
390,YES,Single,64437,57324,17,1
555,NO,Single,31629,53848,10,1
417,NO,Divorced,38183,76625,2,1
80,NO,Married,44947,28184,26,1
173,YES,Single,84835,105110,16,1
174,YES,Single,44795,149857,8,1
453,YES,Single,97341,41504,26,1
37,NO,Married,68513,66912,5,1
462,NO,Divorced,16690,149327,17,0
596,YES,Divorced,69967,55369,2,1


In [44]:
# We need to encode Undergrad and Marital.Status
lb = LabelEncoder()
data["Undergrad"] = lb.fit_transform(data["Undergrad"])
data["Marital.Status"] = lb.fit_transform(data["Marital.Status"])

In [45]:
data.sample(10)

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,group
559,1,0,72026,180424,26,1
380,1,1,36156,197699,2,1
88,0,1,86652,63896,24,1
80,0,1,44947,28184,26,1
203,1,0,56536,117260,4,1
433,0,1,15532,129265,14,0
303,1,0,78796,189061,13,1
244,1,2,68268,154647,27,1
89,1,1,26741,176050,20,0
9,1,0,98152,155482,4,1


In [46]:
data.dtypes

Undergrad          int32
Marital.Status     int32
Taxable.Income     int64
City.Population    int64
Work.Experience    int64
group              int32
dtype: object

In [47]:
# no zero variance
data.var()

Undergrad          2.500167e-01
Marital.Status     6.756149e-01
Taxable.Income     6.866930e+08
City.Population    2.485030e+09
Work.Experience    7.818357e+01
group              1.642293e-01
dtype: float64

In [48]:
colnames = list(data.columns)
predictors = colnames[:5]
target = colnames[5]

In [49]:
predictors

['Undergrad',
 'Marital.Status',
 'Taxable.Income',
 'City.Population',
 'Work.Experience']

In [50]:
target

'group'

In [51]:
# Splitting data into training and testing data set
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size = 0.3)

In [52]:
from sklearn.tree import DecisionTreeClassifier as DT
model = DT(criterion = 'gini')
model.fit(train[predictors], train[target])

DecisionTreeClassifier()

In [53]:
# Prediction on Test Data
preds = model.predict(test[predictors])
pd.crosstab(test[target], preds, rownames=['Actual'], colnames=['Predictions'])


Predictions,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,37,2
1,0,141


In [54]:
np.mean(preds == test[target]) # Test Data Accuracy 

0.9888888888888889

In [55]:
# Print the confusion matrix (alternate way)
from sklearn import metrics
metrics.confusion_matrix(test[target], preds)

array([[ 37,   2],
       [  0, 141]], dtype=int64)

In [56]:
# Print the precision and recall, for  2 classes
print(metrics.classification_report(test[target], preds, digits=2))

              precision    recall  f1-score   support

           0       1.00      0.95      0.97        39
           1       0.99      1.00      0.99       141

    accuracy                           0.99       180
   macro avg       0.99      0.97      0.98       180
weighted avg       0.99      0.99      0.99       180



In [57]:
# Prediction on Train Data
preds = model.predict(train[predictors])
pd.crosstab(train[target], preds, rownames = ['Actual'], colnames = ['Predictions'])

Predictions,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,85,0
1,0,335


In [58]:
np.mean(preds == train[target]) # Train Data Accuracy

1.0

In [59]:
# Print the confusion matrix (alternate way)
from sklearn import metrics
metrics.confusion_matrix(train[target], preds)

array([[ 85,   0],
       [  0, 335]], dtype=int64)

In [60]:
# Print the precision and recall, for  2 classes
print(metrics.classification_report(train[target], preds, digits=2))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        85
           1       1.00      1.00      1.00       335

    accuracy                           1.00       420
   macro avg       1.00      1.00      1.00       420
weighted avg       1.00      1.00      1.00       420



In [61]:
# Building  Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(criterion = 'entropy', random_state = 22)
rfc.fit(train[predictors], train[target])

RandomForestClassifier(criterion='entropy', random_state=22)

In [62]:
# Evaluating on Training set
rfc_pred_train = rfc.predict(train[predictors])
metrics.confusion_matrix(train[target], rfc_pred_train)

array([[ 85,   0],
       [  0, 335]], dtype=int64)

In [63]:
# Print the precision and recall and f1 score, for 2 classes
print(metrics.classification_report(train[target], rfc_pred_train, digits=2))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        85
           1       1.00      1.00      1.00       335

    accuracy                           1.00       420
   macro avg       1.00      1.00      1.00       420
weighted avg       1.00      1.00      1.00       420



In [64]:
# Evaluating on Test set
rfc_pred_test = rfc.predict(test[predictors])
metrics.confusion_matrix(test[target], rfc_pred_test)

array([[ 37,   2],
       [  0, 141]], dtype=int64)

In [65]:
# Print the precision and recall and f1 score, for 2 classes
print(metrics.classification_report(test[target], rfc_pred_test, digits=2))

              precision    recall  f1-score   support

           0       1.00      0.95      0.97        39
           1       0.99      1.00      0.99       141

    accuracy                           0.99       180
   macro avg       0.99      0.97      0.98       180
weighted avg       0.99      0.99      0.99       180

