In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
fraud=pd.read_csv('Fraud_check (1).csv')
fraud.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO


In [3]:
fraud["income"]="<=30000"
fraud.loc[fraud["Taxable.Income"]>=30000,"income"]="Good"
fraud.loc[fraud["Taxable.Income"]<=30000,"income"]="Risky"

In [4]:
fraud["income"].unique()

array(['Good', 'Risky'], dtype=object)

In [5]:
fraud["income"].value_counts()

Good     476
Risky    124
Name: income, dtype: int64

In [6]:
### dropping the Taxable.Income columns
fraud = fraud.drop(['Taxable.Income'],axis=1)
fraud.rename(columns={"Marital.Status":"marital","City.Population":"population","Work.Experience":"workexp"},inplace=True)
fraud.isnull().sum()

Undergrad     0
marital       0
population    0
workexp       0
Urban         0
income        0
dtype: int64

No missing values

In [7]:
fraud.head()

Unnamed: 0,Undergrad,marital,population,workexp,Urban,income
0,NO,Single,50047,10,YES,Good
1,YES,Divorced,134075,18,YES,Good
2,NO,Married,160205,30,YES,Good
3,YES,Single,193264,15,YES,Good
4,NO,Married,27533,28,NO,Good


Need to convert  string data into float. Model cannot fit a string data

In [8]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
for column_name in fraud.columns:
    if fraud[column_name].dtype == object:
        fraud[column_name] = le.fit_transform(fraud[column_name])
    else:
        pass

In [9]:
fraud.head()
## converted string data into float

Unnamed: 0,Undergrad,marital,population,workexp,Urban,income
0,0,2,50047,10,1,0
1,1,0,134075,18,1,0
2,0,1,160205,30,1,0
3,1,2,193264,15,1,0
4,0,1,27533,28,0,0


In [10]:
features = fraud.iloc[:,0:5]
labels = pd.DataFrame(fraud.iloc[:,5])

In [11]:
fraud["income"].value_counts()

0    476
1    124
Name: income, dtype: int64

In [12]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(features,labels, test_size=0.3, stratify=labels)

In [13]:
print(y_train["income"].value_counts())

0    333
1     87
Name: income, dtype: int64


In [14]:
print(y_test["income"].value_counts())

0    143
1     37
Name: income, dtype: int64


In [15]:
## factorizing some float datatypes (Undergrad,marital,Urban)
fraud["Undergrad"],_ = pd.factorize(fraud["Undergrad"])
fraud["marital"],_ = pd.factorize(fraud["marital"])
fraud["Urban"],_ = pd.factorize(fraud["Urban"])


In [16]:
##Converting the column names into the list format
colnames = list(fraud.columns)
predictors = colnames[:5]
target = colnames[5]

In [17]:
fraud.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   Undergrad   600 non-null    int64
 1   marital     600 non-null    int64
 2   population  600 non-null    int64
 3   workexp     600 non-null    int64
 4   Urban       600 non-null    int64
 5   income      600 non-null    int32
dtypes: int32(1), int64(5)
memory usage: 25.9 KB


In [19]:
###Splitting the data in train and test data
##One of the way to split the data
fraud["is_train"] = np.random.uniform(0,1,len(fraud))<=0.70
fraud["is_train"]
train,test = fraud[fraud["is_train"]==True],fraud[fraud["is_train"]==False]

In [20]:
from sklearn.tree import DecisionTreeClassifier as DT

model = DT(criterion = 'entropy')
model.fit(x_train,y_train)

DecisionTreeClassifier(criterion='entropy')

In [21]:
from sklearn.metrics import accuracy_score
##Prediciton on train data 
pred_train = pd.DataFrame(model.predict(x_train))
pred_train

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,1
...,...
415,0
416,0
417,0
418,0


In [22]:
### Finding the accuracy of train data
acc_train = accuracy_score(y_train,pred_train)
acc_train

1.0

Train data for model have 100% accuracy

In [23]:
##Confusion matrix for train data
from sklearn.metrics import confusion_matrix

In [24]:
cm = pd.DataFrame(confusion_matrix(y_train,pred_train))
cm

Unnamed: 0,0,1
0,333,0
1,0,87


In [25]:
##Prediction on test data
pred_test = pd.DataFrame(model.predict(x_test))
pred_test

Unnamed: 0,0
0,1
1,0
2,0
3,0
4,0
...,...
175,0
176,1
177,0
178,0


In [26]:
acc_test = accuracy_score(y_test,pred_test)
acc_test

0.6333333333333333

Test data for model have 63.3% accuracy 

In [27]:
#confusion matrix for test data
cm_test = confusion_matrix(y_test,pred_test)
cm_test

array([[106,  37],
       [ 29,   8]], dtype=int64)

In [28]:
pip install pydotplus

Note: you may need to restart the kernel to use updated packages.


In [29]:
pip install --upgrade scikit-learn==0.20.3

Collecting scikit-learn==0.20.3
  Using cached scikit-learn-0.20.3.tar.gz (11.8 MB)
Building wheels for collected packages: scikit-learn
  Building wheel for scikit-learn (setup.py): started
  Building wheel for scikit-learn (setup.py): finished with status 'error'
  Running setup.py clean for scikit-learn
Failed to build scikit-learn
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.23.2
    Uninstalling scikit-learn-0.23.2:
      Successfully uninstalled scikit-learn-0.23.2
    Running setup.py install for scikit-learn: started
    Running setup.py install for scikit-learn: finished with status 'error'
  Rolling back uninstall of scikit-learn
  Moving to c:\users\user-pc\anaconda3\lib\site-packages\scikit_learn-0.23.2.dist-info\
   from C:\Users\User-PC\anaconda3\Lib\site-packages\~cikit_learn-0.23.2.dist-info
  Moving to c:\users\user-pc\anaconda3\lib\site-packages\sklearn\
   from C:\Users\User-PC\anacon

  ERROR: Command errored out with exit status 1:
   command: 'C:\Users\User-PC\anaconda3\python.exe' -u -c 'import sys, setuptools, tokenize; sys.argv[0] = '"'"'C:\\Users\\User-PC\\AppData\\Local\\Temp\\pip-install-13a7sa1c\\scikit-learn\\setup.py'"'"'; __file__='"'"'C:\\Users\\User-PC\\AppData\\Local\\Temp\\pip-install-13a7sa1c\\scikit-learn\\setup.py'"'"';f=getattr(tokenize, '"'"'open'"'"', open)(__file__);code=f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' bdist_wheel -d 'C:\Users\User-PC\AppData\Local\Temp\pip-wheel-fwp2v7b3'
       cwd: C:\Users\User-PC\AppData\Local\Temp\pip-install-13a7sa1c\scikit-learn\
  Complete output (678 lines):
  Partial import of sklearn during the build process.
  blas_opt_info:
  blas_mkl_info:
  No module named 'numpy.distutils._msvccompiler' in numpy.distutils; trying from distutils
  customize MSVCCompiler
    libraries mkl_rt not found in ['C:/Users/User-PC/anaconda3\\Library\\lib']
    NOT A

  creating build\lib.win-amd64-3.8\sklearn\_build_utils
  copying sklearn\_build_utils\__init__.py -> build\lib.win-amd64-3.8\sklearn\_build_utils
  creating build\lib.win-amd64-3.8\sklearn\compose
  copying sklearn\compose\_column_transformer.py -> build\lib.win-amd64-3.8\sklearn\compose
  copying sklearn\compose\_target.py -> build\lib.win-amd64-3.8\sklearn\compose
  copying sklearn\compose\__init__.py -> build\lib.win-amd64-3.8\sklearn\compose
  creating build\lib.win-amd64-3.8\sklearn\compose\tests
  copying sklearn\compose\tests\test_column_transformer.py -> build\lib.win-amd64-3.8\sklearn\compose/tests
  copying sklearn\compose\tests\test_target.py -> build\lib.win-amd64-3.8\sklearn\compose/tests
  copying sklearn\compose\tests\__init__.py -> build\lib.win-amd64-3.8\sklearn\compose/tests
  creating build\lib.win-amd64-3.8\sklearn\covariance
  copying sklearn\covariance\elliptic_envelope.py -> build\lib.win-amd64-3.8\sklearn\covariance
  copying sklearn\covariance\empirical_covari

  copying sklearn\semi_supervised\tests\test_label_propagation.py -> build\lib.win-amd64-3.8\sklearn\semi_supervised/tests
  copying sklearn\semi_supervised\tests\__init__.py -> build\lib.win-amd64-3.8\sklearn\semi_supervised/tests
  creating build\lib.win-amd64-3.8\sklearn\cluster
  copying sklearn\cluster\affinity_propagation_.py -> build\lib.win-amd64-3.8\sklearn\cluster
  copying sklearn\cluster\bicluster.py -> build\lib.win-amd64-3.8\sklearn\cluster
  copying sklearn\cluster\birch.py -> build\lib.win-amd64-3.8\sklearn\cluster
  copying sklearn\cluster\dbscan_.py -> build\lib.win-amd64-3.8\sklearn\cluster
  copying sklearn\cluster\hierarchical.py -> build\lib.win-amd64-3.8\sklearn\cluster
  copying sklearn\cluster\k_means_.py -> build\lib.win-amd64-3.8\sklearn\cluster
  copying sklearn\cluster\mean_shift_.py -> build\lib.win-amd64-3.8\sklearn\cluster
  copying sklearn\cluster\setup.py -> build\lib.win-amd64-3.8\sklearn\cluster
  copying sklearn\cluster\spectral.py -> build\lib.win-

    copying sklearn\ensemble\__init__.py -> build\lib.win-amd64-3.8\sklearn\ensemble
    creating build\lib.win-amd64-3.8\sklearn\ensemble\tests
    copying sklearn\ensemble\tests\test_bagging.py -> build\lib.win-amd64-3.8\sklearn\ensemble\tests
    copying sklearn\ensemble\tests\test_base.py -> build\lib.win-amd64-3.8\sklearn\ensemble\tests
    copying sklearn\ensemble\tests\test_forest.py -> build\lib.win-amd64-3.8\sklearn\ensemble\tests
    copying sklearn\ensemble\tests\test_gradient_boosting.py -> build\lib.win-amd64-3.8\sklearn\ensemble\tests
    copying sklearn\ensemble\tests\test_gradient_boosting_loss_functions.py -> build\lib.win-amd64-3.8\sklearn\ensemble\tests
    copying sklearn\ensemble\tests\test_iforest.py -> build\lib.win-amd64-3.8\sklearn\ensemble\tests
    copying sklearn\ensemble\tests\test_partial_dependence.py -> build\lib.win-amd64-3.8\sklearn\ensemble\tests
    copying sklearn\ensemble\tests\test_voting_classifier.py -> build\lib.win-amd64-3.8\sklearn\ensemble\t

In [30]:
## visualizing the decision tree

import six
import sys
sys.modules['sklearn.externals.six'] = six
from sklearn.tree import export_graphviz
import pydotplus
from sklearn.externals.six import StringIO

In [31]:
dot_data = StringIO()
export_graphviz(model, out_file = dot_data ,filled = True,rounded =True,feature_names = predictors,class_names = target, impurity = False)
export_graphviz

<function sklearn.tree._export.export_graphviz(decision_tree, out_file=None, *, max_depth=None, feature_names=None, class_names=None, label='all', filled=False, leaves_parallel=False, impurity=True, node_ids=False, proportion=False, rotate=False, rounded=False, special_characters=False, precision=3)>

In [32]:
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph

<pydotplus.graphviz.Dot at 0x19e9558f220>

In [33]:
conda install python-graphviz

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.


In [34]:
###PDF file of Decision tree
graph.write_pdf('fraud.pdf')

InvocationException: GraphViz's executables not found

In [None]:
##PNG file of Decision tree
graph.write_png('fraud.png')