In [1]:
#Importing required libraries
import numpy as np
import pandas as pd
import time
from sklearn.tree import DecisionTreeClassifier

## CART 1

In [2]:

class Node:
    '''This Class will store the data related to the specific Nodes'''
    def __init__(self, feature=None, threshold=None, left=None, right=None,*,value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value
        
    def is_leaf_node(self):
        '''Function to check whether the tree reached to its leaf Node'''
        return self.value is not None


class DecisionTreeClassifier1:
    '''this class will take training data and build the tree using different methods'''
    def __init__(self, min_samples_split=2, max_depth=6, n_features=None):
        self.min_samples_split=min_samples_split
        self.max_depth=max_depth
        self.n_features=n_features
        self.root=None
    
    def fit(self, X, y):
        '''fits the data into a tree structure'''
        self.n_features = X.shape[1] if not self.n_features else min(X.shape[1],self.n_features)
        self.root = self._grow_tree(X, y)
        
    #function to grow the tree from node
    def _grow_tree(self, X, y, depth=0):
        n_samples, n_feats = X.shape
       # print('samples',n_samples,n_feats,self.n_features)
        n_labels = len(np.unique(y))
        
        # check the stopping criteria
        if (depth>=self.max_depth or n_labels<=1 or n_samples<self.min_samples_split):
            return  Node(value = np.bincount(y).argmax())#Node(value=leaf_value)

        feat_idxs = np.random.choice(n_feats, self.n_features, replace=False)
        
        if len(X) <= 2:
            for feat_idx in feat_idxs:
               # print(feat_idx)
                X_column = X[:, feat_idx]
                #print(X_column)
                thresholds = np.unique(X_column)
                if len(thresholds) == 2:
                    best_feature = feat_idx
                    best_thresh = thresholds[0]
        else:
            # find the best split
            best_feature, best_thresh = self._best_split(X, y, feat_idxs)
            
        if best_thresh == None or best_feature == None:
            return Node(value = np.bincount(y).argmax())
        
        # create child nodes
        left_idxs, right_idxs = self._split(X[:, best_feature], best_thresh)
        #print(left_idxs,right_idxs,depth)
        left = self._grow_tree(X[left_idxs, :], y[left_idxs], depth+1)
        right = self._grow_tree(X[right_idxs, :], y[right_idxs], depth+1)
        return Node(best_feature, best_thresh, left, right)
    
    def gini_index(self, y):
        n_samples = len(y)
        counts = np.unique(y, return_counts=True)[1]
        impurity = 1
        for count in counts:
            prob = count / n_samples
            impurity -= prob ** 2
        return impurity 

    def _best_split(self, X, y, feat_idxs):
        best_gain = 1
        split_idx, split_threshold = None, None
        
        for feat_idx in feat_idxs:
            X_column = X[:, feat_idx]
            thresholds = np.unique(X_column)
            
            for thr in thresholds:
                # calculate the information gain
                gain = self.impurity_gain(y, X_column, thr)

                if gain!=0 and gain < best_gain:
                    best_gain = gain
                    split_idx = feat_idx
                    split_threshold = thr

        return split_idx, split_threshold
    

    def impurity_gain(self, y, X_column, threshold):
        '''Calculates impurity for particular split'''
        # create subsets
        left_subset, right_subset = self._split(X_column, threshold)

        if len(left_subset) == 0 or len(right_subset) == 0:
            return 0
        
        # calculate the gini impurity of each subset
        n = len(y)
        n_left, n_right = len(left_subset), len(right_subset)
        gini_l, gini_r = self.gini_index(y[left_subset]), self.gini_index(y[right_subset])
        
        return (n_left/n)*gini_l + (n_right/n)*gini_r  #returning impurity gain for particular split
        
    #function to split data into two parts
    def _split(self, X, split_thresh):
        '''Splits data based on the threshold value'''
        left_idxs = np.argwhere(X <= split_thresh).flatten()
        right_idxs = np.argwhere(X > split_thresh).flatten()
        return  left_idxs, right_idxs

    def predict(self, X):
        '''Taking each row from test data and makes predictions'''
        return np.array([self._traverse_tree(x, self.root) for x in X])

    def _traverse_tree(self, x, node):
        '''Passing over the data points till end point/ leaf node'''
        if node.is_leaf_node():
            return node.value

        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)
        

## CART 2

In [3]:

class Node:
    '''This Class will store the data related to the specific Nodes'''
    def __init__(self, feature=None, threshold=None, left=None, right=None,*,value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value
        
    def is_leaf_node(self):
        '''Function to check whether the tree reached to its leaf Node'''
        return self.value is not None


class DecisionTreeClassifier2:
    '''this class will take training data and build the tree using different methods'''
    def __init__(self, min_samples_split=2, max_depth=6, n_features=100):
        self.min_samples_split=min_samples_split
        self.max_depth=max_depth
        self.n_features=n_features
        self.root=None
    
    def fit(self, X, y):
        '''fits the data into a tree structure'''
        self.n_features = X.shape[1] if not self.n_features else min(X.shape[1],self.n_features)
        self.root = self._grow_tree(X, y)
        
    #function to grow the tree from node
    def _grow_tree(self, X, y, depth=0):
        n_samples, n_feats = X.shape
       # print('samples',n_samples,n_feats,self.n_features)
        n_labels = len(np.unique(y))
        
        # check the stopping criteria
        if (depth>=self.max_depth or n_labels<=1 or n_samples<self.min_samples_split):
            return  Node(value = np.bincount(y).argmax())#Node(value=leaf_value)

        feat_idxs = np.random.choice(n_feats, self.n_features, replace=False)
        
       
       # print('feat_idxs',feat_idxs)
        # find the best split
        best_feature, best_thresh = self._best_split(X, y, feat_idxs)
        if best_thresh == None or best_feature == None:
            return Node(value = np.bincount(y).argmax())
        #print(best_feature,best_thresh)
        
        # create child nodes
        left_idxs, right_idxs = self._split(X[:, best_feature], best_thresh)
        #print(left_idxs,right_idxs,depth)
        left = self._grow_tree(X[left_idxs, :], y[left_idxs], depth+1)
        right = self._grow_tree(X[right_idxs, :], y[right_idxs], depth+1)
        return Node(best_feature, best_thresh, left, right)
    
    def gini_index(self, y):
        n_samples = len(y)
        counts = np.unique(y, return_counts=True)[1]
        impurity = 1
        for count in counts:
            prob = count / n_samples
            impurity -= prob ** 2
        return impurity 

    def _best_split(self, X, y, feat_idxs):
        best_gain = 1
        split_idx, split_threshold = None, None
        if len(X) <= 2:
            for feat_idx in feat_idxs:
               # print(feat_idx)
                X_column = X[:, feat_idx]
                #print(X_column)
                thresholds = np.unique(X_column)
                if len(thresholds) == 2:
                    best_feature = feat_idx
                    best_thresh = thresholds[0]
                    #print("best_thresh", best_feature,best_thresh)
                    return best_feature, best_thresh

        for feat_idx in feat_idxs:
            X_column = X[:, feat_idx]
            thr = np.unique(X_column)
            
            if len(thr) >= 2 and len(thr) <=5:
                thresholds = thr
            else:
                thresholds = np.unique(X_column.mean())
    
            for thr in thresholds:
                # calculate the information gain
                gain = self.impurity_gain(y, X_column, thr)
                #print(gain)

                if gain!=0 and gain !=1 and gain < best_gain:
                    best_gain = gain
                    split_idx = feat_idx
                    split_threshold = thr
                    #print('thr',split_idx,split_threshold)
        #print(best_gain,split_threshold,split_idx)

        return split_idx, split_threshold
    

    def impurity_gain(self, y, X_column, threshold):
        '''Calculates impurity for particular split'''
        # create subsets
        left_subset, right_subset = self._split(X_column, threshold)

        if len(left_subset) == 0 or len(right_subset) == 0:
            return 0
        
        # calculate the gini impurity of each subset
        #n = len(y)
        #n_left, n_right = len(left_subset), len(right_subset)
        #gini_l, gini_r = self.gini_index(y[left_subset]), self.gini_index(y[right_subset])
        
        #return (n_left/n)*gini_l + (n_right/n)*gini_r  #returning impurity gain for particular split
        return (len(left_subset)/len(y))*self.gini_index(y[left_subset]) + ((len(right_subset)/len(y))*self.gini_index(y[right_subset]))
    
    #function to split data into two parts
    def _split(self, X, split_thresh):
        '''Splits data based on the threshold value'''
        left_idxs = np.argwhere(X <= split_thresh).flatten()
        right_idxs = np.argwhere(X > split_thresh).flatten()
        return  left_idxs, right_idxs

    def predict(self, X):
        '''Taking each row from test data and makes predictions'''
        return np.array([self._traverse_tree(x, self.root) for x in X])

    def _traverse_tree(self, x, node):
        '''Passing over the data points till end point/ leaf node'''
        if node.is_leaf_node():
            return node.value

        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)
        

### Function to split our data into train and test

In [4]:
def TrainTestSplit(X,y,test_size,random_state=None):
    np.random.seed(random_state)    #Random Number Generator
    permuted_indices = np.random.permutation(len(X))     #Shuffling indexes  
    test_size = int(test_size * len(X))
    test_indices = permuted_indices[:test_size]
    train_indices = permuted_indices[test_size:]
    X_train, X_test = X[train_indices], X[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]
    return X_train, X_test, y_train, y_test

### Functions for Evaluation Metrics

In [127]:

def accuracy_score(y_true, y_pred):
    TP = sum(y_true & y_pred)
    FP = sum(1-y_true & y_pred ) 
    TN = sum(1-y_true & 1-y_pred)
    FN = sum(y_true & 1-y_pred)
    return (TP+TN)/(TP+TN+FP+FN)

def precision(y_true,y_pred):
    TP = sum(y_true & y_pred) 
    FP = sum(1-y_true & y_pred )
    
    return (TP/(TP+FP))

def recall(y_true, y_pred):
    TP = sum(y_true & y_pred)
    FN = sum(y_true & 1-y_pred)
    return ((TP)/(TP+FN))

def f1_score(y_true, y_pred):
    Recall = recall(y_true, y_pred)
    Precision = precision(y_true, y_pred)
    
    return 2*(Recall*Precision)/(Recall+Precision)

In [6]:
cart1=DecisionTreeClassifier1()
cart2=DecisionTreeClassifier2()
dt=DecisionTreeClassifier()

## 1. Testing on HR-Employee-Attrition Data

In [7]:
#Loading sample data 
df = pd.read_csv("E:\Downloads\WA_Fn-UseC_-HR-Employee-Attrition.csv")
df

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,1,2061,...,3,80,1,17,3,3,5,2,0,3
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,1,2062,...,1,80,1,9,5,3,7,7,1,7
1467,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,1,2064,...,2,80,1,6,0,3,6,2,0,3
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,1,2065,...,4,80,0,17,3,2,9,6,0,8


In [8]:
df.EmployeeCount.value_counts()

1    1470
Name: EmployeeCount, dtype: int64

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

### Split X & y

In [10]:
X = df.drop(columns=['Attrition','EmployeeNumber','EmployeeCount'], axis = 1)
y = df['Attrition']

In [11]:
X

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Travel_Rarely,1102,Sales,1,2,Life Sciences,2,Female,94,...,1,80,0,8,0,1,6,4,0,5
1,49,Travel_Frequently,279,Research & Development,8,1,Life Sciences,3,Male,61,...,4,80,1,10,3,3,10,7,1,7
2,37,Travel_Rarely,1373,Research & Development,2,2,Other,4,Male,92,...,2,80,0,7,3,3,0,0,0,0
3,33,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,4,Female,56,...,3,80,0,8,3,3,8,7,3,0
4,27,Travel_Rarely,591,Research & Development,2,1,Medical,1,Male,40,...,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,Travel_Frequently,884,Research & Development,23,2,Medical,3,Male,41,...,3,80,1,17,3,3,5,2,0,3
1466,39,Travel_Rarely,613,Research & Development,6,1,Medical,4,Male,42,...,1,80,1,9,5,3,7,7,1,7
1467,27,Travel_Rarely,155,Research & Development,4,3,Life Sciences,2,Male,87,...,2,80,1,6,0,3,6,2,0,3
1468,49,Travel_Frequently,1023,Sales,2,3,Medical,4,Male,63,...,4,80,0,17,3,2,9,6,0,8


In [12]:
# label encoder for target
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
y=le.fit_transform(y)

In [13]:
y

array([1, 0, 1, ..., 0, 0, 0])

In [14]:
#Encoding all attributes using get_dummies
X = pd.get_dummies(X, drop_first = True)
X = X.values

In [15]:
X

array([[  41, 1102,    1, ...,    0,    1,    1],
       [  49,  279,    8, ...,    1,    0,    0],
       [  37, 1373,    2, ...,    0,    1,    1],
       ...,
       [  27,  155,    4, ...,    1,    0,    1],
       [  49, 1023,    2, ...,    1,    0,    0],
       [  34,  628,    8, ...,    1,    0,    0]], dtype=int64)

### Train test splitting

In [16]:
#train_test_split
x_train, x_test, y_train, y_test = TrainTestSplit(X,y,test_size = 0.25,random_state = 1)

In [17]:
models={
    "CART_Classifier1":cart1,
    "CART_Classifier2":cart2,
    "DecisionTreeClassifier":dt
}

In [18]:
#model building and evaluation
for name, model in models.items():
    print("***********************************",name,"******************************************")
    start = time.time()
    model.fit(x_train,y_train)
    pred = model.predict(x_test)
    print("Accuracy :", accuracy_score(y_test,pred))
    print("Precision :", precision(y_test,pred))
    print("Recall :",recall(y_test,pred))
    print("F1_Score :",f1_score(y_test,pred))
    print('Time-----------------',time.time()-start)

*********************************** CART_Classifier1 ******************************************
Accuracy : 0.8038147138964578
Precision : 0.4523809523809524
Recall : 0.27941176470588236
F1_Score : 0.3454545454545454
Time----------------- 2.724505662918091
*********************************** CART_Classifier2 ******************************************
Accuracy : 0.8256130790190735
Precision : 0.5555555555555556
Recall : 0.29411764705882354
F1_Score : 0.3846153846153846
Time----------------- 0.39900684356689453
*********************************** DecisionTreeClassifier ******************************************
Accuracy : 0.7874659400544959
Precision : 0.41935483870967744
Recall : 0.38235294117647056
F1_Score : 0.39999999999999997
Time----------------- 0.014961957931518555


### Earlier Results

<img src="1.Comparison with Sklearn with maxdepth10.PNG" align="left" width = 500/>

## 2. Testing on Cancer Data

In [177]:
#Load the data set
df1 = pd.read_csv("cancer.csv")
df1

Unnamed: 0,"diagnosis(1=m, 0=b)",radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,1,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,1,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,1,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,1,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,1,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,1,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,1,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,1,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [178]:
#Check for missing values
df1.isna().sum()

diagnosis(1=m, 0=b)        0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64

In [179]:
#check the shape of data
df1.shape

(569, 31)

### Split data into features and target

In [180]:
#selecting only the attribute columns
X = df1.iloc[:,1:].values
#selecting only the target column
y = df1.iloc[:,0].values

In [181]:
x_train, x_test, y_train, y_test = TrainTestSplit(X,y,test_size = 0.25,random_state = 1)

In [190]:
#model building and evaluation
for name, model in models.items():
    print("***********************************",name,"******************************************")
    start = time.time()
    model.fit(x_train,y_train)
    pred = model.predict(x_test)
    print("Accuracy :", accuracy_score(y_test,pred))
    print("Precision :", precision(y_test,pred))
    print("Recall :",recall(y_test,pred))
    print("F1_Score :",f1_score(y_test,pred))
    print('Time-----------------',time.time()-start)

*********************************** CART_Classifier1 ******************************************
Accuracy : 0.9295774647887324
Precision : 0.8947368421052632
Recall : 0.9272727272727272
F1_Score : 0.9107142857142856
Time----------------- 4.276276350021362
*********************************** CART_Classifier2 ******************************************
Accuracy : 0.9295774647887324
Precision : 0.9245283018867925
Recall : 0.8909090909090909
F1_Score : 0.9074074074074073
Time----------------- 0.07810783386230469
*********************************** DecisionTreeClassifier ******************************************
Accuracy : 0.9436619718309859
Precision : 0.9607843137254902
Recall : 0.8909090909090909
F1_Score : 0.9245283018867925
Time----------------- 0.015598535537719727


### Earlier Results

<img src="2.Comparison with Sklearn with maxdepth10.PNG" align="left" width = 500/>

## 3. Testing on Insurance Fraud Claims Detection Data

In [25]:
#Load the data set
df2 = pd.read_csv("insurance_claims.csv")
df2

Unnamed: 0,months_as_customer,age,policy_number,policy_bind_date,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,...,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year,fraud_reported,_c39
0,328,48,521585,2014-10-17,OH,250/500,1000,1406.91,0,466132,...,YES,71610,6510,13020,52080,Saab,92x,2004,Y,
1,228,42,342868,2006-06-27,IN,250/500,2000,1197.22,5000000,468176,...,?,5070,780,780,3510,Mercedes,E400,2007,Y,
2,134,29,687698,2000-09-06,OH,100/300,2000,1413.14,5000000,430632,...,NO,34650,7700,3850,23100,Dodge,RAM,2007,N,
3,256,41,227811,1990-05-25,IL,250/500,2000,1415.74,6000000,608117,...,NO,63400,6340,6340,50720,Chevrolet,Tahoe,2014,Y,
4,228,44,367455,2014-06-06,IL,500/1000,1000,1583.91,6000000,610706,...,NO,6500,1300,650,4550,Accura,RSX,2009,N,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,3,38,941851,1991-07-16,OH,500/1000,1000,1310.80,0,431289,...,?,87200,17440,8720,61040,Honda,Accord,2006,N,
996,285,41,186934,2014-01-05,IL,100/300,1000,1436.79,0,608177,...,?,108480,18080,18080,72320,Volkswagen,Passat,2015,N,
997,130,34,918516,2003-02-17,OH,250/500,500,1383.49,3000000,442797,...,YES,67500,7500,7500,52500,Suburu,Impreza,1996,N,
998,458,62,533940,2011-11-18,IL,500/1000,2000,1356.92,5000000,441714,...,YES,46980,5220,5220,36540,Audi,A5,1998,N,


In [26]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 40 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   months_as_customer           1000 non-null   int64  
 1   age                          1000 non-null   int64  
 2   policy_number                1000 non-null   int64  
 3   policy_bind_date             1000 non-null   object 
 4   policy_state                 1000 non-null   object 
 5   policy_csl                   1000 non-null   object 
 6   policy_deductable            1000 non-null   int64  
 7   policy_annual_premium        1000 non-null   float64
 8   umbrella_limit               1000 non-null   int64  
 9   insured_zip                  1000 non-null   int64  
 10  insured_sex                  1000 non-null   object 
 11  insured_education_level      1000 non-null   object 
 12  insured_occupation           1000 non-null   object 
 13  insured_hobbies    

In [27]:
#Lets check which columns contains '?'
df2[df2.columns[(df2 == '?').any()]]#.nunique()

Unnamed: 0,collision_type,property_damage,police_report_available
0,Side Collision,YES,YES
1,?,?,?
2,Rear Collision,NO,NO
3,Front Collision,?,NO
4,?,NO,NO
...,...,...,...
995,Front Collision,YES,?
996,Rear Collision,YES,?
997,Side Collision,?,YES
998,Rear Collision,?,YES


### Data Processing

In [28]:
#lets replace the categorical data in columns property_damage and police_report_available with suitable number
# I am replacing '?' with most frequent number of that column
df2['property_damage'].replace('YES', 1, inplace=True)
df2['property_damage'].replace('NO', 0, inplace=True)
df2['property_damage'].replace('?', 0, inplace=True)
df2['police_report_available'].replace('YES', 1, inplace=True)
df2['police_report_available'].replace('NO',0, inplace=True)
df2['police_report_available'].replace('?', 0, inplace=True)
#lets replace label data with 0 & 1
df2['fraud_reported'].replace('Y', 1, inplace = True)
df2['fraud_reported'].replace('N', 0, inplace = True)

#Lets use policy_csl and make two different columns
df2['csl_per_person'] = df2.policy_csl.str.split('/', expand=True)[0]
df2['csl_per_accident'] = df2.policy_csl.str.split('/', expand=True)[1]

#lets Derive the age of the vehicle based on the year value by assuming the data is collected in the year 2018
df2['vehicle_age'] = 2023 - df2['auto_year']

In [29]:
#lets drop unwanted columns
df2.drop(columns = ['policy_number','policy_bind_date','_c39','insured_zip','policy_csl',"auto_year","incident_date",'umbrella_limit'], inplace = True)

In [30]:
df2.head()

Unnamed: 0,months_as_customer,age,policy_state,policy_deductable,policy_annual_premium,insured_sex,insured_education_level,insured_occupation,insured_hobbies,insured_relationship,...,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,fraud_reported,csl_per_person,csl_per_accident,vehicle_age
0,328,48,OH,1000,1406.91,MALE,MD,craft-repair,sleeping,husband,...,71610,6510,13020,52080,Saab,92x,1,250,500,19
1,228,42,IN,2000,1197.22,MALE,MD,machine-op-inspct,reading,other-relative,...,5070,780,780,3510,Mercedes,E400,1,250,500,16
2,134,29,OH,2000,1413.14,FEMALE,PhD,sales,board-games,own-child,...,34650,7700,3850,23100,Dodge,RAM,0,100,300,16
3,256,41,IL,2000,1415.74,FEMALE,PhD,armed-forces,board-games,unmarried,...,63400,6340,6340,50720,Chevrolet,Tahoe,1,250,500,9
4,228,44,IL,1000,1583.91,MALE,Associate,sales,board-games,unmarried,...,6500,1300,650,4550,Accura,RSX,0,500,1000,14


### Splitting data into X & y

In [31]:
# lets saperate label and features
X = df2.drop(columns = 'fraud_reported')
y = df2['fraud_reported'].values

In [32]:
X

Unnamed: 0,months_as_customer,age,policy_state,policy_deductable,policy_annual_premium,insured_sex,insured_education_level,insured_occupation,insured_hobbies,insured_relationship,...,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,csl_per_person,csl_per_accident,vehicle_age
0,328,48,OH,1000,1406.91,MALE,MD,craft-repair,sleeping,husband,...,1,71610,6510,13020,52080,Saab,92x,250,500,19
1,228,42,IN,2000,1197.22,MALE,MD,machine-op-inspct,reading,other-relative,...,0,5070,780,780,3510,Mercedes,E400,250,500,16
2,134,29,OH,2000,1413.14,FEMALE,PhD,sales,board-games,own-child,...,0,34650,7700,3850,23100,Dodge,RAM,100,300,16
3,256,41,IL,2000,1415.74,FEMALE,PhD,armed-forces,board-games,unmarried,...,0,63400,6340,6340,50720,Chevrolet,Tahoe,250,500,9
4,228,44,IL,1000,1583.91,MALE,Associate,sales,board-games,unmarried,...,0,6500,1300,650,4550,Accura,RSX,500,1000,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,3,38,OH,1000,1310.80,FEMALE,Masters,craft-repair,paintball,unmarried,...,0,87200,17440,8720,61040,Honda,Accord,500,1000,17
996,285,41,IL,1000,1436.79,FEMALE,PhD,prof-specialty,sleeping,wife,...,0,108480,18080,18080,72320,Volkswagen,Passat,100,300,8
997,130,34,OH,500,1383.49,FEMALE,Masters,armed-forces,bungie-jumping,other-relative,...,1,67500,7500,7500,52500,Suburu,Impreza,250,500,27
998,458,62,IL,2000,1356.92,MALE,Associate,handlers-cleaners,base-jumping,wife,...,1,46980,5220,5220,36540,Audi,A5,500,1000,25


In [33]:
X = pd.get_dummies(X, drop_first = True)
X = X.values

In [34]:
X.shape

(1000, 1142)

In [160]:
#train_test_split
x_train, x_test, y_train, y_test = TrainTestSplit(X,y,test_size = 0.25,random_state = 42)

## CART1

In [165]:
start = time.time()
cart1.fit(x_train,y_train)
pred = cart1.predict(x_test)
print("Accuracy :", accuracy_score(y_test,pred))
print("Precision :", precision(y_test,pred))
print("Recall :",recall(y_test,pred))
print("F1_Score :",f1_score(y_test,pred))
print('Time-----------------',time.time()-start)

Accuracy : 0.772
Precision : 0.75
Recall : 0.22388059701492538
F1_Score : 0.3448275862068966
Time----------------- 0.12777209281921387


## CART2

In [174]:
start = time.time()
cart2.fit(x_train,y_train)
pred = cart2.predict(x_test)
print("Accuracy :", accuracy_score(y_test,pred))
print("Precision :", precision(y_test,pred))
print("Recall :",recall(y_test,pred))
print("F1_Score :",f1_score(y_test,pred))
print('Time-----------------',time.time()-start)

Accuracy : 0.772
Precision : 0.7777777777777778
Recall : 0.208955223880597
F1_Score : 0.32941176470588235
Time----------------- 0.09946823120117188


### Earlier Results

<img src="3.Comparison with Sklearn with maxdepth10.PNG" align="left" width = 700/>

## DecisionTreeClassifier

In [175]:
start = time.time()
dt.fit(x_train,y_train)
pred = dt.predict(x_test)
print("Accuracy :", accuracy_score(y_test,pred))
print("Precision :", precision(y_test,pred))
print("Recall :",recall(y_test,pred))
print("F1_Score :",f1_score(y_test,pred))
print('Time-----------------',time.time()-start)

Accuracy : 0.788
Precision : 0.6346153846153846
Recall : 0.4925373134328358
F1_Score : 0.5546218487394958
Time----------------- 0.06582379341125488


In [66]:
dt = DecisionTreeClassifier()