In [1]:
import pandas as pd 
import numpy as np 
import os
from DecisionTree import DecisionTree, treeNode
from pathlib import Path

def error_rate(predy,y):
    if np.reshape(predy,(-1)).shape != np.reshape(y,(-1)).shape:
        raise ValueError("The sample size are not equal.")
    return np.mean(np.reshape(predy,(-1,1))!=np.reshape(y,(-1,1)))


Car dataset. Include only categorical columns.

In [2]:
data_path = Path('./data/car')
colnames = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'label']
training_path = data_path/'train.csv'
test_path = data_path/'test.csv'

train_data = pd.read_csv(training_path, header=None, names=colnames) 
test_data = pd.read_csv(test_path, header=None, names=colnames) 
train_data.head() #viewing some row of the dataset

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,label
0,low,vhigh,4,4,big,med,acc
1,low,high,5more,4,med,high,vgood
2,vhigh,med,2,2,big,high,unacc
3,high,high,2,2,small,high,unacc
4,vhigh,low,3,2,big,low,unacc


In [3]:
column = train_data.columns.to_numpy()[:-1]
x = train_data[column].to_numpy()
y = train_data.label.to_numpy()

In [4]:
#training_error = np.zeros(6,3)
#testing_error = np.zeros(6,3)
error_table = np.zeros((6,6))
for i, criterion in enumerate(["entropy", "gini", "me"]):
    for max_depth in range(1,7): #should be not larger than column.shape[0], DecisionTree will take minimum of {column.shape[0], max_depth}.
        mytree = DecisionTree(trainx = x, trainy = y, column = column, criterion = criterion, max_depth = max_depth, entropy_base = 6)
        mytree.fit()
        predy = mytree.predict(train_data.to_numpy())
        error_table[max_depth-1, 2*i] = error_rate(predy,train_data.label.to_numpy())
        #print("training error for max_depth = {} is {}".format(max_depth, error_table[max_depth-1, 2*i]))
        predy = mytree.predict(test_data.to_numpy())
        error_table[max_depth-1, 2*i+1] = error_rate(predy,test_data.label.to_numpy())
        #print("testing error for max_depth = {} is {}".format(max_depth, error_table[max_depth-1, 2*i+1]))

print(error_table)

[[0.302      0.2967033  0.302      0.2967033  0.302      0.2967033 ]
 [0.286      0.29807692 0.286      0.29807692 0.292      0.31318681]
 [0.25       0.2706044  0.245      0.25824176 0.247      0.2706044 ]
 [0.233      0.29258242 0.236      0.26236264 0.228      0.27472527]
 [0.21       0.26098901 0.212      0.24450549 0.202      0.25412088]
 [0.2        0.26098901 0.202      0.24450549 0.19       0.25412088]]


Bank dataset. Include both categorical and numerical columns.

In [5]:
data_path = Path('./data/bank')
colnames = ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 
          'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']
training_path = data_path/'train.csv'
test_path = data_path/'test.csv'

train_data = pd.read_csv(training_path, header=None, names=colnames) 
test_data = pd.read_csv(test_path, header=None, names=colnames) 

train_data.head() #viewing some row of the dataset

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,41,services,married,secondary,no,0,yes,no,unknown,5,may,114,2,-1,0,unknown,no
1,48,blue-collar,single,secondary,no,312,yes,yes,cellular,3,feb,369,2,-1,0,unknown,no
2,55,technician,married,secondary,no,1938,no,yes,cellular,18,aug,193,1,386,3,success,yes
3,54,admin.,married,tertiary,no,59,yes,no,cellular,10,jul,268,1,-1,0,unknown,no
4,34,management,single,tertiary,no,2646,no,no,cellular,14,apr,142,1,-1,0,unknown,yes


In [6]:
thresholds = train_data[["age", "balance", "day", "duration", "campaign", "pdays", "previous"]].median()
#thain and test should use the same thresholds.
#consider unknown as a catagory
def bank_preprocessing(df):
    #for col in ["default", "housing", "loan", "y"]:
        #df.loc[df[col] == "yes", col] = 1
        #df.loc[df[col] == "no", col] = 0

    month_map = {"jan": 1, "feb": 2, "mar": 3, "apr": 4, "may": 5, "jun": 6, "jul": 7, "aug": 8, "sep": 9, "oct": 10, "nov": 11, "dec": 12}
    df.month = df.month.map(month_map)
    #numeric: age balance day duration campaign pdays(-1 means client was not previously contacted) previous
    for col in ["age", "balance", "day", "duration", "campaign", "pdays", "previous"]:
        df.loc[df[col] <= thresholds[col], col] = 0
        df.loc[df[col] > thresholds[col], col] = 1
        df[col] = df[col].map({0: "low", 1: "high"})

    return df

train_data = bank_preprocessing(train_data)
test_data = bank_preprocessing(test_data)
train_data.head() #viewing some row of the dataset

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,high,services,married,secondary,no,low,yes,no,unknown,low,5,low,low,high,low,unknown,no
1,high,blue-collar,single,secondary,no,low,yes,yes,cellular,low,2,high,low,high,low,unknown,no
2,high,technician,married,secondary,no,high,no,yes,cellular,high,8,high,low,high,high,success,yes
3,high,admin.,married,tertiary,no,low,yes,no,cellular,low,7,high,low,high,low,unknown,no
4,low,management,single,tertiary,no,high,no,no,cellular,low,4,low,low,high,low,unknown,yes


In [7]:
column = train_data.columns.to_numpy()[:-1]
x = train_data[column].to_numpy()
y = train_data.y.to_numpy()

In [8]:
#training_error = np.zeros(6,3)
#testing_error = np.zeros(6,3)
error_table = np.zeros((16,6))
for i, criterion in enumerate(["entropy", "gini", "me"]):
    for max_depth in range(1,17): #should be not larger than column.shape[0], DecisionTree will take minimum of {column.shape[0], max_depth}.
        mytree = DecisionTree(trainx = x, trainy = y, column = column, criterion = criterion, max_depth = max_depth, entropy_base = 6)
        mytree.fit()
        predy = mytree.predict(train_data.to_numpy())
        error_table[max_depth-1, 2*i] = error_rate(predy,train_data.y.to_numpy())
        #print("training error for max_depth = {} is {}".format(max_depth, error_table[max_depth-1, 2*i]))
        predy = mytree.predict(test_data.to_numpy())
        error_table[max_depth-1, 2*i+1] = error_rate(predy,test_data.y.to_numpy())
        #print("testing error for max_depth = {} is {}".format(max_depth, error_table[max_depth-1, 2*i+1]))

print(error_table)

[[0.1192 0.1248 0.1088 0.1166 0.1088 0.1166]
 [0.1192 0.1248 0.1042 0.1088 0.1042 0.1088]
 [0.1176 0.1262 0.098  0.1112 0.0978 0.1112]
 [0.1134 0.1294 0.0916 0.113  0.0918 0.1118]
 [0.1114 0.1302 0.0892 0.1142 0.0882 0.113 ]
 [0.111  0.1302 0.0886 0.1146 0.0878 0.1136]
 [0.111  0.1302 0.0886 0.1146 0.0878 0.1136]
 [0.111  0.1302 0.0886 0.1146 0.0878 0.1136]
 [0.1108 0.1304 0.0886 0.1146 0.0878 0.1136]
 [0.1108 0.1304 0.0884 0.115  0.0878 0.1136]
 [0.1108 0.1304 0.0884 0.115  0.0878 0.1136]
 [0.1108 0.1304 0.0884 0.115  0.0878 0.1136]
 [0.1108 0.1304 0.0884 0.115  0.0878 0.1136]
 [0.1108 0.1304 0.0884 0.115  0.0878 0.1136]
 [0.1108 0.1304 0.0884 0.115  0.0878 0.1136]
 [0.1108 0.1304 0.0884 0.115  0.0878 0.1136]]


In [9]:
train_data[["job", "education", "contact", "poutcome"]].mode()

Unnamed: 0,job,education,contact,poutcome
0,blue-collar,secondary,cellular,unknown


In [10]:
value, counts = np.unique(train_data.poutcome.to_numpy(), return_counts = True)
print(value, counts)

['failure' 'other' 'success' 'unknown'] [ 508  205  152 4135]


In [11]:
#replace unknown by most frequent value

def fill_unknown(df):
    fill_value = ["blue-collar", "secondary", "cellular", "failure"]
    #numeric: age balance day duration campaign pdays(-1 means client was not previously contacted) previous
    for i, col in enumerate(["job", "education", "contact", "poutcome"]):
        df.loc[df[col] == "unknown", col] = fill_value[i]
    return df

train_data = fill_unknown(train_data)
test_data = fill_unknown(test_data)
train_data.head() #viewing some row of the dataset

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,high,services,married,secondary,no,low,yes,no,cellular,low,5,low,low,high,low,failure,no
1,high,blue-collar,single,secondary,no,low,yes,yes,cellular,low,2,high,low,high,low,failure,no
2,high,technician,married,secondary,no,high,no,yes,cellular,high,8,high,low,high,high,success,yes
3,high,admin.,married,tertiary,no,low,yes,no,cellular,low,7,high,low,high,low,failure,no
4,low,management,single,tertiary,no,high,no,no,cellular,low,4,low,low,high,low,failure,yes


In [12]:
column = train_data.columns.to_numpy()[:-1]
x = train_data[column].to_numpy()
y = train_data.y.to_numpy()

In [13]:
#training_error = np.zeros(6,3)
#testing_error = np.zeros(6,3)
error_table = np.zeros((16,6))
for i, criterion in enumerate(["entropy", "gini", "me"]):
    for max_depth in range(1,17): #should be not larger than column.shape[0], DecisionTree will take minimum of {column.shape[0], max_depth}.
        mytree = DecisionTree(trainx = x, trainy = y, column = column, criterion = criterion, max_depth = max_depth, entropy_base = 6)
        mytree.fit()
        predy = mytree.predict(train_data.to_numpy())
        error_table[max_depth-1, 2*i] = error_rate(predy,train_data.y.to_numpy())
        #print("training error for max_depth = {} is {}".format(max_depth, error_table[max_depth-1, 2*i]))
        predy = mytree.predict(test_data.to_numpy())
        error_table[max_depth-1, 2*i+1] = error_rate(predy,test_data.y.to_numpy())
        #print("testing error for max_depth = {} is {}".format(max_depth, error_table[max_depth-1, 2*i+1]))

print(error_table)

[[0.1192 0.1248 0.1088 0.1166 0.1088 0.1166]
 [0.1192 0.1248 0.1052 0.1104 0.1052 0.1104]
 [0.1178 0.1262 0.103  0.1128 0.103  0.1128]
 [0.1144 0.1284 0.1    0.114  0.1004 0.1124]
 [0.1122 0.13   0.0998 0.1144 0.0998 0.1124]
 [0.112  0.13   0.0998 0.1144 0.0998 0.1124]
 [0.1118 0.13   0.0998 0.1144 0.0998 0.1124]
 [0.1116 0.1306 0.0998 0.1144 0.0998 0.1124]
 [0.1116 0.1306 0.0998 0.1144 0.0998 0.1124]
 [0.1116 0.1306 0.0998 0.1144 0.0998 0.1124]
 [0.1116 0.1306 0.0998 0.1144 0.0998 0.1124]
 [0.1116 0.1306 0.0998 0.1144 0.0998 0.1124]
 [0.1116 0.1306 0.0998 0.1144 0.0998 0.1124]
 [0.1116 0.1306 0.0998 0.1144 0.0998 0.1124]
 [0.1116 0.1306 0.0998 0.1144 0.0998 0.1124]
 [0.1116 0.1306 0.0998 0.1144 0.0998 0.1124]]
