In [1]:
import pandas as pd 
import numpy as np 
#import os
from DecisionTree import DecisionTree
from pathlib import Path

def error_rate(predy,y):
    if np.reshape(predy,(-1)).shape != np.reshape(y,(-1)).shape:
        raise ValueError("The sample size are not equal.")
    return np.mean(np.reshape(predy,(-1,1))!=np.reshape(y,(-1,1)))


In [2]:
print(np.__version__)
print(pd.__version__)

1.21.5
1.4.3


Car dataset. Include only categorical columns.

In [3]:
data_path = Path('./data/car')
colnames = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'label']
training_path = data_path/'train.csv'
test_path = data_path/'test.csv'

train_data = pd.read_csv(training_path, header=None, names=colnames) 
test_data = pd.read_csv(test_path, header=None, names=colnames) 
train_data.head() #viewing some row of the dataset

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,label
0,low,vhigh,4,4,big,med,acc
1,low,high,5more,4,med,high,vgood
2,vhigh,med,2,2,big,high,unacc
3,high,high,2,2,small,high,unacc
4,vhigh,low,3,2,big,low,unacc


In [4]:
column = train_data.columns.to_numpy()[:-1]
x = train_data[column].to_numpy()
y = train_data.label.to_numpy()

In [5]:
#training_error = np.zeros(6,3)
#testing_error = np.zeros(6,3)
error_table = np.zeros((6,6))
for i, criterion in enumerate(["entropy", "gini", "me"]):
    for max_depth in range(1,7): #should be not larger than column.shape[0], DecisionTree will take minimum of {column.shape[0], max_depth}.
        mytree = DecisionTree(trainx = x, trainy = y, column = column, criterion = criterion, max_depth = max_depth, entropy_base = 6)
        mytree.fit()
        predy = mytree.predict(train_data.to_numpy())
        error_table[max_depth-1, 2*i] = error_rate(predy,train_data.label.to_numpy())
        #print("training error for max_depth = {} is {}".format(max_depth, error_table[max_depth-1, 2*i]))
        predy = mytree.predict(test_data.to_numpy())
        error_table[max_depth-1, 2*i+1] = error_rate(predy,test_data.label.to_numpy())
        #print("testing error for max_depth = {} is {}".format(max_depth, error_table[max_depth-1, 2*i+1]))

report = pd.DataFrame(error_table, columns = ["entropy_train", "entropy_test", "gini_train", "gini_test", "me_train", "me_test"])
report.insert(loc=0, column="depth", value=np.arange(1,7))
print(report.to_string(index=False))

print("averages  : {}".format(error_table.mean(axis = 0)))

 depth  entropy_train  entropy_test  gini_train  gini_test  me_train  me_test
     1          0.302      0.296703       0.302   0.296703     0.302 0.296703
     2          0.286      0.298077       0.286   0.298077     0.292 0.313187
     3          0.250      0.270604       0.245   0.258242     0.247 0.270604
     4          0.233      0.292582       0.236   0.262363     0.228 0.274725
     5          0.210      0.260989       0.212   0.244505     0.202 0.254121
     6          0.200      0.260989       0.202   0.244505     0.190 0.254121
averages  : [0.24683333 0.27999084 0.24716667 0.26739927 0.2435     0.27724359]


Bank dataset. Include both categorical and numerical columns.

In [6]:
data_path = Path('./data/bank')
colnames = ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 
          'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']
training_path = data_path/'train.csv'
test_path = data_path/'test.csv'

train_data = pd.read_csv(training_path, header=None, names=colnames) 
test_data = pd.read_csv(test_path, header=None, names=colnames) 

train_data.head() #viewing some row of the dataset

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,41,services,married,secondary,no,0,yes,no,unknown,5,may,114,2,-1,0,unknown,no
1,48,blue-collar,single,secondary,no,312,yes,yes,cellular,3,feb,369,2,-1,0,unknown,no
2,55,technician,married,secondary,no,1938,no,yes,cellular,18,aug,193,1,386,3,success,yes
3,54,admin.,married,tertiary,no,59,yes,no,cellular,10,jul,268,1,-1,0,unknown,no
4,34,management,single,tertiary,no,2646,no,no,cellular,14,apr,142,1,-1,0,unknown,yes


In [7]:
thresholds = train_data[["age", "balance", "day", "duration", "campaign", "pdays", "previous"]].median()
#thain and test should use the same thresholds.
#consider unknown as a catagory
def bank_preprocessing(df):
    #for col in ["default", "housing", "loan", "y"]:
        #df.loc[df[col] == "yes", col] = 1
        #df.loc[df[col] == "no", col] = 0

    month_map = {"jan": 1, "feb": 2, "mar": 3, "apr": 4, "may": 5, "jun": 6, "jul": 7, "aug": 8, "sep": 9, "oct": 10, "nov": 11, "dec": 12}
    df.month = df.month.map(month_map)
    #numeric: age balance day duration campaign pdays(-1 means client was not previously contacted) previous
    for col in ["age", "balance", "day", "duration", "campaign", "pdays", "previous"]:
        df.loc[df[col] <= thresholds[col], col] = 0
        df.loc[df[col] > thresholds[col], col] = 1
        df[col] = df[col].map({0: "low", 1: "high"})

    return df

train_data = bank_preprocessing(train_data)
test_data = bank_preprocessing(test_data)
train_data.head() #viewing some row of the dataset

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,high,services,married,secondary,no,low,yes,no,unknown,low,5,low,low,high,low,unknown,no
1,high,blue-collar,single,secondary,no,low,yes,yes,cellular,low,2,high,low,high,low,unknown,no
2,high,technician,married,secondary,no,high,no,yes,cellular,high,8,high,low,high,high,success,yes
3,high,admin.,married,tertiary,no,low,yes,no,cellular,low,7,high,low,high,low,unknown,no
4,low,management,single,tertiary,no,high,no,no,cellular,low,4,low,low,high,low,unknown,yes


In [8]:
column = train_data.columns.to_numpy()[:-1]
x = train_data[column].to_numpy()
y = train_data.y.to_numpy()

In [9]:
#training_error = np.zeros(6,3)
#testing_error = np.zeros(6,3)
error_table = np.zeros((16,6))
for i, criterion in enumerate(["entropy", "gini", "me"]):
    for max_depth in range(1,17): #should be not larger than column.shape[0], DecisionTree will take minimum of {column.shape[0], max_depth}.
        mytree = DecisionTree(trainx = x, trainy = y, column = column, criterion = criterion, max_depth = max_depth, entropy_base = 16)
        mytree.fit()
        predy = mytree.predict(train_data.to_numpy())
        error_table[max_depth-1, 2*i] = error_rate(predy,train_data.y.to_numpy())
        #print("training error for max_depth = {} is {}".format(max_depth, error_table[max_depth-1, 2*i]))
        predy = mytree.predict(test_data.to_numpy())
        error_table[max_depth-1, 2*i+1] = error_rate(predy,test_data.y.to_numpy())
        #print("testing error for max_depth = {} is {}".format(max_depth, error_table[max_depth-1, 2*i+1]))

report = pd.DataFrame(error_table, columns = ["entropy_train", "entropy_test", "gini_train", "gini_test", "me_train", "me_test"])
report.insert(loc=0, column="depth", value=np.arange(1,17))
print(report.to_string(index=False))

print("averages  : {}".format(error_table.mean(axis = 0)))

 depth  entropy_train  entropy_test  gini_train  gini_test  me_train  me_test
     1         0.1192        0.1248      0.1088     0.1166    0.1088   0.1166
     2         0.1192        0.1248      0.1042     0.1088    0.1042   0.1088
     3         0.1176        0.1262      0.0980     0.1112    0.0978   0.1112
     4         0.1134        0.1294      0.0916     0.1130    0.0918   0.1118
     5         0.1114        0.1302      0.0892     0.1142    0.0882   0.1130
     6         0.1110        0.1302      0.0886     0.1146    0.0878   0.1136
     7         0.1110        0.1302      0.0886     0.1146    0.0878   0.1136
     8         0.1110        0.1302      0.0886     0.1146    0.0878   0.1136
     9         0.1108        0.1304      0.0886     0.1146    0.0878   0.1136
    10         0.1108        0.1304      0.0884     0.1150    0.0878   0.1136
    11         0.1108        0.1304      0.0884     0.1150    0.0878   0.1136
    12         0.1108        0.1304      0.0884     0.1150    0.

In [10]:
train_data[["job", "education", "contact", "poutcome"]].mode()

Unnamed: 0,job,education,contact,poutcome
0,blue-collar,secondary,cellular,unknown


In [11]:
value, counts = np.unique(train_data.poutcome.to_numpy(), return_counts = True)
print(value, counts)

['failure' 'other' 'success' 'unknown'] [ 508  205  152 4135]


In [12]:
#replace unknown by most frequent value (mode)

def fill_unknown(df):
    fill_value = ["blue-collar", "secondary", "cellular", "failure"]
    for i, col in enumerate(["job", "education", "contact", "poutcome"]):
        df.loc[df[col] == "unknown", col] = fill_value[i]
    return df

train_data = fill_unknown(train_data)
test_data = fill_unknown(test_data)
train_data.head() #viewing some row of the dataset

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,high,services,married,secondary,no,low,yes,no,cellular,low,5,low,low,high,low,failure,no
1,high,blue-collar,single,secondary,no,low,yes,yes,cellular,low,2,high,low,high,low,failure,no
2,high,technician,married,secondary,no,high,no,yes,cellular,high,8,high,low,high,high,success,yes
3,high,admin.,married,tertiary,no,low,yes,no,cellular,low,7,high,low,high,low,failure,no
4,low,management,single,tertiary,no,high,no,no,cellular,low,4,low,low,high,low,failure,yes


In [13]:
column = train_data.columns.to_numpy()[:-1]
x = train_data[column].to_numpy()
y = train_data.y.to_numpy()

In [14]:
#training_error = np.zeros(6,3)
#testing_error = np.zeros(6,3)
error_table = np.zeros((16,6))
for i, criterion in enumerate(["entropy", "gini", "me"]):
    for max_depth in range(1,17): #should be not larger than column.shape[0], DecisionTree will take minimum of {column.shape[0], max_depth}.
        mytree = DecisionTree(trainx = x, trainy = y, column = column, criterion = criterion, max_depth = max_depth, entropy_base = 16)
        mytree.fit()
        predy = mytree.predict(train_data.to_numpy())
        error_table[max_depth-1, 2*i] = error_rate(predy,train_data.y.to_numpy())
        #print("training error for max_depth = {} is {}".format(max_depth, error_table[max_depth-1, 2*i]))
        predy = mytree.predict(test_data.to_numpy())
        error_table[max_depth-1, 2*i+1] = error_rate(predy,test_data.y.to_numpy())
        #print("testing error for max_depth = {} is {}".format(max_depth, error_table[max_depth-1, 2*i+1]))
        
report = pd.DataFrame(error_table, columns = ["entropy_train", "entropy_test", "gini_train", "gini_test", "me_train", "me_test"])
report.insert(loc=0, column="depth", value=np.arange(1,17))
print(report.to_string(index=False))

print("averages  : {}".format(error_table.mean(axis = 0)))

 depth  entropy_train  entropy_test  gini_train  gini_test  me_train  me_test
     1         0.1192        0.1248      0.1088     0.1166    0.1088   0.1166
     2         0.1192        0.1248      0.1052     0.1104    0.1052   0.1104
     3         0.1178        0.1262      0.1030     0.1128    0.1030   0.1128
     4         0.1144        0.1284      0.1000     0.1140    0.1004   0.1124
     5         0.1122        0.1300      0.0998     0.1144    0.0998   0.1124
     6         0.1120        0.1300      0.0998     0.1144    0.0998   0.1124
     7         0.1118        0.1300      0.0998     0.1144    0.0998   0.1124
     8         0.1116        0.1306      0.0998     0.1144    0.0998   0.1124
     9         0.1116        0.1306      0.0998     0.1144    0.0998   0.1124
    10         0.1116        0.1306      0.0998     0.1144    0.0998   0.1124
    11         0.1116        0.1306      0.0998     0.1144    0.0998   0.1124
    12         0.1116        0.1306      0.0998     0.1144    0.

In [15]:
mytree.tree.split_name

'poutcome'

In [16]:
mytree = DecisionTree(trainx = x, trainy = y, column = column, criterion = 'entropy', max_depth = 16, entropy_base = 16)
for col in column:
    print("information gain from using {} as root is {}".format(col, mytree._IG(train_data[col].to_numpy(), train_data.y.to_numpy())))

information gain from using age as root is 0.0001901141874276896
information gain from using job as root is 0.0032942337434569546
information gain from using marital as root is 0.0011303557373564047
information gain from using education as root is 0.0015585380939867333
information gain from using default as root is 0.0001809463675113668
information gain from using balance as root is 0.0009297334661190274
information gain from using housing as root is 0.0034883264328749025
information gain from using loan as root is 0.0012130987025118691
information gain from using contact as root is 4.884919638579836e-05
information gain from using day as root is 0.00020175186527073008
information gain from using month as root is 0.00911739426050565
information gain from using duration as root is 0.014900575346940201
information gain from using campaign as root is 0.0010229265009810834
information gain from using pdays as root is 0.0
information gain from using previous as root is 0.00342328174937169
i