In [None]:
# Decision Tree Classification Algorithm for BankNote case study

In [1]:
# Load csv file function
from csv import reader
def load_csv(filename):
    dataset=list()
    open_file=open(filename,'r')
    read_file=reader(open_file)
    for row in read_file:
        if not row:
            continue
        dataset.append(row)
    return dataset

In [2]:
# convert String column value to float value
def convert_str_to_float(dataset,column):
    for row in dataset:
        row[column]=float(row[column])

In [3]:
# Accuracy metrics for a model
def accuracy_metrics(actual,predicted):
    correct=0
    for i in range(len(actual)):
        if actual[i]==predicted[i]:
            correct+=1
    return correct/float(len(actual))*100.0

In [4]:
# Split a dataset into kfold
from random import seed
from random import randrange
def KFold(dataset,folds):
    fold_value=list()
    dataset_copy=list(dataset)
    fold_size=int(len(dataset)/folds)
    for _ in range(folds):
        fold=list()
        while len(fold)<fold_size:
            index=randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        fold_value.append(fold)
    return fold_value

In [5]:
# Evaluate an algorithm using KFold Cross validation split
def evaluate_model(dataset,algorithm,folds,*args):
    folds=KFold(dataset,folds)
    accuracy=list()
    for fold in folds:
        train_set=list(folds)
        train_set.remove(fold)
        train_set=sum(train_set, [])
        test_set=list()
        for row in fold:
            row_copy=list(row)
            test_set.append(row_copy)
            row_copy[-1]=None
        predicted=algorithm(train_set,test_set,*args)
        actual=[row[-1] for row in fold]
        accuracy_value=accuracy_metrics(actual,predicted)
        accuracy.append(accuracy_value)
    return accuracy

In [68]:
# Decision Tree Classification algorithm 
def decision_tree_classification(train,test,max_depth,min_size):
    predictions=list()
    # function
    tree=build_tree(train,max_depth,min_size)
    for row in test:
        predict=prediction(tree,row)
        predictions.append(predict)
    return predictions

In [69]:
# Build tree funtion 
def build_tree(dataset,max_depth,min_size):
    # function
    root=split_tree(dataset)
    split(root,max_depth,min_size,1)
    return root

In [70]:
# Split tree function
def split_tree(dataset):
    class_values=list(set([row[-1] for row in dataset]))
    b_index,b_value,b_score,b_groups=999,999,999,None
    for index in range(len(dataset[0])-1):
        for row in dataset:
            # group split function
            groups=split_data(index,row[index],dataset)
            # gini index
            gini=gini_index(groups,class_values)
            b_index,b_value,b_score,b_groups=index,row[index],gini,groups
    return {'index':b_index,'value':b_value,'groups':b_groups}

In [71]:
# Building split data function
def split_data(index,value,dataset):
    left,right=list(),list()
    for row in dataset:
        if row[index]<value:
            left.append(row)
        else:
            right.append(row)
    return left,right

In [72]:
# Building gini_index function
def gini_index(groups,classes):
    n_instance=float(sum([len(group) for group in groups]))
    gini=0.0
    for group in groups:
        size=float(len(group))
        if size==0:
            continue
        score=0.0
        for class_val in classes:
            p=[row[-1] for row in group].count(class_val)/size
            score=p*p
        gini+=(1.0-score)*(size/n_instance)
    return gini

In [73]:
# create child splits for a node
def split(node,max_depth,min_size,depth):
    left,right=node['groups']
    del(node['groups'])
    # Checking for no split
    if not left or not right:
        node['left']=node['right']=to_terminal(left+right)
        return
    # checking for max depth
    if depth>=max_depth:
        node['left'],node['right']=to_terminal(left),to_terminal(right)
        return
    # processing left child
    if len(left)<=min_size:
        node['left']=to_terminal(left)
    else:
        node['left']=split_tree(left)
        split(node['left'],max_depth,min_size,depth+1)
    # processing right child
    if len(right)<=min_size:
        node['right']=to_terminal(right)
    else:
        node['right']=split_tree(right)
        split(node['right'],max_depth,min_size,depth+1)      
    

In [74]:
def to_terminal(group):
    outcomes=[row[-1] for row in group]
    return max(set(outcomes),key=outcomes.count)

In [75]:
def prediction(node,row):
    if row[node['index']]<node['value']:
        if isinstance(node['left'],dict):
            return prediction(node['left'],row)
        else:
            return node['left']
    else:
        if isinstance(node['right'],dict):
            return prediction(node['right'],row)
        else:
            return node['right']

In [76]:
# Test Cart on Bank Note Dataset
seed(1)
filename='data_banknote_authentication.csv'
dataset=load_csv(filename)
for column in range(len(dataset[0])):
    convert_str_to_float(dataset,column)
folds=10
max_depth=5
min_size=10
accuracy=evaluate_model(dataset,decision_tree_classification,folds,max_depth,min_size)
print("accuracy",accuracy)

('accuracy', [58.3941605839416, 55.47445255474452, 59.12408759124088, 49.63503649635037, 51.82481751824818, 53.284671532846716, 56.20437956204379, 59.854014598540154, 54.01459854014598, 58.3941605839416])


In [77]:
print("------------Mean Accuracy-------------------")
print(sum(accuracy)/float(len(accuracy)))

------------Mean Accuracy-------------------
55.6204379562
