In [6]:
import pandas as pd
import numpy as np
import math
from sklearn.ensemble import RandomForestClassifier
import statistics
from statistics import mode
import sklearn
import sklearn.model_selection

from keras.models import Sequential
from keras.layers import Dense

def my_model(nodes, attr_count, loss):
    model = Sequential()
    model.add(Dense(12, input_dim=attr_count, activation='relu'))
    model.add(Dense(nodes, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # compile the keras model
    model.compile(loss=loss, optimizer='adam')
    # model.compile(loss='mean_absolute_error', optimizer='adam', metrics=[metrics.mean_absolute_error])
    return model

def perform(csv_name,seperator,predict_class,loss):
    data = pd.read_csv(csv_name, sep=seperator)

    predict = predict_class

    # Divide data into train and test splits
    X = np.array(data.drop([predict], 1))
    Y = np.array(data[predict])
    x_train,x_test,y_train,y_test = sklearn.model_selection.train_test_split( X,Y, test_size=0.1 )


    model = my_model(5,data.drop([predict], 1).columns.size,loss)

    model.fit(x_train, y_train, epochs=4, batch_size=10)

    t = "Accuracy is "
    if loss == 'mean_absolute_error':
        t = "Mean Absolute Error is "

    print(t, model.evaluate(x_test, y_test, batch_size=16))

    return model

# Split the data based on an attribute and an attribute value
def test_split(i, value, data):
    right = []
    left = []
    for row in data:
        if row[i] >= value:
            right.append(row)
        else:
            left.append(row)
    return left, right


# Calculate the Gini index for a split dataset
def gini_index(groups, categories):
    # calculate number of instances on 'this' node
    total_count = 0.0
    for sub in groups:
        total_count += len(sub)

    # Gini index (weighted)
    gini_score = 0.0
    for group in groups:
        size = float(len(group))
        if size != 0.0:
            score = 0.0
            for categ_val in categories:
                p = [row[-1] for row in group].count(categ_val) / size
                score += math.pow(p, 2)
            gini_score += (size / total_count) * (1.0 - score)

    return gini_score


# Select the best split point for a dataset
def splitter(data):
    categ_values = list(set(row[-1] for row in data))
    # Keep track of best gini score row
    best_index = -1
    best_value = -1
    best_score = 1
    left_right = None

    for index in range(len(data[0]) - 1):
        for row in data:
            categories = test_split(index, row[index], data)

            gini_score = gini_index(categories, categ_values)
            if best_score > gini_score:
                best_index, best_value, best_score, left_right = index, row[index], gini_score, categories

    # Return a dictionary consist of best gini score attribute value, left and right sets and index of that attribute
    return {'index': best_index, 'value': best_value, 'categories': left_right}


# Convert a node into a leaf. Normally a node is a dictionary but this function transforms a dict into a value.
# I later use this utility to differentiate a leaf node from others.
def to_terminal(group):
    target_values = [row[-1] for row in group]
    try:
        return mode(target_values)
    except statistics.StatisticsError:
        # A bad split happened unfortunately. In result of it, count of categories are equal! So I return the first one.
        return target_values[0]


# Create child splits for a node or make terminal
def birth(node, max_depth, min_size, current_depth):
    # Pull child nodes' info from parent.
    left, right = node['categories']
    del (node['categories'])


    # BASE CASES

    # If there is only 1 child
    if not left:
        node['left'] = to_terminal(right)
        node['right'] = to_terminal(right)
        return
    elif not right:
        node['left'] = to_terminal(left)
        node['right'] = to_terminal(left)
        return

    # Check if I hit max depth
    if current_depth >= max_depth:
        node['left'] = to_terminal(left)
        node['right'] = to_terminal(right)
        return


    # RECURSIVE CASES

    # process right child
    if len(right) > min_size:
        node['right'] = splitter(right)
        birth(node['right'], max_depth, min_size, current_depth + 1)
    else:
        node['right'] = to_terminal(right)

    # process left child
    if len(left) > min_size:
        node['left'] = splitter(left)
        birth(node['left'], max_depth, min_size, current_depth + 1)
    else:
        node['left'] = to_terminal(left)


def print_tree(node, attributes, depth=0):
    if isinstance(node, dict):
        print('%s[%s < %.3f]' % (depth * ' ', attributes[node['index']], node['value']))
        print_tree(node['left'], attributes,depth + 1)
        print_tree(node['right'], attributes,depth + 1)
    else:
        print('%s[%s]' % (depth * ' ', node))


# Tree builder
def build_tree(train, max_depth, min_size):
    root = splitter(train)
    birth(root, max_depth, min_size, current_depth=1)
    return root


# Predict 1 data
def predict_one(dt, X):
    if X[dt['index']] < dt['value']:
        # If not a leaf, i.e terminal node
        if isinstance(dt['left'], dict):
            return predict_one(dt['left'], X)
        else:
            return dt['left']
    else:
        # If not a leaf, i.e terminal node
        if isinstance(dt['right'], dict):
            return predict_one(dt['right'], X)
        else:
            return dt['right']


# Predict all
def predict_dt(dt, X, options):
    predictions = list()
    for row in X:
        prediction = predict_one(dt, row)
        predictions.append(prediction)
    return predictions


# Build tree with gini
def build_dt(X, y, options):
    # I used options as a dict to be able to fetch these parameters.
    max_depth = options['max_depth']
    min_size = options['min_size']

    # Concatenate train and target lists to fetch to function.
    temp = np.column_stack((X, y))

    return build_tree(temp, max_depth, min_size)



In [12]:
# Decision tree for divorce.csv
data = pd.read_csv("divorce.csv", sep=";")

predict = "Class"

# Divide data into train and test splits
X = np.array(data.drop([predict], 1))
Y = np.array(data[predict])
x_train,x_test,y_train,y_test = sklearn.model_selection.train_test_split( X,Y, test_size=0.1 )

options = {'max_depth': 10, 'min_size': 5, 'data':data}

my_tree = build_dt(x_train, y_train, options)
print_tree(my_tree,data.columns,0)


predicted = predict_dt(my_tree,x_test,None)

print()
print("My decision tree ")
print("Real values:")
print(y_test)
print("Predicted values:")
print(predicted)


rforest = RandomForestClassifier(n_estimators=100)
rforest = rforest.fit(x_train,y_train)

print()
acc = rforest.score(x_test,y_test)
print("Random Forest score ", acc)

[Atr18 < 2.000]
 [Atr26 < 2.000]
  [Atr40 < 3.000]
   [Atr1 < 0.000]
    [0]
    [0]
   [1]
  [1]
 [Atr1 < 2.000]
  [1]
  [Atr1 < 2.000]
   [1]
   [1]

My decision tree 
Real values:
[1 1 0 0 0 1 0 0 1 1 0 1 0 0 0 1 1]
Predicted values:
[1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1]

Random Forest score  1.0


In [11]:
# Decision tree for forecast.csv
data = pd.read_csv("forecast.csv", sep=";")

predict = "Target (Total orders)"

# Divide data into train and test splits
X = np.array(data.drop([predict], 1))
Y = np.array(data[predict])
x_train,x_test,y_train,y_test = sklearn.model_selection.train_test_split( X,Y, test_size=0.1 )

options = {'max_depth': 10, 'min_size': 5, 'data':data}

my_tree = build_dt(x_train, y_train, options)
print_tree(my_tree,data.columns,0)


predicted = predict_dt(my_tree,x_test,None)
print()
print("Real values:")
print(y_test)
print("Predicted values:")
print(predicted)
print()


[Non-urgent order < 218.856]
 [Non-urgent order < 170.566]
  [Non-urgent order < 118.552]
   [Non-urgent order < 89.526]
    [129.412]
    [Non-urgent order < 96.494]
     [202.02200000000002]
     [233.12599999999998]
   [Non-urgent order < 148.139]
    [Non-urgent order < 130.465]
     [Non-urgent order < 120.629]
      [235.59799999999998]
      [Non-urgent order < 123.143]
       [402.60699999999997]
       [Non-urgent order < 123.286]
        [231.035]
        [236.304]
     [Non-urgent order < 134.425]
      [255.06099999999998]
      [Non-urgent order < 144.124]
       [213.50900000000001]
       [263.043]
    [Non-urgent order < 150.257]
     [238.826]
     [268.64]
  [Non-urgent order < 172.783]
   [308.178]
   [Non-urgent order < 178.433]
    [253.847]
    [Non-urgent order < 206.206]
     [Non-urgent order < 193.768]
      [281.42]
      [336.87199999999996]
     [298.459]
 [Non-urgent order < 235.106]
  [363.402]
  [Non-urgent order < 275.076]
   [346.035]
   [416.83]

Real

In [4]:
# Keras Sequantial MLP
divorce_model = perform("divorce.csv", ";", "Class",'binary_crossentropy')

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Accuracy is  0.3280900716781616


In [5]:
# Keras Sequantial MLP
forecast_model = perform("forecast.csv", ";", "Target (Total orders)",'mean_absolute_error')

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Mean Absolute Error is  295.3548278808594


There are the implementation of decision tree by myself.
And I used Keras Sequential model for MLP.
