In [29]:
import matplotlib.pyplot as plt
import numpy as np
import h5py
import pandas as pd
import math

In [30]:
features_cont = pd.read_pickle('dataRF_cont.pkl')
features_disc = pd.read_pickle('dataRF_disc.pkl')
features = pd.read_pickle('dataRF.pkl')
labels = np.load('labelsRF.npy', allow_pickle=True)
print(features.shape)
features.head()
print(labels)
np.random.seed()

(190440, 30)
[12 12 12 ...  9  9  9]


In the Random Forest Class, we will have the following parameters:

- x = Feature
- y = Labels
- n_tress = Number of uncorrelated trees
- n_features = Number of features to pass to the tree
- sample_size = Number of input samples to be randoly selected from the set
- depth = The depth of each tree
- min_leaf = Minimum number of nodes in a split

In [31]:
class Random_Forest():
    def __init__(self, x, y, n_trees, n_features, sample_size, max_depth=10, min_leaf=5):
        self.x = x
        self.y = y
        self.n_features = n_features
        self.sample_size = sample_size
        self.max_depth = max_depth 
        self.min_leaf = min_leaf
        self.trees = [self.plant_tree(i) for i in range(n_trees)]
    
    def plant_tree(self, tree_num):
        #print('Tree Number ' + str(tree_num + 1) + ' is being created.')
        indices = np.random.permutation(len(self.y))[:self.sample_size]
        f_indices = np.random.permutation(self.x.shape[1])[:self.n_features]
        return Decision_Tree(self.x.iloc[indices], self.y[indices], self.n_features, f_indices,
                            indices=np.array(range(self.sample_size)),
                            max_depth = self.max_depth, min_leaf=self.min_leaf)
    
    def predict(self, x):
        predictor = np.array([t.predict(x) for t in self.trees])
        return predictor

In [32]:
class Decision_Tree():
    def __init__(self, x, y, n_features, f_indices,indices, max_depth=10, min_leaf=5):
        self.x = x
        self.y = y
        self.n_features = n_features
        self.f_indices = f_indices
        self.indices = indices
        self.max_depth = max_depth
        self.min_leaf = min_leaf
        
        self.n_tot, self.f_tot = len(indices), x.shape[1]
        
        counts = np.bincount(y[indices])
        self.value = np.argmax(counts)
        self.split_index = None
        self.score = 0
        self.do_split()
        
    def do_split(self):
        for i in self.f_indices: 
            self.find_best_split(i)
        if self.is_leaf: 
            return
        x = self.split_point
        
        left = np.nonzero(x<=self.split)[0]
        right = np.nonzero(x>self.split)[0]
        lf_indices = np.random.permutation(self.x.shape[1])[:self.n_features]
        rf_indices = np.random.permutation(self.x.shape[1])[:self.n_features]
        self.lhs = Decision_Tree(self.x, self.y, self.n_features, lf_indices,
                                self.indices[left], max_depth=self.max_depth-1, min_leaf=self.min_leaf)
        self.rhs = Decision_Tree(self.x, self.y, self.n_features, rf_indices,
                                self.indices[right], max_depth=self.max_depth-1, min_leaf=self.min_leaf)
                
    def find_best_split(self, split_index):
        
        x = self.x.values[self.indices,split_index]
        y = self.y[self.indices]
        sort_index = np.argsort(x)
        sort_x = x[sort_index]
        sort_y = y[sort_index]
        
        right_node = y
        left_node = []
        left_node = np.asarray(left_node)
        
        for i in range(0,self.n_tot-self.min_leaf-1):
            xi = sort_x[i]
            yi = sort_y[i]
            #xi, yi = x[i], y[i]
            np.append(left_node, right_node[0:1])
            right_node = right_node[1:]
            if i < self.min_leaf or xi == sort_x[i+1]:
                continue
                
            IG = self.information_gain(y, right_node, left_node)
            if IG > self.score:
                self.split_index = split_index
                self.score = IG
                self.split = xi
    

    def information_gain(self, y, r_node, l_node):
        groups = [l_node, r_node]
        Number_all = len(y)
        IG = self.gini(y)
        for group in groups:
            IG -= self.gini(group)*len(group)/Number_all
        return IG

    def gini(self, y):
        class_ids = [i for i in range(23)]
        Number_group = len(y)
        if Number_group == 0:
            return 0
        
        sum_of_classes = 0.
        for class_id in class_ids:
            p = list(y).count(class_id)/Number_group
            sum_of_classes += p**2
        return 1. - sum_of_classes
    
    @property
    def split_point(self): 
        point = self.x.values[self.indices,self.split_index]
        return point

    @property
    def is_leaf(self): 
        if self.score == 0 or self.max_depth <= 0:
            return True
        else:
            return False
    
    def predict(self, x):
        predictor = np.array([self.predict_single(xi) for xi in x])
        return predictor

    def predict_single(self, xi):
        if self.is_leaf: 
            return self.value
        if xi[self.split_index] <= self.split:
            t = self.lhs
        else:
            t = self.rhs
        return t.predict_single(xi)

In [33]:
def accuracy(d, y):
    t = 0
    f = 0
    for i in range(len(y)):
        counts = np.bincount(d[:,i])
        if np.argmax(counts) == y[i]:
            t = t + 1
        else:
            f = f + 1
    return (t / (t + f)) * 100

In [34]:
def splitter(data,y,perctrain, percv, perctest):
    a = (len(data)*perctrain)/100
    data_train = data[:int(a)]
    label_train = y[:int(a)]
    
    b = (len(data)*percv)/100
    data_val = data[int(a):int(b)+int(a)]
    label_val = y[int(a):int(b)+int(a)]
    
    c = (len(data)*perctest)/100
    data_test = data[int(b)+int(a):int(c)+int(b)+int(a)]
    label_test = y[int(b)+int(a):int(c)+int(b)+int(a)]
    return data_train, label_train, data_val, label_val, data_test, label_test

In [35]:
data_np = np.asarray(features)
label_np = np.asarray(labels)
shuffle=np.random.permutation(data_np.shape[0])
data_np = data_np[shuffle]
label_np = label_np[shuffle]

train_features, train_labels, val_features, val_labels, test_features, test_labels= splitter(data_np, label_np, 70, 20, 10) 
print(train_features.shape)
print(train_labels.shape)
print(val_features.shape)
print(val_labels.shape)
print(test_features.shape)
print(test_labels.shape)

(133308, 30)
(133308,)
(38088, 30)
(38088,)
(19044, 30)
(19044,)


In [37]:
np.random.seed(1)
MyForest = Random_Forest(pd.DataFrame(train_features), train_labels, 40, 5, 80)
prediction_train = MyForest.predict(np.asarray(train_features))
print('Training Accuracy: ' + str(accuracy(prediction_train, train_labels)) + '%')
prediction_val = MyForest.predict(np.asarray(val_features))
print('Validation Accuracy: ' + str(accuracy(prediction_val, val_labels)) + '%')

Training Accuracy: 18.60128424400636%
Validation Accuracy: 18.65942028985507%


In [38]:
np.random.seed(1)
MyForest2 = Random_Forest(pd.DataFrame(train_features), train_labels, 80, 5, 80)
prediction_train = MyForest2.predict(np.asarray(train_features))
print('Training Accuracy: ' + str(accuracy(prediction_train, train_labels)) + '%')
prediction_val = MyForest2.predict(np.asarray(val_features))
print('Validation Accuracy: ' + str(accuracy(prediction_val, val_labels)) + '%')

Training Accuracy: 18.95010051909863%
Validation Accuracy: 19.050619617727367%


In [39]:
np.random.seed(1)
MyForest3 = Random_Forest(pd.DataFrame(train_features), train_labels, 40, 5, 120)
prediction_train = MyForest3.predict(np.asarray(train_features))
print('Training Accuracy: ' + str(accuracy(prediction_train, train_labels)) + '%')
prediction_val = MyForest3.predict(np.asarray(val_features))
print('Validation Accuracy: ' + str(accuracy(prediction_val, val_labels)) + '%')

Training Accuracy: 18.963603084586072%
Validation Accuracy: 19.163516068052928%


In [40]:
np.random.seed(1)
MyForest4 = Random_Forest(pd.DataFrame(train_features), train_labels, 80, 5, 120)
prediction_train = MyForest4.predict(np.asarray(train_features))
print('Training Accuracy: ' + str(accuracy(prediction_train, train_labels)) + '%')
prediction_val = MyForest4.predict(np.asarray(val_features))
print('Validation Accuracy: ' + str(accuracy(prediction_val, val_labels)) + '%')

Training Accuracy: 20.730188735859816%
Validation Accuracy: 20.964608275572356%


In [41]:
np.random.seed(1)
MyForest5 = Random_Forest(pd.DataFrame(train_features), train_labels, 40, 5, 40)
prediction_train = MyForest5.predict(np.asarray(train_features))
print('Training Accuracy: ' + str(accuracy(prediction_train, train_labels)) + '%')
prediction_val = MyForest5.predict(np.asarray(val_features))
print('Validation Accuracy: ' + str(accuracy(prediction_val, val_labels)) + '%')

Training Accuracy: 18.372490773246916%
Validation Accuracy: 18.630539802562488%


In [42]:
np.random.seed(1)
MyForest6 = Random_Forest(pd.DataFrame(train_features), train_labels, 80, 5, 40)
prediction_train = MyForest6.predict(np.asarray(train_features))
print('Training Accuracy: ' + str(accuracy(prediction_train, train_labels)) + '%')
prediction_val = MyForest6.predict(np.asarray(val_features))
print('Validation Accuracy: ' + str(accuracy(prediction_val, val_labels)) + '%')

Training Accuracy: 17.76562546883908%
Validation Accuracy: 18.268220961982777%


In [43]:
np.random.seed(1)
MyForest7 = Random_Forest(pd.DataFrame(train_features), train_labels, 80, 3, 120)
prediction_train = MyForest7.predict(np.asarray(train_features))
print('Training Accuracy: ' + str(accuracy(prediction_train, train_labels)) + '%')
prediction_val = MyForest7.predict(np.asarray(val_features))
print('Validation Accuracy: ' + str(accuracy(prediction_val, val_labels)) + '%')

Training Accuracy: 19.86902511477181%
Validation Accuracy: 20.271476580550303%


In [44]:
np.random.seed(1)
MyForest8 = Random_Forest(pd.DataFrame(train_features), train_labels, 80, 8, 120)
prediction_train = MyForest8.predict(np.asarray(train_features))
print('Training Accuracy: ' + str(accuracy(prediction_train, train_labels)) + '%')
prediction_val = MyForest8.predict(np.asarray(val_features))
print('Validation Accuracy: ' + str(accuracy(prediction_val, val_labels)) + '%')

Training Accuracy: 20.261349656434724%
Validation Accuracy: 20.318735559756355%


In [45]:
np.random.seed(1)
MyForest9 = Random_Forest(pd.DataFrame(train_features), train_labels, 80, 5, 120, max_depth=20, min_leaf=2)
prediction_train = MyForest9.predict(np.asarray(train_features))
print('Training Accuracy: ' + str(accuracy(prediction_train, train_labels)) + '%')
prediction_val = MyForest9.predict(np.asarray(val_features))
print('Validation Accuracy: ' + str(accuracy(prediction_val, val_labels)) + '%')

Training Accuracy: 19.358178053830226%
Validation Accuracy: 19.27378701953371%


In [46]:
np.random.seed(1)
MyForest10 = Random_Forest(pd.DataFrame(train_features), train_labels, 80, 5, 120, max_depth=5, min_leaf=2)
prediction_train = MyForest10.predict(np.asarray(train_features))
print('Training Accuracy: ' + str(accuracy(prediction_train, train_labels)) + '%')
prediction_val = MyForest10.predict(np.asarray(val_features))
print('Validation Accuracy: ' + str(accuracy(prediction_val, val_labels)) + '%')

Training Accuracy: 13.479311069103131%
Validation Accuracy: 13.58958202058391%


In [47]:
np.random.seed(1)
MyForest11 = Random_Forest(pd.DataFrame(train_features), train_labels, 80, 5, 120, max_depth=20, min_leaf=10)
prediction_train = MyForest11.predict(np.asarray(train_features))
print('Training Accuracy: ' + str(accuracy(prediction_train, train_labels)) + '%')
prediction_val = MyForest11.predict(np.asarray(val_features))
print('Validation Accuracy: ' + str(accuracy(prediction_val, val_labels)) + '%')

Training Accuracy: 20.478890989287965%
Validation Accuracy: 20.943604284814114%


In [48]:
np.random.seed(1)
MyForest12 = Random_Forest(pd.DataFrame(train_features), train_labels, 80, 5, 120, max_depth=5, min_leaf=10)
prediction_train = MyForest12.predict(np.asarray(train_features))
print('Training Accuracy: ' + str(accuracy(prediction_train, train_labels)) + '%')
prediction_val = MyForest12.predict(np.asarray(val_features))
print('Validation Accuracy: ' + str(accuracy(prediction_val, val_labels)) + '%')

Training Accuracy: 19.2043988357788%
Validation Accuracy: 19.457571938668348%


In [49]:
np.random.seed(1)
MyForest13 = Random_Forest(pd.DataFrame(train_features), train_labels, 80, 5, 120, max_depth=10, min_leaf=2)
prediction_train = MyForest13.predict(np.asarray(train_features))
print('Training Accuracy: ' + str(accuracy(prediction_train, train_labels)) + '%')
prediction_val = MyForest13.predict(np.asarray(val_features))
print('Validation Accuracy: ' + str(accuracy(prediction_val, val_labels)) + '%')

Training Accuracy: 15.323161400666127%
Validation Accuracy: 15.422180214240704%


In [50]:
np.random.seed(1)
MyForest14 = Random_Forest(pd.DataFrame(train_features), train_labels, 80, 5, 120, max_depth=10, min_leaf=10)
prediction_train = MyForest14.predict(np.asarray(train_features))
print('Training Accuracy: ' + str(accuracy(prediction_train, train_labels)) + '%')
prediction_val = MyForest14.predict(np.asarray(val_features))
print('Validation Accuracy: ' + str(accuracy(prediction_val, val_labels)) + '%')

Training Accuracy: 20.478890989287965%
Validation Accuracy: 20.943604284814114%


In [51]:
final_features = np.concatenate((train_features, val_features))
final_labels = np.concatenate((train_labels, val_labels))
print(final_features.shape)
print(final_labels.shape)

(171396, 30)
(171396,)


In [52]:
import time

In [55]:
start = time.time()
np.random.seed(1)
MyForest_last = Random_Forest(pd.DataFrame(final_features), final_labels, 80, 5, 120)
prediction_test = MyForest_last.predict(np.asarray(test_features))
print('Testing Accuracy: ' + str(accuracy(prediction_test, test_labels)) + '%')
elapsed_time = time.time() - start
print('Execution time: ' + str(elapsed_time))

Testing Accuracy: 19.197647553035075%
Execution time: 107.831716299057
