In [1]:
import pandas as pd
import numpy as np
from scipy.stats import mode
import math
import random

In [2]:
class TreeNode(object):
    def __init__(self, key, num):
        self.key = key
        self.leaf = False
        self.num = num
        self.result = -1
        self.children = []

In [3]:
class RF(object):
    def __init__(self, m, head, branch, goal, random_feature):
        self.m = m
        self.branch = branch
        self.train_data, self.test_data = self.generate_data(head, goal)
        head.remove(goal)
        self.head = head
        self.goal = goal
        self.d = int(random_feature * len(head))
        self.forest = []
        print('Initiated successfully!')

    def generate_data(self, head, goal):
        csv_data = pd.read_csv('ML3AllSites.csv', encoding='ansi')
        data = csv_data[head]
        data.dropna(axis=0, subset=[goal], inplace=True)

        data['mcfiller2'] = data['mcfiller2'] + 1
        data['mcdv1'] = data['mcdv1'] + 4
        data['mcdv2'] = data['mcdv2'] + 4

        data = data - 1
        # print(data.min())
        sp = int(0.8 * len(data))
        train_data, test_data = data[:sp], data[sp:]
        return train_data, test_data

    def entropy(self, p):
        if p <= 0 or p >= 1:
            return 0
        else:
            return -p * math.log2(p) - (1 - p) * math.log2(1 - p)

    def information_gain(self, data):
        m = len(data)
        ig = np.zeros(self.k)
        py = data[:, -1].sum() / m
        hy = self.entropy(py)

        py0 = 1 - py
        py1 = py
        for i in range(self.k):
            px1 = (data[:, i].sum()) / m
            px0 = 1 - px1
            if px1 == 1 or px0 == 1:
                ig[i] = 0
                continue
            px0y1 = 0  # joint
            px1y1 = 0
            for j in range(m):
                if data[j][i] == 0 and data[j][self.k] == 1:
                    px0y1 += 1
                if data[j][i] == 1 and data[j][self.k] == 1:
                    px1y1 += 1

            f01 = px0y1
            f11 = px1y1
            f00 = m - data[:, i].sum() - f01
            f10 = data[:, i].sum() - f11
            t = (px0 * py0 * m - f00) ** 2 / px0 * py0 * m + (px0 * py1 * m - f01) ** 2 / px0 * py1 * m \
                + (px1 * py0 * m - f10) ** 2 / px1 * py0 * m + (px1 * py1 * m - f11) ** 2 / px1 * py1 * m
            if t > self.t0:
                px0y1 = px0y1 / m / px0  # conditional
                px1y1 = px1y1 / m / px1
                ig[i] = hy - px0 * self.entropy(px0y1) - px1 * self.entropy(px1y1)
        return ig

    def build_decision_tree(self, data):
        # ig = self.information_gain(data)
        # split_point = ig.argmax()
        # root = TreeNode(split_point)

        head = list(data.columns[:-1])
        if not head:
            root = TreeNode('', 0)
            root.leaf = True
            if data[self.goal].mode().empty:
                root.result = -1
            else:
                root.result = int(data[self.goal].mode()[0])
            return root

        split_point = random.choice(head)
        root = TreeNode(split_point, self.branch[split_point])

        split = data[split_point].min() != data[split_point].max()
        if split:
            for i in range(root.num):
                new_data = data[data[split_point] == i]
                new_data.drop(split_point, axis=1, inplace=True)
                root.children.append(self.build_decision_tree(new_data))
        else:
            root.leaf = True
            root.result = int(data[self.goal].mode()[0])
        return root

    def build_forest(self):
        for i in range(self.m):
            data = pd.DataFrame()
            while data.empty:
                feature = random.sample(self.head, self.d)
                idx = feature + [self.goal]
                data = self.train_data[idx].dropna(subset=feature)
            root = self.build_decision_tree(data)
            self.forest.append(root)
            print('No.', i, 'Decision Tree built successfully!')
        print('Random Forest built successfully!')

    def predict(self, row):
        result = []
        for tree in self.forest:
            root = tree
            while not root.leaf and not pd.isnull(row[root.key]):
                root = root.children[int(row[root.key])]
            if root.result != -1:
                result.append(root.result)
        # print(result)
        prediction = mode(result)[0]
        # print('prediction', prediction)
        return prediction

    def error(self, data):
        error = 0
        for index, row in data.iterrows():
            prediction = self.predict(row)
            # print(row[self.goal], prediction)
            # print(row[self.goal] != prediction)
            # print(row, prediction)
            if row[self.goal] != prediction:
                error += 1
        error = error / len(data)
        return error

    def main(self):
        self.build_forest()
        train_error = self.error(self.train_data)
        print('Train error: ', train_error)
        test_error = self.error(self.test_data)
        print('Test error: ', test_error)
        return train_error, test_error

In [4]:
'''
head = ['big5_01', 'big5_02', 'big5_03', 'big5_04', 'big5_05',
        'big5_06', 'big5_07', 'big5_08', 'big5_09', 'big5_10']
branch = {'big5_01': 7, 'big5_02': 7, 'big5_03': 7, 'big5_04': 7, 'big5_05': 7,
          'big5_06': 7, 'big5_07': 7, 'big5_08': 7, 'big5_09': 7, 'big5_10': 7}
goal = 'big5_05'
'''
head = ['mcfiller1', 'mcfiller2', 'mcfiller3',
        'mcmost1', 'mcmost2', 'mcmost3', 'mcmost4', 'mcmost5',
        'mcsome1', 'mcsome2', 'mcsome3', 'mcsome4', 'mcsome5',
        'mcdv1', 'mcdv2']
branch = {'mcfiller1': 4, 'mcfiller2': 5, 'mcfiller3': 4,
          'mcmost1': 2, 'mcmost2': 2, 'mcmost3': 2, 'mcmost4': 2, 'mcmost5': 2,
          'mcsome1': 2, 'mcsome2': 2, 'mcsome3': 2, 'mcsome4': 2, 'mcsome5': 2,
          'mcdv1': 7, 'mcdv2': 7}
goal = 'mcdv1'

In [6]:
rf = RF(50, head, branch, goal, 0.5) # n: tree numbers; 
                                     # head: all features; 
                                     # branch: number of branches for each feature; 
                                     # goal: predict feature; 
                                     # p: percentage of how many features are selected within a decision tree
rf.main()

  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versu

Initiated successfully!
No. 0 Decision Tree built successfully!
No. 1 Decision Tree built successfully!
No. 2 Decision Tree built successfully!
No. 3 Decision Tree built successfully!
No. 4 Decision Tree built successfully!
No. 5 Decision Tree built successfully!
No. 6 Decision Tree built successfully!
No. 7 Decision Tree built successfully!
No. 8 Decision Tree built successfully!
No. 9 Decision Tree built successfully!
No. 10 Decision Tree built successfully!
No. 11 Decision Tree built successfully!
No. 12 Decision Tree built successfully!
No. 13 Decision Tree built successfully!
No. 14 Decision Tree built successfully!
No. 15 Decision Tree built successfully!
No. 16 Decision Tree built successfully!
No. 17 Decision Tree built successfully!
No. 18 Decision Tree built successfully!
No. 19 Decision Tree built successfully!
No. 20 Decision Tree built successfully!
No. 21 Decision Tree built successfully!
No. 22 Decision Tree built successfully!
No. 23 Decision Tree built successfully!
No



Train error:  0.204442200908632
Test error:  0.3467741935483871


(0.204442200908632, 0.3467741935483871)