In [1]:
import pandas as pd
import numpy as np
import random
import math

from collections import Counter

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
train_data = pd.read_csv('/content/drive/My Drive/LOCS/combined-swell-classification-eda-train-dataset.csv')
test_data = pd.read_csv('/content/drive/My Drive/LOCS/combined-swell-classification-eda-test-dataset.csv')
val_data = pd.read_csv('/content/drive/My Drive/LOCS/combined-swell-classification-eda-validation-dataset.csv')

train_X = train_data        # 78788 
test_X = test_data          # 9849     
val_X = val_data            # 9849 

# train_Y, test_Y, val_Y
train_Y = train_X["condition"]
test_Y = test_X["condition"]
val_Y = val_X["condition"]

In [4]:
train_X = train_X.drop(columns=['NasaTLX class', 'Condition Label', 'NasaTLX Label', 'condition', 'subject_id'])
test_X = test_X.drop(columns=['NasaTLX class', 'Condition Label', 'NasaTLX Label', 'condition', 'subject_id'])
val_X = val_X.drop(columns=['NasaTLX class', 'Condition Label', 'NasaTLX Label', 'condition', 'subject_id'])

In [5]:
class Node:
    def __init__(self):
        self.split_feature = None
        self.split_point = None
        self.result = None
        self.childs = None
        

In [6]:
class DecisionTree:
    
    def __init__(self):
        self.root = None
        
    def DT_print(self, cur_node = None, cnt = 0):
        if cnt == 0:
            cur_node = self.root
        print(' ' * cnt, "Level ", cnt," :: ", cur_node.split_feature, cur_node.split_point, cur_node.result)
        if cur_node.childs is None:
            return
        for child in cur_node.childs:
            self.DT_print(child, cnt + 1)
        
    def gini_impurity(self, data_Y):  # input label dataset of a group
        impurity = 1
        label_counts = Counter(data_Y)
        for label in label_counts:
            p_of_label = label_counts[label] / len(data_Y)
            impurity -= p_of_label ** 2
        return impurity
    
    def information_gain(self, unsplited_data_Y, splited_data_Y):
        gain = self.gini_impurity(unsplited_data_Y)
        # print(gain)
        # print(splited_data_Y)
        for subset in splited_data_Y:
            # print("-", gini_impurity(subset), " X ( ", len(subset), " / ", len(unsplited_data_Y), " )")
            gain -= self.gini_impurity(subset) * (len(subset)/ len(unsplited_data_Y))
        return gain
    
    def split(self, data_X, data_Y, column):
        data_X_subsets = []    # 분할 후의 data_X 그룹을 저장하는 배열
        data_Y_subsets = []    # 분할 후의 data_Y 그룹을 저장하는 배열
        split_point = 0.0      # 최적 분할 기준 값
        split_point_gain = 0.0  # 최적 분할 지점에서의 information_gain

        data_X = data_X.sort_values(by=column)
        data_Y = data_Y[data_X.index]
        data_X = data_X.reset_index(drop=True)
        data_Y = data_Y.reset_index(drop=True)

        for i in range(1, len(data_Y)):
            candidate_splited_data_X = []
            candidate_splited_data_Y = []
            if data_Y[i-1] != data_Y[i]:
                # print(i, data_Y[i-1], data_Y[i])
                candidate_point = (data_X[column].iloc[i-1] + data_X[column].iloc[i]) / 2
                candidate_splited_data_Y.append(data_Y[:i])
                candidate_splited_data_Y.append(data_Y[i:])
                gain = self.information_gain(data_Y, candidate_splited_data_Y)
                if gain > split_point_gain:
                    candidate_splited_data_X.append(data_X[:i])
                    candidate_splited_data_X.append(data_X[i:])
                    split_point = candidate_point
                    split_point_gain = gain
                    data_X_subsets = candidate_splited_data_X
                    data_Y_subsets = candidate_splited_data_Y
                    # print("== Updated :: ", split_point_gain, split_point)
        return split_point_gain, split_point, data_X_subsets, data_Y_subsets
    
    def find_best_split(self, data_X, data_Y, bagging=False):
        # print("=-=-New Group=-=-")
        best_feature = ''    # 데이터를 분할 할 feature
        best_gain = 0.0       # 데이터를 특정 feature로 분할했을 때 가장 높게 측정된 information_gain
        best_split_point = 0.0

        columns = data_X.columns
        if bagging:     # feature bagging
          num_feature = math.ceil(math.sqrt(len(data_X.columns)))
          col_indices = np.random.choice(len(data_X.columns), num_feature, replace=False)
          columns = [data_X.columns[col_idx] for col_idx in col_indices]

        for column in columns:
            # print("check column :: ", column)
            gain, split_point = self.split(data_X, data_Y, column)[0:2]
            if gain > best_gain:
                best_gain = gain
                best_feature = column
                best_split_point = split_point
        return best_feature, best_gain, best_split_point
    
    def fit(self, data_X, data_Y, cnt=0, bagging=False):
        root = Node()

        data_X = data_X.reset_index(drop=True)
        data_Y = data_Y.reset_index(drop=True)

        best_feature, best_gain, best_split_point = self.find_best_split(data_X, data_Y, bagging)
        if best_gain == 0:
            root.result = data_Y[0]
            # print(' ' * cnt, "== No Split ", cnt," :: ", root.result)
            return root
        data_X_subsets, data_Y_subsets = self.split(data_X, data_Y, best_feature)[2:]

        # print(' ' * cnt, "== split ", cnt," :: ", best_feature, best_gain)
        childs = []
        for i in range(len(data_X_subsets)):
            childs.append(self.fit(data_X_subsets[i], data_Y_subsets[i], cnt+1))

        root.split_feature = best_feature
        root.split_point = best_split_point
        root.childs = childs
        if cnt == 0:
            self.root = root
        return root
    
    def predict(self, dataset):
        result = []
        for i in dataset.index:
            cur_node = self.root
            while cur_node.result is None:
                value = dataset.loc[i, cur_node.split_feature]
                if value < cur_node.split_point:
                    cur_node = cur_node.childs[0]
                else :
                    cur_node = cur_node.childs[1]
            result.append(cur_node.result)
        return result
    
    def score(self, data_X, data_Y):
        predict = self.predict(data_X)
        cnt = 0
        for i in range(len(data_Y)):
            if predict[i] == data_Y[i]:
                cnt += 1
                
        return cnt / len(data_Y)

In [7]:
newDT = DecisionTree()
newDT.fit(train_X.head(100), train_Y.head(100))

<__main__.Node at 0x7f0213782e80>

In [8]:
newDT.score(val_X.head(100), val_Y.head(100))

0.5

In [9]:
class RandomForest:
    def __init__(self):
        self.trees = []
        self.result = None

    def bagging_data(self, data_X, data_Y):
        indices = [random.randint(0, len(data_X)-1) for x in range(len(data_X))]
        data_X_subset = data_X.iloc[indices]
        data_Y_subset = data_Y.iloc[indices]
        data_X_subset = data_X_subset.reset_index(drop=True)
        data_Y_subset = data_Y_subset.reset_index(drop=True)
        return data_X_subset, data_Y_subset

    def fit(self, data_X, data_Y, tree_num = 10):
        self.trees = []
        for i in range(tree_num):
          data_X_subset, data_Y_subset = self.bagging_data(data_X, data_Y)
          tree = DecisionTree()
          tree.fit(data_X_subset, data_Y_subset, bagging=True)
          self.trees.append(tree)

    def predict(self, dataset):
        results = []
        for tree in self.trees:
          result = tree.predict(dataset)
          results.append(result)
        results = np.transpose(results)
        results = [Counter(result).most_common()[0][0] for result in results] # max 값이 두개라면 어떡할지 고민
        return results

    def score(self, data_X, data_Y):
        predict = self.predict(data_X)
        cnt = 0
        for i in range(len(data_Y)):
            if predict[i] == data_Y[i]:
                cnt += 1
                
        return cnt / len(data_Y)

In [15]:
RF = RandomForest()

In [19]:
RF.fit(train_X.head(1000), train_Y.head(1000))

In [20]:
RF.score(test_X.head(1000), test_Y.head(1000))

0.828

In [21]:
RF2 = RandomForest()

In [22]:
RF2.fit(train_X.head(5000), train_Y.head(5000))

In [24]:
RF2.score(test_X.head(5000), test_Y.head(5000))

0.9756