In [52]:
from collections import Counter
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [53]:
def entropy(classes):
    unique_classes = np.unique(classes)
    probabilities = np.array([0.1] * len(unique_classes))
    i = 0
    for cl in unique_classes:
        n = len(classes[classes == cl])
        probabilities[i] = (n + 0.1 - 0.1) / len(classes)
        i += 1
    return -np.sum(probabilities * np.log2(probabilities))


def gini(classes):
    unique_classes = np.unique(classes)
    probabilities = np.array([0.1] * len(unique_classes))
    i = 0
    for cl in unique_classes:
        n = len(classes[classes == cl])
        probabilities[i] = (n + 0.1 - 0.1) / len(classes)
        i += 1
    return np.sum(probabilities * (1 - probabilities))


def miss_error(classes):
    max = Counter(y).most_common(1)[0][1] / len(classes)
    return 1 - max


impurities = {"entropy": entropy, "gini": gini, "miss_error": miss_error}

features = []

class Tree:
    children = []
    isLeaf = True

    def __init__(self, data, impurity, max_depth, depth = 0):
        features = []
        y = data[:, 0]
        x = data[:, 1:]
        self.pr = Counter(y).most_common(1)[0][1] / len(y)
        if len(np.unique(y)) == 1:
            self.isLeaf = True
            self.cl = np.unique(y)[0]
        elif depth == max_depth:
            self.isLeaf = True
            cl = Counter(y).most_common(1)[0][0]
            self.cl = cl
        else:
            maxIG = 0
            I_node = impurity(y)
            for i in range(len(x[0])):
                if i not in features:
                    feature_data = x[:, i]
                    for j in range(len(feature_data)):
                        left = y[feature_data <= feature_data[j]]
                        right = y[feature_data > feature_data[j]]
                        if len(left) == 0 or len(right) == 0:
                            continue
                        I_left = impurity(left) * len(left) / len(y)
                        I_right = impurity(right) * len(right) / len(y)
                        IG = I_node - I_left - I_right
                        if maxIG < IG:
                            maxIG = IG
                            self.sep_feature = i
                            self.separator = feature_data[j]
            if maxIG > 0:
                features.append(self.sep_feature)
                self.isLeaf = False
                left = np.array([False] * len(data))
                right = np.array([False] * len(data))
                for i in range(len(y)):
                    if data[i][self.sep_feature + 1] <= self.separator:
                        left[i] = True
                    else:
                        right[i] = True
                self.children.append(Tree(data[left], impurity, max_depth, depth + 1))
                self.children.append(Tree(data[right], impurity, max_depth, depth + 1))
            else:
                self.isLeaf = True
                cl = Counter(y).most_common(1)[0][0]
                self.cl = cl

    def classify(self, x):
        if self.isLeaf:
            return self.cl, self.pr
        else:
            if x[self.sep_feature] <= self.separator:
                return self.children[0].classify(x)
            else:
                return self.children[1].classify(x)

In [54]:
def getRandomSampleFromData(data):
    N = len(data)
    arr = np.array([i for i in range(N)])
    sample = np.random.choice(arr, size=N, replace=True)
    return data[sample]

In [78]:
class RandomForest:
    def __init__(self, data, n, maxdepth):
        self.feats = []
        N = len(data)
        forest = []
        for i in range(n):
            sample = getRandomSampleFromData(data)
            x = data[:, 1:]
            y = data[:, 0]
            features = np.random.permutation(len(x[0]))[:int(len(x[0]) * 0.7)]
            self.feats.append(features)
            forest.append(Tree(np.column_stack((y, x[:, features])), gini, maxdepth))
            print ("Tree number ", i)
        self.forest = np.array(forest)
        

def classifyy(forest, X, features):
    classes = {}
    i = 0
    for tree in forest:
        cl, pr = tree.classify(X[features[i]])
        print (cl, pr)
        if cl in classes:
            classes[cl] += pr
            classes[cl] /= 2
        else:
            classes[cl] = pr
    max = -1
    cl = None
    for clas in classes:
        if classes[clas] > max:
            max = classes[clas]
            cl = clas
    prob = classes[cl]
    return cl, prob



In [65]:
data = np.array(pd.read_csv('/Users/user/PycharmProjects/MLHW1/cancer.csv'))
data_tr = data[10:543]
data_eval_m = data[0:10]
data_eval_b = data[543:553]
forest = RandomForest(data_tr, 20, 3)

Tree number  0


Tree number  1


Tree number  2


Tree number  3


Tree number  4


Tree number  5


Tree number  6


Tree number  7


Tree number  8


Tree number  9


Tree number  10


Tree number  11


Tree number  12


Tree number  13


Tree number  14


Tree number  15


Tree number  16


Tree number  17


Tree number  18


Tree number  19


In [80]:
fig = plt.figure()
for elem in data_eval_b:
    print (classifyy(forest.forest, elem[1:], forest.feats))

B 0.9696048632218845
B 0.9696048632218845
M 0.8333333333333334
M 0.8333333333333334
B 0.9696048632218845
B 0.9696048632218845
M 0.8333333333333334
B 0.9696048632218845
B 0.9696048632218845
B 0.9696048632218845
B 0.9696048632218845
B 0.9696048632218845
M 0.8333333333333334
B 0.9696048632218845
B 0.9696048632218845
B 0.9696048632218845
M 0.8333333333333334
B 0.9696048632218845
B 0.9696048632218845
B 0.9696048632218845
('B', 0.9696048632218845)
B 0.9696048632218845
B 0.9696048632218845
M 0.8333333333333334
M 0.8333333333333334
B 0.9696048632218845
B 0.9696048632218845
M 0.8333333333333334
B 0.9696048632218845
B 0.9696048632218845
B 0.9696048632218845
B 0.9696048632218845
B 0.9696048632218845
M 0.8333333333333334
B 0.9696048632218845
B 0.9696048632218845
B 0.9696048632218845
M 0.8333333333333334
B 0.9696048632218845
B 0.9696048632218845
B 0.9696048632218845
('B', 0.9696048632218845)
B 0.9696048632218845
B 0.9696048632218845
M 0.8333333333333334
M 0.8333333333333334
B 0.9696048632218845
B 0

<Figure size 432x288 with 0 Axes>