In [1]:
header = ["humidity","temperature","room"]

In [2]:
def unique_vals(rows, col):
    return set([row[col] for row in rows])

In [3]:
def class_counts(rows):
    counts = {}
    for row in rows:
        label = row[-1]
        if label not in counts:
            counts[label] = 0
        counts[label] += 1
    return counts

In [4]:
def is_numeric(value):
    return isinstance(value, int) or isinstance(value, float)

In [5]:
class Question:
    
    def __init__(self, column, value):
        self.column = column
        self.value = value
        
    def match(self, example):
        val = example[self.column]
        if is_numeric(val):
            return val >= self.value
        else:
            return val == self.value
        
    def moreless(self, example):
        val = example[self.column]
        if is_numeric(val):
            return val >= self.value
        else:
            return val <= self.value
    
    def __repr__(self):
        # This is just a helper method to print
        # the question in a readable format.
        condition = "=="
        if is_numeric(self.value):
            condition = ">="
        return "Is %s %s %s?" % (
            header[self.column], condition, str(self.value))

In [6]:
def partition(rows, question):
    true_rows, false_rows = [], []
    for row in rows:
        if question.match(row):
            true_rows.append(row)
        else:
            false_rows.append(row)
    return true_rows, false_rows

def partitionMoreLess(rows, question):
    true_rows, false_rows = [],[]
    for row in rows:
        if question.moreless(row):
            true_rows.append(row)
        else:
            false_rows.append(row)
    return true_rows, false_rows

In [7]:
def gini(rows):
    counts = class_counts(rows)
    impurity = 1
    for lbl in counts:
        prob_of_lbl = counts[lbl] / float(len(rows))
        impurity -= prob_of_lbl**2
    return impurity

In [8]:
def info_gain(left, right, current_uncertainty):
    p = float(len(left))/(len(left)+len(right))
    return current_uncertainty - p * gini(left) - (1 - p) * gini(right)

In [9]:
def find_best_split(rows):
    best_gain = 0
    best_question = None
    current_uncertainty = gini(rows)
    n_features = len(rows[0]) - 1
    
    for col in range(n_features):
        values = set([row[col] for row in rows])
        for val in values:
            question = Question(col, val)
            true_rows, false_rows = partitionMoreLess(rows, question)
            
            if len(true_rows) == 0 or len(false_rows) == 0:
                continue
            
            gain = info_gain(true_rows, false_rows, current_uncertainty)
            
            if gain > best_gain:
                best_gain, best_question = gain, question
    
    return best_gain, best_question

In [10]:
class Leaf:
    def __init__(self, rows):
        self.predictions = class_counts(rows)

In [11]:
class Decision_Node:
    def __init__(self, question, true_branch, false_branch):
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch

In [12]:
def build_tree(rows):
    gain, question = find_best_split(rows)
    if gain == 0:
        return Leaf(rows)
    true_rows, false_rows = partition(rows, question)
    
    true_branch = build_tree(true_rows)
    false_branch = build_tree(false_rows)
    
    return Decision_Node(question, true_branch, false_branch)

In [13]:
def print_tree(node, spacing=""):
    if isinstance(node, Leaf):
        print(spacing + "Predict",node.predictions)
        return
    
    print(spacing + str(node.question))
    print(spacing + '--> True:')
    print_tree(node.true_branch, spacing + "  ")
    print(spacing + '--> False:')
    print_tree(node.false_branch, spacing + "  ")

In [14]:
def classify(row, node):
    """See the 'rules of recursion' above."""

    # Base case: we've reached a leaf
    if isinstance(node, Leaf):
        return node.predictions

    # Decide whether to follow the true-branch or the false-branch.
    # Compare the feature / value stored in the node,
    # to the example we're considering.
    if node.question.match(row):
        return classify(row, node.true_branch)
    else:
        return classify(row, node.false_branch)

In [15]:
def print_leaf(counts):
    """A nicer way to print the predictions at a leaf."""
    total = sum(counts.values()) * 1.0
    probs = {}
    for lbl in counts.keys():
        probs[lbl] = str(int(counts[lbl] / total * 100)) + "%"
    return probs

In [16]:
rooms = [
    {"label": "Freezer", "minheat":-10.0, "maxheat": 10.0, "minhum": 5.7, "maxhum":40.9},
    {"label": "Kitchen", "minheat":15.65, "maxheat": 32.10, "minhum": 25, "maxhum":70.2},
    {"label": "Livingroom", "minheat":18.6, "maxheat": 28.0, "minhum": 30.65, "maxhum":65},
    {"label": "Greenhouse", "minheat":30.12, "maxheat": 55.6, "minhum": 38.7, "maxhum":80.98},
]

In [26]:
import random
import pandas as pd
def GenerateNewData(rooms, nPoints):
    data = []
    for room in rooms:
        if nPoints > 100:
            n = random.randint(nPoints-(nPoints/2),nPoints)
        else:
            n = random.randint(1,nPoints)
        for i in range(0,n):
            t = random.uniform(room["minheat"],room["maxheat"])
            h = random.uniform(room["minhum"], room["maxhum"])
            data.append([h,t,room["label"]])
    print(len(data))
    return data

newdata = GenerateNewData(rooms,1000)
newvalues = pd.DataFrame(newdata)
newvalues[2].value_counts()

2731


Livingroom    942
Kitchen       635
Greenhouse    624
Freezer       530
Name: 2, dtype: int64

In [27]:
newvalues = newvalues.sample(frac=1).reset_index(drop=True) # Randomizes the array!

In [28]:
def splitData(data, ntrees, ownvalue):
    subsets = []
    splitOne = data.sample(frac=0.5,random_state=200)
    splitTwo = data.drop(splitOne.index)
    if ntrees != 1:
        fb = splitData(splitOne, ntrees-1, ownvalue+"a")
        fb2 = splitData(splitTwo, ntrees-1, ownvalue+"b")
        for i in range(0,len(fb)):
            subsets.append(fb[i])
            subsets.append(fb2[i])
    else:
        return [splitOne, splitTwo]
    return subsets

In [35]:
subsets = splitData(newvalues,5,"a")
print("nSubsets:",len(subsets))
print("nValues/subset:",len(subsets[0]))
print(subsets[0])

nSubsets: 32
nValues/subset: 86
              0          1           2
1401  35.504378  20.678655  Livingroom
1148  38.276266  20.874772     Kitchen
2602  56.602357  34.371317  Greenhouse
372   39.428697  20.687358     Kitchen
529   69.837929  19.089643     Kitchen
...         ...        ...         ...
2565  69.444692  34.801238  Greenhouse
356   28.627182  17.781601     Kitchen
2094  26.430031  29.849362     Kitchen
716   43.211236  26.829672  Livingroom
1385  80.083631  36.967069  Greenhouse

[86 rows x 3 columns]


In [36]:
forrest = []
i = 1
for iSet in subsets:
    iList = iSet.values.tolist()
    i+=1
    tree = build_tree(iList)
    forrest.append(tree)
    print_tree(tree)

Is temperature >= 31.990900829965742?
--> True:
  Predict {'Greenhouse': 23}
--> False:
  Is temperature >= 15.93598635770735?
  --> True:
    Is temperature >= 22.223625399544936?
    --> True:
      Is temperature >= 27.42149187166944?
      --> True:
        Is humidity >= 51.54174197093124?
        --> True:
          Is humidity >= 56.279265767496135?
          --> True:
            Predict {'Kitchen': 1}
          --> False:
            Predict {'Livingroom': 1}
        --> False:
          Predict {'Kitchen': 5}
      --> False:
        Is humidity >= 45.51911233783494?
        --> True:
          Is humidity >= 63.61078104248174?
          --> True:
            Is humidity >= 63.89248707588036?
            --> True:
              Predict {'Livingroom': 2}
            --> False:
              Predict {'Kitchen': 1}
          --> False:
            Predict {'Livingroom': 13}
        --> False:
          Is temperature >= 24.30229940620845?
          --> True:
            Is tempe

--> True:
  Predict {'Greenhouse': 18}
--> False:
  Is temperature >= 16.514060842538292?
  --> True:
    Is temperature >= 27.961342504582312?
    --> True:
      Predict {'Kitchen': 11}
    --> False:
      Is humidity >= 32.556022544081195?
      --> True:
        Is temperature >= 20.601879984551495?
        --> True:
          Is humidity >= 66.37892385787966?
          --> True:
            Predict {'Kitchen': 1}
          --> False:
            Is temperature >= 21.167241725734566?
            --> True:
              Is humidity >= 34.75230031833316?
              --> True:
                Is humidity >= 54.379745648225914?
                --> True:
                  Is humidity >= 56.156670655573635?
                  --> True:
                    Is temperature >= 26.080370042295314?
                    --> True:
                      Is temperature >= 26.48905016835102?
                      --> True:
                        Predict {'Livingroom': 3}
                      -->

Is temperature >= 30.4025460479245?
--> True:
  Is temperature >= 31.14766143587033?
  --> True:
    Predict {'Greenhouse': 23}
  --> False:
    Is humidity >= 56.47454785725385?
    --> True:
      Predict {'Kitchen': 1}
    --> False:
      Predict {'Greenhouse': 1}
--> False:
  Is temperature >= 15.98055664507344?
  --> True:
    Is temperature >= 18.775537724770224?
    --> True:
      Is temperature >= 25.129128741599704?
      --> True:
        Is humidity >= 48.66755965919147?
        --> True:
          Is humidity >= 59.52581331495701?
          --> True:
            Predict {'Kitchen': 2}
          --> False:
            Predict {'Livingroom': 2}
        --> False:
          Is humidity >= 45.863416123323795?
          --> True:
            Predict {'Kitchen': 3}
          --> False:
            Is humidity >= 45.14579266975731?
            --> True:
              Predict {'Livingroom': 1}
            --> False:
              Predict {'Kitchen': 2}
      --> False:
        Is

In [37]:
def predictionCounter(row,trees,labels):
    length = len(labels)
    predictions = [0 for i in range(length)]
    for tree in trees:
        prediction = classify(row,tree)
        for i in range(length):
            if(prediction.get(labels[i])is not None):
                predictions[i]+=1
    return predictions

In [38]:
def rndForest(trees,data):
    for row in data:
        predictions = [0,0,0,0]
        labels = ["Livingroom", "Kitchen", "Greenhouse", "Freezer"]
        predictions = predictionCounter(row, trees, labels)
        high = max(predictions)
        print("Actual:",row[2],labels[predictions.index(high)],":", max(predictions),"/",len(trees))

In [39]:
testing_data = GenerateNewData(rooms, 1)
#testing_data = testing_data.sample(frac=1).reset_index(drop=True) # Randomizes the array!

4


In [40]:
rndForest(forrest,testing_data)

Actual: Freezer Freezer : 32 / 32
Actual: Kitchen Livingroom : 25 / 32
Actual: Livingroom Kitchen : 23 / 32
Actual: Greenhouse Greenhouse : 29 / 32
