In [1]:
import pandas as pd
import numpy as np
from google.colab import drive
import time
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
iris_path = "/content/drive/MyDrive/ML/Lab 8/iris.csv"
df = pd.read_csv(iris_path)

In [3]:
def discretize(df):
  for i in list(df):
    if i != "variety":
      col = df[i]
      print(i)
      bins = pd.cut(col, 3, labels = ["1", "2", "3"]) # Discretizes to bins
      df[i] = bins

In [4]:
discretize(df)

sepal.length
sepal.width
petal.length
petal.width


In [5]:
y = df['variety']
X = df.drop(['variety'], axis = 1)

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [7]:
class Node(object):
    def __init__(self, attr):
      self.leaf = False
      self.out = -1
      self.attr_value = attr
      self.children = []
    def set_attribute(self, value):
      self.attr_value = value
    def is_leaf(self, value):
      self.leaf = True
      self.out = value
    def add_child(self, obj):
      self.children.append(obj)


In [8]:
from math import log2, inf
class DecisionTree:
  def __init__(self):
      self.root_node = Node("")
      self.max_depth = 0
  def entropy(self, X_c, y_c):
    val = 0
    for i in y_c.value_counts():
      val += (i / len(y_c)) * log2(i / len(y_c))
    return val * -1
  def split_by_attr(self, X, y, attr):
    full_dset = X_train.join(y_train)
    vals = []
    for i in range(1, 4):
      x_attr = full_dset.loc[full_dset[attr] == str(i)].drop(['variety'], axis = 1)
      y_attr = full_dset.loc[full_dset[attr] == str(i)]['variety']
      vals.append((x_attr, y_attr))
    return vals
  def recur(self, X_s, y_s, remaining_attributes, node, depth = 0):
    if(depth > self.max_depth or len(remaining_attributes) == 0):
      l = y_s.value_counts()
      outp = l[l == l.max()].index.values[0]
      node.is_leaf(outp)
      return
    best_children = []
    best_attribute = ""
    max_gain = -inf
    parent_info = self.entropy(X_s, y_s)
    if(parent_info == 0):
      l = y_s.unique()
      node.is_leaf(l[0])
      return
    for att in remaining_attributes:
      children = self.split_by_attr(X_s, y_s, att)
      children_entropy = 0
      for i in children:
        X_c, y_c = i[0], i[1]
        child_entropy = self.entropy(X_c, y_c)
        children_entropy += (len(X_c) / len(X_s)) * child_entropy
      #print(parent_info, children_entropy, att)
      gain = parent_info - children_entropy
      if gain > max_gain:
        best_attribute = att
        max_gain = gain
        best_children = children
    #print(max_gain, best_attribute)
    remaining_attributes.remove(best_attribute)
    temp = remaining_attributes.copy()
    count = 0
    if max_gain < 0 and depth > 1:
      l = y_s.value_counts()
      outp = l[l == l.max()].index.values[0]
      node.is_leaf(outp)
      return
    for i in best_children:
      #print(depth, count, len(i[0]), temp)
      node.set_attribute(best_attribute)
      n = Node("")
      node.add_child(n)
      self.recur(i[0], i[1], remaining_attributes, n, depth + 1)
      remaining_attributes = temp
      count += 1
  def fit(self, X, y):
    self.max_depth = len(list(X))
    self.recur(X, y, list(X), self.root_node)
    return self.root_node
  def predict(self, X):
    preds = []
    for i in X.iterrows():
      preds.append(self.predict_helper(i[1]))
    return pd.Series(preds)
  def predict_helper(self, X):
    return self.predict_recur(X, self.root_node)
  def predict_recur(self, X, node):
    if(node.leaf):
      return node.out
    return self.predict_recur(X, node.children[int(X.loc[node.attr_value]) - 1])


In [9]:
import math
class RandomForest:
  def __init__(self):
    self.ntrees = 0
    self.trees = []
  def bagging(self, X_p, y_p):
    full_dset = X_p.join(y_p)
    full_dset = full_dset.sample(n=len(X_p) // (2),replace=True)
    X_p = full_dset.drop(['variety'], axis = 1)
    y_p = full_dset['variety']
    X_p = X_p.sample(n=int(math.sqrt(len(list(X_p))) + 1),axis='columns')
    return X_p, y_p
  def fit(self, X, y, ntrees = 4):
    self.ntrees = ntrees
    for i in range(ntrees):
      X_p, y_p = self.bagging(X, y)
      tree = DecisionTree()
      tree.fit(X_p, y_p)
      self.trees.append(tree)
  def predict(self, X):
    preds = []
    for tree in self.trees:
      pred = tree.predict(X)
      preds.append(pred)
    df_preds = pd.concat(preds, axis=1).T
    df_preds.head()
    return df_preds.mode().iloc[0]

In [12]:
start = time.perf_counter()
model = RandomForest()
model.fit(X_train, y_train, 100)
end = time.perf_counter()
y_preds = model.predict(X_test)

In [14]:
print(end - start)

5.892108531000019


In [None]:
corr, tot = 0, len(y_test)
for i, j in zip(list(y_test), list(y_preds)):
  if i == j:
    corr += 1
print("Testing Accuracy:", corr / tot)
print("\nConfusion Matrix:\n", pd.crosstab(pd.Series(list(y_test)), y_preds))

Testing Accuracy: 1.0

Confusion Matrix:
 0           Setosa  Versicolor  Virginica
row_0                                    
Setosa          11           0          0
Versicolor       0          13          0
Virginica        0           0          6


In [None]:
print(list(X_train))

['sepal.length', 'sepal.width', 'petal.length', 'petal.width']
