In [1]:
from __future__ import print_function
import numpy as np
from sklearn.utils import shuffle
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from collections import Counter
import time
import random
from random import sample 
import matplotlib.pyplot as plt
import sklearn
from sklearn import tree

In [2]:
#scikit learn classifier with train values
def scikit_learn(train):
    y = train[train.columns[-1]].values.tolist()
    x = train[train.columns[:-1]].values.tolist()
    clf = sklearn.tree.DecisionTreeClassifier()
    clf.fit(x,y)
    return clf
    

In [3]:
#Prediction of scikit-learn on a given row. 
def scikit_predict(clf,test):
    x_test = test[test.columns[:-1]].values.tolist()
    y_test = test[test.columns[-1]].values.tolist()
    cnt = 0
    accurate = 0
    for index,row in enumerate(x_test):
        cnt+=1
        test = clf.predict([row])
        if(test[0]==y_test[index]): 
            accurate+=1
    return 1.0*accurate/cnt
        
    

In [4]:
#Count of each row in the dataset. 
def class_counts(dataset):
    
    last_name = dataset.columns[-1]  
    last_col = dataset[last_name]
    counter = Counter(last_col) 
    return counter
    

In [5]:

def most_frequent(counter): 
    maxi = 0
    ele = 0
    for i in counter.keys():
        if(counter[i]>maxi):
            ele = i
            maxi = counter[i]
    return ele

In [6]:
def gini(rows):

    
    classes = class_counts(rows)
    
    total=len(rows)
    p=0
    for i in classes:
        p+= (classes[i]*1.0 / total)**2
        
        
        
    return 1-p

In [7]:
def info_gain(upper, lower, current):
    upper_len = len(upper)
    lower_len = len(lower)
    p = lower_len*1.0 / (lower_len + upper_len)
    
    s = p * gini(lower) + (1 - p) * gini(upper)
    gain = current - s
    
    return gain

In [8]:
class Output: 
    def __init__(self,rows):
        self.prediction = most_frequent(class_counts(rows))
    
        
   

In [9]:
class Decision_Node:
    
    def __init__(self,cutoff,attr,true,false): 
        self.true = true
        self.false = false
        self.attr = attr
        self.cutoff = cutoff

In [10]:
def best_split(dataset): 
    best_gain = -10
    best_attr = 0 
    best_cutoff = 0
    
    current = gini(dataset)
    
    features = dataset.columns[:-1]
    
    for attr in features:
        cutoff = set(dataset[attr])
        for cut in cutoff:
            upper,lower = part(dataset,cut,attr)
            if(len(upper)==0 or len(lower)==0): 
                continue
           
            gain = info_gain(upper,lower,current)
            if(gain>best_gain): 
                best_gain = gain
                best_attr = attr
                best_cutoff = cut
        
    return best_gain, best_attr, best_cutoff
            
            


In [11]:
def part(dataset,cutoff,attr):
    
    upper = dataset[dataset[attr]>cutoff]
    lower = dataset[dataset[attr]<=cutoff]
    
    return upper,lower
    

In [12]:
def build_tree(dataset,depth,greedy): 
    
    best_gain, best_attr,best_cutoff = best_split(dataset)
    if greedy:
        if(best_gain<=0):
            return Output(dataset)
    else: 
        if(depth==0 or len(dataset)<5): 
            return Output(dataset)
    
    upper,lower = part(dataset,best_cutoff,best_attr)
    
    true_branch = build_tree(upper,depth-1,greedy)
    false_branch = build_tree(lower,depth-1,greedy)
    
    return Decision_Node(best_cutoff,best_attr,true_branch,false_branch)
        

In [13]:
#Check a given row for the condition.
def check(row,node):
    at = node.attr
    
    if(row[at]>node.cutoff):
        return True
    else:
        return False


In [14]:
#A classification for iris datase
def classify_iris(row,node):
    if isinstance(node,Output): 
        return node.prediction
    else: 
        if(check(row,node)):
            return classify_iris(row,node.true)
        else:
            return classify_iris(row,node.false)
    

In [15]:
#Dividing the data into train and test.
def divide_data(dataset):
    attr = dataset.columns
    train = []
    test = []
    for index,row in dataset.iterrows():
        if(index%3==0):
            test.append(row)
        else: 
            train.append(row)
    return pd.DataFrame(train),pd.DataFrame(test)
    

In [16]:
#Evaluating on the test set
def accuracy(test,tree):
    accurate = 0
    cnt = 0
    listed = test.values.tolist()
    for index,row in test.iterrows(): 
        cnt+=1
        if(row.tolist()[-1]==classify_iris(row,tree)):
            accurate+=1
    return 1.0*accurate/cnt
    

In [17]:
def random_features(dataset,m):
    features = dataset.columns[:-1].tolist()
    chosen_names = sample(features,m)
    chosen_names.append(dataset.columns[-1])

    return dataset[chosen_names]

In [18]:
#Q1(a) Extending the Decision tree to build random forests.
def random_forest(dataset,m,n,depth): 
    tree_arr = []
    for i in range(n):
        tree_arr.append([])
        trimmed = random_features(dataset,m)
    
        tree_ = build_tree(trimmed,depth,False)
        tree_arr[i] = tree_
    return tree_arr


In [19]:
def test_random_forests(tree_arr,test):
    accuracy = 0
    cnt = 0
    for index,row in test.iterrows(): 
        output_array = []
        cnt+=1
        for node in tree_arr:
            decision = classify_iris(row,node)
            output_array.append(decision)
        prediction = most_common(output_array)
        if row.tolist()[-1] == prediction:
            accuracy+=1
    return 1.0*accuracy/cnt
        
    

In [20]:
def most_common(lst):
    return max(set(lst), key=lst.count)

In [22]:
#Q1 (D)Accuracy on the IRIS dataset.
dataset = pd.read_csv('iris.csv') 
train,test = divide_data(dataset)
tree_arr = random_forest(train,2,20,2)

acc1 = test_random_forests(tree_arr, test)
tree_train = build_tree(train,3,True)
acc2 = accuracy(test,tree_train)

print("Accuracy using Random Forests ", acc1) 
print("Accuracy with single Tree", acc2)

Accuracy using Random Forests  0.94
Accuracy with single Tree 0.9


In [23]:
def forest_cross_valid(total_data):
    estimator_list = [1,2,5,10,50,100]
    max_acc = 0
    opt_est = 0
    for est in estimator_list:
        
        train_data = total_data[:120]
        test_data = total_data[120:150]
        
        #Using 2 paramaters and depth of 2
        
        tree_arr = random_forest(train_data,2,est,2)
        acc1 = test_random_forests(tree_arr,test_data)

        train_data = total_data[30:150]
        test_data = total_data[0:30]
        
        tree_arr = random_forest(train_data,2,est,2)
        acc2 = test_random_forests(tree_arr,test_data)

        test_data = total_data[30:60]
        train_data = pd.concat([total_data[:30],total_data[60:150]])
        
        tree_arr = random_forest(train_data,2,est,2)
        acc3 = test_random_forests(tree_arr,test_data)
        
        test_data = total_data[60:90]
        train_data = pd.concat([total_data[:60],total_data[90:150]])
        
        tree_arr = random_forest(train_data,2,est,2)
        acc4 = test_random_forests(tree_arr,test_data)

        test_data = total_data[90:120]
        train_data = pd.concat([total_data[:90],total_data[120:150]])
        
        tree_arr = random_forest(train_data,2,est,2)
        acc5 = test_random_forests(tree_arr,test_data)
        
        avg_acc = (acc1+acc2+acc3+acc4+acc5)/5.
        
        
        
        if(avg_acc>max_acc):
            max_acc = avg_acc
            opt_est = est
        
    
    
    return max_acc,opt_est 


In [None]:
#Q1(e) Randomly doing sampling of the data

dataset = dataset.sample(frac = 1)

#Finding optimal accuracy and number of estimators.

max_acc, opt_est = forest_cross_valid(dataset)
print ("the optimal estimator is",opt_est)