# DecisionTree - House Votes

In [72]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import csv
from collections import Counter, defaultdict


In [73]:
df = pd.read_csv("house-votes-84.csv")

In [74]:
df.head()

Unnamed: 0,Class Name,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
0,republican,n,y,n,y,y,y,n,n,n,y,?,y,y,y,n,y
1,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,?
2,democrat,?,y,y,?,y,y,n,n,n,n,y,n,y,y,n,n
3,democrat,n,y,y,n,?,y,n,n,n,n,y,n,y,n,n,y
4,democrat,y,y,y,n,y,y,n,n,n,n,y,?,y,y,y,y


In [75]:
type(df)

pandas.core.frame.DataFrame

In [76]:
df.replace('?', np.nan, inplace = True)

In [77]:
df = df[df.notna().all(axis=1)]

In [78]:
features = list(df.columns)
features

['Class Name',
 ' handicapped-infants',
 ' water-project-cost-sharing',
 ' adoption-of-the-budget-resolution',
 ' physician-fee-freeze',
 ' el-salvador-aid',
 ' religious-groups-in-schools',
 ' anti-satellite-test-ban',
 ' aid-to-nicaraguan-contras',
 ' mx-missile',
 ' immigration',
 ' synfuels-corporation-cutback',
 ' education-spending',
 ' superfund-right-to-sue',
 ' crime',
 ' duty-free-exports',
 ' export-administration-act-south-africa']

In [79]:
rows = df.index
print(rows)

Int64Index([  5,   8,  19,  23,  25,  26,  27,  28,  29,  30,
            ...
            418, 419, 420, 421, 422, 423, 426, 427, 430, 431],
           dtype='int64', length=232)


In [80]:
Y = df["Class Name"]
Y.value_counts()

democrat      124
republican    108
Name: Class Name, dtype: int64

In [81]:
class Node:
    def __init__(self, value):
        self.value = value
        self.left = None
        self.right = None

In [82]:
class DecisionTree():
    def __init__(self, data, features, rows, min_samples_split =2, max_depth = 5, depth = None, **kwargs): 
        self.data = data  
        self.features = list(self.data.columns)
        self.rows = data.index
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.depth = depth
        self.counts = Counter()
        self.root = None
        self.child = None
                
    def __repr__(self): #Pronounced "wrapper" if we print the object it should print the following string
        return "DecisionTree(data = {}, features = {}, Names = {})".format(self.data, self.features, self.features[0])
            
    def class_size(self):
        #This version will return the class size of the two classes p and e
        class_size = df.groupby(self.features[0]).size()/len(self.data)
        return class_size
        print(class_size)
        
    #GINI formula  g(x) = 1-(x**2 + (1-x)**2)
    def gini(self):
        t = len(self.data)
        p = self.data.groupby(self.features).size()/t
        return 1 - sum(p **2)

    #Entropy formula H(x) = -(x log_2(x) + (1-x) log_2(1-x)) /// 1/2 H(x)
    def entropy(self):
        t = len(self.data)
        p = self.data.groupby(self.features[0]).size()/t
        return sum(p*np.log(p.values))
    
    #Define subset for gini calculation
    def subf(self):
        groups, keys = self.data.groupby[self.features].groups, keys()
        return groups, keys
    
    #Get the impurity of the node using the GINI formula
    def impurity(self, prop):
        pur = 0
        for k, subf in self.data.groupby(prop):
            g = self.gini(subf)
            pur += g*len(subf/len(self.data))
            return pur
        
    #This function gets the minimum impurity in the sorted list
    def gini_sort(self, data):
        new_node = []
        for f in self.features[1:]:
            new_node.append(f, impurity(self.data, f))
            new_node.sort(key=lambda t: t[1])
            
    #Check if a leaf node - stop there   
    def is_leaf_node(self):
        if impurity == 0 or depth >= self.max_depth:
            return True  
               
    #Information gain
    def info_gain (self):
        self.root_node = self.impurity
        #best_gain
        best_gain = self.root_node - impurity
        return best_gain
              
    def insert(self, value):
        new_node = Node(value)
        if self.root is None:
            self.root = new_node
            return True
        temp = self.root
        while(True):
            if new_node.value == temp.value:
                return False
            if new_node.value < temp.value:
                if temp.left is None:
                    temp.left = new_node
                    return True
                temp = temp.left
                if temp.left is None:
                    temp.left = new_node
                    return True
                temp = temp.left
            else:
                if temp.right is None:
                    temp.right = new_node
                    return True
                temp = temp.right
    
    def grow_tree(self, features, node):
        if node.is_leaf_node():
            return True
        else:
            subf(self)
            impurity(self)
            gini_sort(self, data)
            info_gain(self, data)
            insert(self, value)
               
    def predict(self, features):
        return np.array([self._grow_tree])

In [83]:
tree1 = DecisionTree(df, features, rows)

In [84]:
print(tree1.root)

None


In [85]:
tree1.features[0]

'Class Name'

In [86]:
dir(tree1)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'child',
 'class_size',
 'counts',
 'data',
 'depth',
 'entropy',
 'features',
 'gini',
 'gini_sort',
 'grow_tree',
 'impurity',
 'info_gain',
 'insert',
 'is_leaf_node',
 'max_depth',
 'min_samples_split',
 'predict',
 'root',
 'rows',
 'subf']

In [87]:
tree1.gini()

0.9892612960760999

In [88]:
tree1.class_size()

Class Name
democrat      0.534483
republican    0.465517
dtype: float64

In [89]:
tree1.entropy()

-0.6907671705264811

In [90]:
tree1.data

Unnamed: 0,Class Name,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
5,democrat,n,y,y,n,y,y,n,n,n,n,n,n,y,y,y,y
8,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,y
19,democrat,y,y,y,n,n,n,y,y,y,n,y,n,n,n,y,y
23,democrat,y,y,y,n,n,n,y,y,y,n,n,n,n,n,y,y
25,democrat,y,n,y,n,n,n,y,y,y,y,n,n,n,n,y,y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
423,democrat,n,y,y,n,n,y,y,y,y,n,y,n,n,y,y,y
426,democrat,y,n,y,n,n,n,y,y,y,y,n,n,n,n,y,y
427,republican,n,n,n,y,y,y,y,y,n,y,n,y,y,y,n,y
430,republican,n,n,y,y,y,y,n,n,y,y,n,y,y,y,n,y


In [91]:
tree1.max_depth

5

In [98]:
#tree1.impurity(" handicapped-infants")

In [99]:
hp = {"max_depth": 3, "min_samples": 50}
hp

{'max_depth': 3, 'min_samples': 50}