# DecisionTrees Class Draft

In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import csv
from collections import Counter, defaultdict


### Notes about this notebook:  
In this notebook, I tried to put the whole DecisionTree project into one class.  I like this better - it is
much more logical to me

In [30]:
df = pd.read_csv("mushroom.csv")

In [31]:
df

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


In [32]:
type(df)

pandas.core.frame.DataFrame

In [33]:
df.replace('?', np.nan, inplace = True)

In [34]:
X = df.drop("class", axis = 1)
X.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,x,s,n,t,p,f,c,n,k,e,...,s,w,w,p,w,o,p,k,s,u
1,x,s,y,t,a,f,c,b,k,e,...,s,w,w,p,w,o,p,n,n,g
2,b,s,w,t,l,f,c,b,n,e,...,s,w,w,p,w,o,p,n,n,m
3,x,y,w,t,p,f,c,n,n,e,...,s,w,w,p,w,o,p,k,s,u
4,x,s,g,f,n,f,w,b,k,t,...,s,w,w,p,w,o,e,n,a,g


#### Value counts:
Here again I could get the value counts for the categories in a column but not in the entire dataframe.  
Apparently it only works for a Series but not for a DataFrame.

In [35]:
Y = df["class"]
Y.value_counts()

e    4208
p    3916
Name: class, dtype: int64

In [36]:
cap = X["cap-shape"]
cap.value_counts()

x    3656
f    3152
k     828
b     452
s      32
c       4
Name: cap-shape, dtype: int64

In [37]:
features = list(X.columns)

#### Root node
I found this for hyperparameters to use in getting the root node, but I'm not sure how to apply it

In [38]:
hp = {"max_depth": 3, "min_samples": 50}
hp

{'max_depth': 3, 'min_samples': 50}

In [11]:
#root = DecisionTree(Y, X, **hp)

#### Making the Decision Tree Class
I would like to get the csv from within the class but can't seem to do it.  I put the root node into 
the attributes but see above about this.  I don't have a code for how to grow the tree


In [39]:
class DecisionTree():
    def __init__(self, file, X, y, min_samples_split =2, max_depth = 5, depth = None): 
        self.file = file  
        self.X = X
        self.y = y
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.depth = depth
        self.features = list(self.X.columns)
        #self.node_type = node_type if node_type else "root"
        self.counts = Counter()
        self.root = None
        self.child = None
        
    def __repr__(self): #Pronounced "wrapper" if we print the object it should print the following string
        return "DecisionTree(file = {}, X = {}, y = {})".format(self.file, self.X, self.y)
            
    def get_data(self):  #These set method "get data".  Must put self into the paren
        df = pd.read_csv("mushrooms.csv")
        return df

    def class_size(self):
        #This version will return the class size of the two classes p and e
        class_size = df.groupby(self.y).size()/len(self.file)
        return class_size
        print(class_size)
        
    #GINI formula:  g(x) = 1-(x**2 + (1-x)**2)
    def gini(self):
        t = len(self.file)
        p = df.groupby(self.y).size()/t
        return 1- sum(p **2)
    
    #Entropy formula H(x) = -(x log_2(x) + (1-x) log_2(1-x)) /// 1/2 H(x)
    def entropy(self):
        t = len(self.file)
        p = df.groupby(self.y).size()/t
        return - sum(p*np.log(p.values))
    
    #This is where I start to get mixed up
    #Define subset for gini calculation
    def subframe(self):
        groups, keys = df.groupby[self.X].groups,keys()
        return groups, keys 
    
    #Calculate gini impurity for that subset   #property is like "cap-color")
    def gini_impurity(self, property):
        impurity = 0
        for keys, subframe in self.file.groupby(property):
            g = gini(subframe)
            avg = g*len(subframe)/len(self.file)
            impurity += avg
            return impurity
        
    #Calculate gini impurity for multiple subsets
    def gini_impurities(self, X):
       #Iterate over the columns
        impurities = []
        for x in X:
            impurities.append(x, impurities)
            return impurities

    #Count elements
    def count_elements(self, X):
        n = "number of observations"
        y1 = "number of first class elements"
        y2 = "number of second class elements"
        x = "numeric observations"
        
        #Insuring the correct type
        if y1_count is None:
            y1_count = 0
            
        if y2_count is None:
            y2_count = 0
            
        n = y1_count + y2_count
        
        #If n is 0, then return lowest possible gini impurity
        if n == 0:
            return 0.0
        
        #Getting the probability for each  of the classes
        p1 = y1_count /n
        p2 = y2_count /n
            
    #Compare gini impurity for subset with data.gini
    def compare_impurity(self, gini, impurity):
        return gini - impurity
        
    #Check if a leaf node - stop there   
    def is_leaf_node(self):
        if impurity == 0 or depth >= self.max_depth:
            return True
    
    #Information gain
    def info_gain_root(self):
        # parent node
        self.root = self.impurities.min()
        #best_gain
        best_gain = self.root - impurities
        return best_gain
        
    def info_gain(self):
        #child node
        self.child = self.impurities()
        best_gain = self.child - impurities
        return best_gain
   
    #Splitting criteria
    def split_criteria(self, X, y):
        if gain > best_gain:
            best_gain = gain
            return 
    
    #Generate split
    def _split(self, X_column, split):
        left_idxs = "fill in this"
        right_idxs = "fill in this"
        return left_idxs, rigth_idxs
    
    #Grow tree
    def grow_tree(self,       ):
        "fill in"
     
                                     
    def _traverse_tree(self, x, node):
        if node.is_leaf_node():
            return node.value
        if X.feature > 0.5:
            return (x, left)
        return self._traverse_tree(x, node.right)
    
    def predict(self, X):
        #traverse tree -This is a recursive function
        return np.array([self._traverse_tree(x, self.root) for x in X])
    
    #Repeat steps from subframe to leaf node for other child nodes    
    

In [40]:
tree1 = DecisionTree(df, X, Y, 100) # instantiation

In [41]:
#tree1  #Why does node give this output, where is the repr

In [42]:
tree1.class_size() 

class
e    0.517971
p    0.482029
dtype: float64

In [43]:
tree1.file

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


In [44]:
tree1.X

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,x,s,n,t,p,f,c,n,k,e,...,s,w,w,p,w,o,p,k,s,u
1,x,s,y,t,a,f,c,b,k,e,...,s,w,w,p,w,o,p,n,n,g
2,b,s,w,t,l,f,c,b,n,e,...,s,w,w,p,w,o,p,n,n,m
3,x,y,w,t,p,f,c,n,n,e,...,s,w,w,p,w,o,p,k,s,u
4,x,s,g,f,n,f,w,b,k,t,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,k,s,n,f,n,a,c,b,y,e,...,s,o,o,p,o,o,p,b,c,l
8120,x,s,n,f,n,a,c,b,y,e,...,s,o,o,p,n,o,p,b,v,l
8121,f,s,n,f,n,a,c,b,n,e,...,s,o,o,p,o,o,p,b,c,l
8122,k,y,n,f,y,f,c,n,b,t,...,k,w,w,p,w,o,e,w,v,l


In [45]:
X

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,x,s,n,t,p,f,c,n,k,e,...,s,w,w,p,w,o,p,k,s,u
1,x,s,y,t,a,f,c,b,k,e,...,s,w,w,p,w,o,p,n,n,g
2,b,s,w,t,l,f,c,b,n,e,...,s,w,w,p,w,o,p,n,n,m
3,x,y,w,t,p,f,c,n,n,e,...,s,w,w,p,w,o,p,k,s,u
4,x,s,g,f,n,f,w,b,k,t,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,k,s,n,f,n,a,c,b,y,e,...,s,o,o,p,o,o,p,b,c,l
8120,x,s,n,f,n,a,c,b,y,e,...,s,o,o,p,n,o,p,b,v,l
8121,f,s,n,f,n,a,c,b,n,e,...,s,o,o,p,o,o,p,b,c,l
8122,k,y,n,f,y,f,c,n,b,t,...,k,w,w,p,w,o,e,w,v,l


In [46]:
tree1.y

0       p
1       e
2       e
3       p
4       e
       ..
8119    e
8120    e
8121    e
8122    p
8123    e
Name: class, Length: 8124, dtype: object

In [47]:
Y

0       p
1       e
2       e
3       p
4       e
       ..
8119    e
8120    e
8121    e
8122    p
8123    e
Name: class, Length: 8124, dtype: object

In [48]:
tree1.gini()

0.49935405449893955

In [49]:
tree1.entropy()

0.6925010959051001

In [50]:
tree1.max_depth

5

#### These functions don't work.

In [51]:
tree1.gini_impurity("cap-shape")

NameError: name 'gini' is not defined

In [52]:
tree1.gini_impurities()

TypeError: gini_impurities() missing 1 required positional argument: 'X'

In [53]:
tree1.best_gain()

AttributeError: 'DecisionTree' object has no attribute 'best_gain'