In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from math import log

In [2]:
tennis = pd.read_csv('tennis-weather/tennis.csv')
tennis.index = ['D'+str(i) for i in range(1,15)]
df = tennis.copy()
out_attr = 'play'
outputs = set(df[out_attr].values)
df

Unnamed: 0,outlook,temp,humidity,windy,play
D1,sunny,hot,high,False,no
D2,sunny,hot,high,True,no
D3,overcast,hot,high,False,yes
D4,rainy,mild,high,False,yes
D5,rainy,cool,normal,False,yes
D6,rainy,cool,normal,True,no
D7,overcast,cool,normal,True,yes
D8,sunny,mild,high,False,no
D9,sunny,cool,normal,False,yes
D10,rainy,mild,normal,False,yes


In [47]:
def entropy(df):
    ent = 0
    tot = float(len(df))
    for i in df[out_attr].value_counts():
        p = i/tot
        ent -= p*log(p,2)
    return ent

In [46]:
def sub_df(df,attr,val=True,greater=True):
    if type(val) == int or type(val) == float:
        if greater:
            return df[df[attr]>=val]
        else:
            return df[df[attr]<val]
    else:
        return df[df[attr]==val]

In [45]:
def info_gain(current_entropy, df, attr, val=None):
    """Information Gain.
    
    The entropy of the current node, minus the weighted entropy of
    the child nodes.
    
    If the attribute has countinuous values, then, a partitioning value (the optimum split) should be
    given for that attribute.
    """
    
    if type(val) == int or type(val) == float: #If the attribute has continuous (numeric) values
        greater = sub_df(df,attr,val,True)
        less = sub_df(df,attr,val,False)
        p = len(greater)/len(df)
        return curent_entropy - p*entropy(greater) - (1-p)*entropy(less)
    
    else: #If the attribute has discrete values
        ig = current_entropy
        values = set(df[attr].values)
        for val in values:
            sub = sub_df(df,attr,val)
            p = len(sub)/len(df)
            ig -= p*entropy(sub)
        return ig

In [44]:
def sum_square_error(lst):
    m = sum(lst)/len(lst)
    s = 0
    for i in lst:
        s += (i-m)**2
    return s

In [43]:
def best_split(df, attr):
    # This method is used to find the best split for an attribute which has continuous values 
    # (The output value may be discrete or continuous)
    newdf = df.loc[:,[attr,out_attr]].copy()
    newdf.sort_values(by=attr)
    if type(outputs[0]) in [int, float]:
        out_values = list(newdf[out_attr])
        in_values = list(newdf[attr])
        min_sqe = sum_square_error(out_values)
        split_val = in_values[0]
        for i in range(1,len(in_values)):
            sqe = sum_square_error(out_values[:i]) + sum_square_error(out_values[i:])
            if sqe <= min_sqe and in_values[i] != in_values[i-1]:
                min_sqe = sqe
                split_val = (in_values[i]+in_values[i-1])/2.0
        return split_val
    else:
        out_values = list(newdf[in_attr])
        in_values = list(newdf[attr])
        current_entropy = entropy(in_values)
        split_val = in_values[0]
        max_infog = info_gain(current_entropy, newdf, attr, val=split_val)
        for i in range(1,len(in_values)):
            split_val = in_values[0]
            ig = info_gain(current_entropy, newdf, attr, val=split_val)
            if ig >= max_infog and in_values[i] != in_values[i-1]:
                max_infog = ig
                split_val = inval[i]
        return split_val
            
        
    

In [64]:
def max_ig(df):
    keys = list(df.keys())
    keys.remove(out_attr)
    ent = entropy(df)
    key0 = keys[0]
    row0 = df.iloc[[0]]
    val = None
    if type(row0[key0]) in [int, float]:
        val = best_split(df, attr)
    mkey, mig = key0, info_gain(ent, df, key0, val)
    for i in range(1,len(keys)):
        attr = keys[i]
        rowi = df.iloc[[i]]
        val = None
        if type(rowi[attr]) in [int, float]:
            val = best_split(df, attr)
        ig = info_gain(ent, df, attr, val)
        if ig > mig:
            mkey, mig = attr, ig
    return mkey    

In [58]:
def best_split_overall(df):
    if type(out_attr) in [int, float]:
        keys = list(df.keys())
        keys.remove(out_attr)
        mkey = keys[0]
        val = best_split(df, mkey)
        lesslst = df[df[mkey]<val]
        greaterlst = df[df[mkey]>val]
        sqe = sum_square_error(lesslst) + sum_square_error(greaterlst)
        min_sqe = sqe
        mval = val
        for i in range(1,len(keys)):
            keyi = keys[i]
            val = best_split(df, keyi)
            lesslst = list(df[df[keyi]<val][out_attr])
            greaterlst = list(df[df[keyi]>val][out_attr])
            sqe = sum_square_error(lesslst) + sum_square_error(greaterlst)
            if sqe < min_sqe:
                min_sqe = sqe
                mkey = keyi
                mval = val
        return mkey, mval
                
    else:
        mkey = max_ig(df)
        return mkey, None

In [67]:
class Node:
    depth_limit = 12
    def __init__(self, df, children = [],level = 1, attr = None, prevval = None, val = None,
                 greater = None, used_attr = []):
        '''df   ->  the dataframe for the current node
        attr    ->  the attribute used to decide the next split
        val     ->  the value used for greater/less than checking for the 
                    next split (only for numeric values of the respective attribute)
        greater ->  greater/less than flag from the value corresponding to the threshold that 
                    resulted in the previous split (only for numeric values of the respective attribute)
        prevval ->  the value of the previous attribute that led to this branch (only
                    for non-numeric or discrete values of the respective attribute)
        ent     ->  entropy at the current node
        '''
        self.df, self.attr,  = df, attr
        self.val, self.greater = val, greater
        self.prevval = prevval
        self.children = children # This is a list with the child nodes.
        self.ent = entropy(df)
        self.level = level
    
    def add_children(self):
        pass
    
    def find(self, row):
        '''Function to find the prediction for a given row/entry/object'''
        
        if len(children) == 0:
            if type(outputs[0]) == int or type(outputs[0]) == float:
                return np.mean(df[attr].values)
            else:
                val = []
                vcounts = df[attr].value_counts()
                for i in outputs:
                    if val == [] or val[0] < vcounts[i]:
                        val = [i]
                    elif vcounts[val[0]] == vcounts[i]:
                        val += [i]
                if len(val) == 1:
                    return val[0]
                else:
                    return np.random.choice(val)
        else:
            attr = self.attr
            if type(row[attr]) == int or type(row[attr]) == float:
                val = self.val
                greater = (row[attr] >= val)
                for child in self.children:
                    if child.greater == greater:
                        return child.find(row)
            else:
                for child in self.children:
                    if child.prevval == row[attr]:
                        return child.find(row)
            

In [93]:
max_depth = 13
def build_tree(node, df, root=None, level = 1, used_attr = []):
    if root is None:
        attr, val = best_split_overall(df)
        if type(val) == int or type(val) == float:
            pass
            #sub_df(df,attr,val=True,greater=True):
        else:
            root = Node(df, children = [],level = 1, attr = attr, prevval = None, val = val,
                 greater = None, used_attr = None)
            values = set(df[attr].values)
            for i in values:
                sub = sub_df(df,attr,i)
                nd = Node(sub, children = [],level = 2, attr = attr, prevval = i, val = None,
                          greater = None, used_attr = [attr])
                nd = build_tree(nd, sub, root=root, level = 2, used_attr = [attr])
                children += [nd]
            root.children = children
            return root
    else:
        attr, val = best_split_overall(df)
        if type(val) == int or type(val) == float:
            pass
        else:
            keys = list(df.keys())
            keys.remove(out_attr)
            keys = [i for i in keys if i not in used_attr]
            #for i in used_attr:
            #    if i in keys:
            #        keys.remove(i)
            if len(keys) == 0:
                return node
            
            ent = entropy(df)
            if level > max_depth or info_gain(ent, df, attr) < 10**(-2):
                return node
            
            values = set(df[attr].values)
            for i in values:
                sub = sub_df(df,attr,i)
                nd = Node(sub, children = [],level = level+1, attr = attr, prevval = i, val = None,
                          greater = None, used_attr = used_attr+[attr])
                nd = build_tree(nd, sub, root=root, level = level+1, used_attr = used_attr+[attr])
                children += [nd]
            node.children = children
            return node
                
            
            
            
        
    
    







In [None]:
root = build_tree(None, tennis, root=None, level = 1, used_attr = [])

In [95]:
##### AFTER THIS IS ONLY ROUGH CODE CHECKING #####


##### AFTER THIS IS ONLY ROUGH CODE CHECKING #####


##### AFTER THIS IS ONLY ROUGH CODE CHECKING #####



##### AFTER THIS IS ONLY ROUGH CODE CHECKING #####






















###### AFTER THIS IS ONLY ROUGH CODE CHECKING #####
###################################################

In [12]:
print(tennis[tennis['outlook']=='sunny'].count()['outlook'])
print(tennis['outlook'].value_counts()['sunny'])

5
5


In [13]:
print(log(2,2))
for c in tennis['outlook'].value_counts():
    print(c)
print(type(True)==int)
print(type(True)==float)

1.0
5
5
4
False
False


In [14]:
print(tennis[tennis['outlook']=='sunny'])
print()
print(tennis[tennis.index=='D1'])

    outlook  temp humidity  windy play
D1    sunny   hot     high  False   no
D2    sunny   hot     high   True   no
D8    sunny  mild     high  False   no
D9    sunny  cool   normal  False  yes
D11   sunny  mild   normal   True  yes

   outlook temp humidity  windy play
D1   sunny  hot     high  False   no


In [15]:
s = set(tennis['outlook'].values)
print(s)
for i in s:
    print(i)
nnnn = None
print(0 is None)
a = [1,2,3]
b = a
print(a is b)
dd = tennis.copy()
print(dd is tennis)
dd['num'] = pd.Series(list(range(1,15)), index=dd.index)

{'sunny', 'overcast', 'rainy'}
sunny
overcast
rainy
False
True
False


In [16]:
print(dd['play'].value_counts()['yes'])
print(np.random.choice([1,2,3,4,5]))
row1 = tennis.loc[['D1']]
row1['outlook'] == 'sunny'
print(tennis.keys())

9
4
Index(['outlook', 'temp', 'humidity', 'windy', 'play'], dtype='object')


In [17]:
def fn(root):
    left = Node(tennis, children = [],level = 2, attr = None, prevval = None, val = None,
                 greater = None, ent = 0)
    right = Node(tennis, children = [],level = 2, attr = None, prevval = None, val = None,
                 greater = None, ent = 0)
    root.children = [left, right]
    #return root

In [24]:

root = Node(tennis, children = [],level = 1, attr = None, prevval = None, val = None,
                 greater = None)
root.children
#a = fn(root)
#root.children[0].level, a.children[1].level

[]

In [19]:
print(tennis[0:1]['outlook'], '\n')
for i in tennis['outlook']:
    print(i)
ll = list(tennis['outlook'])
print(ll)

ll.sort()
print(ll)

D1    sunny
Name: outlook, dtype: object 

sunny
sunny
overcast
rainy
rainy
rainy
overcast
sunny
sunny
rainy
sunny
overcast
overcast
rainy
['sunny', 'sunny', 'overcast', 'rainy', 'rainy', 'rainy', 'overcast', 'sunny', 'sunny', 'rainy', 'sunny', 'overcast', 'overcast', 'rainy']
['overcast', 'overcast', 'overcast', 'overcast', 'rainy', 'rainy', 'rainy', 'rainy', 'rainy', 'sunny', 'sunny', 'sunny', 'sunny', 'sunny']


In [25]:
df['outlook']+df['play']

D1         sunnyno
D2         sunnyno
D3     overcastyes
D4        rainyyes
D5        rainyyes
D6         rainyno
D7     overcastyes
D8         sunnyno
D9        sunnyyes
D10       rainyyes
D11       sunnyyes
D12    overcastyes
D13    overcastyes
D14        rainyno
dtype: object

In [26]:
entropy(tennis[tennis['outlook']=='overcast'])

0.0

In [31]:
a = list(df.keys())
a.remove(out_attr)
df.iloc[[0]]

Unnamed: 0,outlook,temp,humidity,windy,play
D1,sunny,hot,high,False,no


In [36]:
l1= list(range(10))
l2 = [i for i in l1 if (i<5)]
l2

[0, 1, 2, 3, 4]

In [40]:
lesslst = list(df[df['outlook']<'sunny'][out_attr])
lesslst

['yes', 'yes', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'no']