# Libraries

In [2]:
import pandas as pd
import numpy as np
import math

# Reading Dataset

In [3]:
df = pd.read_csv("./assets/nursery.csv")
df

Unnamed: 0,parents,has_nurs,form,children,housing,finance,social,health,final evaluation
0,usual,proper,complete,1,convenient,convenient,nonprob,recommended,recommend
1,usual,proper,complete,1,convenient,convenient,nonprob,priority,priority
2,usual,proper,complete,1,convenient,convenient,nonprob,not_recom,not_recom
3,usual,proper,complete,1,convenient,convenient,slightly_prob,recommended,recommend
4,usual,proper,complete,1,convenient,convenient,slightly_prob,priority,priority
...,...,...,...,...,...,...,...,...,...
12955,great_pret,very_crit,foster,more,critical,inconv,slightly_prob,priority,spec_prior
12956,great_pret,very_crit,foster,more,critical,inconv,slightly_prob,not_recom,not_recom
12957,great_pret,very_crit,foster,more,critical,inconv,problematic,recommended,spec_prior
12958,great_pret,very_crit,foster,more,critical,inconv,problematic,priority,spec_prior


## Shuffling

In [4]:
df = df.sample(frac=1.).reset_index(drop=True)
df

Unnamed: 0,parents,has_nurs,form,children,housing,finance,social,health,final evaluation
0,usual,very_crit,complete,1,less_conv,convenient,slightly_prob,recommended,priority
1,great_pret,proper,foster,3,less_conv,convenient,nonprob,priority,spec_prior
2,great_pret,very_crit,completed,2,convenient,convenient,slightly_prob,recommended,priority
3,pretentious,improper,completed,3,critical,inconv,slightly_prob,not_recom,not_recom
4,pretentious,less_proper,foster,1,less_conv,inconv,slightly_prob,not_recom,not_recom
...,...,...,...,...,...,...,...,...,...
12955,great_pret,very_crit,completed,1,convenient,inconv,nonprob,priority,spec_prior
12956,great_pret,proper,incomplete,more,less_conv,inconv,problematic,not_recom,not_recom
12957,pretentious,less_proper,incomplete,3,less_conv,convenient,nonprob,recommended,priority
12958,pretentious,improper,completed,3,less_conv,inconv,problematic,priority,spec_prior


## Splitting data into "train" and "test"

In [5]:
train = df.sample(frac=0.8).reset_index(drop=True)
test = df.drop(index=train.index).reset_index(drop=True)

print(f"dataset shape:\n{df.shape}\n{type(df)}\n")
print(f"train shape:\n{train.shape}\n{type(train)}\n")
print(f"test shape:\n{test.shape}\n{type(test)}")
train

dataset shape:
(12960, 9)
<class 'pandas.core.frame.DataFrame'>

train shape:
(10368, 9)
<class 'pandas.core.frame.DataFrame'>

test shape:
(2592, 9)
<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,parents,has_nurs,form,children,housing,finance,social,health,final evaluation
0,great_pret,very_crit,incomplete,1,critical,convenient,problematic,priority,spec_prior
1,great_pret,very_crit,incomplete,2,convenient,inconv,problematic,priority,spec_prior
2,pretentious,improper,completed,2,critical,convenient,nonprob,not_recom,not_recom
3,pretentious,less_proper,incomplete,2,critical,inconv,nonprob,recommended,priority
4,usual,proper,foster,2,less_conv,inconv,nonprob,not_recom,not_recom
...,...,...,...,...,...,...,...,...,...
10363,pretentious,proper,complete,1,critical,inconv,problematic,priority,priority
10364,great_pret,less_proper,completed,2,convenient,convenient,slightly_prob,priority,priority
10365,pretentious,less_proper,complete,2,convenient,inconv,problematic,recommended,priority
10366,usual,improper,incomplete,more,less_conv,inconv,problematic,priority,priority


# Entropy & IG

In [6]:
# function to calculate the entropy of entire dataset
def base_entropy(dataset):

    target = dataset.iloc[:, -1]
    targets = list(set(target))

    lst = [] # number of each element of targets in target    
    for i in targets:
        lst.append(target.value_counts()[i])
    
    entropy = 0
    _sum = sum(lst)
    for i in range(len(targets)):
        entropy -= (lst[i] / _sum) * (np.log2(lst[i] / _sum))

    return entropy

# function to calculate the entropy of attributes
def entropy(dataset, feature, attribute):
    
    target = dataset.iloc[:, -1]
    targets = list(set(target))
    lst = [0 for i in range(len(targets))]
    flag = 0
    
    for k in targets:
        for i,j in zip(feature, target):
            if i == attribute and j == k:
                lst[flag] += 1
        flag += 1
        
    # Remove all zeros in list - they cus problem in calculating log
    lst[:] = (val for val in lst if val != 0)     
    entropy = 0
    _sum = sum(lst)
    
    for i in range(len(lst)):
        entropy -= (lst[i] / _sum) * (np.log2(lst[i] / _sum))
    
    return entropy

# function that calculates the information gain
def Information_Gain(dataset, feature):
    
    Distinct = list(set(feature))
    Info_Gain = 0
    for i in Distinct:
        t = feature[feature==i].shape[0]  
        Info_Gain = Info_Gain + (t / len(feature)) * entropy(dataset, feature, i)
    Info_Gain = base_entropy(dataset) - Info_Gain
    return Info_Gain
    

In [7]:
H_T = base_entropy(df)
print(f"entropy of entire dataset = {H_T}")
print(f"Information Gain of Parent's occupation               = {Information_Gain(df, df['parents'])}")
print(f"Information Gain of Child's nursery                   = {Information_Gain(df, df['has_nurs'])}")
print(f"Information Gain of Form of the family                = {Information_Gain(df, df['form'])}")
print(f"Information Gain of Number of children                = {Information_Gain(df, df['children'])}")
print(f"Information Gain of Housing conditions                = {Information_Gain(df, df['housing'])}")
print(f"Information Gain of Financial standing of the family  = {Information_Gain(df, df['finance'])}")
print(f"Information Gain of Social conditions                 = {Information_Gain(df, df['social'])}")
print(f"Information Gain of Health conditions                 = {Information_Gain(df, df['health'])}")

entropy of entire dataset = 1.7164959001837934
Information Gain of Parent's occupation               = 0.07293460750309944
Information Gain of Child's nursery                   = 0.19644928048811572
Information Gain of Form of the family                = 0.005572591715219843
Information Gain of Number of children                = 0.011886431475775838
Information Gain of Housing conditions                = 0.019602025022871894
Information Gain of Financial standing of the family  = 0.0043331270252005005
Information Gain of Social conditions                 = 0.022232616894018342
Information Gain of Health conditions                 = 0.9587749604699762


In [8]:
def find_most_informative_feature(train_data, label):
    feature_list = train_data.columns.drop(label) 
                                          
    max_info_gain = -1
    max_info_feature = None
    
    for feature in feature_list:  #for each feature in the dataset
        feature_info_gain = Information_Gain(train_data, train_data[feature])
        if max_info_gain < feature_info_gain: #selecting feature name with highest information gain
            max_info_gain = feature_info_gain
            max_info_feature = feature
            
    return max_info_feature

In [9]:
print(find_most_informative_feature(train, "final evaluation"))

health


# Generating The Tree

In [10]:
def ID3(data, attribute, target, max_recursion, recursion=0):
    recursion += 1
    
    _val = data.mode()
    _val.drop(labels=target, axis=1, inplace=True)
    vals = list(data[target])
    
    if (data.empty) | (len(attribute) <= 0):
        return _val
    
    elif vals.count(vals[0]) == len(vals):
        return vals[0]
    
    else:
        #Returns a new decision tree based on the examples given.
        best = find_most_informative_feature(data, target)

        # Create a new decision tree/node with the best attribute and an empty
        # dictionary object--we'll fill that up next.
        tree = {best:{}}
        # Create a new decision tree/sub-node for each of the values in the
        # best attribute field
        for val in list(set(df[best])):
            # depth of the tree
            if recursion >= max_recursion:
                return tree
            
            # Create a subtree for the current value under the "best" field
            example = data.loc[data[best] == val]
            del example[best]
#             example.drop(labels=best, axis=1, inplace=True)
            new_attribute = attribute[:]
            new_attribute.remove(best)
            subtree = ID3(example, new_attribute, target, max_recursion, recursion)

            tree[best][val] = subtree
    
    return tree

In [11]:
attributes = train.columns.drop('final evaluation').to_list()
label = "final evaluation"

max_recursion = 8 #maximum depth of the tree
# max_recursion = len(attributes)

tree = ID3(train, attributes, label, max_recursion, 0)
test_tree = ID3(test, attributes, label, max_recursion, 0)

In [12]:
tree

{'health': {'priority': {'has_nurs': {'improper': {'parents': {'usual': 'priority',
      'pretentious': {'housing': {'critical': {'children': {'more': 'spec_prior',
          '3': 'spec_prior',
          '1': {'form': {'incomplete': 'spec_prior',
            'foster': 'spec_prior',
            'complete': 'priority',
            'completed': 'spec_prior'}},
          '2': 'spec_prior'}},
        'convenient': {'finance': {'inconv': {'children': {'more': 'spec_prior',
            '3': 'spec_prior',
            '1': {'form': {'incomplete': 'priority',
              'foster': 'spec_prior',
              'complete': 'priority',
              'completed': 'priority'}},
            '2': {'form': {'incomplete': 'spec_prior',
              'foster': 'spec_prior',
              'complete': 'priority',
              'completed': 'priority'}}}},
          'convenient': 'priority'}},
        'less_conv': {'children': {'more': 'spec_prior',
          '3': 'spec_prior',
          '1': {'form': {'in

In [13]:
test_tree

{'health': {'priority': {'has_nurs': {'improper': {'parents': {'usual': 'priority',
      'pretentious': {'housing': {'critical': 'spec_prior',
        'convenient': {'finance': {'inconv': {'children': {'more': 'spec_prior',
            '3': 'spec_prior',
            '1': {'form': {'incomplete': 'priority',
              'foster': 'spec_prior',
              'complete': Empty DataFrame
              Columns: [social]
              Index: [],
              'completed': Empty DataFrame
              Columns: [social]
              Index: []}},
            '2': {'form': {'incomplete': Empty DataFrame
              Columns: [social]
              Index: [],
              'foster': 'spec_prior',
              'complete': 'priority',
              'completed': Empty DataFrame
              Columns: [social]
              Index: []}}}},
          'convenient': 'priority'}},
        'less_conv': {'children': {'more': 'spec_prior',
          '3': 'spec_prior',
          '1': {'form': {'incomple

In [17]:
list(set(train['children']))

['more', '3', '1', '2']

In [15]:
test_tree

{'health': {'priority': {'has_nurs': {'improper': {'parents': {}},
    'very_crit': {'form': {}},
    'critical': {'parents': {}},
    'less_proper': {'parents': {}},
    'proper': {'parents': {}}}},
  'not_recom': 'not_recom',
  'recommended': {'has_nurs': {'improper': {'parents': {}},
    'very_crit': {'social': {}},
    'critical': {'parents': {}},
    'less_proper': {'parents': {}},
    'proper': {'parents': {}}}}}}