In [150]:
import numpy as np

def get_target(y_data) :
    
    y_values = []
    
    for item in y_data:
        
        if item not in y_values:
            y_values.append(item)
            
    return y_values

In [151]:
def getFrequency(y_data, label) :
    
    length = len(y_data)
    target = get_target(y_data)
    frequency = 0 
    matching = 0
        
    for item in y_data :
            
        if label == item :
                
            matching=matching+1
                
    frequency = matching/length
    
    return frequency           

In [152]:
import math

def getEntropy(y_data) :
    
    entropy = 0
    target = get_target(y_data)
    
    for tg in target :
        
        f = getFrequency(y_data,tg)
        entropy = entropy - f*math.log(f,2)
        
    return entropy

In [153]:
def getGiniIndex(y_data) :
    
    gini = 0
    target = get_target(y_data)
    
    for tg in target :
        
        f = getFrequency(y_data,tg)
        gini = gini - f*(1-f)
        
    return gini

In [154]:
import numpy as np

def getProbability(column) :
    
    length = len(column)
    target = get_target(column)
    probability = []
    
    for tg in target :
        matching = 0
        for item in column :
            
            if tg == item :
                matching=matching+1
                
            prob = matching/length
        probability.append(prob)
    
    return probability

In [155]:
def getConditionalEntropy(column,feature_value,y_data) :
        
        new_Y = []
        conditionalEntropy=0
        
        for index, item in enumerate(column):
        
            if feature_value == item :
                
                new_Y.append(y_data[index])
                
        probability = np.array(getProbability(new_Y)) 
        
        for p in probability :
    
            conditionalEntropy = conditionalEntropy-p*math.log(p,2)
        
        return conditionalEntropy   

In [156]:
def getConditionalGiniIndex(column,feature_value,y_data) :
        
        new_Y = []
        conditionalGini=0
        
        for index, item in enumerate(column):
        
            if feature_value == item :
                
                new_Y.append(y_data[index])
                
        probability = np.array(getProbability(new_Y)) 
        
        for p in probability :
    
            conditionalGini = conditionalGini-p*(1-p)
        
        return conditionalGini   

In [157]:
def getInformationGain(column,y_data,impurity_measure):
        
    if impurity_measure=='entropy' :
        
        probabilities = getProbability(column)
        entropies = []
        values=get_target(column)
        informationGain=0
        conditionalEntropy=0
    
        for item in values:
        
            x=getConditionalEntropy(column,item,y_data)
            entropies.append(x)
        
    
        for index, el in enumerate(probabilities):
        
            conditionalEntropy = conditionalEntropy + (probabilities[index]*entropies[index])
        
        informationGain = getEntropy(y_data) - conditionalEntropy
    
        return informationGain
    
    else :
        
        probabilities = getProbability(column)
        gini = []
        values=get_target(column)
        informationGain=0
        conditionalGini=0
    
        for item in values:
        
            x=getConditionalGiniIndex(column,item,y_data)
            gini.append(x)
        
    
        for index, el in enumerate(probabilities):
        
            conditionalGini = conditionalGini + (probabilities[index]*gini[index])
        
        informationGain = getGiniIndex(y_data) + conditionalGini
    
        return informationGain

In [158]:
def computeBestSplit(x_data,y_data,feature_names,impurity_measure):
    
        ig=[]
        a=0
        for column in x_data.T:
            ig.append(getInformationGain(column,y_data,impurity_measure))
            a=a+1        
        
        return feature_names[np.argmax(ig)],np.argmax(ig)    

In [159]:
def getInducedSubDatasets (x_data,y_data,bestSplit):
  
    xT_data=x_data.T
    target=get_target(xT_data[bestSplit[1]])
        
    inducedSubDatsets_x = []
    inducedSubDatsets_y = []
    
    for tg in target :
        
        temp_x = []
        temp_y = []
        
        for index, item in enumerate(x_data):
            
            if tg==item[bestSplit[1]] :
                
                temp_x.append(x_data[index])
                temp_y.append(y_data[index])
                
                
            
        inducedSubDatsets_x.append(temp_x)
        inducedSubDatsets_y.append(temp_y)
        
    return np.array(inducedSubDatsets_x),np.array(inducedSubDatsets_y)   

In [160]:
def pure(y_data):
    
    first=y_data[0]
    pure= True
    for item in y_data:
        
        if item!=first:
            pure= False
        
    return pure

In [161]:
def learn(x_data,y_data,feature_names,impurity_measure = 'entropy'):
    
        if impurity_measure!='entropy': 
            if impurity_measure!='gini':
                return 'Please enter a valid impurity measure (entropy or gini)'
        
        best_feature = computeBestSplit(x_data,y_data,feature_names,impurity_measure)
    
        if pure(y_data):
            return y_data[0]
    
        if len(x_data)==0 :
            return best_feature[0]
    
        tree = {best_feature[0]:{}}
    
        subdataset=np.array(getInducedSubDatasets(x_data,y_data,best_feature))
    
        for index, item in enumerate(subdataset[0],start=0):
    
            x=np.array(subdataset.T[index,0])
            y=np.array(subdataset.T[index,1])
        
            v= x.T[best_feature[1]][0]
            x=np.delete(x,best_feature[1],1)
        
            a = np.where(feature_names==best_feature[0])
            feature_names=np.delete(feature_names,a,0)
        
            subtree = learn(x,y,feature_names,impurity_measure)
        
            tree[best_feature[0]][v]=subtree
    
        return (tree)

In [162]:
def predict(tree,new,feature_names) :
     
    root = next(iter(tree))
    value=new[np.where(feature_names == root)]
            
    for key in tree.keys():
        
        result = tree[key][value[0]]
        
        if type(result) is (np.str_) :
            return result        
        else :
            return predict(result,new,feature_names) 
            

In [163]:
X_data= np.array ([['Sunny','Hot','High','Weak'],
        ['Sunny','Hot','High','Strong'],
        ['Overcast','Hot','High','Weak'],
        ['Rain','Mild','High','Weak'],
        ['Rain','Cool','Normal','Weak'],
        ['Rain','Cool','Normal','Strong'],
        ['Overcast','Cool','Normal','Strong'],
        ['Sunny','Mild','High','Weak'],
        ['Sunny','Cool','Normal','Weak'],
        ['Rain','Mild','Normal','Weak'],
        ['Sunny','Mild','Normal','Strong'],
        ['Overcast','Mild','High','Strong'],
        ['Overcast','Hot','Normal','Weak'],
        ['Rain','Mild','High','Strong']])

y_data=np.array(['no','no','yes','yes','yes','no','yes','no','yes','yes','yes','yes','yes','no'])

feature_names=np.array(['outlook','temperature','humidity','wind'])

tree = learn(X_data,y_data,feature_names,impurity_measure='gini')

new=np.array(['Sunny','Cool','Normal','Strong'])
new1=np.array(['Sunny','Cool','High','Strong'])

print(tree)

a = predict(tree,new,feature_names)
print(a)

b = predict(tree,new1,feature_names)
print(b)

{'outlook': {'Sunny': {'humidity': {'High': 'no', 'Normal': 'yes'}}, 'Overcast': 'yes', 'Rain': {'wind': {'Weak': 'yes', 'Strong': 'no'}}}}
yes
no
