# Import required Library

In [90]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math


# Node of Tree

In [91]:
class Node():
    feature = "no split"
    leaf = False
    prediction = ""
    child1 = None
    child2 = None
    child3 = None
    child4 = None

# Function to discretize Data (4 bins used)

In [92]:
def discretize_data(df):
    bins = 4
    for col, data in df.iteritems():
        if col != "class" and col < 8:
            minimum = int(data.min())
            maximum = int(data.max())
            width = (maximum - minimum)/ (bins)
            bin1 = int(minimum + width)
            bin2 = int(bin1 + width)
            bin3 = int(bin2 + width)
            bin4 = int(bin3 + width)
            for i, value in enumerate(data):
                val = int(value)
                if(val >= minimum and val <= bin1):
                    data.at[i] = 1
                elif(val > bin1 and val <=bin2):
                    data.at[i] = 2
                elif(val > bin2 and val <=bin3):
                    data.at[i] = 3
                elif(val > bin3 and val <=maximum):
                    data.at[i] = 4
                else:
                    print("Nan")
    return df


# Entropy Calculation

In [93]:
# target is class label and df is the dataset
def entropy(df, target):
    entropy_value = 0.0
    positive_case = df[target[0] == True]
    negative_case = df[target[0] == False]
    p = float(positive_case.shape[0])
    n = float(negative_case.shape[0])
    if(p == 0 or n == 0):
        entropy_value = 0
    else:
        entropy_value = ((-1*p)/(p + n))*math.log(p/(p+n), 2) + ((-1*n)/(p + n))*math.log(n/(p+n), 2)
    return entropy_value      

In [94]:
def subset_entropy(df, subset_df, target):
#     Calculate the second part of gain formula using entropy and division
    num_data = df.shape[0]
    ans = float(0)
    for bin in subset_df:
        ans += float(bin.shape[0]/num_data)*entropy(bin, target)
    return ans


# Gain Calculation

In [95]:
def gain(df, attribute, target):
#    We have 4 bins so subdivide the data and use gain formula
    gain = 1
    if(attribute > 7):
        bin1 = df[df[attribute] == 0]
        bin2 = df[df[attribute] == 1]
        gain = entropy(df, target) - subset_entropy(df, [bin1, bin2], target)
        return gain

    else:
        
        bin1 = df[df[attribute] == 1]
        bin2 = df[df[attribute] == 2] 
        bin3 = df[df[attribute] == 3] 
        bin4 = df[df[attribute] == 4] 
        gain = entropy(df, target) - subset_entropy(df, [bin1, bin2, bin3, bin4], target)
        print(attribute, "  ", gain)

        return gain



# Choose Split Feature

In [96]:
def choose_feature_split(df, attribute, target):
    max_gain = float("-inf")
    best_feature = None
    for feature in attribute:
        g = gain(df, feature, target)
# Choose Split Feature        if(max_gain < g):
            max_gain = g
            best_feature = feature
    return max_gain, best_feature

IndentationError: unexpected indent (<ipython-input-96-4ce8a4dd8931>, line 7)

# ID3 Algorithm to construct tree

In [97]:
def ID3(df, attribute, target, depth):
    node = Node()
    positive_example = df[target[0] == True]
    negative_example = df[target[0] == False]
    
    positive_number = positive_example.shape[0]
    negative_number = negative_example.shape[0]
    if(positive_number == 0 or negative_number == 0 or len(attribute) == 0 or depth == 3):#depth
        node.leaf = True
        if(positive_number > negative_number):
            node.prediction = True
        else:
            node.prediction = False
        return node        
    else:
        max_gain, best_feature = choose_feature_split(df, attribute, target)
        node.feature = best_feature
        if(best_feature > 7):
            bin1 = df[df[best_feature] == 0]
            bin2 = df[df[best_feature] == 1]  
            
            attribute_cpy1 = attribute.copy()
            attribute_cpy1.remove(best_feature)
            node.child1 = ID3(bin1, attribute_cpy1, target, depth + 1)

            attribute_cpy2 = attribute.copy()
            attribute_cpy2.remove(best_feature)
            node.child2 = ID3(bin2, attribute_cpy2, target, depth + 1)
            
        else:
            
            bin1 = df[df[best_feature] == 1]
            bin2 = df[df[best_feature] == 2] 
            bin3 = df[df[best_feature] == 3] 
            bin4 = df[df[best_feature] == 4] 
        
            attribute_cpy1 = attribute.copy()
            attribute_cpy1.remove(best_feature)
            node.child1 = ID3(bin1, attribute_cpy1, target, depth + 1)

            attribute_cpy2 = attribute.copy()
            attribute_cpy2.remove(best_feature)
            node.child2 = ID3(bin2, attribute_cpy2, target, depth + 1)

            attribute_cpy3 = attribute.copy()
            attribute_cpy3.remove(best_feature)

            node.child3 = ID3(bin3, attribute_cpy3, target, depth + 1)

            attribute_cpy4 = attribute.copy()
            attribute_cpy4.remove(best_feature)
            node.child4 = ID3(bin4, attribute_cpy4, target, depth + 1)
        
        return node


# Prediction

In [98]:
def predict(node, single_data):
#     print(single_data)
    if(node.leaf):
        return node.prediction
    if(node.feature > 7):
        if(single_data[node.feature] == 0):
            return predict(node.child1, single_data)
        elif(single_data[node.feature] == 1):
            return predict(node.child2, single_data)
    else:
        if(single_data[node.feature] == 1):
            return predict(node.child1, single_data)
        elif(single_data[node.feature] == 2):
            return predict(node.child2, single_data)
        elif(single_data[node.feature] == 3):
            return predict(node.child3, single_data)
        elif(single_data[node.feature] == 4):
            return predict(node.child4, single_data)

    

In [99]:
def prediction_test_data(root, df, df_output):
    num_data = df.shape[0]
    prediction_correct = 0
    for row, data in df.iterrows():
        prediction = predict(root, data)
        
        if(prediction == df_output[0][row]):
            prediction_correct += 1
            
    predcition_per = (prediction_correct/num_data)*100
    print("The classifier classified = ",predcition_per, "%")   
    

# # Read Data and Visualize

In [100]:
df = pd.read_csv('pokemonStats.csv', header = None)
df_out = pd.read_csv('pokemonLegendary.csv', header =None)

df = discretize_data(df)
df_features = df.copy()
attr = df_features.columns.tolist()
depth = 0
tree = ID3(df, attr, df_out, depth)

  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.

0    0.5367714428519792
1    0.16692306261879897
2    0.2591240918370702
3    0.18840379801349771
4    0.3557207287547639
5    0.30371264552786914
6    0.2853998459675635
7    0.04639444670832926


  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from 

1    0.015198159414379853
2    0.01998577533867163
3    0.012036110611498785
4    0.0693910391239605
5    0.05711275552919792
6    0.11604403407672748
7    0.04449372325972589


  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd fro

1    0.02335814983654405
2    0.02211669083415435
3    0.07170539669166276
4    0.024476832030277507
5    0.20746467003831426
7    0.07627590924204397


  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.

1    0.05386290235607394
2    0.048840276405491156
3    0.05897469594473248
4    0.07079382650400046
5    0.07699331098031592
7    0.02671921434382729


  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd fro

1    0.20612075248941286
2    0.09360150854509919
3    0.04187780985080103
4    0.19273784283023176
5    0.09945764408989577
7    0.2901516117007078


  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.

  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.

1    0.000714779834250806
2    0.013359827264941365
3    0.0021471536351172216
4    0.018730675534251756
5    0.016424223226692125
6    0.000893110816659104
7    0.02025646192072214


  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  """
  af

1    0.0007482601814257639
2    0.007571993980441949
3    0.004328356953428231
4    0.013081346385415277
5    0.017125762074528816
6    0.00023345080954462194
7    0.02677212863720363


  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd from sys.path.
  """
  after removing the cwd fro

In [101]:
print(tree.child4.child1.child4.leaf)
print(tree.child4.child1.child4.prediction)

True
True


In [102]:
df = pd.read_csv('pokemonStats.csv', header = None)
df_out = pd.read_csv('pokemonLegendary.csv', header =None)
df = discretize_data(df)

prediction_test_data(tree, df, df_out)

The classifier classified =  88.43537414965986 %
