In [225]:
import numpy as np

In [226]:
# Load the training data
M = np.genfromtxt('./monks-1.train', missing_values=0, skip_header=0, delimiter=',', dtype=int)
ytrn = M[:, 0]
Xtrn = M[:, 1:]
# Load the test data
M = np.genfromtxt('./monks-1.test', missing_values=0, skip_header=0, delimiter=',', dtype=int)
ytst = M[:, 0]
Xtst = M[:, 1:]

In [227]:
def partition(a):
    return {c: (a==c).nonzero()[0] for c in np.unique(a)}

In [228]:
def entropy(s):
    res = 0
    val, counts = np.unique(s, return_counts=True)
    freqs = counts.astype('float')/len(s)
    for p in freqs:
        if p != 0.0:
            res -= p * np.log2(p)
    return res

In [229]:
def mutual_information(y, x):

    res = entropy(y)

    # We partition x, according to attribute values x_i
    val, counts = np.unique(x, return_counts=True)
    freqs = counts.astype('float')/len(x)

    # We calculate a weighted average of the entropy
    for p, v in zip(freqs, val):
        res -= p * entropy(y[x == v])

    return res

In [230]:
attribute_labels= [val for val in range(1, Xtrn.shape[1]+1)]
attribute_value_pair_list=[]
for i in attribute_labels:
    attribute_values=[val for val in np.unique(Xtrn[:,i-1])]
    for j in attribute_values:
        attribute_value_pair_list.append([i,j])

In [262]:
def find_best_attribute(Xtrn,ytrn,attribute_value_pair_list):
    entropy_parent = entropy(ytrn)
    count=0
    for pair in attribute_value_pair_list:
        #print(pair)
        index=pair[0]-1 #label of the attribute
        vall=pair[1] #value of the attribute
        part=partition(Xtrn[:,index])
        count_left=0
        for i in part[vall]:
            count_left += 1
        y_sample_left=ytrn.take(part[vall], axis=0)
        temp=[]
        for key,value in part.items():
            if(key != index):
                for val in value:
                    temp.append(val)
        y_sample_right=ytrn.take(temp,axis=0)
        #count_left = len(part[value]) #no of rows for atrribute x==a
        count_right = len(temp) #no of rows for atrribute x!=a
        entropy_left=entropy(y_sample_left)
        entropy_right=entropy(y_sample_right)
        total_examples=count_left+count_right
        weight_left = count_left / total_examples
        weight_right = count_right / total_examples
        average_entropy_children = (weight_left*entropy_left)+(weight_right*entropy_right)
        information_gain = entropy_parent - average_entropy_children 
        #print(information_gain)
        count = count + 1
        if count > 1:
            if information_gain > max_information_gain:
                selected_pair=pair
                selected_attribute = index+1
                selected_attr_value= vall
                max_information_gain = information_gain
        else:
                selected_pair=pair
                selected_attribute = index+1
                selected_attr_value= vall
                max_information_gain = information_gain
    #print(selected_attribute)
    #print(selected_attr_value)
    
    return (selected_pair)

In [288]:
Xtrn.shape[0]

124

In [289]:
def find_example_subset(Xtrn,selected_pair):
    Xtrn_new=[]
    for i in range(Xtrn.shape[0]):
        #print(type(Xtrn[i,selected_pair[0]-1]))
        #print(type(selected_pair[1]))
        if Xtrn[i,selected_pair[0]-1] == selected_pair[1]:
            Xtrn_new.append(Xtrn[i,:])
    return Xtrn_new

#finding the size of example subset
def find_size_example_subset(x):
    size_subset = x.shape[0]
    return size_subset

In [290]:
attr=find_best_attribute(Xtrn,ytrn,attribute_value_pair_list)

In [291]:
a=find_example_subset(Xtrn,attr)
a

[1 1 2 3 1 2]
[1 2 1 1 1 2]
[1 2 1 2 1 1]
[1 3 1 3 1 2]
[1 3 2 2 1 2]
[1 3 2 3 1 1]
[2 1 1 2 1 1]
[2 1 1 2 1 2]
[2 2 1 3 1 1]
[2 2 1 3 1 2]
[2 2 2 1 1 1]
[2 3 1 1 1 1]
[2 3 1 2 1 1]
[2 3 1 3 1 2]
[2 3 2 2 1 1]
[2 3 2 2 1 2]
[3 1 1 1 1 1]
[3 1 1 1 1 2]
[3 1 1 2 1 1]
[3 1 2 1 1 1]
[3 2 1 1 1 1]
[3 2 1 2 1 2]
[3 2 2 1 1 1]
[3 2 2 1 1 2]
[3 2 2 3 1 1]
[3 3 1 1 1 1]
[3 3 1 3 1 2]
[3 3 2 1 1 1]
[3 3 2 3 1 2]


[]

In [242]:
def id3(x, y, attribute_value_pairs=attribute_value_pair_list, depth=0, max_depth=5):

a=id3(Xtrn,ytrn)

IndentationError: expected an indented block (<ipython-input-242-f1f4328af0e0>, line 3)

In [220]:
b=find_size_example_subset(a)

[5, 1]

In [None]:
gain = np.array([mutual_information(ytrn, x_attr) for x_attr in Xtrn.T])
selected_attr_label = np.argmax(gain)+1
#selected_attr_label

In [73]:
sets = partition(Xtrn[:, selected_attr_label-1])
sets.items()

dict_items([(1, array([  8,   9,  13,  30,  36,  41,  47,  48,  65,  66,  70,  76,  77,
        79,  83,  84,  87,  88,  89,  92,  97,  99, 101, 102, 104, 107,
       112, 116, 120], dtype=int64)), (2, array([  2,   4,   5,  10,  17,  19,  23,  26,  28,  31,  35,  37,  42,
        49,  54,  59,  61,  62,  67,  74,  85,  90,  91,  93,  94,  96,
       105, 108, 113, 114, 121], dtype=int64)), (3, array([  0,   1,   3,   6,  11,  14,  15,  20,  24,  25,  32,  38,  45,
        46,  50,  55,  57,  63,  64,  68,  71,  78,  80,  82,  86,  95,
       103, 110, 117, 122], dtype=int64)), (4, array([  7,  12,  16,  18,  21,  22,  27,  29,  33,  34,  39,  40,  43,
        44,  51,  52,  53,  56,  58,  60,  69,  72,  73,  75,  81,  98,
       100, 106, 109, 111, 115, 118, 119, 123], dtype=int64))])

In [74]:
res = {}
#impurity={}
for k, v in sets.items():
    y_subset = ytrn.take(v, axis=0)
    print(k)
    print(y_subset)
    #impurity[k]=entropy(y_subset)
    #x_subset = Xtrn.take(v, axis=0)
    #res["x_%d = %d" % (selected_attr, k)] = recursive_split(x_subset, y_subset)

#return res

1
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
2
[1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 1]
3
[1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 1 1 1]
4
[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 1 1]
