In [31]:

import numpy as np

import matplotlib.pyplot as plt
import tensorflow as tf
from collections import Counter
import random

Use https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz MNIST
dataset for this question and select class 0, 1 and 2. Note you are not allowed
to use libraries which can take data, fit the model, predict the classes and give
accuracy. Perform following tasks.

In [32]:
link = 'https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz'
path = tf.keras.utils.get_file('mnist.npz', link)    
DATASET = np.load(path)
x_train, y_train = DATASET['x_train'], DATASET['y_train']
x_test, y_test = DATASET['x_test'], DATASET['y_test']

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(60000, 28, 28)
(60000,)
(10000, 28, 28)
(10000,)


In [33]:
train_selective = np.isin(y_train , [0,1,2]) 
test_selective = np.isin(y_test , [0,1,2])

x_train ,y_train = x_train[train_selective], y_train[train_selective]
x_test , y_test = x_test[test_selective], y_test[test_selective]


print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)


(18623, 28, 28)
(18623,)
(3147, 28, 28)
(3147,)


Apply PCA and reduce the dimension to p = 10. You can use the entire
train set of these 3 classes to obtain PCA matrix. For the remaining parts,
use the reduced dimension dataset.

In [34]:
x_train_flat = x_train.reshape(x_train.shape[0] , -1) 
x_test_flat = x_test.reshape(x_test.shape[0] ,  -1)

x_train_flat = x_train_flat.T 
x_test_flat = x_test_flat.T 

print(x_train_flat.shape)

X = x_train_flat
mean_X = np.mean(X, axis=1, keepdims=True)
print(mean_X.shape)

X_centered = X - mean_X

S = (X_centered @ np.transpose(X_centered)) / (X_centered.shape[1] - 1) 

eigenvalues, eigenvectors = np.linalg.eig(S)
sorted_indices = np.argsort(eigenvalues)[::-1]
eigenvalues = eigenvalues[sorted_indices]
eigenvectors = eigenvectors[:, sorted_indices]

U = eigenvectors
p = 10

Y = np.transpose(U[:, :p]) @ X_centered


print(Y.shape)
Y = Y.T 



(784, 18623)
(784, 1)
(10, 18623)


Now learn a decision tree using the train set. You need to grow a deci-
sion tree with 3 terminal nodes. This is similar to what we did in the

baseball salary example. For the first split, consider all p dimensions. For
each dimension, consider one split which will divide the space into two
regions. Find the total Gini index. Similarly find the total Gini index
for all 50 dimensions. Find the best split by searching for minimum Gini
index. Suppose, you split across 10th dimension. Choose one of the splits,
and repeat the steps to find best split. Once you find it, the entire p
dimensional space is divided into three regions.

In [35]:

def find_gini_index_per_dimension(d ,Y):
    dimension = d 
    val_of_dimension = []
    total_sum = 0 
    list_less = []
    list_more = []
    for i in range(Y.shape[0]):
        total_sum += Y[i][dimension]
        val_of_dimension.append(Y[i][dimension]) # the dimension are from 0 to 9 as there are 10 dimensions
        
    mid_val = total_sum/(len(val_of_dimension))
    # print(val_of_dimension)
    # print(mid_val)
    
    for i in range(Y.shape[0]):
        if (Y[i][dimension] < mid_val):
            list_less.append(i)
        else:
            list_more.append(i)

    # for x in list_less:
    #     print(y_train[x])
    # for x in list_more:
    #     print(y_train[x]) 
    dict_less = {}
    dict_more = {} 
    for x in list_less:
        if y_train[x] in dict_less:
            dict_less[y_train[x]] += 1
        else:
            dict_less[y_train[x]] = 1
    for x in list_more:
        if y_train[x] in dict_more:
            dict_more[y_train[x]] += 1
        else:
            dict_more[y_train[x]] = 1
    
    # print(len(list_less))
    # print(len(list_more))
    # print(dict_less)
    # print(dict_more)    
    gini_index_less = 0 
    gini_index_more = 0
    for x in dict_less:
        gini_index_less += (dict_less[x]/len(list_less))*(1-(dict_less[x]/len(list_less)))
    for x in dict_more:
        gini_index_more += (dict_more[x]/len(list_more))*(1-(dict_more[x]/len(list_more)))
    
    gini_index = (gini_index_less)*(len(list_less)/Y.shape[0]) + (gini_index_more)*(len(list_more)/Y.shape[0])
    
    return gini_index
    
print(find_gini_index_per_dimension(0,Y))


0.44158414128431744


This is for the first split . We will see all the dimensions and choose the one with the minimum gini index .

In [36]:
def get_mean(d ,Y):
    dimension = d 
    sum1 = 0 
    for x in range(Y.shape[0]):
        sum1 += Y[x][dimension]
    return sum1/Y.shape[0]

def find_first_dimension(Y):
    gini_indices = np.zeros(p)
    for dimension in range(p):
        gini_indices[dimension] = find_gini_index_per_dimension(dimension , Y)

    decided_dimension_first = np.argmin(gini_indices) #X1
    decided_mean_first = get_mean(decided_dimension_first ,Y) # t1
    
    return decided_dimension_first, decided_mean_first

decided_dimension_first , decided_mean_first = find_first_dimension(Y)

print(decided_dimension_first)
print(decided_mean_first)

0
(3.218980267980891e-14+0j)


In [37]:


def find_gini_index_per_dimension_left_split(d ,Y , decided_dimension_first , decided_mean_first):
    dimension = d 
    val_of_dimension = []
    total_sum = 0 
    list_less = []
    list_more = []
    for i in range(Y.shape[0]):
        if (Y[i][decided_dimension_first] < decided_mean_first):
            total_sum += Y[i][dimension]
            val_of_dimension.append(Y[i][dimension])
        
    mid_val = (total_sum)/len(val_of_dimension)
    # print(mid_val)
    for i in range(Y.shape[0]):
        if (Y[i][dimension] < mid_val and Y[i][decided_dimension_first] < decided_mean_first):
            list_less.append(i)
        elif (Y[i][dimension] > mid_val and Y[i][decided_dimension_first] < decided_mean_first):
            list_more.append(i)
    
    # print(list_less)
    # print(list_more)
    
    dict_less = {}
    dict_more = {} 
    for x in list_less:
        if y_train[x] in dict_less:
            dict_less[y_train[x]] += 1
        else:
            dict_less[y_train[x]] = 1
    for x in list_more:
        if y_train[x] in dict_more:
            dict_more[y_train[x]] += 1
        else:
            dict_more[y_train[x]] = 1
    
    
    gini_index_less = 0 
    gini_index_more = 0
    for x in dict_less:
        gini_index_less += (dict_less[x]/len(list_less))*(1-(dict_less[x]/len(list_less)))
    for x in dict_more:
        gini_index_more += (dict_more[x]/len(list_more))*(1-(dict_more[x]/len(list_more)))
    
    # print(dict_less)
    # print(dict_more)
    gini_index = (gini_index_less)*(len(list_less)/(len(list_less) + len(list_more))) + (gini_index_more)*(len(list_more)/(len(list_less) + len(list_more)))
    # print(gini_index)
    return gini_index , mid_val

# print(find_gini_index_per_dimension(0))
# find_gini_index_per_dimension_left_split(0)


def find_dimension_left(Y ,decided_dimension_first , decided_mean_first):
    gini_indices_left = np.zeros(p)
    gini_mean_left = np.zeros(p)

    for dimension in range(p):
        gini_index , mean = find_gini_index_per_dimension_left_split(dimension, Y , decided_dimension_first , decided_mean_first) 
        gini_indices_left[dimension] = gini_index
        gini_mean_left[dimension] = mean
        
    decided_dimension_second_left = np.argmin(gini_indices_left)
    decided_mean_second_left = gini_mean_left[decided_dimension_second_left]

    return decided_dimension_second_left , decided_mean_second_left , gini_indices_left

decided_dimension_second_left , decided_mean_second_left , gini_indices_left = find_dimension_left(Y , decided_dimension_first , decided_mean_first)

# print(decided_dimension_second_left) #X2 (left)
# print(decided_mean_second_left) # t2 (left)
# print(gini_indices_left[decided_dimension_second_left])


def find_gini_index_per_dimension_right_split(d , Y , decided_dimension_first , decided_mean_first):
    dimension = d 
    val_of_dimension = []
    total_sum = 0 
    list_less = []
    list_more = []
    for i in range(Y.shape[0]):
        if (Y[i][decided_dimension_first] >= decided_mean_first):
            total_sum += Y[i][dimension]
            val_of_dimension.append(Y[i][dimension])
    
    mid_val = (total_sum/(len(val_of_dimension)))
    # print(mid_val)
    for i in range(Y.shape[0]):
        if (Y[i][dimension] < mid_val and Y[i][decided_dimension_first] >= decided_mean_first):
            list_less.append(i)
        elif (Y[i][dimension] > mid_val and Y[i][decided_dimension_first] >= decided_mean_first):
            list_more.append(i)
    
    # print(list_less)
    # print(list_more)
    
    dict_less = {}
    dict_more = {} 
    for x in list_less:
        if y_train[x] in dict_less:
            dict_less[y_train[x]] += 1
        else:
            dict_less[y_train[x]] = 1
    for x in list_more:
        if y_train[x] in dict_more:
            dict_more[y_train[x]] += 1
        else:
            dict_more[y_train[x]] = 1
    
    gini_index_less = 0 
    gini_index_more = 0
    for x in dict_less:
        gini_index_less += (dict_less[x]/len(list_less))*(1-(dict_less[x]/len(list_less)))

    for x in dict_more:
        gini_index_more += (dict_more[x]/len(list_more))*(1-(dict_more[x]/len(list_more)))

    gini_index = (gini_index_less)*(len(list_less)/(len(list_less) + len(list_more))) + (gini_index_more)*(len(list_more)/(len(list_less) + len(list_more)))
    # print(gini_index)
    # print(gini_index)
    return gini_index , mid_val

# print(find_gini_index_per_dimension(0))
# find_gini_index_per_dimension_left_split(0)
def find_dimension_right(Y ,decided_dimension_first , decided_mean_first):
    
    gini_indices_right = np.zeros(p)
    gini_mean_right = np.zeros(p)
 
    for dimension in range(p):
        gini_index , mean = find_gini_index_per_dimension_right_split(dimension , Y , decided_dimension_first , decided_mean_first) 
        gini_indices_right[dimension] = gini_index
        gini_mean_right[dimension] = mean
        
    decided_dimension_second_right = np.argmin(gini_indices_right)
    decided_mean_second_right = gini_mean_right[decided_dimension_second_right]
    
    return  decided_dimension_second_right , decided_mean_second_right , gini_indices_right

decided_dimension_second_right , decided_mean_second_right , gini_indices_right = find_dimension_right(Y, decided_dimension_first , decided_mean_first)
# print(decided_dimension_second_right) #X2 (left)
# print(decided_mean_second_right) # t2 (left)
# print(gini_indices_right[decided_dimension_second_right])



  gini_mean_left[dimension] = mean
  gini_mean_right[dimension] = mean


Now deciding the class for each of the region .

In [38]:
def classifier_1(decided_dimension_first , decided_mean_first ,decided_dimension_second , decided_mean_second ,Y):
    
    #Region 1 X1 < t1 and X2 < t2
    #Region 2 X1 < t1 and X2 > t2
    #Region 3 X1 > t1
    list_R1 = []
    list_R2 = []
    list_R3 = []
    dict_R1 = {}
    dict_R2= {}
    dict_R3 = {}
    for i in range(Y.shape[0]):
        if (Y[i][decided_dimension_first] < decided_mean_first and Y[i][decided_dimension_second] < decided_mean_second):
            list_R1.append(i) 
        elif (Y[i][decided_dimension_first] < decided_mean_first and Y[i][decided_dimension_second] >= decided_mean_second):
            list_R2.append(i)
        
        elif (Y[i][decided_dimension_first] > decided_mean_first):
            list_R3.append(i)
            
    for x in list_R1:
        if y_train[x] in dict_R1:
            dict_R1[y_train[x]] += 1
        else:
            dict_R1[y_train[x]] = 1

    for x in list_R2:
        if y_train[x] in dict_R2:
            dict_R2[y_train[x]] += 1
        else:
            dict_R2[y_train[x]] = 1

    for x in list_R3:
        if y_train[x] in dict_R3:
            dict_R3[y_train[x]] += 1
        else:
            dict_R3[y_train[x]] = 1

    decided_class_R1 = max(dict_R1, key=dict_R1.get)
    decided_class_R2 = max(dict_R2, key=dict_R2.get)
    decided_class_R3 = max(dict_R3, key=dict_R3.get)
    # print(dict_R1)
    # print(dict_R2)
    # print(dict_R3)
    # print(decided_class_R1)
    # print(decided_class_R2)
    # print(decided_class_R3)
    return decided_class_R1, decided_class_R2, decided_class_R3
    
    
def classifier_2(decided_dimension_first , decided_mean_first ,decided_dimension_second , decided_mean_second , Y):
    #print("here")
    #Region 1 X1 < t1 
    #Region 2 X1 > t1 and X2 < t2
    #Region 3 X1 > t1 and X2 > t2   
    list_R1 = []
    list_R2 = []
    list_R3 = []
    dict_R1 = {}
    dict_R2= {}
    dict_R3 = {}
    for i in range(Y.shape[0]):
        if (Y[i][decided_dimension_first] < decided_mean_first):
            list_R1.append(i) 
        elif (Y[i][decided_dimension_first] >= decided_mean_first and Y[i][decided_dimension_second] < decided_mean_second):
            list_R2.append(i)
        
        elif (Y[i][decided_dimension_first] >= decided_mean_first and Y[i][decided_dimension_second] >= decided_mean_second):
            list_R3.append(i)
            
    for x in list_R1:
        if y_train[x] in dict_R1:
            dict_R1[y_train[x]] += 1
        else:
            dict_R1[y_train[x]] = 1

    for x in list_R2:
        if y_train[x] in dict_R2:
            dict_R2[y_train[x]] += 1
        else:
            dict_R2[y_train[x]] = 1

    for x in list_R3:
        if y_train[x] in dict_R3:
            dict_R3[y_train[x]] += 1
        else:
            dict_R3[y_train[x]] = 1

    decided_class_R1 = max(dict_R1, key=dict_R1.get)
    decided_class_R2 = max(dict_R2, key=dict_R2.get)
    decided_class_R3 = max(dict_R3, key=dict_R3.get)
    # print(dict_R1)
    # print(dict_R2)
    # print(dict_R3)
    # print(decided_class_R1)
    # print(decided_class_R2)
    # print(decided_class_R3)
    
    return decided_class_R1, decided_class_R2, decided_class_R3
    

Deciding the classes for the test data .

In [39]:
X_test = x_test_flat
mean_X_test = np.mean(X_test, axis=1, keepdims=True)
# print(mean_X.shape)

X_centered_test = X_test - mean_X

S_test = (X_centered_test @ np.transpose(X_centered_test)) / (X_centered_test.shape[1] - 1) 

eigenvalues, eigenvectors = np.linalg.eig(S_test)
sorted_indices = np.argsort(eigenvalues)[::-1]
eigenvalues = eigenvalues[sorted_indices]
eigenvectors = eigenvectors[:, sorted_indices]

U_test = eigenvectors


Y_test = np.transpose(U_test[:, :p]) @ X_centered_test
# print(Y_test.shape)
Y_test = Y_test.T


correct_count = 0
class_counters =[0]*3
total_counters =[0]*3

choice = random.randint(0, 1)  
if (choice == 1):
    decided_dimension_second = decided_dimension_second_left 
    decided_mean_second = decided_mean_second_left
    decided_class_R1 , decided_class_R2, decided_class_R3 = classifier_1(decided_dimension_first , decided_mean_first ,decided_dimension_second , decided_mean_second , Y)
    


    decided_classes = []
    for i in range(Y_test.shape[0]):
        if (Y_test[i][decided_dimension_first] < decided_mean_first and Y_test[i][decided_dimension_second] < decided_mean_second):
            decided_classes.append(decided_class_R1)
        elif (Y_test[i][decided_dimension_first] < decided_mean_first and Y_test[i][decided_dimension_second] > decided_mean_second):
            decided_classes.append(decided_class_R2)
        elif (Y_test[i][decided_dimension_first] > decided_mean_first):
            decided_classes.append(decided_class_R3)
            

    for i in range(len(decided_classes)):
        if decided_classes[i] == y_test[i]:
            correct_count += 1
            class_counters[y_test[i]] += 1
        total_counters[y_test[i]] += 1

    
    
else:
    decided_dimension_second = decided_dimension_second_right
    decided_mean_second = decided_mean_second_right
    decided_class_R1 , decided_class_R2, decided_class_R3 = classifier_2(decided_dimension_first , decided_mean_first ,decided_dimension_second , decided_mean_second ,Y)
    


    decided_classes = []
    for i in range(Y_test.shape[0]):
        if (Y_test[i][decided_dimension_first] < decided_mean_first ):
            decided_classes.append(decided_class_R1)
        elif (Y_test[i][decided_dimension_first] >= decided_mean_first and Y_test[i][decided_dimension_second] < decided_mean_second):
            decided_classes.append(decided_class_R2)
        elif (Y_test[i][decided_dimension_first] >= decided_mean_first and Y_test[i][decided_dimension_second] >= decided_mean_second):
            decided_classes.append(decided_class_R3)
        
    # print(decided_classes)
    for i in range(len(decided_classes)):
        if decided_classes[i] == y_test[i]:
            correct_count += 1
            class_counters[y_test[i]] += 1
        total_counters[y_test[i]] += 1
    
for i in range(3):
    print("Class ", i, " Accuracy : ", (class_counters[i]/total_counters[i])*100 , "%")
print("Total accuracy is : " + str((correct_count/len(decided_classes))*100) , "%")




Class  0  Accuracy :  87.34693877551021 %
Class  1  Accuracy :  99.91189427312776 %
Class  2  Accuracy :  42.92635658914728 %
Total accuracy is : 77.3117254528122 %


Now use bagging, develop 5 different datasets from the original dataset.
Learn trees for all these datasets. For test samples, use majority voting
(atleast 3 trees should predict the same class) to find the class of a given
sample. In case there is a tie, that is two trees predict one class and other
two trees predict another class, then you can choose either of the classes.
Report the total accuracy and class-wise accuracy.

These are the 5 different datasets created from the main dataset . 

In [40]:
# print(Y.shape) 
num_datasets = 100
sample_size = Y.shape[0] 
num_features = Y.shape[1]
bootstrap_datasets = []

all_indices_data = []
stored_train = y_train


for i in range(num_datasets):
  indices = np.random.choice(sample_size, size=sample_size, replace=True)
  all_indices_data.append(indices) 

  bootstrap_sample = Y[indices] 
  bootstrap_datasets.append(bootstrap_sample) 

bootstrap_datasets = np.array(bootstrap_datasets) 
# print(bootstrap_datasets[0].shape)
# print(all_indices_data[0].shape)


# we already created the test dataset Y_test

decided_classes = [[] for _ in range(Y_test.shape[0])]

 
def class_guesser(Y , decided_dimension_first , decided_mean_first , decided_dimension_second_left ,decided_mean_second_left , gini_indices_left , decided_dimension_second_right , decided_mean_second_right , gini_indices_right ):
  
  choice = random.randint(0 ,1)
  if (choice == 1):
    decided_dimension_second = decided_dimension_second_left 
    decided_mean_second = decided_mean_second_left
    decided_class_R1 , decided_class_R2, decided_class_R3 = classifier_1(decided_dimension_first , decided_mean_first,decided_dimension_second , decided_mean_second , Y)
    

    for i in range(Y_test.shape[0]):
        if (Y_test[i][decided_dimension_first] < decided_mean_first and Y_test[i][decided_dimension_second] < decided_mean_second):
            decided_classes[i].append(decided_class_R1)
        elif (Y_test[i][decided_dimension_first] < decided_mean_first and Y_test[i][decided_dimension_second] > decided_mean_second):
            decided_classes[i].append(decided_class_R2)
        elif (Y_test[i][decided_dimension_first] > decided_mean_first):
            decided_classes[i].append(decided_class_R3)
            

    
    
    
  else:
      decided_dimension_second = decided_dimension_second_right
      decided_mean_second = decided_mean_second_right
      decided_class_R1 , decided_class_R2, decided_class_R3 = classifier_2(decided_dimension_first , decided_mean_first,decided_dimension_second , decided_mean_second ,Y)
      

      for i in range(Y_test.shape[0]):
          if (Y_test[i][decided_dimension_first] < decided_mean_first ):
              decided_classes[i].append(decided_class_R1)
          elif (Y_test[i][decided_dimension_first] >= decided_mean_first and Y_test[i][decided_dimension_second] < decided_mean_second):
              decided_classes[i].append(decided_class_R2)
          elif (Y_test[i][decided_dimension_first] >= decided_mean_first and Y_test[i][decided_dimension_second] >= decided_mean_second):
              decided_classes[i].append(decided_class_R3)


for i in range(num_datasets):
  y_train = []
  for x in all_indices_data[i]:
    y_train.append(stored_train[x])


  
  decided_dimension_first , decided_mean_first = find_first_dimension(bootstrap_datasets[i]) 
  decided_dimension_second_left , decided_mean_second_left , gini_indices_left = find_dimension_left(bootstrap_datasets[i] , decided_dimension_first , decided_mean_first )
  decided_dimension_second_right , decided_mean_second_right , gini_indices_right = find_dimension_right(bootstrap_datasets[i] , decided_dimension_first , decided_mean_first )

  class_guesser(bootstrap_datasets[i] , decided_dimension_first , decided_mean_first , decided_dimension_second_left ,decided_mean_second_left , gini_indices_left , decided_dimension_second_right , decided_mean_second_right , gini_indices_right) 

# decided_dimension_first , decided_mean_first = find_first_dimension(Y)
# print("xxxxxxxxxxxxxxxxxxxxxxx")
# print(decided_dimension_first , decided_mean_first)
# decided_dimension_second_left , decided_mean_second_left , gini_indices_left = find_dimension_left(Y , decided_dimension_first , decided_mean_first)
# print("xxxxxxxxxxxxxxxxxxxxxxx")
# print(decided_dimension_second_left , decided_mean_second_left , gini_indices_left)
# decided_dimension_second_right , decided_mean_second_right , gini_indices_right = find_dimension_right(Y, decided_dimension_first , decided_mean_first)
# print("xxxxxxxxxxxxxxxxxxxxxxx")
# print(decided_dimension_second_right , decided_mean_second_right , gini_indices_right)

# class_guesser(Y , decided_dimension_first , decided_mean_first , decided_dimension_second_left ,decided_mean_second_left , gini_indices_left , decided_dimension_second_right , decided_mean_second_right , gini_indices_right) 


i = 0
correct_count = 0
correct_classes = [0]*3
total_classes = [0]*3
# print(decided_classes)
# print("jjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjj")
for subarray in decided_classes:
  counter = Counter(subarray)
  most_common_element = counter.most_common(1)[0][0]
  # print(most_common_element)
  if (most_common_element == y_test[i]):
    correct_count += 1   
    correct_classes[y_test[i]] += 1
  total_classes[y_test[i]] += 1

  i += 1  

for i in range(3):
  print( f"The accuracy of the class {i} is : " + str((correct_classes[i]/total_classes[i]) * 100 ) + " %") 

print( "Total accuracy is : " + str((correct_count/Y_test.shape[0]) * 100) + " %")        


  gini_mean_left[dimension] = mean
  gini_mean_right[dimension] = mean


The accuracy of the class 0 is : 99.08163265306122 %
The accuracy of the class 1 is : 90.9251101321586 %
The accuracy of the class 2 is : 52.71317829457365 %
Total accuracy is : 80.93422306959009 %
