In [1]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score

from scipy.stats import entropy


# import some data to play with
iris = datasets.load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.33, random_state=42)

# def change_weights(X_train,y_train,X_test,y_test,weights):

In [2]:
pk = np.array([1/5, 2/5, 2/5])  # fair coin
H = entropy(pk)
print(H)

1.0549201679861442


In [3]:
H=-0.2*(np.log(0.2))-0.4*(np.log(0.4))-0.4*(np.log(0.4))
print(H)

1.0549201679861442


In [11]:
def calculate_probabilities(list_labels, uniq_labels):
    '''
    Author: Sara Nassar 
    this function calculates the probabilities of each label in the list of labels
    it is calculated by number of labels in class A/all labels
    number of labels in class B/all labels
    and so on
    '''
    
    # A dictionary to store the probabilities
    probabilities = dict.fromkeys(uniq_labels, 0)
    
    # Total number of labels
    total_labels = len(list_labels)
    
    for label in uniq_labels:
        # Counting the number of times the label occurs in the list
        count = list_labels.count(label)
        
        # Calculating the probability of the label
        probability = count / total_labels
        
        # Storing the calculated probability in the dictionary
        probabilities[label] = probability
        
    return probabilities    
    
    
# test your function
list_labels=[1,2,0,1,2,0]
uniq_labels=[0,1,2]
print(calculate_probabilities(list_labels,uniq_labels))
# this should print somehting like 0.33,0.33,0.33

{0: 0.3333333333333333, 1: 0.3333333333333333, 2: 0.3333333333333333}


In [12]:
def calc_entropy_from_probabilities(list_probas):
    '''
    Author: Sara Nassar 
    list_probas is the list of probabiities
    the formula for entropy is
    sum(-proba*log(proba))
    
    '''
    
    entropy_value = 0

    for proba in list_probas:
        # If the probability is not zero
        if proba != 0:
            entropy_value += -proba * np.log(proba)
     
    return entropy_value


# test your function
list_probas=[1/5, 2/5, 2/5]
print(calc_entropy_from_probabilities(list_probas))
# above should print 1.054...

1.0549201679861442


In [13]:
def information_gain(old_entropy,new_entropies,count_items):
    '''
    Author: Sara Nassar 
    from the list of new entropies, calculate the overall new entropy
    
    formula is something like:
    overall_new_entropy = entropy1*proportion1 + entropy2*proportion2+ entropy3*proportion3 ...
    
    igain=old_entropy-overall_new_entropy
    '''
    
    overall_new_entropy = 0
    
    # Calculating the total number of items
    total_items = sum(count_items)
    
    for i in range(len(new_entropies)):
        # Calculating the proportion of items in the current partition
        proportion = count_items[i] / total_items
        
        # Adding the entropy of the current partition weighted by its proportion to the overall new entropy
        overall_new_entropy += new_entropies[i] * proportion
        
    # Calculating the information gain
    information_gain = old_entropy - overall_new_entropy
    
    return information_gain

#test your function
old_entropy=1
new_entropies=[0,0.65]
count_items=[4,6]
print(information_gain(old_entropy,new_entropies,count_items))
# above should print 0.61

0.61


In [14]:
num_feats=X_train.shape[1]
def initialize_weights(number_features):
    '''
    the first set of weights corresponding to the features
    For now, it defaults to 2
    '''
    
    weights=np.array([2 for i in range(number_features)])
    return weights

In [15]:
print(initialize_weights(num_feats))

[2 2 2 2]


In [16]:
y_test

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 2, 1, 1, 0,
       0, 1, 2, 2, 1, 2])

In [19]:
# initialization
''' Author: Raghad Alqirshi'''

# step 1 calculate the probabilities of 0, 1 and 2 in the y_test array
uniq_labels=[0,1,2]
y_test=list(y_test)
proba_init=calculate_probabilities(y_test, uniq_labels) #get the probabilities for y_test
print("Initial proba=",proba_init)

# step 2 calculate the initial entropy of y_test, using the probability values
# you might have to convert the dictionary to a list
# get only the probability values
list_probas=list(proba_init.values())#get list from dictionary proba_init
print(list_probas)
entropy_init=calc_entropy_from_probabilities(list_probas)# call the entropy calculation function using the list probabilities
print("Initial entropy = ",entropy_init)

Initial proba= {0: 0.38, 1: 0.3, 2: 0.32}
[0.38, 0.3, 0.32]
Initial entropy =  1.0934927418975058


In [38]:
wt_init=initialize_weights(num_feats)
# right now the initialize_weights function only returns 2,2,2 
print(wt_init)
res=[0 for i in range(len(X_test))]
# multiply the weights with each feature and calculate the sum
for f in range(len(X_test)):
    res[f]=round(np.sum(wt_init*X_test[f]),2)
#res=np.sum(wt_init*num_feats)# use np.sum() function
print(res)

[2 2 2 2]
[29.6, 23.0, 39.0, 29.8, 31.6, 21.4, 26.8, 34.8, 28.8, 27.2, 33.6, 18.6, 21.0, 19.2, 21.4, 31.8, 35.0, 26.2, 28.6, 34.0, 19.4, 31.6, 20.8, 33.8, 40.2, 34.4, 33.6, 36.4, 19.0, 19.4, 18.8, 24.0, 31.2, 20.0, 18.2, 31.4, 31.2, 20.8, 20.4, 21.8, 31.0, 31.0, 32.0, 22.0, 21.6, 25.2, 31.4, 33.6, 30.8, 38.8]


In [41]:
# choose a threshold between minimum and maximum
threshold=np.random.uniform(min(res),max(res))# use np.random.uniform() function
print(threshold)

# make two groups
group1=[]
group2=[]
res=np.array(res)
for i in range(res.shape[0]):
    if res[i]<threshold:
        group1.append(y_test[i])
    else:
        group2.append(y_test[i])

35.25541482448425


In [44]:
proba_gr1=calculate_probabilities(group1, uniq_labels)# caluclate the probabilities of items in group 1
proba_gr1=list(proba_gr1.values())# convert the dictionary into a list
entropy_group1=calc_entropy_from_probabilities(proba_gr1)# calculate entropy for group 1
count_group1=len(proba_gr1)

proba_gr2=calculate_probabilities(group2, uniq_labels)# caluclate the probabilities of items in group 2
proba_gr2=list(proba_gr2.values())# convert the dictionary into a list
entropy_group2=calc_entropy_from_probabilities(proba_gr2)# calculate entropy for group 2
count_group2=len(proba_gr2)

new_entropies=[entropy_group1,entropy_group2]
count_items=[count_group1,count_group2]
ig=information_gain(entropy_init,new_entropies,count_items)
print("IG=",ig)

IG= 0.5529108851546053


In [45]:
def change_weights(weights):
    new_weights=[]
    for i in range(weights[-1].shape[0]):
        new_weights.append(np.random.uniform(0,1))
    return np.array(new_weights)

In [46]:
def apply_and_measure_accuracy(X,y,weights):    
    res=np.sum(X*weights[-1],axis=1)
    res = np.tanh(res)
    res[res>0.5]=1
    res[res<=0.5]=0
    acc=accuracy_score(y, res)
    return acc
    
def get_train_test_accuracy(X_train,y_train,X_test,y_test,weights):
    train_acc=apply_and_measure_accuracy(X_train,y_train,weights)
    test_acc=apply_and_measure_accuracy(X_test,y_test,weights)
    return train_acc,test_acc
    

In [47]:
wt_init=[initialize_weights(num_feats)]
res=np.sum(X_test*wt_init[-1],axis=1)
res = np.tanh(res)
res[res>0.5]=1
res[res<=0.5]=0
print(res.shape)
acc=accuracy_score(y_test, res)
test_accuracies=[acc]

res=np.sum(X_train*wt_init[-1],axis=1)
res = np.tanh(res)
res[res>0.5]=1
res[res<=0.5]=0
print(res.shape)
acc=accuracy_score(y_train, res)

train_accuracies=[acc]
print("Initial test acc",test_accuracies)




def train_weights(X_train,y_train,X_test,y_test,weights,train_accuracies,test_accuracies):
    print("Trial number ",len(weights))
    
    train_acc,test_acc=get_train_test_accuracy(X_train,y_train,X_test,y_test,weights)
    
    
    
    # store the accuracy in this list of accuracies
    train_accuracies.append(train_acc)
    test_accuracies.append(test_acc)
    print("train",train_acc,"test",test_acc)
    print(test_accuracies[-1],test_accuracies[-2])
    
    # exit condition
    if test_accuracies[-1]<test_accuracies[-2]:
        print("returning")
        return weights,train_accuracies,test_accuracies
    
    # change the weights according to the accuracy
    new_weights=change_weights(weights)
    weights.append(new_weights)
    return train_weights(X_train,y_train,X_test,y_test,weights,train_accuracies,test_accuracies)
    

(50,)
(100,)
Initial test acc [0.3]


In [48]:
weights,train_accuracies,test_accuracies=train_weights(X_train,y_train,X_test,y_test,wt_init,train_accuracies,test_accuracies)

Trial number  1
train 0.35 test 0.3
0.3 0.3
Trial number  2
train 0.35 test 0.3
0.3 0.3
Trial number  3
train 0.35 test 0.3
0.3 0.3
Trial number  4
train 0.35 test 0.3
0.3 0.3
Trial number  5
train 0.35 test 0.3
0.3 0.3
Trial number  6
train 0.35 test 0.3
0.3 0.3
Trial number  7
train 0.35 test 0.3
0.3 0.3
Trial number  8
train 0.35 test 0.3
0.3 0.3
Trial number  9
train 0.35 test 0.3
0.3 0.3
Trial number  10
train 0.35 test 0.3
0.3 0.3
Trial number  11
train 0.35 test 0.3
0.3 0.3
Trial number  12
train 0.35 test 0.3
0.3 0.3
Trial number  13
train 0.35 test 0.3
0.3 0.3
Trial number  14
train 0.35 test 0.3
0.3 0.3
Trial number  15
train 0.35 test 0.3
0.3 0.3
Trial number  16
train 0.35 test 0.3
0.3 0.3
Trial number  17
train 0.35 test 0.3
0.3 0.3
Trial number  18
train 0.35 test 0.3
0.3 0.3
Trial number  19
train 0.35 test 0.3
0.3 0.3
Trial number  20
train 0.35 test 0.3
0.3 0.3
Trial number  21
train 0.35 test 0.3
0.3 0.3
Trial number  22
train 0.35 test 0.3
0.3 0.3
Trial number  23
tr

In [49]:
### Test the weights in the pre-final iteration
res=np.sum(X_test*weights[-2],axis=1)
res = np.tanh(res)
res[res>0.5]=1
res[res<=0.5]=0
acc=accuracy_score(y_test, res)
print(acc)
print(res)

0.62
[1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 1. 1. 1. 1. 1. 0. 1. 0. 1.
 1. 1. 1. 1. 0. 0. 0. 1. 1. 0. 0. 1. 1. 0. 0. 0. 1. 1. 1. 1. 0. 1. 1. 1.
 1. 1.]
