##Active Learning Assignment

### Importing Libraries 
Libraries used are numpy, pandas, matplotlib and sklearn for classifiers

In [0]:
import os
import time
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.datasets import load_digits
from sklearn.datasets import fetch_mldata
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import check_random_state
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances

###Fetching Dataset
Dataset used are Iris and MNIST from sklearn for testing the algorithms

In [0]:
def fetch_data_iris():
    iris = load_iris()
    X = iris.data.astype('float64')
    y = iris.target
    print ('Dataset : ', X.shape, y.shape)
    return (X, y)

def fetch_data_mnist():
    mnist = load_digits()
    X = mnist.data.astype('float64')
    y = mnist.target
    print ('Dataset : ', X.shape, y.shape)
    return (X, y)    

###Classifiers Model
Superclass is made for different types of classifiers for major use in query by committee. The main classifier used is SVM for all the other query frameworks.The classifiers consist of SVM, Random Forest, Naive Bayes and Multinomial Logistic Regression.

In [0]:
class BaseModel(object):

    def __init__(self):
        pass

    def fit_predict(self):
        pass


class SvmModel(BaseModel):

    def fit_predict(self, X_train, y_train, X_val, X_test):
        self.classifier = SVC(C=1, kernel='linear', probability=True)
        self.classifier.fit(X_train, y_train)
        self.test_y_predicted = self.classifier.predict(X_test)
        self.val_y_predicted = self.classifier.predict(X_val)
        return (X_train, X_val, X_test, self.val_y_predicted, self.test_y_predicted)


class LogModel(BaseModel):

    def fit_predict(self, X_train, y_train, X_val, X_test):
        train_samples = X_train.shape[0]
        self.classifier = LogisticRegression(C=50. / train_samples,multi_class='multinomial',penalty='l1',solver='saga',tol=0.1)
        self.classifier.fit(X_train, y_train)
        self.test_y_predicted = self.classifier.predict(X_test)
        self.val_y_predicted = self.classifier.predict(X_val)
        return (X_train, X_val, X_test, self.val_y_predicted, self.test_y_predicted)

class RfModel(BaseModel):

    def fit_predict(self, X_train, y_train, X_val, X_test):
        self.classifier = RandomForestClassifier(n_estimators=500)
        self.classifier.fit(X_train, y_train)
        self.test_y_predicted = self.classifier.predict(X_test)
        self.val_y_predicted = self.classifier.predict(X_val)
        return (X_train, X_val, X_test, self.val_y_predicted, self.test_y_predicted)

class NbcModel(BaseModel):

    def fit_predict(self, X_train, y_train, X_val, X_test):
        self.classifier = GaussianNB()
        self.classifier.fit(X_train, y_train)
        self.test_y_predicted = self.classifier.predict(X_test)
        self.val_y_predicted = self.classifier.predict(X_val)
        return (X_train, X_val, X_test, self.val_y_predicted, self.test_y_predicted)


###Train Model
Class made for testing and training different classifiers by interfacing with the Classfiers superclass

In [0]:
class TrainModel:

    def __init__(self, model_object):        
        self.accuracies = []
        self.model_object = globals()[model_object]()
        self.model_name = model_object         

    def train(self, X_train, y_train, X_val, X_test):
        (X_train, X_val, X_test, self.val_y_predicted,self.test_y_predicted) = self.model_object.fit_predict(X_train, y_train, X_val, X_test)
        return (X_train, X_val, X_test)  

    def get_test_accuracy(self, i, y_test):
        classif_rate = np.mean(self.test_y_predicted.ravel() == y_test.ravel()) * 100
        self.accuracies.append(classif_rate)
        print('Iteration: %d Model: %s Accuracy: %f' % (i,self.model_name,classif_rate))
        


###Uncertainity Sampling
Superclass is made for three different types of sampling as part of uncertainity sampling. The three samplings consist of Least Confidence, Margin Sampling and Entropy Sampling for both pool based and stream based strategies.

In [0]:
class uncertainity_sampling(object) :
    
    def __init__(self) :
        pass
    def query_pool(self) :
        pass 
    def query_stream(self) :
        pass    

class least_confidence(uncertainity_sampling) :

    @staticmethod
    def query_pool(conf_unlab , batch_size) :

        sorted_prob = -np.sort(-conf_unlab , axis = 1) 
        return np.argsort(sorted_prob[:,0])[:batch_size]
    
    @staticmethod
    def query_stream(conf_unlab) :
        if np.sort(-np.sort(-conf_unlab)[:,0])[0] < 0.5 :
            return True 
        else :
            return False    

class margin_sampling(uncertainity_sampling) :

    @staticmethod
    def query_pool(conf_unlab , batch_size) :

        sorted_prob = -np.sort(-conf_unlab , axis = 1)
        margin = sorted_prob[:,0] - sorted_prob[:,1] 
        return np.argsort(margin)[:batch_size]

    @staticmethod
    def query_stream(conf_unlab) :
        if (-np.sort(-conf_unlab)[0,0] - (-np.sort(-conf_unlab)[0,1])) < 0.1 :
            return True 
        else :
            return False    

class  entropy_sampling(uncertainity_sampling) :

    @staticmethod
    def query_pool(conf_unlab , batch_size) :
        
        entropy = (-conf_unlab * np.log2(conf_unlab)).sum(axis = 1)
        return (np.argsort(entropy)[::-1])[:batch_size]

    @staticmethod
    def query_stream(conf_unlab ) :
        if np.count_nonzero(conf_unlab) ==0:
            return False
        elif (-conf_unlab * np.log2(conf_unlab)).sum() / np.log2(conf_unlab.shape[1]) > 0.6 : 
            return True
        else :
            return False     

###Normalization
Class made for normalizing and inversing while training and testing the data for better accuracy

In [0]:
class Normalize():
    
    def normalize(self, X_train, X_unlab, X_test):
        self.scaler = MinMaxScaler()
        X_train = self.scaler.fit_transform(X_train)
        X_unlab   = self.scaler.transform(X_unlab)
        X_test  = self.scaler.transform(X_test)
        return (X_train, X_unlab, X_test) 
    
    def inverse(self, X_train, X_unlab, X_test):
        X_train = self.scaler.inverse_transform(X_train)
        X_unlab   = self.scaler.inverse_transform(X_unlab)
        X_test  = self.scaler.inverse_transform(X_test)
        return (X_train, X_unlab, X_test)

###Random Initial Labels
Function is used to randomly label the initial number of samples.

In [0]:
def random_initial_train(initial_num, X_total, y_total):
    
    np.random.seed(21)
    random_state = check_random_state(0)
    permutation = np.random.choice(len(X_total),
                                   initial_num,
                                   replace=False)
    X_train = X_total[permutation]
    y_train = y_total[permutation]
    X_train = X_train.reshape((X_train.shape[0], -1))
    
    return (permutation, X_train, y_train)

###Pool Based Uncertainity Sampling


In [0]:
class uncertainity_active_pool(object):

    def __init__(self, top_pooled, model_object, selection_function):
        self.top_pooled = top_pooled
        self.model_object = model_object
        self.sample_selection_function = globals()[selection_function]

    def run(self, X_train_full, y_train_full, X_test, y_test,initial_samples,max_samples):

        (permutation, X_train, y_train) = random_initial_train(initial_samples, X_train_full, y_train_full)
        self.queried = initial_samples

        X_val = np.array([])
        y_val = np.array([])
        X_val = np.copy(X_train_full)
        X_val = np.delete(X_val, permutation, axis=0)
        y_val = np.copy(y_train_full)
        y_val = np.delete(y_val, permutation, axis=0)
        print ()

        normalizer = Normalize()
        X_train, X_val, X_test = normalizer.normalize(X_train, X_val, X_test)   
        
        self.clf_model = TrainModel(self.model_object)
        (X_train, X_val, X_test) = self.clf_model.train(X_train, y_train, X_val, X_test)
        active_iteration = 1
        self.clf_model.get_test_accuracy(1, y_test)

        while self.queried < max_samples:

            active_iteration += 1

            probas_val = self.clf_model.model_object.classifier.predict_proba(X_val)

            uncertain_samples = self.sample_selection_function.query_pool(probas_val, self.top_pooled)
 
            X_train, X_val, X_test = normalizer.inverse(X_train, X_val, X_test)   
            
            X_train = np.concatenate((X_train, X_val[uncertain_samples]))
            y_train = np.concatenate((y_train, y_val[uncertain_samples]))

            X_val = np.delete(X_val, uncertain_samples, axis=0)
            y_val = np.delete(y_val, uncertain_samples, axis=0)

            normalizer = Normalize()
            X_train, X_val, X_test = normalizer.normalize(X_train, X_val, X_test)               

            self.queried += self.top_pooled
            (X_train, X_val, X_test) = self.clf_model.train(X_train, y_train, X_val, X_test)
            self.clf_model.get_test_accuracy(active_iteration, y_test)

        return self.clf_model.accuracies

In [0]:
def call_uncertainity_active_pool(model,sampling_method,max_samples,initial_samples):
    (X,y) = fetch_data_mnist()

    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    act_pool = uncertainity_active_pool(1,model , sampling_method)
    accuracies = act_pool.run(X_train,y_train,X_test,y_test,initial_samples,max_samples)

    (permutation, X_train_selected, y_train_selected) = random_initial_train(max_samples, X_train, y_train)
    random_accuracies=[]
    classifier_random = SVC(C=1, kernel='linear', probability=True)

    for i in range(initial_samples-1,max_samples):
        classifier_random.fit(X_train_selected[:i+1], y_train_selected[:i+1])
        y_pred_random = classifier_random.predict(X_test)
        random_accuracies.append(accuracy_score(y_test, y_pred_random)*100)
    print("accuracies",accuracies)
    print("random_accuracies",random_accuracies)
    x_axis = np.linspace(initial_samples,max_samples,num=max_samples - initial_samples +1,endpoint=True)
    plt.plot(x_axis, accuracies, 'r',label='active') 
    plt.plot(x_axis, random_accuracies, 'blue',label='random') 
    plt.legend()
    plt.xlabel('Sample Size')
    plt.ylabel('Accuracy')
    plt.show()


###Stream Based Uncertainity Sampling

In [0]:
class uncertainity_active_stream(object):

    def __init__(self, model_object, selection_function):
        self.model_object = model_object
        self.sample_selection_function = globals()[selection_function]

    def run(self, X_train_full, y_train_full, X_test, y_test,initial_samples,max_samples):

        (permutation, X_train, y_train) = random_initial_train(initial_samples, X_train_full, y_train_full)
        self.queried = initial_samples

        X_val = np.array([])
        y_val = np.array([])
        X_val = np.copy(X_train_full)
        X_val = np.delete(X_val, permutation, axis=0)
        y_val = np.copy(y_train_full)
        y_val = np.delete(y_val, permutation, axis=0)
        print ()

        normalizer = Normalize()
        X_train, X_val, X_test = normalizer.normalize(X_train, X_val, X_test)   
        
        self.clf_model = TrainModel(self.model_object)
        (X_train, X_val, X_test) = self.clf_model.train(X_train, y_train, X_val, X_test)
        active_iteration = 1
        self.clf_model.get_test_accuracy(1, y_test)

        while self.queried < max_samples:

            active_iteration += 1
            rand_index = np.random.choice(len(X_val))
            probas_val = self.clf_model.model_object.classifier.predict_proba(X_val[rand_index].reshape(1,X_val.shape[1]))
            bool_info = self.sample_selection_function.query_stream(probas_val)
 
            if bool_info :
                X_train, X_val, X_test = normalizer.inverse(X_train, X_val, X_test)   
                
                X_train = np.concatenate((X_train, X_val[rand_index].reshape(1,X_val.shape[1])))
                y_train = np.append(y_train, y_val[rand_index])

                X_val = np.delete(X_val, rand_index, axis=0)
                y_val = np.delete(y_val, rand_index, axis=0)

                normalizer = Normalize()
                X_train, X_val, X_test = normalizer.normalize(X_train, X_val, X_test)               

                self.queried += 1
                (X_train, X_val, X_test) = self.clf_model.train(X_train, y_train, X_val, X_test)
                self.clf_model.get_test_accuracy(active_iteration, y_test)
            
        return self.clf_model.accuracies

In [0]:
def call_uncertainity_active_stream(model,sampling_method,max_samples,initial_samples):
    (X,y) = fetch_data_mnist()

    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    act_stream = uncertainity_active_stream(model , sampling_method)
    accuracies = act_stream.run(X_train,y_train,X_test,y_test,initial_samples,max_samples)

    (permutation, X_train_selected, y_train_selected) = random_initial_train(max_samples, X_train, y_train)
    random_accuracies=[]
    classifier_random = SVC(C=1, kernel='linear', probability=True)

    for i in range(initial_samples-1,max_samples):
        classifier_random.fit(X_train_selected[:i+1], y_train_selected[:i+1])
        y_pred_random = classifier_random.predict(X_test)
        random_accuracies.append(accuracy_score(y_test, y_pred_random)*100)

    x_axis = np.linspace(initial_samples,max_samples,num=max_samples - initial_samples +1,endpoint=True)
    plt.plot(x_axis, accuracies, 'r',label='active') 
    plt.plot(x_axis, random_accuracies, 'blue',label='random') 
    plt.legend()
    plt.xlabel('Sample Size')
    plt.ylabel('Accuracy')
    plt.show()

###Vote Entropy for QBC
Function for vote entropy calculation for Query by Committee Sampling

In [0]:
def vote_entropy(svm,logistic,random_forest,X_val):
    y_pred1 = svm.model_object.classifier.predict(X_val)
    y_pred2 = logistic.model_object.classifier.predict(X_val)
    y_pred3 = random_forest.model_object.classifier.predict(X_val)

    # n_classes = svm.model_object.classifier.classes_
    n_classes = 10

    entropy = np.zeros(y_pred1.shape[0])

    for i in range(y_pred1.shape[0]):
        votes=np.zeros(n_classes)
        votes[y_pred1[i]] = votes[y_pred1[i]] + 1
        votes[y_pred2[i]] = votes[y_pred2[i]] + 1
        votes[y_pred3[i]] = votes[y_pred3[i]] + 1

        for j in range(n_classes):
            if votes[j] == 0 :
                temp = 0 
            else :    
                temp = (-1) * (votes[j]/n_classes) * np.log(votes[j]/n_classes)
            entropy[i] = entropy[i] + temp

    return entropy

###KL Divergence
Function for KL Divergence calculation for Query by Committee Sampling 

In [0]:
def kld(svm,logistic,random_forest,X_val):
    y_prob1 = svm.model_object.classifier.predict_proba(X_val)
    y_prob2 = logistic.model_object.classifier.predict_proba(X_val)
    y_prob3 = random_forest.model_object.classifier.predict_proba(X_val)

    y_prob_avg = (y_prob1 + y_prob2 + y_prob3)/3

    # n_classes = svm.model_object.classifier.classes_
    n_classes = 10
    sum = np.zeros(y_prob1.shape[0])
    
    for i in range(y_prob1.shape[0]) :
        temp = 0 
        for j in range(y_prob1.shape[1]) :
            if y_prob1[i,j] == 0 :
                temp += 0
            else :
                temp += y_prob1[i,j] * math.log(y_prob1[i,j] / y_prob_avg[i,j])
            if y_prob2[i,j] == 0 :
                temp += 0
            else :
                temp += y_prob2[i,j] * math.log(y_prob2[i,j] / y_prob_avg[i,j])
            if y_prob3[i,j] == 0 :
                temp += 0
            else :
                temp += y_prob3[i,j] * math.log(y_prob3[i,j] / y_prob_avg[i,j])

        sum[i] = temp / 3    
    
    # sum += (y_prob1 * np.log(y_prob1 / y_prob_avg)).sum(axis = 1)
    # sum += (y_prob2 * np.log(y_prob2 / y_prob_avg)).sum(axis = 1)
    # sum += (y_prob3 * np.log(y_prob3 / y_prob_avg)).sum(axis = 1)
    # sum /= 3
    
    return sum


###Pool Based Query by Committee Sampling

In [0]:
class qbc_active_pool(object):

    def __init__(self,dis_way):
        self.dis_way = dis_way

    def run(self, X_train_full, y_train_full, X_test, y_test,initial_samples,max_samples):

        (permutation, X_train, y_train) = random_initial_train(initial_samples, X_train_full, y_train_full)
        self.queried = initial_samples
        self.svm=TrainModel(SvmModel.__name__ )
        self.logistic=TrainModel(LogModel.__name__ )
        self.random_forest=TrainModel(RfModel.__name__ )

        X_val = np.array([])
        y_val = np.array([])
        X_val = np.copy(X_train_full)
        X_val = np.delete(X_val, permutation, axis=0)
        y_val = np.copy(y_train_full)
        y_val = np.delete(y_val, permutation, axis=0)

        normalizer = Normalize()
        X_train, X_val, X_test = normalizer.normalize(X_train, X_val, X_test)   
             
        (X_train, X_val, X_test) = self.svm.train(X_train, y_train, X_val, X_test)
        (X_train, X_val, X_test) = self.logistic.train(X_train, y_train, X_val, X_test)
        (X_train, X_val, X_test) = self.random_forest.train(X_train, y_train, X_val, X_test)

        active_iteration = 1
        self.svm.get_test_accuracy(1, y_test)
        self.logistic.get_test_accuracy(1, y_test)
        self.random_forest.get_test_accuracy(1, y_test)

        while self.queried < max_samples:

            active_iteration += 1

            dis_array = globals()[self.dis_way](self.svm,self.logistic,self.random_forest,X_val)
            if self.dis_way == 'vote_entropy':
                index_selected = np.argmax(dis_array)
            else: 
                index_selected = np.argmax(dis_array)    
                # print(index_selected," yoyo ",dis_array[index_selected])
            X_train, X_val, X_test = normalizer.inverse(X_train, X_val, X_test)   
            
            X_train = np.concatenate((X_train, X_val[index_selected:index_selected+1]))
            y_train = np.concatenate((y_train, y_val[index_selected:index_selected+1]))

            X_val = np.delete(X_val, index_selected, axis=0)
            y_val = np.delete(y_val, index_selected, axis=0)

            normalizer = Normalize()
            X_train, X_val, X_test = normalizer.normalize(X_train, X_val, X_test)               

            self.queried += 1

            (X_train, X_val, X_test) = self.svm.train(X_train, y_train, X_val, X_test)
            (X_train, X_val, X_test) = self.logistic.train(X_train, y_train, X_val, X_test)
            (X_train, X_val, X_test) = self.random_forest.train(X_train, y_train, X_val, X_test)
            self.svm.get_test_accuracy(active_iteration, y_test)
            self.logistic.get_test_accuracy(active_iteration, y_test)
            self.random_forest.get_test_accuracy(active_iteration, y_test)

        print ('Intermediate accuracies SVM: ', self.svm.accuracies)
        print ('Intermediate Accuracies Logistic Regression: ', self.logistic.accuracies)
        print ('Intermediate Accuracies Random Forests: ', self.random_forest.accuracies)
        return self.svm.accuracies

In [0]:
def call_qbc_active_pool(dis_way,max_samples,initial_samples):
    (X,y) = fetch_data_mnist()

    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    act_pool = qbc_active_pool(dis_way)
    accuracies = act_pool.run(X_train,y_train,X_test,y_test,initial_samples,max_samples)

    (permutation, X_train_selected, y_train_selected) = random_initial_train(max_samples, X_train, y_train)
    random_accuracies=[]
    classifier_random = SVC(C=1, kernel='linear', probability=True)

    for i in range(initial_samples-1,max_samples):
        classifier_random.fit(X_train_selected[:i+1], y_train_selected[:i+1])
        y_pred_random = classifier_random.predict(X_test)
        random_accuracies.append(accuracy_score(y_test, y_pred_random)*100)

    x_axis = np.linspace(initial_samples,max_samples,num=max_samples - initial_samples +1,endpoint=True)
    plt.plot(x_axis, accuracies, 'r',label='active') 
    plt.plot(x_axis, random_accuracies, 'blue',label='random') 
    plt.legend()
    plt.xlabel('Sample Size')
    plt.ylabel('Accuracy')
    plt.show()

###Stream Based Query by Committee Sampling

In [0]:
class qbc_active_stream(object):

    def __init__(self,dis_way):
        self.dis_way = dis_way

    def run(self, X_train_full, y_train_full, X_test, y_test,initial_samples,max_samples):

        (permutation, X_train, y_train) = random_initial_train(initial_samples, X_train_full, y_train_full)
        self.queried = initial_samples
        self.samplecount = [initial_samples]
        self.svm=TrainModel(SvmModel.__name__ )
        self.logistic=TrainModel(LogModel.__name__ )
        self.random_forest=TrainModel(RfModel.__name__ )

        X_val = np.array([])
        y_val = np.array([])
        X_val = np.copy(X_train_full)
        X_val = np.delete(X_val, permutation, axis=0)
        y_val = np.copy(y_train_full)
        y_val = np.delete(y_val, permutation, axis=0)

        normalizer = Normalize()
        X_train, X_val, X_test = normalizer.normalize(X_train, X_val, X_test)   
             
        (X_train, X_val, X_test) = self.svm.train(X_train, y_train, X_val, X_test)
        (X_train, X_val, X_test) = self.logistic.train(X_train, y_train, X_val, X_test)
        (X_train, X_val, X_test) = self.random_forest.train(X_train, y_train, X_val, X_test)

        active_iteration = 1
        self.svm.get_test_accuracy(1, y_test)
        self.logistic.get_test_accuracy(1, y_test)
        self.random_forest.get_test_accuracy(1, y_test)

        while self.queried < max_samples:

            active_iteration += 1

            rand_index = np.random.choice(X_val.shape[0])
            dis_array = globals()[self.dis_way](self.svm,self.logistic,self.random_forest,X_val[rand_index].reshape(1,X_val.shape[1]))
            
            if self.dis_way == 'kld' :
                if dis_array[0] > 0.04 :
                    bool_take = True
                else :
                    bool_take = False        
            if self.dis_way == 'vote_entropy' :
                if dis_array[0] > 0.5 :
                    bool_take = True
                else :
                    bool_take = False
 
            if bool_take :
                X_train, X_val, X_test = normalizer.inverse(X_train, X_val, X_test)   
                
                X_train = np.concatenate((X_train, X_val[rand_index].reshape(1,X_val.shape[1])))
                y_train = np.append(y_train, y_val[rand_index])

                X_val = np.delete(X_val, rand_index, axis=0)
                y_val = np.delete(y_val, rand_index, axis=0)

                normalizer = Normalize()
                X_train, X_val, X_test = normalizer.normalize(X_train, X_val, X_test)               

                self.queried += 1

                (X_train, X_val, X_test) = self.svm.train(X_train, y_train, X_val, X_test)
                (X_train, X_val, X_test) = self.logistic.train(X_train, y_train, X_val, X_test)
                (X_train, X_val, X_test) = self.random_forest.train(X_train, y_train, X_val, X_test)
                self.svm.get_test_accuracy(active_iteration, y_test)
                self.logistic.get_test_accuracy(active_iteration, y_test)
                self.random_forest.get_test_accuracy(active_iteration, y_test)


        print ('Intermediate accuracies SVM: ', self.svm.accuracies)
        print ('Intermediate Accuracies Logistic Regression: ', self.logistic.accuracies)
        print ('Intermediate Accuracies Random Forests: ', self.random_forest.accuracies)

        return self.svm.accuracies

In [0]:
def call_qbc_active_stream(dis_way,max_samples,initial_samples):
    (X,y) = fetch_data_mnist()

    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    act_stream = qbc_active_stream(dis_way)
    accuracies = act_stream.run(X_train,y_train,X_test,y_test,initial_samples,max_samples)

    (permutation, X_train_selected, y_train_selected) = random_initial_train(max_samples, X_train, y_train)
    random_accuracies=[]
    classifier_random = SVC(C=1, kernel='linear', probability=True)

    for i in range(initial_samples-1,max_samples):
        classifier_random.fit(X_train_selected[:i+1], y_train_selected[:i+1])
        y_pred_random = classifier_random.predict(X_test)
        random_accuracies.append(accuracy_score(y_test, y_pred_random)*100)

    x_axis = np.linspace(initial_samples,max_samples,num=max_samples - initial_samples +1,endpoint=True)
    plt.plot(x_axis, accuracies, 'r',label='active') 
    plt.plot(x_axis, random_accuracies, 'blue',label='random') 
    plt.legend()
    plt.xlabel('Sample Size')
    plt.ylabel('Accuracy')
    plt.show()

###Pool Based Diversity Sampling

In [0]:
class diversity_sampling_pool(object):

    def __init__(self, model_object):
        self.model_object = model_object

    def run(self, X_train_full, y_train_full, X_test, y_test,initial_samples,max_samples):

        self.queried = initial_samples
        n_clusters=initial_samples
        kmeans = KMeans(n_clusters, random_state=0).fit(X_train_full)

        cluster_map = pd.DataFrame()
        cluster_map['cluster'] = kmeans.labels_  
        cluster_map.reset_index(level=0, inplace=True)

        initial_indices = np.array([])
        np.random.seed(21)
        for i in range(n_clusters):
            cluster_current = np.asarray(cluster_map[cluster_map.cluster == i]['index'],int)
            index = np.random.choice(cluster_current)
            initial_indices = np.append(initial_indices,index)
        initial_indices = initial_indices.astype(int)
        X_train = X_train_full[initial_indices]
        y_train = y_train_full[initial_indices]
        X_val = np.array([])
        y_val = np.array([])
        X_val = np.copy(X_train_full)
        X_val = np.delete(X_val, initial_indices, axis=0)
        y_val = np.copy(y_train_full)
        y_val = np.delete(y_val, initial_indices, axis=0)

        normalizer = Normalize()
        X_train, X_val, X_test = normalizer.normalize(X_train, X_val, X_test)   
        
        self.clf_model = TrainModel(self.model_object)
        (X_train, X_val, X_test) = self.clf_model.train(X_train, y_train, X_val, X_test)
        active_iteration = 1
        self.clf_model.get_test_accuracy(1, y_test)

        while self.queried < max_samples:

            active_iteration += 1

            distances = euclidean_distances(X_val, X_train)
            min_distances = distances.min(axis=1)
            selected_index = np.argmax(min_distances)
            value = min_distances[selected_index]

            X_train, X_val, X_test = normalizer.inverse(X_train, X_val, X_test)   
            
            X_train = np.concatenate((X_train, X_val[selected_index:selected_index+1]))
            y_train = np.concatenate((y_train, y_val[selected_index:selected_index+1]))

            X_val = np.delete(X_val, selected_index, axis=0)
            y_val = np.delete(y_val, selected_index, axis=0)

            normalizer = Normalize()
            X_train, X_val, X_test = normalizer.normalize(X_train, X_val, X_test)               

            self.queried += 1
            (X_train, X_val, X_test) = self.clf_model.train(X_train, y_train, X_val, X_test)
            self.clf_model.get_test_accuracy(active_iteration, y_test)
            print(value)

        return self.clf_model.accuracies

In [0]:
def call_diversity_sampling_pool(model,max_samples,initial_samples):
    (X,y) = fetch_data_mnist()

    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    diversity = diversity_sampling_pool(model)
    accuracies = diversity.run(X_train,y_train,X_test,y_test,initial_samples,max_samples)

    (permutation, X_train_selected, y_train_selected) = random_initial_train(max_samples, X_train, y_train)
    random_accuracies=[]
    classifier_random = SVC(C=1, kernel='linear', probability=True)

    for i in range(initial_samples-1,max_samples):
        classifier_random.fit(X_train_selected[:i+1], y_train_selected[:i+1])
        y_pred_random = classifier_random.predict(X_test)
        random_accuracies.append(accuracy_score(y_test, y_pred_random)*100)
    print("accuracies",accuracies)
    print("random_accuracies",random_accuracies)
    x_axis = np.linspace(initial_samples,max_samples,num=max_samples - initial_samples +1,endpoint=True)
    plt.plot(x_axis, accuracies, 'r',label='active') 
    plt.plot(x_axis, random_accuracies, 'blue',label='random') 
    plt.legend()
    plt.xlabel('Sample Size')
    plt.ylabel('Accuracy')
    plt.show()

###Stream Based Diversity Sampling

In [0]:
class diversity_sampling_stream(object):

    def __init__(self, model_object):
        self.model_object = model_object

    def run(self, X_train_full, y_train_full, X_test, y_test,initial_samples,max_samples):

        self.queried = initial_samples
        n_clusters=initial_samples
        kmeans = KMeans(n_clusters, random_state=0).fit(X_train_full)

        cluster_map = pd.DataFrame()
        cluster_map['cluster'] = kmeans.labels_  
        cluster_map.reset_index(level=0, inplace=True)

        initial_indices = np.array([])
        np.random.seed(21)
        for i in range(n_clusters):
            cluster_current = np.asarray(cluster_map[cluster_map.cluster == i]['index'],int)
            index = np.random.choice(cluster_current)
            initial_indices = np.append(initial_indices,index)
        initial_indices = initial_indices.astype(int)
        X_train = X_train_full[initial_indices]
        y_train = y_train_full[initial_indices]
        X_val = np.array([])
        y_val = np.array([])
        X_val = np.copy(X_train_full)
        X_val = np.delete(X_val, initial_indices, axis=0)
        y_val = np.copy(y_train_full)
        y_val = np.delete(y_val, initial_indices, axis=0)

        normalizer = Normalize()
        X_train, X_val, X_test = normalizer.normalize(X_train, X_val, X_test)   
        
        self.clf_model = TrainModel(self.model_object)
        (X_train, X_val, X_test) = self.clf_model.train(X_train, y_train, X_val, X_test)
        active_iteration = 1
        self.clf_model.get_test_accuracy(1, y_test)

        while self.queried < max_samples:

            active_iteration += 1

            np.random.seed(21)
            selected_index = np.random.choice(X_val.shape[0])
            distances = euclidean_distances(X_val[selected_index:selected_index+1], X_train)
            dist = np.min(distances)
            if dist >=1:

                X_train, X_val, X_test = normalizer.inverse(X_train, X_val, X_test)   
                
                X_train = np.concatenate((X_train, X_val[selected_index:selected_index+1]))
                y_train = np.concatenate((y_train, y_val[selected_index:selected_index+1]))

                X_val = np.delete(X_val, selected_index, axis=0)
                y_val = np.delete(y_val, selected_index, axis=0)

                normalizer = Normalize()
                X_train, X_val, X_test = normalizer.normalize(X_train, X_val, X_test)               

                self.queried += 1
                (X_train, X_val, X_test) = self.clf_model.train(X_train, y_train, X_val, X_test)
                self.clf_model.get_test_accuracy(active_iteration, y_test)

        return self.clf_model.accuracies

In [0]:
def call_diversity_sampling_stream(model,max_samples,initial_samples):
    (X,y) = fetch_data_mnist()

    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    diversity = diversity_sampling_stream(model)
    accuracies = diversity.run(X_train,y_train,X_test,y_test,initial_samples,max_samples)

    (permutation, X_train_selected, y_train_selected) = random_initial_train(max_samples, X_train, y_train)
    random_accuracies=[]
    classifier_random = SVC(C=1, kernel='linear', probability=True)

    for i in range(initial_samples-1,max_samples):
        classifier_random.fit(X_train_selected[:i+1], y_train_selected[:i+1])
        y_pred_random = classifier_random.predict(X_test)
        random_accuracies.append(accuracy_score(y_test, y_pred_random)*100)
    print("accuracies",accuracies)
    print("random_accuracies",random_accuracies)
    x_axis = np.linspace(initial_samples,max_samples,num=max_samples - initial_samples +1,endpoint=True)
    plt.plot(x_axis, accuracies, 'r',label='active') 
    plt.plot(x_axis, random_accuracies, 'blue',label='random') 
    plt.legend()
    plt.xlabel('Sample Size')
    plt.ylabel('Accuracy')
    plt.show()

###Cluster Strategy without Retention of Initial Labels

In [0]:
def cluster_strategy(limited_budget):
    (X,y) = fetch_data_mnist()
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    n_clusters = 20
    kmeans = KMeans(n_clusters, random_state=0).fit(X_train)

    kmeans.score(X_train)
    cluster_map = pd.DataFrame()
    cluster_map['cluster'] = kmeans.labels_  
    cluster_map.reset_index(level=0, inplace=True)
    len(cluster_map[cluster_map.cluster == 1])
    queried_indices = np.array([])
    y_train_new=np.zeros(y_train.shape[0])

    for i in range(n_clusters): 
        np.random.seed(33)
        cluster_current = np.asarray(cluster_map[cluster_map.cluster == i]['index'],int)
        # select_limit = int(np.ceil(limited_budget/len(cluster_current)))
        select_limit = int(np.ceil((len(cluster_current) * limited_budget)/X_train.shape[0]))
        selected = np.random.choice(cluster_current,select_limit,replace=False)
        selected = selected.astype(int)
        queried_indices = np.concatenate((queried_indices,selected))
        y_cluster_classes = y_train[selected]
        cluster_class = np.bincount(y_cluster_classes).argmax()
        y_train_new[cluster_current] = cluster_class
        # y_train_new[selected] = y_train[selected]

    classifier = SVC(C=1, kernel='linear', probability=True)
    classifier.fit(X_train,y_train_new)
    y_pred_new = classifier.predict(X_test)
    accuracy_with = accuracy_score(y_test,y_pred_new)
    queried_indices
    queried_indices = np.asarray(queried_indices)
    queried_indices = queried_indices.astype(int)
    classifier.fit(X_train[queried_indices],y_train[queried_indices])
    accuracy_without = accuracy_score(y_test,classifier.predict(X_test))

    print("Accuracy with only limited labelled points: ",accuracy_without)
    print("Accuracy after labelling using clustering: ",accuracy_with)

###Cluster Strategy with Retention of Initial Labels

In [0]:
def cluster_strategy_2(limited_budget):
    (X,y) = fetch_data_mnist()
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    n_clusters = 20
    kmeans = KMeans(n_clusters, random_state=0).fit(X_train)

    kmeans.score(X_train)
    cluster_map = pd.DataFrame()
    cluster_map['cluster'] = kmeans.labels_  
    cluster_map.reset_index(level=0, inplace=True)
    len(cluster_map[cluster_map.cluster == 1])
    queried_indices = np.array([])
    y_train_new=np.zeros(y_train.shape[0])

    for i in range(n_clusters): 
        cluster_current = np.asarray(cluster_map[cluster_map.cluster == i]['index'],int)
        # select_limit = int(np.ceil(limited_budget/len(cluster_current)))
        select_limit = int(np.ceil((len(cluster_current) * limited_budget)/X_train.shape[0]))
        np.random.seed(33)
        selected = np.random.choice(cluster_current,select_limit,replace=False)
        selected = selected.astype(int)
        queried_indices = np.concatenate((queried_indices,selected))
        y_cluster_classes = y_train[selected]
        cluster_class = np.bincount(y_cluster_classes).argmax()
        y_train_new[cluster_current] = cluster_class
        y_train_new[selected] = y_train[selected]

    classifier = SVC(C=1, kernel='linear', probability=True)
    classifier.fit(X_train,y_train_new)
    y_pred_new = classifier.predict(X_test)
    accuracy_with = accuracy_score(y_test,y_pred_new)
    queried_indices
    queried_indices = np.asarray(queried_indices)
    queried_indices = queried_indices.astype(int)
    classifier.fit(X_train[queried_indices],y_train[queried_indices])
    accuracy_without = accuracy_score(y_test,classifier.predict(X_test))

    print("Accuracy with only limited labelled points: ",accuracy_without)
    print("Accuracy after labelling using clustering: ",accuracy_with)

###Main Menu

In [0]:
def start():
    while True:
        print('Active Learning Machine Learning Assignment 2')
        print()
        print('Please choose the Query Strategy Framework or select 4 for Cluster Analysis :')
        print('1.Uncertainty Sampling')
        print('2.Query by Committee')
        print('3.Diversity Sampling')
        print('4.Cluster Analysis')
        print('0.Exit')
        a = int(input('Enter the number: '))

        if a==0:
            break
        
        if a!=1 and a!=2 and a!=3 and a!=4:
            print('Please enter a valid number!')
            continue
        
        if a==4:
            print()
            print('Please choose from the following:')
            print('1.Cluster Analysis with Retention of initial labels')
            print('2.Cluster Analysis without Retention')
            print('0.Exit')
            b = int(input('Enter the number:'))

            if b==0:
                break
            print()
            print('Please choose from budget options:')
            print('1.Low Budget(30 samples)')
            print('2.High Budget(100 samples)')
            c = int(input('Enter the number:'))
            if b==1:
                cluster_strategy_2(30 if c==1 else 100)
            else:
                cluster_strategy(30 if c==1 else 100)    
            continue

        print()
        print('Please choose the type of sampling:')
        print('1.Pool-based')
        print('2.Stream-based')
        print('0.Exit')
        b = int(input('Enter the number: '))
            
        if b!=1 and b!=2:
            break

        if a==1:
            print()
            print('Please choose the Uncertainty Measure:')
            print('1.Margin')
            print('2.Entropy')
            print('3.Least Confidence')
            print('0.exit')
            c = int(input('Enter the number: ')) 

            if c!=1 and c!=2 and c!=3:
                break

            if b==1:
                if c==1:
                    call_uncertainity_active_pool(SvmModel.__name__,margin_sampling.__name__,50,20)
                elif c==2:
                    call_uncertainity_active_pool(SvmModel.__name__,entropy_sampling.__name__,50,20)
                elif c==3:
                    call_uncertainity_active_pool(SvmModel.__name__,least_confidence.__name__,50,20)

            
            elif b==2:
                if c==1:
                    call_uncertainity_active_stream(SvmModel.__name__,margin_sampling.__name__,50,20)
                elif c==2:
                    call_uncertainity_active_stream(SvmModel.__name__,entropy_sampling.__name__,50,20)
                elif c==3:
                    call_uncertainity_active_stream(SvmModel.__name__,least_confidence.__name__,50,20)    



        elif a==2:
            print()
            print('Please choose the Disagreement Measure:')
            print('1.Vote Entropy')
            print('2.KL Divergence')
            print('0.exit')
            c = int(input('Enter the number: '))

            if c!=1 and c!=2:
                break

            if b==1:
                if c==1:
                    call_qbc_active_pool('vote_entropy',50,20)
                elif c==2:
                    call_qbc_active_pool('kld',50,20)

            elif b==2:
                if c==1:
                    call_qbc_active_stream('vote_entropy',50,20)
                elif c==2:
                    call_qbc_active_stream('kld',50,20)

        elif a==3:
            if b==1:
                call_diversity_sampling_pool(SvmModel.__name__,50,20)
            elif b==2:
                call_diversity_sampling_stream(SvmModel.__name__,50,20)

###Run this cell for the Main Menu

In [0]:
start()