In [42]:
import forestci as fci

In [43]:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import os
import time
import json
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
from scipy import stats
from pylab import rcParams
from sklearn.utils import check_random_state
from sklearn.datasets import load_digits
from sklearn.datasets import fetch_mldata
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, \
    GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

max_queried = 3000

In [44]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [45]:
def split(train_size):
    X_train_full = X[:train_size]
    y_train_full = y[:train_size]
    X_test = X[train_size:]
    y_test = y[train_size:]
    return (X_train_full, y_train_full, X_test, y_test)


In [46]:
#!pip install forestci


In [47]:
import numpy as np
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestRegressor
import sklearn.model_selection as xval
from sklearn.datasets.mldata import fetch_mldata
import forestci as fci

class BaseModel(object):

    def __init__(self):
        pass

    def fit_predict(self):
        pass

class LogModel(BaseModel):

    model_type = 'Multinominal Logistic Regression' 
    
    def fit_predict(self, X_train, y_train, X_val, X_test, c_weight):
        print ('training multinomial logistic regression')
        train_samples = X_train.shape[0]
        self.classifier = LogisticRegression(
            C=50. / train_samples,
            multi_class='multinomial',
            penalty='l1',
            solver='saga',
            tol=0.1,
            class_weight=c_weight,
            )
        self.classifier.fit(X_train, y_train)
        self.test_y_predicted = self.classifier.predict(X_test)
        self.val_y_predicted = self.classifier.predict(X_val)
        return (X_train, X_val, X_test, self.val_y_predicted,
                self.test_y_predicted)

class RfModel(BaseModel):

    model_type = 'Random Forest'
    
    def fit_predict(self, X_train, y_train, X_val, X_test, c_weight):
        print ('training random forest...')
        self.predictor = RandomForestRegressor(n_estimators=500, criterion='mse')
        self.predictor.fit(X_train, y_train)
        self.test_y_predicted = self.predictor.predict(X_test)
        self.val_y_predicted = self.predictor.predict(X_val)
        return (X_train, X_val, X_test, self.val_y_predicted, self.test_y_predicted)
    
    def score(self, y_predicted, y_test):
        print ('scoring random forest...')
        return self.predictor.score(y_predicted, y_test)
    
    def variance(self, X_train, X_test):
        print('variance of data...')
        return fci.random_forest_error(self.predictor,X_train,X_test)

In [48]:
class TrainModel:

    def __init__(self, model_object):        
        self.scores = []
        self.model_object = model_object()        

    def print_model_type(self):
        print (self.model_object.model_type)

    # we train normally and get probabilities for the validation set. i.e., we use the probabilities to select the most uncertain samples

    def train(self, X_train, y_train, X_val, X_test, c_weight):
        print ('Train set:', X_train.shape, 'y:', y_train.shape)
        print ('Val   set:', X_val.shape)
        print ('Test  set:', X_test.shape)
        t0 = time.time()
        (X_train, X_val, X_test, self.val_y_predicted,
         self.test_y_predicted) = \
            self.model_object.fit_predict(X_train, y_train, X_val, X_test, c_weight)
        self.run_time = time.time() - t0
        return (X_train, X_val, X_test)  # we return them in case we use PCA, with all the other algorithms, this is not needed.

    # we want accuracy only for the test set
    def get_test_accuracy(self, i, y_test):
        """
        Only Useful for classification models
        """
        classif_rate = np.mean(self.test_y_predicted.ravel() == y_test.ravel()) * 100
        self.scores.append(classif_rate)               
        print('--------------------------------')
        print('Iteration:',i)
        print('--------------------------------')
        print('y-test set:',y_test.shape)
        print('Example run in %.3f s' % self.run_time,'\n')
        print("Accuracy rate for %f " % (classif_rate))
        print('--------------------------------')
        
    def get_test_score(self, i, X_test, y_test):
        """
        Use for regression models
        """
        test_score = self.model_object.score(X_test, y_test) #rsquared value
        mse = mean_squared_error(y_test, self.test_y_predicted)
        self.scores.append(test_score)
        print('--------------------------------')
        print('Iteration:',i)
        print('--------------------------------')
        print('y-test set:',y_test.shape)
        print('Example run in %.3f s' % self.run_time,'\n')
        print("Score for %f " % (test_score))
        print("MSE for %f " % (mse))
        print('--------------------------------')
        
        



We create a modular selection function class representation, 'BaseSelectionFunction' is a base class for various sample selection methods. Using this architecture, you can implement new selection methods and use them in addition or instead of previous methods, for experimental purposes. Our current implementations include random-selection, entropy-selection, margin sampling-selection and minimum standard deviation-selection.

In [49]:
class BaseSelectionFunction(object):

    def __init__(self):
        pass

    def select(self):
        pass


class RandomSelection(BaseSelectionFunction):

    @staticmethod
    def select(probas_val, initial_labeled_samples):
        random_state = check_random_state(0)
        selection = np.random.choice(probas_val.shape[0], initial_labeled_samples, replace=False)

#     print('uniques chosen:',np.unique(selection).shape[0],'<= should be equal to:',initial_labeled_samples)

        return selection
class VarianceSelection(BaseSelectionFunction):
    @staticmethod
    def select(var_list, initial_labeled_samples):
        #import pdb; pdb.set_trace()
        selection = (np.argsort(var_list)[::-1])[:initial_labeled_samples]
        return selection

class EntropySelection(BaseSelectionFunction):

    @staticmethod
    def select(probas_val, initial_labeled_samples):
        #import pdb; pdb.set_trace()
        #e = (-probas_val * np.log2(probas_val)).sum(axis=1)
        e = (-probas_val * np.log2(probas_val)).sum()
        selection = (np.argsort(e)[::-1])[:initial_labeled_samples]
        return selection
      
      
class MarginSamplingSelection(BaseSelectionFunction):

    @staticmethod
    def select(probas_val, initial_labeled_samples):
        #import pdb; pdb.set_trace()
        #rev = np.sort(probas_val, axis=1)[:, ::-1]
        rev = np.sort(probas_val)[::-1]
        values = rev[:, 0] - rev[:, 1]
        selection = np.argsort(values)[:initial_labeled_samples]
        return selection


We have a class that is used to normalize using a MinMax Scaler in the range of [0,1].

In [50]:
class Normalize(object):
    
    def normalize(self, X_train, X_val, X_test):
        self.scaler = MinMaxScaler()
        X_train = self.scaler.fit_transform(X_train)
        X_val   = self.scaler.transform(X_val)
        X_test  = self.scaler.transform(X_test)
        return (X_train, X_val, X_test) 
    
    def inverse(self, X_train, X_val, X_test):
        X_train = self.scaler.inverse_transform(X_train)
        X_val   = self.scaler.inverse_transform(X_val)
        X_test  = self.scaler.inverse_transform(X_test)
        return (X_train, X_val, X_test) 

Initially we would like to get a random sampling from the unlabeled data-pool, this is done using random.choice without replacement.

In [51]:
def get_k_random_samples(initial_labeled_samples, X_train_full, 
                         y_train_full,index_test_full,index_train_full):
    random_state = check_random_state(0)
    permutation = np.random.choice(trainset_size,
                                   initial_labeled_samples,
                                   replace=False)
    print ()
    print ('initial random chosen samples', permutation.shape)#,permutation)
    #import pdb; pdb.set_trace()
    X_train = X_train_full[permutation]
    y_train = y_train_full[permutation]
    X_train = X_train.reshape((X_train.shape[0], -1))
    
    #getting index split
    index_train = index_train_full[permutation]
    
    bin_count = np.bincount(y_train.astype('int64'))
    unique = np.unique(y_train.astype('int64'))
    print (
        'initial train set:',
        X_train.shape,
        y_train.shape,
        'unique(labels):',
        bin_count,
        unique,
        )
    return (permutation, X_train, y_train, index_train)

This is the main class that initiates the active-learning process according to the algorithm described in the introduction. In short, we select 'k' random samples, train a model, select the most informative samples, remove from the validation set, query their labels and retrain using those samples until reaching the stop criteria.

In [52]:
class TheAlgorithm(object):

    scores = []

    def __init__(self, initial_labeled_samples, model_object, selection_function):
        self.initial_labeled_samples = initial_labeled_samples
        self.model_object = model_object
        self.sample_selection_function = selection_function
        self.x_train = None
        self.y_train = None
        
    def get_data(self):
        return (self.x_train,self.y_train, self.index_train)

    def run(self, X_train_full, y_train_full, X_test, y_test,index_test_full,index_train_full):

        # initialize process by applying base learner to labeled training data set to obtain Classifier

        (permutation, X_train, y_train, index_train) = \
            get_k_random_samples(self.initial_labeled_samples,
                                 X_train_full, y_train_full,index_test_full,index_train_full)
        self.queried = self.initial_labeled_samples
        self.samplecount = [self.initial_labeled_samples]

        # permutation, X_train, y_train = get_equally_k_random_samples(self.initial_labeled_samples,classes)

        # assign the val set the rest of the 'unlabelled' training data

        X_val = np.array([])
        y_val = np.array([])
        X_val = np.copy(X_train_full)
        X_val = np.delete(X_val, permutation, axis=0)
        y_val = np.copy(y_train_full)
        y_val = np.delete(y_val, permutation, axis=0)
        
        index_val = np.array([])
        index_val = np.copy(index_train_full)
        index_val = np.delete(index_val, permutation, axis=0)
        
        print ('val set:', X_val.shape, y_val.shape,index_val.shape, permutation.shape)
        print ()

        # normalize data

        normalizer = Normalize()
        X_train, X_val, X_test = normalizer.normalize(X_train, X_val, X_test)   
        
        self.current_model = TrainModel(self.model_object)
        (X_train, X_val, X_test) = self.current_model.train(X_train, y_train, X_val, X_test, 'balanced')
        active_iteration = 1
        #self.current_model.get_test_accuracy(1, y_test)
        self.current_model.get_test_score(1, X_test, y_test)

        # fpfn = self.current_model.test_y_predicted.ravel() != y_val.ravel()
        # print(fpfn)
        # self.fpfncount = []
        # self.fpfncount.append(fpfn.sum() / y_test.shape[0] * 100)

        while self.queried < max_queried:

            active_iteration += 1

            # get validation probabilities

            y_val_pred = \
                self.current_model.model_object.predictor.predict(X_val)
            print ('y_val predicted:',
                   self.current_model.val_y_predicted.shape,
                   self.current_model.val_y_predicted)
            #print ('ipc:', y_val_pred.shape, '\n',
                   #np.argmax(y_val_pred, axis=1))
            

            # select samples using a selection function

            uncertain_samples = \
                self.sample_selection_function.select(y_val_pred, self.initial_labeled_samples)

            # normalization needs to be inversed and recalculated based on the new train and test set.
 
            X_train, X_val, X_test = normalizer.inverse(X_train, X_val, X_test)   

            # get the uncertain samples from the validation set
            #import pdb; pdb.set_trace()
            print ('trainset before', X_train.shape, y_train.shape)
            X_train = np.concatenate((X_train, X_val[uncertain_samples]))
            y_train = np.concatenate((y_train, y_val[uncertain_samples]))
            index_train = np.concatenate((index_train, index_val[uncertain_samples])) 
            
            print ('trainset after', X_train.shape, y_train.shape,index_train.shape)
            self.samplecount.append(X_train.shape[0])

            bin_count = np.bincount(y_train.astype('int64'))
            unique = np.unique(y_train.astype('int64'))
            print (
                'updated train set:',
                X_train.shape,
                y_train.shape,
                'unique(labels):',
                bin_count,
                unique,
                )
            print("updated index:",
                 index_train.shape)
            X_val = np.delete(X_val, uncertain_samples, axis=0)
            y_val = np.delete(y_val, uncertain_samples, axis=0)
            
            index_val = np.delete(index_val, uncertain_samples, axis=0)
            print ('val set:', X_val.shape, y_val.shape,index_val.shape)
            print ()

            # normalize again after creating the 'new' train/test sets
            normalizer = Normalize()
            X_train, X_val, X_test = normalizer.normalize(X_train, X_val, X_test)               

            self.queried += self.initial_labeled_samples
            (X_train, X_val, X_test) = self.current_model.train(X_train, y_train, X_val, X_test, 'balanced')
            #self.current_model.get_test_accuracy(active_iteration, y_test)
            self.current_model.get_test_score(active_iteration, X_test, y_test)
            
            # Test var
        var = self.current_model.model_object.variance(X_train, X_test)
        self.x_train = X_train
        self.y_train = y_train
        self.index_train = index_train
        #Print indices used
        #print('Index values used:',
              #index_train)
                
        print ('final active learning scores',
               self.current_model.scores)
        print('Variance of X_train', var)

We download the data, split to train validation and test, we run the experiment by iterating over all of our training algorithms X all of our selection functions X all possible k's in the range of [10,25,50,125,250]. The accuracy results are kept in a dictionary and pickle-saved to a unique file as soon as the model finishes training - this is crucial when using google colaboratory as it tends to disconnect from time to time. We also limit our training to a maximum of 500 queried samples.

In [53]:
#Importing P100 Data -PICK P100 METRICS WITH V100 IPC new_df
p100_only = False
if p100_only:
    root_path = '/Users/yzamora/Desktop/ActiveLearningFrameworkTutorial/'
    df_P100 = pd.read_csv(root_path + 'p100_all_data.csv', index_col = 0)
    df_p100_ipc = df_P100.drop(columns=['shared_utilization','stall_other','single_precision_fu_utilization','architecture','input','application_name','kernelname'])
    df_p100_ipc = df_p100_ipc['ipc']
    p100_ipc_values = df_p100_ipc.values

    df_p100_normd = df_P100.drop(columns=['shared_utilization','stall_other','single_precision_fu_utilization','architecture','input','application_name','kernelname','ipc'])
    df_p100_norm = df_p100_normd.values
    df_p100_norm = MinMaxScaler().fit_transform(df_p100_norm)

    from sklearn.model_selection import train_test_split
    X_P = df_p100_norm
    Y_P = p100_ipc_values
    index = df_P100.index
    # Split the data up in train and test sets
    X_train, X_test, y_train, y_test, index_train, index_test = train_test_split(X_P, Y_P, index, test_size=0.33, random_state=42)



In [54]:
#PICK P100 METRICS WITH V100 IPC new_df
p100_v100IPC = True
if p100_v100IPC:
    root_path = '/Users/yzamora/Desktop/ActiveLearningFrameworkTutorial/'
    df_both100 = pd.read_csv('/Users/yzamora/power/nvidia_gpus/all_apps/new_df.csv', index_col = 0)
    df_both100_ipc = df_both100.drop(columns=['shared_utilization','stall_other','single_precision_fu_utilization','architecture','input','application_name','kernelname'])
    df_both100_ipc = df_both100_ipc['ipc']
    both100_ipc_values = df_both100_ipc.values

    df_both100_normd = df_both100.drop(columns=['shared_utilization','stall_other','single_precision_fu_utilization','architecture','input','application_name','kernelname','ipc'])
    df_both100_norm = df_both100_normd.values
    df_both100_norm = MinMaxScaler().fit_transform(df_both100_norm)

    from sklearn.model_selection import train_test_split
    X_both = df_both100_norm
    Y_both = both100_ipc_values
    index = df_both100.index
    # Split the data up in train and test sets
    X_train, X_test, y_train, y_test, index_train, index_test = train_test_split(X_both, Y_both, index, test_size=0.33, random_state=42)



In [58]:
print(index_train.shape[0])

13789


In [59]:
trainset_size = index_train.shape[0]

In [61]:
#(X, y) = download()
#(X_train_full, y_train_full, X_test, y_test) = split(trainset_size)

X = X_P
y = Y_P
X_train_full = X_train
y_train_full = y_train
X_test = X_test
y_test = y_test

print ('train:', X_train_full.shape, y_train_full.shape)
print ('test :', X_test.shape, y_test.shape)
classes = len(np.unique(y))
print ('unique classes', classes)

def pickle_save(fname, data):
  filehandler = open(fname,"wb")
  pickle.dump(data,filehandler)
  filehandler.close() 
  print('saved', fname, os.getcwd(), os.listdir())

def pickle_load(fname):
  print(os.getcwd(), os.listdir())
  file = open(fname,'rb')
  data = pickle.load(file)
  file.close()
  print(data)
  return data
 
def experiment(d, models, selection_functions, Ks, repeats, contfrom):
    algos_temp = []
    print ('stopping at:', max_queried)
    count = 0
    for model_object in models:
      if model_object.__name__ not in d:
          d[model_object.__name__] = {}
      
      for selection_function in selection_functions:
        if selection_function.__name__ not in d[model_object.__name__]:
            d[model_object.__name__][selection_function.__name__] = {}
        
        for k in Ks:
            d[model_object.__name__][selection_function.__name__][str(k)] = []           
            
            for i in range(0, repeats):
                count+=1
                if count >= contfrom:
                    print ('Count = %s, using model = %s, selection_function = %s, k = %s, iteration = %s.' % (count, model_object.__name__, selection_function.__name__, k, i))
                    alg = TheAlgorithm(k, 
                                       model_object, 
                                       selection_function
                                       )
                    alg.run(X_train_full, y_train_full, X_test, y_test,index_test,index_train)
                    d[model_object.__name__][selection_function.__name__][str(k)].append(alg.current_model.scores)
                    fname = 'Active-learning-experiment-' + str(count) + '.pkl'
                    pickle_save(fname, d)
                    
                    activelearn_data = alg.get_data()
                    
                    #import pdb; pdb.set_trace()
                    
                    #Creating hash for finalized shape
                    print("\n \n Finalized training set shape: ",activelearn_data[0].shape)
                    #final_shape = hash_data(pd.DataFrame(activelearn_data[0]))
                    print("\n \n Index shape:", pd.DataFrame(activelearn_data[2]).shape)
                    #Save indices to file
                    with open('indices_size'+ str(k) + '_both.txt', 'w') as f:
                        for item in activelearn_data[2]:
                            f.write("%s\n" % item)
                            
                    if count % 5 == 0:
                        print(json.dumps(d, indent=2, sort_keys=True))
                    print ()
                    print ('---------------------------- FINISHED ---------------------------')
                    print ()
    return d

#how do i find final chosen X_train data points
repeats = 1
models = [RfModel] 
#selection_functions = [RandomSelection, MarginSamplingSelection, EntropySelection]
selection_functions = [VarianceSelection]
Ks = [500] #[125,250,50,25,10] 
d = {}
stopped_at = -1 

# print('directory dump including pickle files:', os.getcwd(), np.sort(os.listdir()))  
# d = pickle_load('Active-learning-experiment-' + str(stopped_at) + '.pkl')  
# print(json.dumps(d, indent=2, sort_keys=True))

d = experiment(d, models, selection_functions, Ks, repeats, stopped_at+1)

print (d)
results = json.loads(json.dumps(d, indent=2, sort_keys=True))
print(results)


train: (13789, 112) (13789,)
test : (6793, 112) (6793,)
unique classes 30419
stopping at: 3000
Count = 1, using model = RfModel, selection_function = VarianceSelection, k = 500, iteration = 0.

initial random chosen samples (500,)
initial train set: (500, 112) (500,) unique(labels): [377 123] [0 1]
val set: (13289, 112) (13289,) (13289,) (500,)

Train set: (500, 112) y: (500,)
Val   set: (13289, 112)
Test  set: (6793, 112)
training random forest...
scoring random forest...
--------------------------------
Iteration: 1
--------------------------------
y-test set: (6793,)
Example run in 6.053 s 

Score for 0.458461 
MSE for 0.092545 
--------------------------------
y_val predicted: (13289,) [1.17949903 0.88701434 0.4988815  ... 0.48856424 0.50541279 1.28189181]
trainset before (500, 112) (500,)
trainset after (1000, 112) (1000,) (1000,)
updated train set: (1000, 112) (1000,) unique(labels): [612 388] [0 1]
updated index: (1000,)
val set: (12789, 112) (12789,) (12789,)

Train set: (1000,

In [38]:
"""
def performance_plot(fully_supervised_accuracy, dic, models, selection_functions, Ks, repeats):  
    fig, ax = plt.subplots()
    ax.plot([0,500],[fully_supervised_accuracy, fully_supervised_accuracy],label = 'algorithm-upper-bound')
    for model_object in models:
      for selection_function in selection_functions:
        for idx, k in enumerate(Ks):
            x = np.arange(float(Ks[idx]), 500 + float(Ks[idx]), float(Ks[idx]))            
            Sum = np.array(dic[model_object][selection_function][k][0])
            for i in range(1, repeats):
                Sum = Sum + np.array(dic[model_object][selection_function][k][i])
            mean = Sum / repeats
            ax.plot(x, mean ,label = model_object + '-' + selection_function + '-' + str(k))
    ax.legend()
    ax.set_xlim([50,500])
    ax.set_ylim([40,100])
    ax.grid(True)
    plt.show()

#models_str = ['SvmModel', 'RfModel', 'LogModel']
models_str = ['RfModel']#, 'LogModel']

#selection_functions_str = ['RandomSelection', 'MarginSamplingSelection', 'EntropySelection']
selection_functions_str = ['RandomSelection']#, 'EntropySelection']
Ks_str = ['250','125']#,'50','25','10'] 
repeats = 1
random_forest_upper_bound = 97
svm_upper_bound = 94.
log_upper_bound = 92.47
total_experiments = len(models_str) * len(selection_functions_str) * len(Ks_str) * repeats

print('So which is the better model? under the stopping condition and hyper parameters - random forest is the winner!')
performance_plot(random_forest_upper_bound, d, ['RfModel'] , selection_functions_str, Ks_str, 1)
#performance_plot(svm_upper_bound, d, ['SvmModel'] , selection_functions_str    , Ks_str, 1)
#performance_plot(log_upper_bound, d, ['LogModel'] , selection_functions_str    , Ks_str, 1)"""

"\ndef performance_plot(fully_supervised_accuracy, dic, models, selection_functions, Ks, repeats):  \n    fig, ax = plt.subplots()\n    ax.plot([0,500],[fully_supervised_accuracy, fully_supervised_accuracy],label = 'algorithm-upper-bound')\n    for model_object in models:\n      for selection_function in selection_functions:\n        for idx, k in enumerate(Ks):\n            x = np.arange(float(Ks[idx]), 500 + float(Ks[idx]), float(Ks[idx]))            \n            Sum = np.array(dic[model_object][selection_function][k][0])\n            for i in range(1, repeats):\n                Sum = Sum + np.array(dic[model_object][selection_function][k][i])\n            mean = Sum / repeats\n            ax.plot(x, mean ,label = model_object + '-' + selection_function + '-' + str(k))\n    ax.legend()\n    ax.set_xlim([50,500])\n    ax.set_ylim([40,100])\n    ax.grid(True)\n    plt.show()\n\n#models_str = ['SvmModel', 'RfModel', 'LogModel']\nmodels_str = ['RfModel']#, 'LogModel']\n\n#selection_func

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))