## Active Learning on  3 Class Problem

In [2]:
import sys, os, imp

In [3]:
imp.load_source('activelearn', '../activelearn.py')

<module 'activelearn' from '../activelearn.pyc'>

In [4]:
from activelearn import *

In [115]:
import numpy as np
from keras.preprocessing import sequence
from keras.preprocessing import text

class loader(object):
    def __init__(self, init_seed, maxlen, nb_words, skip_top, test_split):
        self.start_char = 1
        self.oov_char = 2
        self.index_from = 3
        
        label_type = '/label.4class.' # '/rating.'
        
        data_dir = "datasets/scale_data/scaledata/"
        files = ["Dennis+Schwartz", "James+Berardinelli", "Scott+Renshaw", "Steve+Rhodes"]
        texts, ratings = [], []
        for file in files:
            with open(data_dir + file + "/subj." + file, "r") as f:
                texts += list(f)
            with open(data_dir + file + label_type + file, "r") as f:
                ratings += list(f)
        tokenizer = text.Tokenizer(filters='')
        tokenizer.fit_on_texts(texts)
        X = tokenizer.texts_to_sequences(texts)
        Y = [float(rating) for rating in ratings]

        # Shuffle data:
        np.random.seed(init_seed)
        np.random.shuffle(X)
        np.random.seed(init_seed)
        np.random.shuffle(Y)

        # Parse data
        X = [[self.start_char] + [w + self.index_from for w in x] for x in X]

        new_X = []
        new_Y = []
        for x, y in zip(X, Y):
            for i in xrange(0, len(x), maxlen):
                new_X.append(x[i:i+maxlen])
                new_Y.append(y)
        X = new_X
        Y = new_Y

        # by convention, use 2 as OOV word
        # reserve 'index_from' (=3 by default) characters: 0 (padding), 1 (start), 2 (OOV)
        X = [[self.oov_char if (w >= nb_words or w < skip_top) else w for w in x] for x in X]

        self.X_train = X[:int(len(X)*(1-test_split))]
        self.Y_train = Y[:int(len(X)*(1-test_split))]
        self.mean_y_train = np.mean(self.Y_train)
        self.std_y_train = np.std(self.Y_train)
        #self.Y_train = [(y - self.mean_y_train) / self.std_y_train for y in self.Y_train]

        self.X_test = X[int(len(X)*(1-test_split)):]
        self.Y_test = Y[int(len(X)*(1-test_split)):]

        print(len(self.X_train), 'train sequences')
        print(len(self.X_test), 'test sequences')

        print("Pad sequences (samples x time)")
        self.X_train = sequence.pad_sequences(self.X_train, maxlen=maxlen)
        self.X_test = sequence.pad_sequences(self.X_test, maxlen=maxlen)
        print('X_train shape:', self.X_train.shape)
        print('X_test shape:', self.X_test.shape)

In [116]:
p_W, p_U, p_dense, p_emb, weight_decay, batch_size, maxlen = ["0.25", "0.25", "0.25", "0.25", "1e-4", "128", "200"]
batch_size = int(batch_size)
maxlen = int(maxlen)
#folder = "/scratch/home/Projects/rnn_dropout/exps/"

# Global params:
nb_words = 20000
skip_top = 0
test_split = 0.2
init_seed = 1
global_seed = 0

num_classes = 4

In [117]:
# Load data:
print("Loading data...")
dataset = loader(init_seed, maxlen, nb_words, skip_top, test_split)

Loading data...
(10620, 'train sequences')
(2655, 'test sequences')
Pad sequences (samples x time)
('X_train shape:', (10620, 200))
('X_test shape:', (2655, 200))


In [118]:
X_train, X_test, Y_train, Y_test = dataset.X_train, dataset.X_test, dataset.Y_train, dataset.Y_test

X_train = np.asarray(X_train)
X_test  = np.asarray(X_test)
Y_train = np.asarray(Y_train)
Y_test  = np.asarray(Y_test)

In [122]:
np.sum(Y_train == 0.0)

1337

In [123]:
from keras.optimizers import SGD, RMSprop, Adagrad
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, SpatialDropout1D
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU, SimpleRNN
from keras.regularizers import l2

In [124]:
Y_train_1hot = np.zeros([Y_train.shape[0], num_classes])
Y_train_1hot[Y_train==0] = [1,0,0,0]
Y_train_1hot[Y_train==1] = [0,1,0,0]
Y_train_1hot[Y_train==2] = [0,0,1,0]
Y_train_1hot[Y_train==3] = [0,0,0,1]


Y_test_1hot = np.zeros([Y_test.shape[0], num_classes])
Y_test_1hot[Y_test==0] = [1,0,0,0]
Y_test_1hot[Y_test==1] = [0,1,0,0]
Y_test_1hot[Y_test==2] = [0,0,1,0]
Y_test_1hot[Y_test==3] = [0,0,1,0]

In [132]:
# Build model:
print('Build model...')
model = Sequential()
model.add(Embedding(nb_words + dataset.index_from, 100, embeddings_regularizer=l2(weight_decay), 
                    input_length=maxlen)) # batch_input_shape=(batch_size, maxlen)

model.add(SpatialDropout1D(p_emb))

#model.add(LSTM(128, W_regularizer=l2(weight_decay), U_regularizer=l2(weight_decay),
#               b_regularizer=l2(weight_decay), dropout_W=p_W, dropout_U=p_U))

model.add(LSTM(128, dropout=p_U, recurrent_dropout=p_W))

model.add(Dropout(p_dense))
#model.add(Dense(4, W_regularizer=l2(weight_decay), b_regularizer=l2(weight_decay)))
model.add(Dense(4, activation='sigmoid'))

#optimiser = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=False)
optimiser = 'adam'
model.compile(loss='mean_squared_error', optimizer=optimiser)

# Potentially load weights
# model.load_weights("path")

Build model...


In [133]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 200, 100)          2000300   
_________________________________________________________________
spatial_dropout1d_5 (Spatial (None, 200, 100)          0         
_________________________________________________________________
lstm_5 (LSTM)                (None, 128)               117248    
_________________________________________________________________
dropout_5 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 4)                 516       
Total params: 2,118,064
Trainable params: 2,118,064
Non-trainable params: 0
_________________________________________________________________


In [134]:
tensorflow_train_size = batch_size * (len(X_train) / batch_size)
model.fit(X_train[:tensorflow_train_size], np.array(Y_train_1hot[:tensorflow_train_size]),
           batch_size=batch_size, epochs=15)#, callbacks=[modeltest_1, modeltest_2])

# Potentially save weights
# model.save_weights("path", overwrite=True)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7fd9a8293050>

In [135]:
# Evaluate model
# Dropout approximation for training data:
train_prob = model.predict(X_train, batch_size=128, verbose=1)
np.sum(np.argmax(train_prob, axis=1) == Y_train)*1.0/Y_train.shape[0]



0.83502824858757063

In [136]:
# Dropout approximation for test data:
test_prob = model.predict(X_test, batch_size=500, verbose=1)
np.sum(np.argmax(test_prob, axis=1) == Y_test)*1.0/Y_test.shape[0]



0.43427495291902074

## Keras IMBD Dataset

In [137]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM
from keras.datasets import imdb

In [138]:
max_features = 20000
maxlen = 80  # cut texts after this number of words (among top max_features most common words)
batch_size = 32

In [139]:
print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

Loading data...
(25000, 'train sequences')
(25000, 'test sequences')


In [354]:
X_train = x_train[:500]
Y_train = y_train[:500]

X_test = x_test[:5000]
Y_test = y_test[:5000]

# Pool is the unsupervised dataset
x_pool = x_train[10000:20000]
y_pool = y_train[10000:20000]

In [356]:
print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
print('x_train shape:', X_train.shape)
print('x_test shape:', X_test.shape)

X_pool = sequence.pad_sequences(x_pool, maxlen=maxlen)
Y_pool = y_pool
print('x_pool shape:', X_pool.shape)

Pad sequences (samples x time)
('x_train shape:', (500, 80))
('x_test shape:', (5000, 80))
('x_pool shape:', (10000, 80))


In [309]:
print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 128))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

Build model...


In [310]:
# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [311]:
print('Train...')
model.fit(X_train, Y_train,
          batch_size=batch_size,
          epochs=3,
          validation_data=(X_test, Y_test))


Train...
Train on 500 samples, validate on 5000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fd958d184d0>

In [169]:
score, acc = model.evaluate(X_test, Y_test, batch_size=batch_size)

print('Test score:', score)
print('Test accuracy:', acc)

('Test score:', 0.49929289445877073)
('Test accuracy:', 0.80640000000000001)


In [147]:
score, acc = model.evaluate(X_test, Y_test, batch_size=batch_size)

print('Test score:', score)
print('Test accuracy:', acc)

('Test score:', 1.008508877468109)
('Test accuracy:', 0.6714)


### Active Learning Procedure
1. Allocate an unsupervised set and a test set
2. Initially pick some (say 100) from the unsupervised set, get the labels and set it as the train set
3. Iterate: pick 10 (vary this parameter) most promising from the unsupervised and add to the train set - retrain

In [330]:
import sys, os, imp
imp.load_source('activelearn', '../activelearn.py')
from activelearn import *

In [454]:
# Pool is the unsupervised dataset
x_pool = x_train[10000:20000]
y_pool = y_train[10000:20000]

X_pool = sequence.pad_sequences(x_pool, maxlen=maxlen)
Y_pool = y_pool
print('x_pool shape:', X_pool.shape)

('x_pool shape:', (10000, 80))


In [439]:
y_train.shape

(25020,)

In [366]:
def init_pick(pool_data, pool_labels, num):
    '''Pick num number of datapoints from the unsupervised data pool
    Remove them from the pool and return the data.
    Returns chosen datapoints and the updated pool_data'''
    if len(pool_data) < num:
        raise Exception('pool data is too small')
        
    #np.random.seed(0)
    indices = np.random.choice(range(len(pool_data)), num, replace=False)
    datapoints = pool_data[indices]
    labels = pool_labels[indices]
    pool_data = np.delete(pool_data, indices, axis=0)
    pool_labels = np.delete(pool_labels, indices, axis=0)
    print("Picked " + str(num) + " datapoints\nSize of updated unsupervised pool = " + str(len(pool_data)) + "\n")
    return datapoints, labels, pool_data, pool_labels

In [455]:
# Initially
X_train, Y_train, X_pool, Y_pool = init_pick(X_pool, Y_pool, 100)

Picked 100 datapoints
Size of updated unsupervised pool = 9900



In [456]:
X_pool.shape

(9900, 80)

In [457]:
results = {}
# Initial training
print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 128))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
model.fit(X_train, Y_train, batch_size=batch_size, epochs=3) #, validation_data=(X_test, Y_test))

score, acc = model.evaluate(X_test, Y_test, batch_size=batch_size)
#print('Test score:', score)
print('Test accuracy:', acc)
results[X_train.shape[0]] = acc

Build model...
Train...
Epoch 1/3
Epoch 2/3
Epoch 3/3
('Test accuracy:', 0.53300000000000003)


In [459]:
results

{100: 0.53300000000000003}

In [445]:
def active_pick(acq_fn, num_samples, pool_data, pool_labels, pool_subset_count = 2000):
    """Inputs: Unsupervised data, an acquisition function and number of samples to return
    Output: The datapoints from unsupervised data which has the highest value as per the acquisition function
    """
    #unsup_data = np.array(unsup_data)
    if len(pool_data) < num_samples:
        raise Exception('pool data is exhausted')
        
    if pool_subset_count > len(pool_data):
        pool_subset_count = len(pool_data)

    #values = [acq_fn(x) for x in pool_data]
    pool_subset_random_index = np.random.choice(range(len(pool_data)), pool_subset_count, replace=False)
    X_pool_subset = pool_data[pool_subset_random_index]
    y_pool_subset = pool_labels[pool_subset_random_index]

    print('Search over Pool of Unlabeled Data')

    values = acq_fn(X_pool_subset)
    pos = np.argpartition(values, -num_samples)[-num_samples:]
    datapoints = X_pool_subset[pos]
    labels = y_pool_subset[pos]
    #print pool_subset_random_index[:10]
    pool_data = np.delete(pool_data, (pool_subset_random_index[pos]), axis=0)
    pool_labels = np.delete(pool_labels, (pool_subset_random_index[pos]), axis=0)
    print("Picked " + str(num_samples) + " datapoints\nSize of updated Unsupervised pool = " + str(len(pool_data)))

    return datapoints, labels, pool_data, pool_labels

In [460]:
def var_ratio(pool_data):
    # Var ratio active learning acquisition function
    # D_probs - Deterministic probs as opposed to MC sampling
    D_probs = model.predict_proba(pool_data)  
    return 1.0 - np.max(D_probs, axis=1)

def random_acq(pool_data):
    return np.random.rand(len(pool_data))

In [466]:
import timeit
timeit.time.time()

1516319308.311072

In [None]:
start = timeit.time.time()
acquisition_iterations = 100
for i in range(acquisition_iterations):
    print('\n\nACQUISITION ITERATION ' + str(i+1) + ' of ' + str(acquisition_iterations))
    
    X_picked, Y_picked, X_pool, Y_pool = active_pick(random_acq, 10, X_pool, Y_pool)
    print('Acquired Points added to the training set')
    X_train = np.concatenate((X_train, X_picked), axis=0)
    Y_train = np.concatenate((Y_train, Y_picked), axis=0)
    print('Train Data size: ' + str(X_train.shape))  
    print('Unlabeled Pool size: ' + str(X_pool.shape))

    print('Train Again with the added points')

    model.fit(X_train, Y_train, batch_size=batch_size, epochs=3) #, validation_data=(X_test, Y_test))

    score, acc = model.evaluate(X_test, Y_test, batch_size=batch_size)
    #print('Test score:', score)
    print('Test accuracy:', acc)
    results[X_train.shape[0]] = acc

end = timeit.time.time()
print('\n Total time = ' + str(end-start) + 's')



ACQUISITION ITERATION 1 of 100
Search over Pool of Unlabeled Data
Picked 10 datapoints
Size of updated Unsupervised pool = 9890
Acquired Points added to the training set
Train Data size: (110, 80)
Unlabeled Pool size: (9890, 80)
Train Again with the added points
Epoch 1/3
Epoch 2/3
Epoch 3/3
('Test accuracy:', 0.58040000000000003)


ACQUISITION ITERATION 2 of 100
Search over Pool of Unlabeled Data
Picked 10 datapoints
Size of updated Unsupervised pool = 9880
Acquired Points added to the training set
Train Data size: (120, 80)
Unlabeled Pool size: (9880, 80)
Train Again with the added points
Epoch 1/3
Epoch 2/3
Epoch 3/3
('Test accuracy:', 0.58660000000000001)


ACQUISITION ITERATION 3 of 100
Search over Pool of Unlabeled Data
Picked 10 datapoints
Size of updated Unsupervised pool = 9870
Acquired Points added to the training set
Train Data size: (130, 80)
Unlabeled Pool size: (9870, 80)
Train Again with the added points
Epoch 1/3
Epoch 2/3
Epoch 3/3
('Test accuracy:', 0.58440000000000

('Test accuracy:', 0.65680000000000005)


ACQUISITION ITERATION 14 of 100
Search over Pool of Unlabeled Data
Picked 10 datapoints
Size of updated Unsupervised pool = 9760
Acquired Points added to the training set
Train Data size: (240, 80)
Unlabeled Pool size: (9760, 80)
Train Again with the added points
Epoch 1/3
Epoch 2/3
Epoch 3/3
('Test accuracy:', 0.65280000000000005)


ACQUISITION ITERATION 15 of 100
Search over Pool of Unlabeled Data
Picked 10 datapoints
Size of updated Unsupervised pool = 9750
Acquired Points added to the training set
Train Data size: (250, 80)
Unlabeled Pool size: (9750, 80)
Train Again with the added points
Epoch 1/3
Epoch 2/3
Epoch 3/3
('Test accuracy:', 0.65739999999999998)


ACQUISITION ITERATION 16 of 100
Search over Pool of Unlabeled Data
Picked 10 datapoints
Size of updated Unsupervised pool = 9740
Acquired Points added to the training set
Train Data size: (260, 80)
Unlabeled Pool size: (9740, 80)
Train Again with the added points
Epoch 1/3
Epoch 2/3
Ep

In [458]:
acq1

{100: 0.51419999999999999,
 110: 0.59219999999999995,
 120: 0.56699999999999995,
 130: 0.63319999999999999,
 140: 0.61480000000000001,
 150: 0.60519999999999996,
 160: 0.60499999999999998,
 170: 0.58440000000000003,
 180: 0.61040000000000005,
 190: 0.5988,
 200: 0.58760000000000001,
 210: 0.65000000000000002,
 220: 0.60819999999999996,
 230: 0.63560000000000005,
 240: 0.61799999999999999,
 250: 0.63539999999999996,
 260: 0.66279999999999994,
 270: 0.65359999999999996,
 280: 0.61519999999999997,
 290: 0.62980000000000003,
 300: 0.64100000000000001,
 310: 0.59399999999999997,
 320: 0.61599999999999999,
 330: 0.61419999999999997,
 340: 0.63759999999999994,
 350: 0.65859999999999996,
 360: 0.6462,
 370: 0.66820000000000002,
 380: 0.65359999999999996,
 390: 0.64959999999999996,
 400: 0.66739999999999999,
 410: 0.64959999999999996,
 420: 0.63719999999999999,
 430: 0.63539999999999996,
 440: 0.63219999999999998,
 450: 0.65480000000000005,
 460: 0.60899999999999999,
 470: 0.64639999999999997,


In [451]:
acq1 = results

In [429]:
X_picked, Y_picked, X_pool, Y_pool = active_pick(var_ratio, 10, X_pool, Y_pool)

Search over Pool of Unlabeled Data
Picked 10 datapoints
Size of updated Unsupervised pool = 9890



In [430]:
Y_picked

array([0, 0, 1, 1, 0, 0, 0, 0, 0, 1])

In [431]:
X_train.shape

(100, 80)

In [432]:
print('Acquired Points added to the training set')
X_train = np.concatenate((X_train, X_picked), axis=0)
Y_train = np.concatenate((Y_train, Y_picked), axis=0)
print('Train Data size: ' + str(X_train.shape))  
print('Unlabeled Pool size: ' + str(X_pool.shape))

print('Train Again with the added points')

model.fit(X_train, Y_train, batch_size=batch_size, epochs=3) #, validation_data=(X_test, Y_test))

score, acc = model.evaluate(X_test, Y_test, batch_size=batch_size)
#print('Test score:', score)
print('Test accuracy:', acc)
results[X_train.shape[0]] = acc

Acquired Points added to the training set
Train Data size: (110, 80)
Unlabeled Pool size: (9890, 80)
Train Again with the added points
Epoch 1/3
Epoch 2/3
Epoch 3/3
('Test accuracy:', 0.55100000000000005)


In [348]:
len(a)

4