In [1]:
import torch
cuda = torch.cuda.is_available()
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import sys
sys.path.append("../../semi-supervised")
from models import ProdLDADeepGenerativeModel

In [2]:
y_dim = 2
z_dim = 100
h_dim = [100, 100]

num_topics = y_dim
a = 1.0
prior_mean = np.log(a) - np.mean(np.log(a))
prior_var = (((1.0 / a) * (1 - (2.0 / num_topics))) + (1.0 / (num_topics * num_topics)) * np.sum((1.0 / a)))

In [3]:
def initialize_model(in_dim, y_dim, z_dim, h_dim, prior_mean, prior_var):
    model = ProdLDADeepGenerativeModel([in_dim, y_dim, z_dim, h_dim], prior_mean, prior_var)
    
    return model

In [4]:
from inference import SVI, ImportanceWeightedSampler
from itertools import cycle
from torch.autograd import Variable

def train_semi_supervised(model, labelled, unlabelled, validation, cuda, epochs=4):
    # You can use importance weighted samples [Burda, 2015] to get a better estimate
    # on the log-likelihood.
    sampler = ImportanceWeightedSampler(mc=1, iw=1)

    def binary_cross_entropy(r, x):
        return -torch.sum(x * torch.log(r + 1e-8) + (1 - x) * torch.log(1 - r + 1e-8), dim=-1)

    if cuda:
        model = model.cuda()
    elbo = SVI(model, likelihood=binary_cross_entropy, sampler=sampler)

    optimizer = torch.optim.Adam(model.parameters(), lr=3e-4, betas=(0.9, 0.999))


    if cuda: 
        model = model.cuda()

    alpha = 1.0 * len(unlabelled) / len(labelled)

    for epoch in range(epochs):
        model.train()
        total_loss, accuracy = (0, 0)
        for (x, y), (u, _) in zip(cycle(labelled), unlabelled):
            #x, y, u = torch.from_numpy(x).float(), torch.from_numpy(y).float(), torch.from_numpy(u).float()
            # Wrap in variables
            x, y, u = Variable(x), Variable(y), Variable(u)

            if cuda:
                # They need to be on the same device and be synchronized.
                x, y = x.cuda(device=0), y.cuda(device=0)
                u = u.cuda(device=0)

            # print(x.sum())
            L = -elbo(x, y)
            U = -elbo(u)

            # Add auxiliary classification loss q(y|x)
            logits = model.classify(x)

            # Regular cross entropy
            classication_loss = torch.sum(y * torch.log(logits + 1e-8), dim=1).mean()

            J_alpha = L - alpha * classication_loss + U

            J_alpha.backward()
            optimizer.step()
            optimizer.zero_grad()

            total_loss += J_alpha.data.item()
            accuracy += torch.mean((torch.max(logits, 1)[1].data == torch.max(y, 1)[1].data).float())

        if epoch % 1 == 0:
            model.eval()
            m = len(unlabelled)
            print("Epoch: {}".format(epoch))
            print("[Train]\t\t J_a: {:.2f}, accuracy: {:.2f}".format(total_loss / m, accuracy / m))

            total_loss, accuracy = (0, 0)
            for x, y in validation:
                x, y = Variable(x), Variable(y)

                if cuda:
                    x, y = x.cuda(device=0), y.cuda(device=0)

                L = -elbo(x, y)
                U = -elbo(x)

                logits = model.classify(x)
                classication_loss = -torch.sum(y * torch.log(logits + 1e-8), dim=1).mean()

                J_alpha = L + alpha * classication_loss + U

                total_loss += J_alpha.data.item()

                _, pred_idx = torch.max(logits, 1)
                _, lab_idx = torch.max(y, 1)
                accuracy += torch.mean((torch.max(logits, 1)[1].data == torch.max(y, 1)[1].data).float())

            m = len(validation)
            print("[Validation]\t J_a: {:.2f}, accuracy: {:.2f}".format(total_loss / m, accuracy / m))
            
    return total_loss / m, accuracy / m

# Representativeness Score

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize


def add_vector_sparse(X,v):
    rows, cols = X.shape
    row_start_stop = np.lib.stride_tricks.as_strided(X.indptr, shape=(rows, 2),
                            strides=2*X.indptr.strides)
    for row, (start, stop) in enumerate(row_start_stop):
        data = X.data[start:stop]
        data += v[row]

def calc_representativeness_scores(corpus, lambda_=0.9, B=0.3, tokenized=False):
    
    if tokenized:
        X = corpus
    else:
        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform(corpus)
    
    D = X.shape[0]
    
    p_w = X.sum(axis=0) / X.sum()
    p_w_given_d_i = normalize(X, norm='l1', axis=1)
    
    log_p_w = np.log(p_w)
    dist_disjoint = p_w_given_d_i.dot((1-lambda_) * log_p_w.T)
    
    p_w = np.squeeze(np.asarray(p_w))
    
    log_p_w_given_d_i = p_w_given_d_i.transpose().copy()

    log_p_w_given_d_i.data = lambda_*log_p_w_given_d_i.data 
    add_vector_sparse(log_p_w_given_d_i, lambda_*p_w)

    log_p_w_given_d_i.data = np.log(log_p_w_given_d_i.data)
    add_vector_sparse(log_p_w_given_d_i, -(1-lambda_)*np.log(p_w))
    
    log_p_w_given_d_i = log_p_w_given_d_i.transpose()
    
    dist_common = p_w_given_d_i.dot(log_p_w_given_d_i.sum(axis=0).T)
    
    dist_common = np.squeeze(np.asarray(dist_common))
    dist_disjoint = np.squeeze(np.asarray(dist_disjoint))
    
    dist_all = dist_common + dist_disjoint
    
    entropy = p_w_given_d_i.copy()
    entropy.data = p_w_given_d_i.data*np.log(p_w_given_d_i.data)

    entropy = np.squeeze(np.asarray(entropy.sum(axis=1)))
    
    kl_sum = dist_all - D * entropy
    kl_sum *= (B / D) 
    z_i = np.exp(kl_sum)
    
    return z_i

# Generate MC samples

In [6]:
def mc_samples(num_mc_samples, model, x_batch):
    model.train()
    mc_samples_ = [model.classify(x_batch) for _ in range(num_mc_samples)]
    return torch.stack(mc_samples_)

In [7]:
def bald_acq(mc_samples):
    #expected_entropy = -(mc_samples * (mc_samples + 1e-10).log()).sum(dim=-1).mean(dim=0)  # [batch size]
    expected_entropy = -np.mean(np.sum(mc_samples * np.log(mc_samples + 1e-10), axis=-1), axis=0)
    #expected_p = mc_samples.mean(dim=0)
    expected_p = np.mean(mc_samples, axis=0)
    #entropy_expected_p = - (expected_p * (expected_p + 1e-10).log()).sum(dim=-1)  # [batch size]
    entropy_expected_p = - np.sum(expected_p * np.log(expected_p + 1e-10), axis=-1)

    BALD_acq = entropy_expected_p - expected_entropy
    
    return BALD_acq

In [8]:
def query_new_data(num_data, num_mc_samples, model, unlabelled_data, repr_scores=None):
    unlabelled_data = torch.from_numpy(unlabelled_data).float()
    if cuda:
        unlabelled_data = unlabelled_data.cuda()
    mc_samples_ = mc_samples(num_mc_samples, model, unlabelled_data).cpu().detach().numpy()
    bald_acq_ = bald_acq(mc_samples_)
    
    if repr_scores is not None:
        bald_acq_ = bald_acq_ * repr_scores
    #sorted_, indices = bald_acq_.sort()
    indices = bald_acq_.argsort()
    return indices[::-1][:num_data]

# Data

In [9]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/atakanguney94/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
from nltk.tokenize import word_tokenize
import os


X_ = []
y_ = []

pos_path = "./data/aclImdb/train/pos/"
poses = os.listdir(pos_path)[:]
pos_path + poses[0]

print("Reading positives...")
counter = 1
for f in poses:
    if counter%2500 == 0:
        print(counter,"/", len(poses))
    counter += 1
    
    with open(pos_path + f) as file:
        lines = file.readlines()
        lines = ''.join(lines)

        #words = word_tokenize(lines)
        X_.append(lines)
        y_.append(1)
        

neg_path = "./data/aclImdb/train/neg/"
negs = os.listdir(neg_path)[:]

print("Reading negatives...")
counter = 1
for f in negs:
    if counter%2500 == 0:
        print(counter, "/", len(negs))
    counter += 1
    
    with open(neg_path + f) as file:
        lines = file.readlines()
        lines = ''.join(lines)

        #words = word_tokenize(lines)
        X_.append(lines)
        y_.append(0)

Reading positives...
2500 / 12500
5000 / 12500
7500 / 12500
10000 / 12500
12500 / 12500
Reading negatives...
2500 / 12500
5000 / 12500
7500 / 12500
10000 / 12500
12500 / 12500


In [11]:
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [12]:
tf = CountVectorizer(stop_words="english", max_features=10000)

In [13]:
docs = tf.fit_transform(X_)

In [14]:
repr_scores = calc_representativeness_scores(docs, tokenized=True)

In [15]:
row_ids =np.arange(len(X_))

In [16]:
x_dim = len(tf.vocabulary_)

In [17]:
docs = docs.todense()

In [18]:
ohe = OneHotEncoder()
labels = ohe.fit_transform(np.expand_dims(y_, -1))

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [19]:
labels = labels.todense()

In [20]:
X_train, X_valid, y_train, y_valid, row_ids_train, row_ids_valid = train_test_split(docs, labels, row_ids, test_size=0.20)

In [21]:
y_train.shape

(20000, 2)

In [22]:
x_labelled, x_unlabelled, y_labelled, y_unlabelled, row_ids_labelled, row_ids_unlabelled = train_test_split(X_train, y_train, row_ids_train, train_size=0.02)



In [23]:
x_labelled.shape, x_unlabelled.shape, y_labelled.shape, y_unlabelled.shape

((400, 10000), (19600, 10000), (400, 2), (19600, 2))

In [24]:
def create_batch(data, batch_size):
    x, y = data
    batch_idx = np.random.choice(x.shape[0], batch_size, replace=False)

    return torch.from_numpy(x[batch_idx]).float(), torch.from_numpy(y[batch_idx]).float()

In [25]:
def create_data_sets(labelled, unlabelled, batch_size):
    num_labelled = labelled[0].shape[0]
    num_unlabelled = unlabelled[0].shape[0]

    train_labelled = [create_batch(labelled, batch_size) for _ in range(num_labelled // batch_size)]
    train_unlabelled = [create_batch(unlabelled, batch_size) for _ in range(num_unlabelled // batch_size)]
    
    return train_labelled, train_unlabelled

In [26]:
def create_validation_set(validation, batch_size):
    num_validation = validation[0].shape[0]
    
    validation = [create_batch(validation, batch_size) for _ in range(num_validation // batch_size)]
    
    return validation

In [27]:
labelled, unlabelled = (x_labelled, y_labelled), (x_unlabelled, y_unlabelled)

In [None]:
train_labelled, train_unlabelled = create_data_sets(labelled, unlabelled, 50)

In [28]:
validation = create_validation_set((X_valid, y_valid), 50)

# Active Learning

In [29]:
NUM_MC_SAMPLES = 10
NUM_QUERY = 200

In [30]:
def rearange_datasets(labelled, unlabelled, new_data):
    labelled_x, labelled_y = labelled
    unlabelled_x, unlabelled_y = unlabelled
    
    new_data_x, new_data_y = unlabelled_x[new_data], unlabelled_y[new_data]
    
    new_labelled_x = np.append(labelled_x, new_data_x, axis=0)
    new_labelled_y = np.append(labelled_y, new_data_y, axis=0)
    
    new_unlabelled_x = np.delete(unlabelled_x, new_data, axis=0)
    new_unlabelled_y = np.delete(unlabelled_y, new_data, axis=0)
    
    return (new_labelled_x, new_labelled_y), (new_unlabelled_x, new_unlabelled_y)

In [31]:
torch.cuda.set_device(0)

In [32]:
torch.cuda.current_device()

0

In [33]:
len(train_labelled)

NameError: name 'train_labelled' is not defined

In [None]:
errors = []
batch_size=50
for i in range(50):
    model = initialize_model(x_dim, y_dim, z_dim, h_dim, prior_mean, prior_var)
    train_labelled, train_unlabelled = create_data_sets(labelled, unlabelled, batch_size)
    
    print(labelled[0].shape[0], unlabelled[0].shape[0])
    error, acc = train_semi_supervised(model, train_labelled, train_unlabelled, validation[:-1], cuda, epochs=5)
    errors.append((labelled[0].shape[0], unlabelled[0].shape[0], error, acc))
    
    
    #new_data = np.random.choice(unlabelled[0].shape[0], NUM_QUERY)
    new_data = query_new_data(NUM_QUERY, NUM_MC_SAMPLES, model, unlabelled[0], repr_scores=repr_scores[row_ids_unlabelled])
    row_ids_unlabelled = np.delete(row_ids_unlabelled, new_data, axis=0)
    
    labelled, unlabelled = rearange_datasets(labelled, unlabelled, new_data)
    torch.cuda.empty_cache()

400 19600


  x = self.activation(x)
  return self.output_activation(x)


Epoch: 0
[Train]		 J_a: 2075.56, accuracy: 0.99
[Validation]	 J_a: 2211.55, accuracy: 0.50
Epoch: 1
[Train]		 J_a: 1928.45, accuracy: 1.00
[Validation]	 J_a: 2260.55, accuracy: 0.49
Epoch: 2
[Train]		 J_a: 1855.08, accuracy: 1.00
[Validation]	 J_a: 2295.59, accuracy: 0.49
Epoch: 3
[Train]		 J_a: 1789.36, accuracy: 1.00
[Validation]	 J_a: 2298.19, accuracy: 0.49
Epoch: 4
[Train]		 J_a: 1734.71, accuracy: 1.00
[Validation]	 J_a: 2258.45, accuracy: 0.49
600 19400
Epoch: 0
[Train]		 J_a: 2044.33, accuracy: 0.99
[Validation]	 J_a: 2228.18, accuracy: 0.51
Epoch: 1
[Train]		 J_a: 1891.79, accuracy: 1.00
[Validation]	 J_a: 2278.49, accuracy: 0.51
Epoch: 2
[Train]		 J_a: 1807.79, accuracy: 1.00
[Validation]	 J_a: 2291.58, accuracy: 0.51
Epoch: 3
[Train]		 J_a: 1758.34, accuracy: 1.00
[Validation]	 J_a: 2302.85, accuracy: 0.51
Epoch: 4
[Train]		 J_a: 1720.97, accuracy: 1.00
[Validation]	 J_a: 2260.01, accuracy: 0.51
800 19200
Epoch: 0
[Train]		 J_a: 2048.72, accuracy: 0.98
[Validation]	 J_a: 215

Epoch: 3
[Train]		 J_a: 2149.29, accuracy: 0.99
[Validation]	 J_a: 2005.60, accuracy: 0.52
Epoch: 4
[Train]		 J_a: 2118.33, accuracy: 1.00
[Validation]	 J_a: 2046.98, accuracy: 0.52
4000 16000
Epoch: 0
[Train]		 J_a: 2397.73, accuracy: 0.78
[Validation]	 J_a: 1995.58, accuracy: 0.55
Epoch: 1
[Train]		 J_a: 2249.78, accuracy: 0.88
[Validation]	 J_a: 1944.50, accuracy: 0.52
Epoch: 2
[Train]		 J_a: 2171.93, accuracy: 0.97
[Validation]	 J_a: 1930.42, accuracy: 0.53
Epoch: 3
[Train]		 J_a: 2131.61, accuracy: 0.99
[Validation]	 J_a: 1919.01, accuracy: 0.52
Epoch: 4
[Train]		 J_a: 2101.05, accuracy: 1.00
[Validation]	 J_a: 1901.22, accuracy: 0.52
4200 15800
Epoch: 0
[Train]		 J_a: 2451.03, accuracy: 0.65
[Validation]	 J_a: 2055.88, accuracy: 0.52
Epoch: 1
[Train]		 J_a: 2302.60, accuracy: 0.88
[Validation]	 J_a: 2008.09, accuracy: 0.54
Epoch: 2
[Train]		 J_a: 2220.57, accuracy: 0.97
[Validation]	 J_a: 2177.15, accuracy: 0.54
Epoch: 3
[Train]		 J_a: 2181.01, accuracy: 0.99
[Validation]	 J_a: 2

Epoch: 1
[Train]		 J_a: 2194.44, accuracy: 0.70
[Validation]	 J_a: 1979.67, accuracy: 0.54
Epoch: 2
[Train]		 J_a: 2131.89, accuracy: 0.82
[Validation]	 J_a: 1944.89, accuracy: 0.57
Epoch: 3
[Train]		 J_a: 2070.74, accuracy: 0.89
[Validation]	 J_a: 1913.37, accuracy: 0.56
Epoch: 4
[Train]		 J_a: 2032.45, accuracy: 0.93
[Validation]	 J_a: 1888.43, accuracy: 0.57
7600 12400
Epoch: 0
[Train]		 J_a: 2308.20, accuracy: 0.52
[Validation]	 J_a: 2053.72, accuracy: 0.51
Epoch: 1
[Train]		 J_a: 2176.56, accuracy: 0.65
[Validation]	 J_a: 1992.77, accuracy: 0.54
Epoch: 2
[Train]		 J_a: 2105.66, accuracy: 0.82
[Validation]	 J_a: 2044.99, accuracy: 0.57
Epoch: 3
[Train]		 J_a: 2061.03, accuracy: 0.90
[Validation]	 J_a: 3301.26, accuracy: 0.57
Epoch: 4
[Train]		 J_a: 2032.04, accuracy: 0.94
[Validation]	 J_a: 2947.98, accuracy: 0.56
7800 12200
Epoch: 0
[Train]		 J_a: 2309.56, accuracy: 0.53
[Validation]	 J_a: 2035.75, accuracy: 0.51
Epoch: 1
[Train]		 J_a: 2180.99, accuracy: 0.69
[Validation]	 J_a: 1

In [None]:
torch.cuda.empty_cache()

In [None]:
len(errors)

In [None]:
num_labelled = []
num_unlabelled = []
cost = []
acc = []

for n_l, n_u, c, a in errors:
    num_labelled.append(n_l)
    num_unlabelled.append(n_u)
    cost.append(c)
    acc.append(a)

In [None]:
plt.figure(figsize=(10, 10))
plt.plot(num_labelled, cost)
plt.xlabel("Number of labelled data")
plt.ylabel("Cost")
plt.savefig("n_labelled_vs_cost-represent.png")

In [None]:
plt.figure(figsize=(10, 10))
plt.plot(num_labelled, acc)
plt.xlabel("Number of labelled data")
plt.ylabel("Acc")
plt.savefig("n_labelled_vs_acc-represent.png")

In [None]:
import pickle

In [None]:
errors_2 = [(n_l, n_u, c, a.item()) for n_l, n_u, c, a in errors]

In [None]:
errors_2[0]

In [None]:
with open("IMDB-data-erros-represent.pkl", "wb") as f:
    pickle.dump(errors_2, f)