In [3]:
import torch
cuda = torch.cuda.is_available()
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import sys
sys.path.append("../../semi-supervised")

In [4]:
from importlib import reload
import models
reload(models)

<module 'models' from '../../semi-supervised/models/__init__.py'>

In [5]:
from models import ProdLDA

In [6]:
class NetArchtec(object):
    
    def __init__(self, num_input, en1_units=100, en2_units=100, num_topic=50, batch_size=200, optimizer="Adam",
                 learning_rate=0.002, momentum=0.99, num_epoch=80, init_mult=1.0, variance=0.995,
                 start=True, nogpu=True):
        self.num_input = num_input
        self.en1_units = en1_units
        self.en2_units = en2_units
        self.num_topic = num_topic
        self.batch_size = batch_size
        self.optimizer = optimizer
        self.learning_rate = learning_rate
        self.momentum = momentum
        self.num_epoch = num_epoch
        self.init_mult = init_mult
        self.variance = variance
        self.start = start
        self.nogpu = nogpu

In [7]:
net_arch = NetArchtec(1995, 100, 100, 50)
# model = ProdLDA(net_arch)

In [6]:
import pickle

def to_onehot(data, min_length):
    return np.bincount(data, minlength=min_length)

def make_data():
    global data_tr, data_te, tensor_tr, tensor_te, vocab, vocab_size
    dataset_tr = './pytorch-avitm-master/data/20news_clean/train.txt.npy'
    data_tr = np.load(dataset_tr, encoding="latin1")
    dataset_te = './pytorch-avitm-master/data/20news_clean/test.txt.npy'
    data_te = np.load(dataset_te, encoding="latin1")
    vocab = './pytorch-avitm-master/data/20news_clean/vocab.pkl'
    vocab = pickle.load(open(vocab,'rb'))
    vocab_size=len(vocab)
    #--------------convert to one-hot representation------------------
    print('Converting data to one-hot representation')
    data_tr = np.array([to_onehot(doc.astype('int'),vocab_size) for doc in data_tr if np.sum(doc)!=0])
    data_te = np.array([to_onehot(doc.astype('int'),vocab_size) for doc in data_te if np.sum(doc)!=0])
    #--------------print the data dimentions--------------------------
    print('Data Loaded')
    print('Dim Training Data ', str(data_tr.shape))
    print('Dim Test Data', str(data_te.shape))
    #--------------make tensor datasets-------------------------------
    tensor_tr = torch.from_numpy(data_tr).float()
    tensor_te = torch.from_numpy(data_te).float()
    if not net_arch.nogpu:
        tensor_tr = tensor_tr.cuda()
        tensor_te = tensor_te.cuda()

In [7]:
make_data()

Converting data to one-hot representation
Data Loaded
Dim Training Data  (11258, 1995)
Dim Test Data (7487, 1995)


In [8]:
num_topics = 50
a = 1.0
prior_mean = np.log(a) - np.mean(np.log(a))
prior_var = (((1.0 / a) * (1 - (2.0 / num_topics))) + (1.0 / (num_topics * num_topics)) * np.sum((1.0 / a)))
#log_prior_var = np.log(prior_var)

In [9]:
prior_var

0.9603999999999999

In [10]:
model = ProdLDA([1995,50, [100, 100]], prior_mean, prior_var)

In [11]:
model

ProdLDA(
  (encoder): Encoder(
    (hidden): ModuleList(
      (0): Linear(in_features=1995, out_features=100, bias=True)
      (1): Linear(in_features=100, out_features=100, bias=True)
    )
    (sample): GaussianSample(
      (mu): Linear(in_features=100, out_features=50, bias=True)
      (log_var): Linear(in_features=100, out_features=50, bias=True)
    )
    (mean_norm): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (logvar_norm): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (decoder): Decoder(
    (activation): Softmax()
    (drop): Dropout(p=0.2)
    (hidden): ModuleList(
      (0): Linear(in_features=50, out_features=1995, bias=True)
    )
    (norm): BatchNorm1d(1995, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (output_activation): Softmax()
  )
)

In [12]:
def binary_cross_entropy(r, x):
    return -torch.sum(x * torch.log(r + 1e-8) + (1 - x) * torch.log(1 - r + 1e-8), dim=-1)

optimizer = torch.optim.Adam(model.parameters(), lr=3e-4, betas=(0.9, 0.999))

In [13]:
from torch.autograd import Variable

In [14]:
for epoch in range(50):
    all_indices = torch.randperm(tensor_tr.size(0)).split(net_arch.batch_size)
    model.train()
    total_loss = 0
    for batch_indices in all_indices:
        if not net_arch.nogpu:
            batch_indices = batch_indices.cuda()

        u = Variable(tensor_tr[batch_indices])

        # if cuda: u = u.cuda(device=0)

        reconstruction = model(u)
        
        likelihood = -binary_cross_entropy(reconstruction, u)
        elbo = likelihood - model.kl_divergence
        
        L = -torch.mean(elbo)

        L.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += L.data.item()

    m = len(all_indices)

    if epoch % 10 == 0:
        print(f"Epoch: {epoch}\tL: {total_loss/m:.2f}")

  x = self.activation(x)
  return self.output_activation(x)


Epoch: 0	L: 1027.61
Epoch: 10	L: 669.82
Epoch: 20	L: 654.45
Epoch: 30	L: 643.58
Epoch: 40	L: 639.05


In [18]:
model.eval()
x_mu = model.sample(Variable(torch.randn(10, 50)))

In [26]:
res = x_mu.detach().numpy()

In [30]:
threshold = res.mean()

In [32]:
doc, word = np.where(res > threshold)

In [36]:
inverse_vocab = {value:key for key, value in vocab.items()}

In [40]:
generated_docs = {}
for i, w in enumerate(word):
    generated_docs.setdefault(doc[i], []).append(inverse_vocab[w])

In [41]:
generated_docs

{0: ['write',
  'one',
  'use',
  'get',
  'say',
  'article',
  'know',
  'people',
  'make',
  'go',
  'like',
  'think',
  'see',
  'time',
  'also',
  'work',
  'take',
  'year',
  'new',
  'good',
  'want',
  'come',
  'thing',
  'even',
  'way',
  'well',
  'look',
  'give',
  'system',
  'may',
  'need',
  'problem',
  'find',
  'god',
  'file',
  'try',
  'many',
  'much',
  'first',
  'two',
  'right',
  'call',
  'run',
  'point',
  'question',
  'anyone',
  'tell',
  'post',
  'believe',
  'drive',
  'seem',
  'program',
  'something',
  'number',
  'please',
  'really',
  'since',
  'mean',
  'back',
  'include',
  'day',
  'still',
  'state',
  'help',
  'read',
  'government',
  'information',
  'start',
  'key',
  'law',
  'case',
  'game',
  'thanks',
  'last',
  'must',
  'part',
  'let',
  'better',
  'never',
  'might',
  'ask',
  'sure',
  'support',
  'another',
  'group',
  'car',
  'without',
  'name',
  'send',
  'keep',
  'fact',
  'lot',
  'man',
  'someone',


In [2]:
from models import ProdLDADeepGenerativeModel