<a href="https://colab.research.google.com/github/ayakow1/ttic31220-japanparliament-analysis/blob/main/NVDM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NVDM (Neural Variational Document Model)

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
! git clone https://github.com/ysmiao/nvdm.git

fatal: destination path 'nvdm' already exists and is not an empty directory.


In [None]:
! git clone https://github.com/YongfeiYan/Neural-Document-Modeling.git

fatal: destination path 'Neural-Document-Modeling' already exists and is not an empty directory.


In [None]:
!cp -r Neural-Document-Modeling /content/drive/MyDrive/議事録/

In [None]:
import sys
sys.path.insert(0,'/content/drive/MyDrive/議事録/Neural-Document-Modeling')

In [None]:
import os
from os import path
import json
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from collections import OrderedDict
import numpy as np
import re
from string import punctuation
import sqlite3
import pandas as pd
from data_utils import read_pre_embedding


In [None]:
conn = sqlite3.connect('/content/drive/MyDrive/議事録/speech.db')

In [None]:
train = pd.read_sql_query(f'''SELECT * FROM speech WHERE speech_date >= '2020-01-01' AND speech_date <= '2021-12-31' ''', conn)

In [None]:
test = pd.read_sql_query(f'''SELECT * FROM speech WHERE speech_date >= '2022-01-01' AND speech_date <= '2023-04-31' ''', conn)

In [None]:
train = train['speech'].to_list()
test = test['speech'].to_list()

In [None]:
# Modify https://github.com/YongfeiYan/Neural-Document-Modeling/blob/master/dataset.py
def save_dataset(save_dir, corpus, vocab):
    """corpus: n x all_vocab, vocab: dict of vocab trimmed(subset of all_vocab)."""
    train, test = corpus
    new_vocab = OrderedDict()
    for k in vocab.keys():
        new_vocab[k] = len(new_vocab) + 1
    itos = {v: k for k, v in vocab.items()}

    def _bow(data):
        bow = {}  # n:f
        wf = {}   # word word word
        for i, j in zip(*data.nonzero()):
            if i not in bow:
                bow[i] = []
            if i not in wf:
                wf[i] = []
            if j not in itos:
                continue
            f = int(data[i, j])
            w = itos[j]
            wf[i].extend([w] * f)
            bow[i].append('{}:{}'.format(new_vocab[w], f))
        bow = [' '.join(v) for v in bow.values() if len(v) > 0]
        wf = [' '.join(v) for v in wf.values() if len(v) > 0]
        return bow, wf

    train_bow, train_txt = _bow(train)
    test_bow, test_txt = _bow(test)

    # save data
    os.makedirs(path.join(save_dir, 'corpus'), exist_ok=True)

    def _write_lines(dst, lines):
        with open(dst, 'w') as f:
            for line in lines:
                f.write('{}\n'.format(line))

    _write_lines(path.join(save_dir, 'corpus/train.txt'), train_txt)
    _write_lines(path.join(save_dir, 'corpus/test.txt'), test_txt)
    _write_lines(path.join(save_dir, 'train.feat'), ['1 {}'.format(line) for line in train_bow])
    _write_lines(path.join(save_dir, 'test.feat'), ['1 {}'.format(line) for line in test_bow])
    _write_lines(path.join(save_dir, 'vocab'), ['{} {}'.format(k, v) for k, v in new_vocab.items()])

    return new_vocab


def should_filter_word(w):
    REMOVE = r'[a-z]+'
    return re.fullmatch(REMOVE, w) is None


def create_data(train, test, n_vocab, save_dir):
    os.makedirs(save_dir, exist_ok=True)
    data = train + test
    counter = CountVectorizer()
    counter.fit(data)
    vocab = counter.vocabulary_
    cnt = counter.transform(data).sum(axis=0)
    cnt = sorted([(k, cnt[0, vocab[k]]) for k in vocab.keys() if not should_filter_word(k)], key=lambda x: x[1])

    v = {item[0]: vocab[item[0]] for item in cnt[-n_vocab:]}

    train = counter.transform(train)
    test = counter.transform(test)
    new_vocab = save_dataset(save_dir, [train, test], v)
    stoi = {k: v-1 for k, v in new_vocab.items()}



In [None]:
# create dataset
n_vocab = 4000
save_dir = 'data/gijiroku-{}'.format(n_vocab)
create_data(train, test, n_vocab, save_dir)

print('finished')

finished


In [None]:
!ls data/gijiroku-4000


corpus	test.feat  train.feat  vocab


Modify https://github.com/visionshao/NVDM

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class NVDM(nn.Module):
    def __init__(self, vocab_size, n_hidden, n_topic, n_sample):
        super(NVDM, self).__init__()

        self.vocab_size = vocab_size
        self.n_hidden = n_hidden
        self.n_topic = n_topic
        self.n_sample = n_sample

        # encoder architecture
        # encode doc to vectors
        self.enc_vec = nn.Linear(self.vocab_size, self.n_hidden)
        # get mean of Gaussian distribution
        self.mean = nn.Linear(self.n_hidden, self.n_topic)
        # get log_sigma of Gaussian distribution
        self.log_sigma = nn.Linear(self.n_hidden, self.n_topic)

        # decoder architecture
        self.dec_vec = nn.Linear(self.n_topic, self.vocab_size)

    def encoder(self, x):
        # encode doc to vectors
        enc_vec = F.tanh(self.enc_vec(x))
        # getting variational parameters
        mean = self.mean(enc_vec)
        log_sigma = self.log_sigma(enc_vec)
        # computing kld
        kld = -0.5 * torch.sum(1 - torch.square(mean) + 2 * log_sigma - torch.exp(2 * log_sigma), 1)
        return mean, log_sigma, kld

    def decoder(self, mean, log_sigma, x):
        # reconstruct doc from encoded vector
        if self.n_sample == 1:  # single sample
            eps = torch.rand(self.batch_size, self.n_topic)
            doc_vec = torch.mul(torch.exp(log_sigma), eps) + mean
            logits = F.log_softmax(self.dec_vec(doc_vec), dim=1)
            recons_loss = -torch.sum(torch.mul(logits, x), 1)
        # multiple samples
        else:
            eps = torch.rand(self.n_sample * self.batch_size, self.n_topic)
            eps_list = list(eps.view(self.n_sample, self.batch_size, self.n_topic))
            recons_loss_list = []
            for i in range(self.n_sample):
                curr_eps = eps_list[i]
                doc_vec = torch.mul(torch.exp(log_sigma), curr_eps) + mean
                logits = F.log_softmax(self.dec_vec(doc_vec))
                recons_loss_list.append(-torch.sum(torch.mul(logits, x), 1))
            recons_loss_list = torch.tensor(recons_loss_list)
            recons_loss = torch.sum(recons_loss_list, dim=1) / self.n_sample

        return recons_loss

    def forward(self, x):
        self.batch_size = len(x)
        mean, log_sigma, kld = self.encoder(x)
        epsilons = torch.normal(0, 1, size=(
                    x.size()[0], self.n_topic))
        sample = (torch.exp(log_sigma) * epsilons) + mean
        recons_loss = self.decoder(mean, log_sigma, x)
        return sample, kld, recons_loss


In [None]:
from torch.utils.data import Dataset
import numpy as np
import torch


class FeatDataset(Dataset):

    def __init__(self, file_path, vocab_size):
        data, word_count = self.data_set(file_path)
        transformed_docs = self.transform(docs=data, vocab_size=vocab_size)
        self.data = transformed_docs
        self.word_count = word_count

    def __getitem__(self, item):
        return self.data[item], self.word_count[item]

    def __len__(self):
        return len(self.data)

    def data_set(self, file_path):
        """process data input."""
        data = []
        word_count = []
        fin = open(file_path)
        while True:
            line = fin.readline()
            if not line:
                break
            id_freqs = line.split()
            doc = {}
            count = 0
            for id_freq in id_freqs[1:]:
                items = id_freq.split(':')
                # python starts from 0
                doc[int(items[0]) - 1] = int(items[1])
                count += int(items[1])
            if count > 0:
                data.append(doc)
                word_count.append(count)
        fin.close()
        return data, word_count

    def transform(self, docs, vocab_size):
        """transform data to bag-of-words"""
        transformed_docs = []
        for doc in docs:
            bow_doc = np.zeros(vocab_size)
            for word_id, freq in doc.items():
                bow_doc[word_id] = freq
            transformed_docs.append(bow_doc)

        return transformed_docs


In [None]:
from torch.utils.data import DataLoader

def test(dataloader, model):
    loss_sum = 0.0
    ppx_sum = 0.0
    kld_sum = 0.0
    word_count = 0
    doc_count = 0
    for data_batch, count_batch in dataloader:
        data_batch = data_batch.float()
        sample, kld, recons_loss = model(data_batch)
        loss = kld + recons_loss
        loss_sum += torch.sum(loss)
        kld_sum += torch.mean(kld)
        word_count += torch.sum(count_batch)
        count_batch = torch.add(count_batch, 1e-12)
        ppx_sum += torch.sum(torch.div(loss, count_batch))
        doc_count += len(data_batch)

    print_ppx = torch.exp(loss_sum / word_count)
    print_ppx_perdoc = torch.exp(ppx_sum / doc_count)
    print_kld = kld_sum / len(dataloader)
    print('| Perplexity: {:.9f}'.format(print_ppx),
          '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),
          '| KLD: {:.5}'.format(print_kld))


def train(dataloader, model, epoch_num):
    loss_sum = 0.0
    ppx_sum = 0.0
    kld_sum = 0.0
    word_count = 0
    doc_count = 0
    optim = torch.optim.Adam(model.parameters(), lr=0.0001)
    for epoch in range(epoch_num):
        for data_batch, count_batch in dataloader:
            data_batch = data_batch.float()
            sample, kld, recons_loss = model(data_batch)
            loss = kld + recons_loss
            loss_sum += torch.sum(loss)
            kld_sum += torch.mean(kld)
            word_count += torch.sum(count_batch)
            count_batch = torch.add(count_batch, 1e-12)
            ppx_sum += torch.sum(torch.div(loss, count_batch))
            doc_count += len(data_batch)
            #
            optim.zero_grad()
            loss.mean().backward()
            optim.step()
        print_ppx = torch.exp(loss_sum / word_count)
        print_ppx_perdoc = torch.exp(ppx_sum / doc_count)
        print_kld = kld_sum / len(dataloader)
        print('| Epoch train: {:d} |'.format(epoch + 1),
              '| Perplexity: {:.9f}'.format(print_ppx),
              '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),
              '| KLD: {:.5}'.format(print_kld))

# Hyperparameters
vocab_size = 2000
batch_size = 64
n_hidden = 500
n_topic = 50
n_sample = 1
# Dataloaders
train_dataset = FeatDataset(r'data/gijiroku-4000/train.feat', vocab_size)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size)
test_dataset = FeatDataset(r'data/gijiroku-4000/test.feat', vocab_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)
# Model
model = NVDM(vocab_size, n_hidden, n_topic, n_sample)
# Training
train(dataloader=train_dataloader, model=model, epoch_num=30)
# Evaluation
model.eval()
test(test_dataloader, model)

| Epoch train: 1 | | Perplexity: 1660.499267578 | Per doc ppx: 1703.76550 | KLD: 0.062899
| Epoch train: 2 | | Perplexity: 1266.129272461 | Per doc ppx: 1342.20459 | KLD: 0.26687
| Epoch train: 3 | | Perplexity: 998.591552734 | Per doc ppx: 1089.50342 | KLD: 0.70759
| Epoch train: 4 | | Perplexity: 809.885742188 | Per doc ppx: 905.25708 | KLD: 1.3437
| Epoch train: 5 | | Perplexity: 676.040527344 | Per doc ppx: 770.29742 | KLD: 2.1103
| Epoch train: 6 | | Perplexity: 578.771118164 | Per doc ppx: 669.42279 | KLD: 2.956
| Epoch train: 7 | | Perplexity: 506.316955566 | Per doc ppx: 592.84583 | KLD: 3.8602
| Epoch train: 8 | | Perplexity: 450.634124756 | Per doc ppx: 532.88696 | KLD: 4.8092
| Epoch train: 9 | | Perplexity: 406.605133057 | Per doc ppx: 485.02399 | KLD: 5.7928
| Epoch train: 10 | | Perplexity: 371.089172363 | Per doc ppx: 446.14313 | KLD: 6.8084
| Epoch train: 11 | | Perplexity: 341.824249268 | Per doc ppx: 413.97357 | KLD: 7.8588
| Epoch train: 12 | | Perplexity: 317.267456

In [None]:
torch.save(model, '/content/drive/MyDrive/議事録/model.pt')

In [None]:
beta = model.dec_vec.weight.cpu().detach().T

In [None]:
def validate(model, val_loader):       
    model.eval() # set to eval mode to avoid batchnorm
    samples = list()
    with torch.no_grad(): # avoid calculating gradients
        for x, _ in val_loader:
            x = x.float()
            sample, kld, recons_loss = model(x)
            samples.append(sample)
    train_repr = torch.cat(samples, dim=0).cpu().numpy()
    return train_repr

In [None]:
testdata = pd.read_sql_query(f'''SELECT * FROM speech WHERE speech_date >= '2022-01-01' AND speech_date <= '2023-04-31' ''', conn)

In [None]:
testdata = testdata['speech'].to_list()

In [None]:
testdata[0]

'質問 国会 建設業者 工事 受注 建設工事 受注 動態 統計 データ 書換え 問題 書換え GDP 値 過大 問題 前半 問題 統計データ 賃上げ 課題 お手元 資料 一月二十五日 朝日新聞 一面 統計 過大 記事 GDP 計算 材料 受注 統計 総額 元請 下請 受注 合算 数字 GDP 計算 材料 元請 受注 国交省 お答え'

In [None]:
repre = validate(model, test_dataloader)

In [None]:
repr[0]

array([ 0.68302435, -0.3627932 ,  0.71582747,  0.32786137, -0.14184505,
       -0.9739588 , -1.1596297 , -1.3929586 , -0.08689603,  1.660855  ,
       -0.22471714,  0.95406926, -1.2456349 , -0.66945314,  1.7411592 ,
       -0.90390086, -0.31726658, -0.36820558, -0.5545904 , -1.9979192 ,
       -0.15012038, -0.39461854,  0.35303116,  0.10400656,  0.5261552 ,
       -0.12791753,  0.39760688, -2.0116277 ,  0.71056074, -0.714365  ,
       -1.0988259 , -0.02518792, -0.2885734 ,  1.0826325 , -0.59357595,
        0.63868916, -0.74680114,  1.7173587 , -1.6159464 ,  2.2673683 ,
       -0.8096608 , -0.51813006,  1.3363781 ,  0.23885822,  0.24959758,
       -0.9270577 ,  0.66196156,  0.90200037, -0.10782486, -0.8426756 ],
      dtype=float32)

In [None]:
conn.close()