In [1]:
import torch
from utils import *
import os
import re
import tarfile
import time
import zipfile
import numpy as np
from torch.utils import data

In [2]:
# utils
def extract(fname):  
    base_dir = os.path.dirname(fname)
    data_dir, ext = os.path.splitext(fname)
    if ext == '.zip':
        fp = zipfile.ZipFile(fname, 'r')
    elif ext in ('.tar', '.gz'):
        fp = tarfile.open(fname, 'r')
    else:
        assert False, 'Only zip/tar files can be extracted.'
    fp.extractall(base_dir)
    return base_dir

class TokenEmbedding(data.Dataset):
    """Token Embedding."""
    def __init__(self, embedding_name):
        self.idx_to_token, self.idx_to_vec = self._load_embedding(
            embedding_name)
        self.unknown_idx = 0
        self.token_to_idx = {token: idx for idx, token in
                             enumerate(self.idx_to_token)}

    def _load_embedding(self, embedding_name):
        idx_to_token, idx_to_vec = ['<unk>'], []
        data_dir = extract(embedding_name)
        with open(os.path.join(data_dir, 'word2vec_vi_syllables_100dims.txt'), 'r') as f:
            for line in f:
                elems = line.rstrip().split(' ')
                token, elems = elems[0], [float(elem) for elem in elems[1:]]
                if len(elems) > 1:
                    idx_to_token.append(token)
                    idx_to_vec.append(elems)
        idx_to_vec = [[0] * len(idx_to_vec[0])] + idx_to_vec
        return idx_to_token, np.array(idx_to_vec)

    def __getitem__(self, tokens):
        indices = [self.token_to_idx.get(token, self.unknown_idx)
                   for token in tokens]
        vecs = self.idx_to_vec[np.array(indices)]
        return vecs

    def __len__(self):
        return len(self.idx_to_token)

In [3]:
model_w2v = 'model/word2vec_vi_syllables_100dims.zip'
vec = TokenEmbedding(model_w2v)

In [4]:
vocab_size = len(vec)
vocab_size

979461

In [5]:
import torch.nn as nn
# model
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers=2, bidirectional=False, batch_first=True)
        self.classifier = nn.Linear(hidden_size, 3)
    def forward(self, input):
        x = self.embedding(input)
        x, (hn, cn) = self.rnn(x) 
        # x là trạng thái ẩn for each t của lớp LSTM cuối cùng 
        # hn là trạng thái ẩn của last t từng lớp LSTM
        # hn = [num layers * num directions, batch size ,hid dim]
        # hn = [num layers * num directions, batch size ,hid dim]
        x = hn[-1]
        out = self.classifier(x)
        return out
myRNN = RNN(vocab_size, embedding_size=100, hidden_size=256)
myRNN.embedding.weight.requires_grad = False
myRNN.embedding.weight.data.copy_(torch.from_numpy(vec.idx_to_vec))

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0090, -0.1823,  0.0022,  ..., -0.0641, -0.0212, -0.1909],
        [ 0.0081, -0.1886,  0.0997,  ..., -0.0978, -0.0280, -0.1687],
        ...,
        [ 0.1463, -0.0747, -0.1573,  ...,  0.0366, -0.1794,  0.1295],
        [ 0.1547, -0.0508, -0.0108,  ...,  0.0473, -0.2727,  0.2101],
        [ 0.1907,  0.0742,  0.0737,  ...,  0.0943, -0.1364,  0.0139]])

In [6]:
# check
ine = torch.tensor([[1,2], [1,3]])
out = myRNN(ine)
out

tensor([[ 0.0438,  0.0081, -0.0181],
        [ 0.0438,  0.0083, -0.0182]], grad_fn=<AddmmBackward>)