In [1]:
import torch

In [2]:
import torch.nn as nn

In [14]:
class MyLinear(nn.Module):
    
    def __init__(self, input_size, output_size):
        super().__init__()
        
        self.W = torch.FloatTensor(input_size, output_size)
        self.b = torch.FloatTensor(output_size)
    
    def forward(self,x):
        y=torch.mm(x, self.W) + self.b
        
        return y

In [15]:
def ground_truth(x):
    return 3 * x[:,0] + x[:,1] - 2 * x[:,2]

In [16]:
def train(model, x, y, optim):
    #initialize gradients in all parameters in module. 
    optim.zero_grad()
    
    #feed-forward
    y_hat = model(x)
    #get error between answer and inferenced. 
    loss = ((y-y_hat)**2).sum() / x.size(0)
    
    #back-propagation
    loss.backward()
    
    #one-step of gradiant descent
    optim.step()
    
    return loss.data

In [18]:
"Natural language processing is one of the biggest streams in artificial intelligence, and it becomes very popular after seq2seq's invention."

"Natural language processing is one of the biggest streams in artificial intelligence, and it becomes very popular after seq2seq's invention."

In [36]:
from torchtext import data

class DataLoader(object):
    
    def __init__(self, train_fn, valid_fn, 
                batch_size=64, 
                device=-1, 
                max_vocab=999999, 
                min_freq=1, 
                use_eos=False, 
                shuffle=True
                ):
        super(DataLoader, self).__init__()
        
        # Define field of the input file. 
        # The input file consists of two fields. 
        self.label = data.Field(sequential=False, 
                               use_vocab=True, 
                               unk_token=None
                               )
        self.text=data.Field(use_vocab=True, 
                            batch_first=True, 
                            include_lengths=False, 
                            eos_token='<EOS>' if use_eos else None)
        
        # Those defined two columns will be delimited by TAB. 
        # Thus, we use TabularDataset to load two columns in the input file. 
        # We would have two separate input file: train_fn, valid_fn
        # Files consist of two columns: label field and text field. 
        train, valid = data.TabularDataset.splits(path='', 
                                                 train=train_fn, 
                                                 validation=valid_fn, 
                                                 format='tsv', 
                                                 fields=[('label', self.label), 
                                                        ('text', self.text)
                                                        ]
                                                 )
        
        # Those loaded dataset would be feeded into each iterator:
        # train iterator and valid iterator. 
        # We sort input sentences by length, to group similar lengths. 
        self.train_iter, self.valid_iter = data.BucketIterator.splits((train, valid), 
                                                                     batch_size=batch_size, 
                                                                     device='cuda:%d' % device if device >= 0 else 'cpu', 
                                                                     shuffle=shuffle, 
                                                                     sort_key=lambda x: len(x.text), 
                                                                     sort_within_batch=True
                                                                     )
        
        # At last, we make a vocabulary for label and text field. 
        # It is making mapping table between wordsa nd indice. 
        self.label.build_vocab(train)
        self.text.build_vocab(train, max_size=max_vocab, min_freq=min_freq)

In [5]:
from torchtext import data, datasets

PAD, BOS, EOS = 1, 2, 3

class Dataloader():
    
    def __init__(self, 
                train_fn, 
                valid_fn, 
                batch_size=64, 
                device='cpu', 
                max_vocab=99999999, 
                max_length=255, 
                fix_length=None, 
                use_bos=True, 
                use_eos=True, 
                shuffle=True
                ):
        
        super(Dataloader, self).__init__()
        
        self.text = data.Field(sequential=True, 
                               device='cpu', 
                               max_vocab=99999999, 
                               max_length=255, 
                               fix_length=None, 
                               use_bos=True, 
                               use_eos=True, 
                               shuffle=True,
                               dsl=False
                              )
        
        super(Dataloader, self).__init__()
        
        self.src = data.Field(sequential=true, 
                              use_vocab=True, 
                              batch_first=True, 
                              include_lengths=True, 
                              fix_length=fix_length, 
                              init_token='<BOS>' if dsl else None, 
                              eos_token='<EOS>' if dsl else None
                             )
        
        self.tgt = data.Field(sequential=true, 
                              use_vocab=True, 
                              batch_first=True, 
                              include_lengths=True, 
                              fix_length=fix_length, 
                              init_token='<BOS>' if use_bos else None, 
                              eos_token='<EOS>' if use_eos else None
                             )
        
        if train_fn is not None and valid_fn is not None and exts is not none: 
            trainb = TranslationDataset(path=train_fn, 
                                       exts=exts, 
                                       fields=[('src', self.src), 
                                              ('tgt', self.tgt)
                                              ], 
                                       max_length=max_length
                                       )
            valid = TranslationDataset(path=valid_fn, 
                                      exts=exts, 
                                      fields=[('src', self.src), 
                                             ('tgt', self.tgt)
                                             ], 
                                      max_length=max_length
                                      )
            
            self.train_iter = data.BucketIterator(train, 
                                                batch_size=batch_size, 
                                                device='cuda:%d' % device if device >= 0 else 'cpu', 
                                                shuffle=shuffle, 
                                                sort_key=lambda x: leng(x.tgt) + (max_length * len(x.src)), 
                                                sort_within_batch=true
                                                )
            
            self.valid_iter = data.BucketIterator(valid, 
                                                 batch_size=batch_size, 
                                                 device='cuda:%d' % device if device >= 0 else 'cpu', 
                                                 shuffle=False, 
                                                 sort_key=lambda x: len(x.tgt) + (max_length * len(x.src)), 
                                                 sort_within_batch=True
                                                 )
            
            self.src.build_vocab(train, max_size=max_vocab)
            self.tgt.build_vocab(train, max_size=max_vocab)
            
    def load_vocab(self, src_vocab, tgt_vocab):
        self.src.vocab = src_vocab
        self.tgt.vocab = tgt_vocab
        
class TranslationDataset(data.Dataset):
    
    @staticmethod
    def sort_key(ex):
        return data.interleave_keys(len(ex.src), len(ex.trg))
    
    def __init__(self, path, exts, fields, max_length=None, **kwargs):
        if not isinstance(fields[0], (tuple, list)):
            fields = [('src', fields[0]), ('trg', fields[1])]
            
        if not path.endswith('.'):
            path +=  '.'
        
        src_path, trg_path = tuple(os.path.expanduser(path + x) for x in exts)
        
        examples = []
        with open(src_path, encoding ='utf-8') as src_file, open(trg_path, encoding='utf-8') as trg_file:
            for src_line, trg_line in zip(src_file, trg_file):
                src_line, trg_line = src_line.strip(), trg_line.strip()
                if max_length and max_length < max(len(src_line.split()), 
                                                  len(trg_line.split())
                                                  ):
                    continue
                if src_line != '' and trg_line != '':
                    examples.append(data.Example.fromlist([src_line, trg_line], fields))
                    
        super().__init__(examples, fields, **kwargs)