In [1]:
from collections import defaultdict

import torch.nn as nn
import torch
from torch.autograd import Variable
import torch.nn.functional as F
import random
import numpy as np

# data를 읽어옴
def read_txt(path_to_file):
    txt_ls = []
    label_ls = []

    with open(path_to_file) as f:
        for i, line in enumerate(f.readlines()[1:]):
            id_num, txt, label = line.split('\t')
            txt_ls.append(txt)
            label_ls.append(int(label.replace('\n','')))
    return txt_ls, label_ls

def add_start_end_token(token_ls):
    sos = '<SOS>'
    eos = '<EOS>'
    
    return_ls = []
    sequence_length = [] # Packing을 위해, original sequence length를 기록
    
    for tokens in token_ls:
        tokens = [sos] + tokens + [eos] # text
        return_ls.append(tokens)
        sequence_length.append(len(tokens))
    
    return return_ls, sequence_length


# Sequence Length를 맞추기 위한 padding
def add_padding(token_ls, max_len):
    pad = '<PAD>'

    for i, tokens in enumerate(token_ls):
        if len(tokens)< max_len:
            token_ls[i] += [pad] * (max_len - len(tokens))
    
        elif len(tokens) > max_len:
            token_ls[i] = tokens[:max_len]    
    return token_ls


# 단어에 대한 idx 부여
def convert_token_to_idx(token_ls):
     
    for tokens in token_ls:
        yield [token2idx[token] for token in tokens.split(' ')]
    return
        
# torch Variable로 변환
def convert_to_variable(x):
    return Variable(torch.LongTensor(x))

## Data loading

In [2]:
x_train, y_train = read_txt('../ratings_train.txt')
x_test, y_test = read_txt('../ratings_test.txt')

In [3]:
x_train = [x.split() for x in x_train]
x_test = [x.split() for x in x_test]

In [4]:
x_train[0]

['아', '더빙..', '진짜', '짜증나네요', '목소리']

## SOS EOS 토큰 추가

In [5]:
token2idx = defaultdict(lambda : len(token2idx))
pad = token2idx['<PAD>'] #0
sos = token2idx['<SOS>'] #1
eos = token2idx['<EOS>'] #2

In [6]:
x_train, x_train_sequence_length = add_start_end_token(x_train)
x_test, x_test_sequence_length = add_start_end_token(x_test)

In [7]:
x_train[0]

['<SOS>', '아', '더빙..', '진짜', '짜증나네요', '목소리', '<EOS>']

In [8]:
x_train_sequence_length[0]

7

## Original Sequence_length 계산 

In [9]:
max_sequence_length = 30
x_train = add_padding(x_train, max_sequence_length)
x_test = add_padding(x_test, max_sequence_length)

x_train = list(convert_token_to_idx(x_train))
x_test = list(convert_token_to_idx(x_test))

NameError: name 'sent' is not defined

## Converting to Variable 

In [24]:
x_train = convert_to_variable(x_train)
x_test = convert_to_variable(x_test)

y_train = convert_to_variable(y_train)
y_test = convert_to_variable(y_test)

ValueError: expected sequence of length 7 at dim 1 (got 6)

In [20]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hid_size):
        super(RNN, self).__init__()
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.embed = nn.Embedding(vocab_size, embed_size)
        
        self.hid_size = hid_size
        
        self.rnn = nn.RNN(embed_size, hid_size, num_layers=1, batch_first=True)
        self.outputs = []
        
    def forward(self, x):
        embeded = self.embed(x) # batch_size, sequence_length, embed_size
        return
        

In [21]:
params = {
    'vocab_size' : len(token2idx),
    'embed_size' : 5,
    'hid_size' : 5,
}

In [22]:
model = RNN(**params)

In [23]:
model(x_train[0])

TypeError: embedding(): argument 'indices' (position 2) must be Tensor, not list