In [2]:
import torch
# from torchtext.datasets import YelpReviewFull
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import ngrams_iterator

In [11]:
import csv

labels = []
texts = []
with open('yelp_review_full_csv/train.csv', 'r') as fr:
    reader = csv.reader(fr)
    for line in reader:
        labels.append(line[0])
        texts.append(line[1])
print(labels[:3])
print(texts[:3])

['5', '2', '4']
["dr. goldberg offers everything i look for in a general practitioner.  he's nice and easy to talk to without being patronizing; he's always on time in seeing his patients; he's affiliated with a top-notch hospital (nyu) which my parents have explained to me is very important in case something happens and you need surgery; and you can get referrals to see specialists without having to see him first.  really, what more do you need?  i'm sitting here trying to think of any complaints i have about him, but i'm really drawing a blank.", "Unfortunately, the frustration of being Dr. Goldberg's patient is a repeat of the experience I've had with so many other doctors in NYC -- good doctor, terrible staff.  It seems that his staff simply never answers the phone.  It usually takes 2 hours of repeated calling to get an answer.  Who has time for that or wants to deal with it?  I have run into this problem with many other doctors and I just don't get it.  You have office workers, y

In [39]:
test_labels = []
test_texts = []
with open('yelp_review_full_csv/test.csv', 'r') as fr:
    reader = csv.reader(fr)
    for line in reader:
        test_labels.append(line[0])
        test_texts.append(line[1])

In [13]:
tokenizer = get_tokenizer('basic_english')
# train_iter = YelpReviewFull(split='train')

def yield_tokens(data):
    for text in data:
        tokens = tokenizer(text)
        yield list(ngrams_iterator(tokens, 2))

vocab = build_vocab_from_iterator(yield_tokens(texts), min_freq=5, specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

text_pipeline = lambda x: vocab(list(ngrams_iterator(tokenizer(x), 2)))
label_pipeline = lambda x: int(x) - 1


In [19]:
vocab(['here', 'is', 'an', 'example'])

[58, 14, 83, 5073]

In [20]:
from torch.utils.data import DataLoader

def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))  # '1', '2', '3', '4' -> [0, 1, 2, 3]
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)  # [475, 21, 30, 5297]
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)  # input의 누적 합계를 반환
    text_list = torch.cat(text_list)  # batch 내의 모든 단어가 일렬로 들어감 -> nn.Embedding 에 들어가기 위해 하나로 합쳐짐

    return label_list, text_list, offsets

In [21]:
from torch import nn

class FastText(nn.Module):
    def __init__(self, vocab_size, embedding_size, num_class, dropout_p):
        super(FastText, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embedding_size, sparse=True)
        self.embedding.weight.data.uniform(-0.5, 0.5)
        self.dropout = nn.Dropout(dropout_p)
        self.linear = nn.Linear(embedding_size, num_class, bias=True)
        self.linear.weight.data.uniform(-0.5,0.5)
        self.linear.bias.data.zero_()

        
    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        embedded = self.dropout(embedded)
        return self.linear(embedded)

In [22]:
import time
from torch.nn.utils import clip_grad_norm_

log_interval = 3000

def train(model, dataloader, criterion, optimizer, scheduler, clip):
    model.train()
    acc, count = 0, 0
    s_time = time.time()
    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted_label = model(text, offsets)  # |predicted_label| = (batch, num_classes)
        loss = criterion(predicted_label, label)

        loss.backward()
        clip_grad_norm_(model.parameters(), clip, norm_type=2)
        optimizer.step()
        scheduler.step()

        acc += (predicted_label.argmax(1) == label).sum().item()  # 같으면 1 -> 쭉 더함 
        count += label.size(0)  # batch 때문에 size(0)으로 카운트 셈

        if idx % log_interval == 0 and idx > 0:
            elasped = (time.time() - s_time)
            print('accuracy: {}, time: {}[s]'.format(acc/count, int(elasped)))
            s_time = time.time()   
    return acc/count

def evaluate(model, dataloader):
    model.eval()
    v_total_acc, v_total_count = 0, 0

    with torch.no_grad():
        for (v_label, v_text, v_offsets) in dataloader:
            v_predicted_label = model(v_text, v_offsets)
            v_total_acc += (v_predicted_label.argmax(1) == v_label).sum().item()
            v_total_count += v_label.size(0)

    return v_total_acc/v_total_count


In [23]:
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset
import os

In [33]:
for i in range(len(labels)-1):
    num_class = max(label_pipeline(labels[i]), label_pipeline(labels[i+1])) + 1
num_class

5

In [34]:
# train_iter = AmazonReviewFull(split='train')
# num_class = len(set([label for (label, text) in train_iter]))
num_class = 5
print("num_class: ", num_class)

save_dir = './saved_model/YelpF'
if not os.path.exists(save_dir):
    os.mkdir(save_dir)
save_path = os.path.join(save_dir, 'ckpt.pth')

num_class:  5


In [48]:
train_dataset = []
with open('yelp_review_full_csv/train.csv', 'r') as fr:
    reader = csv.reader(fr)
    for line in reader:
        train_dataset.append(line)
print(len(train_dataset))

test_dataset = []
with open('yelp_review_full_csv/test.csv', 'r') as fr:
    reader = csv.reader(fr)
    for line in reader:
        test_dataset.append(line)
print(len(test_dataset))

650000
50000


In [54]:
vocab_size = len(vocab)
embedding_size = 64
dropout_p = 0.2
model = FastText(vocab_size, embedding_size, num_class, dropout_p)
max_epoch = 5
lr = 0.1
lr_decay = 0.99
step_size = 1000
batch_size = 64
clip = 3

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adagrad(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size, gamma=lr_decay)

# train_iter, test_iter = AmazonReviewFull()
# train_dataset = to_map_style_dataset(train_iter)
# test_dataset = to_map_style_dataset(test_iter)
num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = random_split(train_dataset, [num_train, len(train_dataset) - num_train])

train_dataloader = DataLoader(split_train_, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(split_valid_, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)

In [55]:
def run_train():
  best_accu = 0

  for epoch in range(0, max_epoch):
    print('-' * 10 + 'epoch: {}/{}'.format(epoch+1, max_epoch))
    epoch_start_time = time.time()
    total_acc = train(model, train_dataloader, criterion, optimizer, scheduler, clip)
    accu_val = evaluate(model, valid_dataloader)
    if best_accu < accu_val:
      best_accu = accu_val
      torch.save({'epoch': epoch + 1, 'model_state_dict': model.state_dict()}, save_path)
      print('-' * 59)
      print('epoch {:3d} | time: {:5.2f}s | valid accuracy: {:8.3f} '.format(epoch+1, time.time() - epoch_start_time, accu_val))
      print('-' * 59)

run_train()

----------epoch: 1/5
accuracy: 0.6047307147617461, time: 184[s]
accuracy: 0.6165300574904182, time: 182[s]
accuracy: 0.622883915676036, time: 181[s]
-----------------------------------------------------------
epoch   1 | time: 594.40s | valid accuracy:    0.641 
-----------------------------------------------------------
----------epoch: 2/5
accuracy: 0.7286529490169943, time: 182[s]
accuracy: 0.7320498666888852, time: 184[s]
accuracy: 0.7342326547050327, time: 186[s]
----------epoch: 3/5
accuracy: 0.8055283655448184, time: 186[s]
accuracy: 0.8064046617230461, time: 194[s]
accuracy: 0.8065891984223975, time: 193[s]
----------epoch: 4/5
accuracy: 0.8637173025658114, time: 193[s]
accuracy: 0.8630150183302783, time: 184[s]
accuracy: 0.8626958115764916, time: 183[s]
----------epoch: 5/5
accuracy: 0.9012881122959013, time: 183[s]
accuracy: 0.9009618188635228, time: 183[s]
accuracy: 0.9001308882346406, time: 182[s]


In [56]:
accuracy = evaluate(model, test_dataloader)

print("Accuracy: ", accuracy)

Accuracy:  0.61544


In [57]:
load_path = save_dir + '/ckpt.pth'
checkpoint = torch.load(load_path)

model.load_state_dict(checkpoint['model_state_dict'])
epoch = checkpoint['epoch']
print('epoch: ', epoch)
accuracy = evaluate(model, test_dataloader)

print("Accuracy: ", accuracy)

epoch:  1
Accuracy:  0.63884
