In [1]:
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import ngrams_iterator

In [12]:
import csv

labels = []
texts = []
with open('yahoo_answers_csv/train.csv', 'r', encoding='UTF8') as fr:
    reader = csv.reader(fr)
    for line in reader:
        labels.append(line[0])
        texts.append(line[1:])
print(labels[:2])
print(texts[:2])

['5', '6']
[["why doesn't an optical mouse work on a glass table?", 'or even on some surfaces?', 'Optical mice use an LED and a camera to rapidly capture images of the surface beneath the mouse.  The infomation from the camera is analyzed by a DSP (Digital Signal Processor) and used to detect imperfections in the underlying surface and determine motion. Some materials, such as glass, mirrors or other very shiny, uniform surfaces interfere with the ability of the DSP to accurately analyze the surface beneath the mouse.  \\nSince glass is transparent and very uniform, the mouse is unable to pick up enough imperfections in the underlying surface to determine motion.  Mirrored surfaces are also a problem, since they constantly reflect back the same image, causing the DSP not to recognize motion properly. When the system is unable to see surface changes associated with movement, the mouse will not work properly.'], ['What is the best off-road motorcycle trail ?', 'long-distance trail throug

In [16]:
text = []
for line in texts:
    temp = line[0] +' ' + line[1] + ' ' + line[2]
    text.append(temp)

text[:2]

["why doesn't an optical mouse work on a glass table? or even on some surfaces? Optical mice use an LED and a camera to rapidly capture images of the surface beneath the mouse.  The infomation from the camera is analyzed by a DSP (Digital Signal Processor) and used to detect imperfections in the underlying surface and determine motion. Some materials, such as glass, mirrors or other very shiny, uniform surfaces interfere with the ability of the DSP to accurately analyze the surface beneath the mouse.  \\nSince glass is transparent and very uniform, the mouse is unable to pick up enough imperfections in the underlying surface to determine motion.  Mirrored surfaces are also a problem, since they constantly reflect back the same image, causing the DSP not to recognize motion properly. When the system is unable to see surface changes associated with movement, the mouse will not work properly.",
 'What is the best off-road motorcycle trail ? long-distance trail throughout CA i hear that th

In [17]:
tokenizer = get_tokenizer('basic_english')

def yield_tokens(data):
    for line in data:
        tokens = tokenizer(line)
        yield list(ngrams_iterator(tokens, 2))

vocab = build_vocab_from_iterator(yield_tokens(text), min_freq=3, specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

text_pipeline = lambda x: vocab(list(ngrams_iterator(tokenizer(x), 2)))
label_pipeline = lambda x: int(x) - 1


In [18]:
vocab(['here', 'is', 'an', 'example'])

[203, 11, 63, 583]

In [19]:
from torch.utils.data import DataLoader

def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))  # '1', '2', '3', '4' -> [0, 1, 2, 3]
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)  # [475, 21, 30, 5297]
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)  # input의 누적 합계를 반환
    text_list = torch.cat(text_list)  # batch 내의 모든 단어가 일렬로 들어감 -> nn.Embedding 에 들어가기 위해 하나로 합쳐짐

    return label_list, text_list, offsets

In [35]:
from torch import nn

class FastText(nn.Module):
    def __init__(self, vocab_size, embedding_size, num_class, dropout_p):
        super(FastText, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embedding_size, sparse=True)
        nn.init.normal_(self.embedding.weight, mean=0.0, std=0.5)
        self.dropout = nn.Dropout(dropout_p)
        self.linear = nn.Linear(embedding_size, num_class, bias=True)
        nn.init.normal_(self.linear.weight, mean=0.0, std=0.5)
        self.linear.bias.data.zero_()

        
    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        embedded = self.dropout(embedded)
        return self.linear(embedded)

In [21]:
import time
from torch.nn.utils import clip_grad_norm_

log_interval = 3000

def train(model, dataloader, criterion, optimizer, scheduler, clip):
    model.train()
    acc, count = 0, 0
    s_time = time.time()
    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted_label = model(text, offsets)  # |predicted_label| = (batch, num_classes)
        loss = criterion(predicted_label, label)

        loss.backward()
        clip_grad_norm_(model.parameters(), clip, norm_type=2)
        optimizer.step()
        scheduler.step()

        acc += (predicted_label.argmax(1) == label).sum().item()  # 같으면 1 -> 쭉 더함 
        count += label.size(0)  # batch 때문에 size(0)으로 카운트 셈

        if idx % log_interval == 0 and idx > 0:
            elasped = (time.time() - s_time)
            print('accuracy: {}, time: {}[s]'.format(acc/count, int(elasped)))
            s_time = time.time()   
    return acc/count

def evaluate(model, dataloader):
    model.eval()
    v_total_acc, v_total_count = 0, 0

    with torch.no_grad():
        for (v_label, v_text, v_offsets) in dataloader:
            v_predicted_label = model(v_text, v_offsets)
            v_total_acc += (v_predicted_label.argmax(1) == v_label).sum().item()
            v_total_count += v_label.size(0)

    return v_total_acc/v_total_count


In [22]:
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset
import os

In [29]:
# train_iter = AmazonReviewFull(split='train')
num_class = len(set([label for label in labels]))
print(num_class)

save_dir = './saved_model/YahA'
if not os.path.exists(save_dir):
    os.mkdir(save_dir)
save_path = os.path.join(save_dir, 'ckpt.pth')

10


In [30]:
test_labels = []
test_text = []
with open('yahoo_answers_csv/test.csv', 'r', encoding='UTF8') as fr:
    reader = csv.reader(fr)
    for line in reader:
        test_labels.append(line[0])
        temp = line[1] +' ' + line[2] + ' ' + line[3]
        test_text.append(temp)

print(test_labels[:2])
print(test_text[:2])

['9', '2']
["What makes friendship click? How does the spark keep going? good communication is what does it.  Can you move beyond small talk and say what's really on your mind.  If you start doing this, my expereince is that potentially good friends will respond or shun you.  Then you know who the really good friends are.", 'Why does Zebras have stripes? What is the purpose or those stripes? Who do they serve the Zebras in the wild life? this provides camouflage - predator vision is such that it is usually difficult for them to see complex patterns']


In [40]:
train_dataset = []
for i in range(len(labels)):
    temp = []
    temp.append(labels[i])
    temp.append(text[i])
    train_dataset.append(temp)

test_dataset = []
for i in range(len(test_labels)):
    temp = []
    temp.append(test_labels[i])
    temp.append(test_text[i])
    test_dataset.append(temp)
    
print(len(train_dataset))
print(len(test_dataset))

1400000
60000


In [42]:
test_dataset[:2]

[['9',
  "What makes friendship click? How does the spark keep going? good communication is what does it.  Can you move beyond small talk and say what's really on your mind.  If you start doing this, my expereince is that potentially good friends will respond or shun you.  Then you know who the really good friends are."],
 ['2',
  'Why does Zebras have stripes? What is the purpose or those stripes? Who do they serve the Zebras in the wild life? this provides camouflage - predator vision is such that it is usually difficult for them to see complex patterns']]

In [43]:
vocab_size = len(vocab)
embedding_size = 64
dropout_p = 0.2
model = FastText(vocab_size, embedding_size, num_class, dropout_p)
max_epoch = 5
lr = 0.1
lr_decay = 0.99
step_size = 1000
batch_size = 64
clip = 3

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adagrad(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size, gamma=lr_decay)

# train_iter, test_iter = AmazonReviewFull()
# train_dataset = to_map_style_dataset(train_iter)
# test_dataset = to_map_style_dataset(test_iter)
num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = random_split(train_dataset, [num_train, len(train_dataset) - num_train])

train_dataloader = DataLoader(split_train_, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(split_valid_, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)

In [44]:
def run_train():
  best_accu = 0

  for epoch in range(0, max_epoch):
    print('-' * 10 + 'epoch: {}/{}'.format(epoch+1, max_epoch))
    epoch_start_time = time.time()
    total_acc = train(model, train_dataloader, criterion, optimizer, scheduler, clip)
    accu_val = evaluate(model, valid_dataloader)
    if best_accu < accu_val:
      best_accu = accu_val
      torch.save({'epoch': epoch + 1, 'model_state_dict': model.state_dict()}, save_path)
      print('-' * 59)
      print('epoch {:3d} | time: {:5.2f}s | valid accuracy: {:8.3f} '.format(epoch+1, time.time() - epoch_start_time, accu_val))
      print('-' * 59)

run_train()

----------epoch: 1/5
accuracy: 0.6673400533155615, time: 151[s]
accuracy: 0.6860653432761207, time: 150[s]
accuracy: 0.6946190006665925, time: 151[s]
accuracy: 0.700048433463878, time: 149[s]
accuracy: 0.7037926638224118, time: 152[s]
accuracy: 0.70671160629965, time: 151[s]
-----------------------------------------------------------
epoch   1 | time: 1059.75s | valid accuracy:    0.726 
-----------------------------------------------------------
----------epoch: 2/5
accuracy: 0.788252874041986, time: 151[s]
accuracy: 0.7894908140309949, time: 151[s]
accuracy: 0.7897733585157205, time: 151[s]
accuracy: 0.7901750374968752, time: 150[s]
accuracy: 0.7906962452503167, time: 152[s]
accuracy: 0.7909603980334425, time: 152[s]
-----------------------------------------------------------
epoch   2 | time: 1062.13s | valid accuracy:    0.728 
-----------------------------------------------------------
----------epoch: 3/5
accuracy: 0.8299629290236588, time: 151[s]
accuracy: 0.8305777162139644, ti

In [45]:
accuracy = evaluate(model, test_dataloader)

print("Accuracy: ", accuracy)

Accuracy:  0.7234166666666667


In [46]:
load_path = save_dir + '/ckpt.pth'
checkpoint = torch.load(load_path)

model.load_state_dict(checkpoint['model_state_dict'])
epoch = checkpoint['epoch']
print('epoch: ', epoch)
accuracy = evaluate(model, test_dataloader)

print("Accuracy: ", accuracy)

epoch:  2
Accuracy:  0.7300833333333333
