In [48]:
#!pip install torchtext torchdata datasets spacy
#!python -m spacy download en_core_web_md


In [49]:
# # Using TPU
# !pip install torchtext torchdata cloud-tpu-client==0.10 torch==1.12.1 https://storage.googleapis.com/tpu-pytorch/wheels/colab/torch_xla-1.12-cp37-cp37m-linux_x86_64.whl
# import torch_xla
# import torch_xla.core.xla_model as xm
# device = xm.xla_device()


In [50]:
import functools
import datasets
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f'selected device: {device}')
tokenizer = get_tokenizer('spacy', 'en_core_web_md')
train_data, test_data = datasets.load_dataset('imdb', split=['train', 'test'])


def tokenize_data(example, tokenizer):
    tokens = {'tokens': tokenizer(example['text'])}
    return tokens


train_data = train_data.map(tokenize_data, fn_kwargs={'tokenizer': tokenizer})
test_data = test_data.map(tokenize_data, fn_kwargs={'tokenizer': tokenizer})
print(train_data, test_data)

test_size = 0.2
train_valid_data = train_data.train_test_split(test_size=test_size)
train_data = train_valid_data['train']
valid_data = train_valid_data['test']

min_freq = 5
special_tokens = ['<unk>', '<pad>']

vocab = build_vocab_from_iterator(
    train_data['tokens'], min_freq=min_freq, specials=special_tokens
)

print(vocab(['here', 'is', 'an', 'example']))

unk_index = vocab['<unk>']
pad_index = vocab['<pad>']
vocab.set_default_index(unk_index)

text_pipeline = lambda x: vocab(tokenize_data(x))
label_pipeline = lambda x: 1 if x == 'pos' else 0

selected device: cuda:0


Reusing dataset imdb (C:\Users\sms20\.cache\huggingface\datasets\imdb\plain_text\1.0.0\2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/2 [00:00<?, ?it/s]

0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

Dataset({
    features: ['text', 'label', 'tokens'],
    num_rows: 25000
}) Dataset({
    features: ['text', 'label', 'tokens'],
    num_rows: 25000
})
[161, 9, 43, 489]


In [51]:
def numericalize_data(example, vocab):
    ids = {'ids': [vocab[token] for token in example['tokens']]}
    return ids


train_data = train_data.map(numericalize_data, fn_kwargs={'vocab': vocab})
valid_data = valid_data.map(numericalize_data, fn_kwargs={'vocab': vocab})
test_data = test_data.map(numericalize_data, fn_kwargs={'vocab': vocab})

train_data.set_format(type='torch', columns=['ids', 'label'])
valid_data.set_format(type='torch', columns=['ids', 'label'])
test_data.set_format(type='torch', columns=['ids', 'label'])

next(iter(test_data))
print(next(iter(test_data)))

0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

{'label': tensor(0), 'ids': tensor([   11,   142,  1335,    17,  1301,     5,   267,  1724,     8,   299,
           64,    21,     6,   193,     4,  2614,    17,  1301,   117,   163,
          268,    32,   721,     0,     3,   518,    17,  2542,     5,  8282,
            4,    11,   827,     8,    47,    16,     3,    11,    74,    86,
            3,    26,    12,     9,     8,    60,   268,  1335,    17,  1301,
           20, 19405,   691,     9,     8,  1153,  2471,    27,     2,   217,
           28,     4, 13084, 23390,     3,   717,  3954,   750,     3,  4902,
         4634,     3,  4869,    13,    85,    30,  1141,     2,  1040,     3,
            5,  2235,    38,    17,  2035,   120,    69,    33,    36,  3375,
           21,     6,    50,  1335,    17,  1301,    50,   978,     4,    27,
           11,   171,   305,    67,    32,   173,     7,    31,    55,    67,
           44,   119, 19405,   691,     9,    60,  1335,    17,  1301,   268,
            4,    54,    15,    33, 

In [52]:
# Hyperparameters
BATCH_SIZE = 64
LR = 0.0005  # learning rate
EPOCHS = 12  # epoch

n_classes = 2
vocab_size = len(vocab)
embed_dim = 128  # 각 단어를 128차원으로 조정(임베딩 계층을 통과한 후 각 벡터의 크기)
hidden_dim = 256
n_layers = 1


In [53]:
class BasicRNN(nn.Module):
    def __init__(
        self,
        n_layers,
        hidden_dim,
        n_vocab,
        embed_dim,
        n_classes,
        pad_index,
        dropout_p=0.2,
    ):
        super(BasicRNN, self).__init__()
        self.n_layers = n_layers  # ------ RNN 계층에 대한 개수
        self.embed = nn.Embedding(n_vocab, embed_dim, pad_index)  # ------ 워드 임베딩 적용
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(dropout_p)  # ------ 드롭아웃 적용
        self.rnn = nn.RNN(
            embed_dim, self.hidden_dim, num_layers=self.n_layers, batch_first=True
        )
        self.out = nn.Linear(self.hidden_dim, n_classes)

    def forward(self, text):
        x = self.embed(text)
        h_0 = self._init_state(batch_size=x.size(0))
        x, _ = self.rnn(x, h_0)
        h_t = x[:, -1, :]
        self.dropout(h_t)
        logit = torch.sigmoid(self.out(h_t))
        return logit

    def _init_state(self, batch_size=1):
        weight = next(self.parameters()).data  # ------ 모델의 파라미터 값을 가져와서 weight 변수에 저장
        return weight.new(
            self.n_layers, batch_size, self.hidden_dim
        ).zero_()  # ------ 크기가 (계층의 개수, 배치 크기, 은닉층의 뉴런/유닛 개수)인 은닉 상태(텐서)를 생성하여 0으로 초기화한 후 반환


In [54]:
model = BasicRNN(
    n_layers=n_layers,
    hidden_dim=hidden_dim,
    n_vocab=vocab_size,
    embed_dim=embed_dim,
    n_classes=n_classes,
    pad_index=pad_index,
    dropout_p=0.5,
).to(device)

criterion = torch.nn.CrossEntropyLoss().to(device)  # loss function
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 5, gamma=0.1)


def collate(batch, pad_index):
    batch_ids = [b['ids'] for b in batch]
    batch_ids = nn.utils.rnn.pad_sequence(
        batch_ids, padding_value=pad_index, batch_first=True
    )
    batch_label = [b['label'] for b in batch]
    batch_label = torch.stack(batch_label)

    batch = {'ids': batch_ids, 'label': batch_label}
    return batch


collate = functools.partial(collate, pad_index=pad_index)

# Refill Generators & Put in the DataLoader
train_iter, valid_iter, test_iter = train_data, valid_data, test_data

train_dataloader, valid_dataloader, test_dataloader = (
    DataLoader(train_iter, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate),
    DataLoader(valid_iter, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate),
    DataLoader(test_iter, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate),
)


In [55]:
def train(dataloader, model, criterion, optimizer, device):
    model.train()
    epoch_loss = 0
    epoch_accuracy = 0

    for batch in dataloader:
        tokens = batch['ids'].to(device)
        labels = batch['label'].to(device)
        predictions = model(tokens)
        loss = criterion(predictions, labels)
        accuracy = get_accuracy(predictions, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_accuracy += accuracy.item()

    return epoch_loss / len(dataloader), epoch_accuracy / len(dataloader)


def evaluate(dataloader, model, criterion, device):
    model.eval()
    epoch_loss = 0
    epoch_accuracy = 0

    with torch.no_grad():
        for batch in dataloader:
            tokens = batch['ids']
            labels = batch['label']
            predictions = model(tokens)
            loss = criterion(predictions, labels)
            accuracy = get_accuracy(predictions, labels)
            epoch_loss += loss.item()
            epoch_accuracy += accuracy.item()

    return epoch_loss / len(dataloader), epoch_accuracy / len(dataloader)


def get_accuracy(predictions, labels):
    batch_size = predictions.shape[0]
    predicted_classes = predictions.argmax(1, keepdim=True)
    correct_predictions = predicted_classes.eq(labels.view_as(predicted_classes)).sum()
    accuracy = correct_predictions.float() / batch_size
    return accuracy


for epoch in range(EPOCHS):
    train_loss, train_acc = train(train_dataloader, model, criterion, optimizer, device)
    valid_loss, valid_acc = evaluate(valid_dataloader, model, criterion, device)

    print(f'epoch: {epoch+1}')
    print(f'train_loss: {train_loss:.3f}, train_acc: {train_acc:.3f}')
    print(f'valid_loss: {valid_loss:.3f}, valid_acc: {valid_acc:.3f}')


RuntimeError: CUDA out of memory. Tried to allocate 1.32 GiB (GPU 0; 2.00 GiB total capacity; 261.33 MiB already allocated; 0 bytes free; 1.54 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

: 

In [None]:
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(1, 1, 1)
ax.plot(train_losses, label='train loss')
ax.plot(valid_losses, label='valid loss')
plt.legend()
ax.set_xlabel('updates')
ax.set_ylabel('loss')


In [None]:
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(1, 1, 1)
ax.plot(train_accs, label='train accuracy')
ax.plot(valid_accs, label='valid accuracy')
plt.legend()
ax.set_xlabel('updates')
ax.set_ylabel('accuracy')


In [None]:
test_loss, test_acc = evaluate(test_dataloader, model, criterion, device)

epoch_test_loss = np.mean(test_loss)
epoch_test_acc = np.mean(test_acc)

print(f'test_loss: {epoch_test_loss:.3f}, test_acc: {epoch_test_acc:.3f}')


In [None]:
def predict_sentiment(text, model, tokenizer, vocab, device):
    tokens = tokenizer(text)
    ids = [vocab[t] for t in tokens]
    tensor = torch.LongTensor(ids).unsqueeze(dim=0).to(device)
    prediction = model(tensor).squeeze(dim=0)
    probability = torch.softmax(prediction, dim=-1)
    predicted_class = prediction.argmax(dim=-1).item()
    predicted_probability = probability[predicted_class].item()
    return predicted_class, predicted_probability


text = "This film is terrible!"
predict_sentiment(text, model, tokenizer, vocab, device)

text = "This film is great!"
predict_sentiment(text, model, tokenizer, vocab, device)


In [None]:
IMDB_label = {0: "neg", 1: "pos"}


def predict(text, text_pipeline):
    with torch.no_grad():
        text = torch.tensor(text_pipeline(text))
        output = model(text.view)
        return output.argmax(1).item()


ex_text_str = "MEMPHIS, Tenn. – Four days ago, Jon Rahm was \
    enduring the season’s worst weather conditions on Sunday at The \
    Open on his way to a closing 75 at Royal Portrush, which \
    considering the wind and the rain was a respectable showing. \
    Thursday’s first round at the WGC-FedEx St. Jude Invitational \
    was another story. With temperatures in the mid-80s and hardly any \
    wind, the Spaniard was 13 strokes better in a flawless round. \
    Thanks to his best putting performance on the PGA Tour, Rahm \
    finished with an 8-under 62 for a three-stroke lead, which \
    was even more impressive considering he’d never played the \
    front nine at TPC Southwind."

model = model.to('cpu')

print(f"This is a {predict(ex_text_str, text_pipeline)} mood")

en_text_str = """I'm struggling to finish this

I'm 6 episodes in and this series feels kind of off. Having read the comics, the characters don't seem like theyre the same personalities. It's very slow, and the dialogue is terrible. The only thing keeping me watching at this point is a hope that I can see more of the endless (especially delirium).
Netflix should stay away from making anymore adaptations (especially after the cowboy bebop flop). I really think Gaiman made a mistake choosing Netflix, and I hope to live long enough to see another company remake it.
Ill update my review if the show is any better once I finish."""

print(f"This is a {IMDB_label[predict(ex_text_str, text_pipeline)]} mood")


RuntimeError: For unbatched 2-D input, hx should also be 2-D but got 3-D tensor