In [1]:
#!pip install torchtext torchdata datasets


In [2]:
# # Using TPU
# !pip install torchtext torchdata cloud-tpu-client==0.10 torch==1.12.1 https://storage.googleapis.com/tpu-pytorch/wheels/colab/torch_xla-1.12-cp37-cp37m-linux_x86_64.whl
# import torch_xla
# import torch_xla.core.xla_model as xm
# device = xm.xla_device()


In [12]:
import functools
import datasets

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import time
from torchtext.vocab import Vocab

from torchtext.data.utils import get_tokenizer
import spacy

from torchtext.vocab import build_vocab_from_iterator

from torch.utils.data import DataLoader

spacy_en = spacy.load('en_core_web_md')

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f'selected device: {device}')
tokenizer = get_tokenizer('spacy', 'en_core_web_md')
train_data, test_data = datasets.load_dataset('imdb', split=['train', 'test'])

def tokenize_data(example, tokenizer):
    tokens = {'tokens': tokenizer(example['text'])}
    return tokens
 
def tokenize(example):
    tokens = {'tokens': spacy_en.tokenizer(example['text'])}
    return tokens

#train_data = train_data.map(tokenize_data, fn_kwargs={'tokenizer': tokenizer})
#test_data = test_data.map(tokenize_data, fn_kwargs={'tokenizer': tokenizer})
train_data = train_data.map(tokenize)
test_data = test_data.map(tokenize)
print(train_data, test_data)

test_size = 0.2
train_valid_data = train_data.train_test_split(test_size=test_size)
train_data = train_valid_data['train']
valid_data = train_valid_data['test']

min_freq = 5
special_tokens = ['<unk>', '<pad>']

vocab = build_vocab_from_iterator(
    train_data['tokens'], min_freq=min_freq, specials=special_tokens
)

print(vocab(['here', 'is', 'an', 'example']))

unk_index = vocab['<unk>']
pad_index = vocab['<pad>']
vocab.set_default_index(unk_index)


text_pipeline = lambda x: vocab(tokenize_data(x))
label_pipeline = lambda x: 1 if x == 'pos' else 0


selected device: cuda:0


Reusing dataset imdb (C:\Users\sms20\.cache\huggingface\datasets\imdb\plain_text\1.0.0\2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/2 [00:00<?, ?it/s]

0ex [00:00, ?ex/s]

TypeError: Provided `function` which is applied to all elements of table returns a variable of type <class 'list'>. Make sure provided `function` returns a variable of type `dict` (or a pyarrow table) to update the dataset or `None` if you are only interested in side effects.

In [4]:
def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))

    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)


In [5]:
# Refill Generators & Put in the DataLoader
train_iter, test_iter = AG_NEWS(split=('train', 'test'))
BATCH_SIZE = 100
dataloader = DataLoader(
    train_iter, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch
)


In [6]:
class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)


In [7]:
train_iter = AG_NEWS(split='train')
num_class = len(set([label for (label, text) in train_iter]))
vocab_size = len(vocab)
emsize = 64
model = TextClassificationModel(vocab_size, emsize, num_class).to(device)


In [8]:
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset

# Hyperparameters
EPOCHS = 12  # epoch
LR = 4.8  # learning rate
# BATCH_SIZE = 64

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None
train_iter, test_iter = AG_NEWS(split=('train', 'test'))

train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)

num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = random_split(
    train_dataset, [num_train, len(train_dataset) - num_train]
)

train_dataloader = DataLoader(
    split_train_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)

valid_dataloader = DataLoader(
    split_valid_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)

test_dataloader = DataLoader(
    test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)


In [9]:
epoch_: int = None


def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted_label = model(text, offsets)
        loss = criterion(predicted_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print(
                '| epoch {:3d} | {:5d}/{:5d} batches '
                '| accuracy {:8.3f}'.format(
                    epoch, idx, len(dataloader), total_acc / total_count
                )
            )
            total_acc, total_count = 0, 0
            start_time = time.time()


def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predicted_label = model(text, offsets)
            loss = criterion(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)

    return total_acc / total_count


In [10]:
for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    epoch_ = epoch
    train(train_dataloader)
    accu_val = evaluate(valid_dataloader)
    if total_accu is not None and total_accu > accu_val:
        scheduler.step()
    else:
        total_accu = accu_val

    print(
        '| end of epoch {:3d} | time: {:5.2f}s | '
        'valid accuracy {:8.3f}'.format(epoch, time.time() - epoch_start_time, accu_val)
    )
    print('-' * 59)


| epoch   1 |   500/ 1140 batches | accuracy    0.693
| epoch   1 |  1000/ 1140 batches | accuracy    0.862
| end of epoch   1 | time:  8.63s | valid accuracy    0.886
-----------------------------------------------------------
| epoch   2 |   500/ 1140 batches | accuracy    0.891
| epoch   2 |  1000/ 1140 batches | accuracy    0.899
| end of epoch   2 | time:  7.10s | valid accuracy    0.897
-----------------------------------------------------------
| epoch   3 |   500/ 1140 batches | accuracy    0.909
| epoch   3 |  1000/ 1140 batches | accuracy    0.911
| end of epoch   3 | time:  7.19s | valid accuracy    0.907
-----------------------------------------------------------
| epoch   4 |   500/ 1140 batches | accuracy    0.919
| epoch   4 |  1000/ 1140 batches | accuracy    0.918
| end of epoch   4 | time:  7.10s | valid accuracy    0.911
-----------------------------------------------------------
| epoch   5 |   500/ 1140 batches | accuracy    0.925
| epoch   5 |  1000/ 1140 batches 

In [11]:
print('Checking the results of test dataset.')
accu_test = evaluate(test_dataloader)
print('test accuracy {:8.3f}'.format(accu_test))


Checking the results of test dataset.
test accuracy    0.906


In [12]:
IMDB_label_back = {0: "neg", 1: "pos"}


def predict(text, text_pipeline):
    with torch.no_grad():
        text = torch.tensor(text_pipeline(text))
        output = model(text, torch.tensor([0]))
        return output.argmax(1).item() + 1


ex_text_str = "MEMPHIS, Tenn. – Four days ago, Jon Rahm was \
    enduring the season’s worst weather conditions on Sunday at The \
    Open on his way to a closing 75 at Royal Portrush, which \
    considering the wind and the rain was a respectable showing. \
    Thursday’s first round at the WGC-FedEx St. Jude Invitational \
    was another story. With temperatures in the mid-80s and hardly any \
    wind, the Spaniard was 13 strokes better in a flawless round. \
    Thanks to his best putting performance on the PGA Tour, Rahm \
    finished with an 8-under 62 for a three-stroke lead, which \
    was even more impressive considering he’d never played the \
    front nine at TPC Southwind."

model = model.to('cpu')

print(f"This is a {IMDB_label_back[predict(ex_text_str, text_pipeline)]} mood")

en_text_str = """I'm struggling to finish this

I'm 6 episodes in and this series feels kind of off. Having read the comics, the characters don't seem like theyre the same personalities. It's very slow, and the dialogue is terrible. The only thing keeping me watching at this point is a hope that I can see more of the endless (especially delirium).
Netflix should stay away from making anymore adaptations (especially after the cowboy bebop flop). I really think Gaiman made a mistake choosing Netflix, and I hope to live long enough to see another company remake it.
Ill update my review if the show is any better once I finish."""

print(f"This is a {IMDB_label_back[predict(ex_text_str, text_pipeline)]} mood")


This is a Sports news
