## Lets build a NLP classification model for news articles using the AG News Dataset. This will classifiy if a news article is a World, Sports, Business or Sci/Tech Topic article.

In [48]:
import torch
import torchtext
from torchtext.datasets import text_classification
import os
import pandas as pd

## Get Data from the PyTorch Text Classification [Datasets](https://pytorch.org/text/datasets.html?highlight=textclassification). We are using the AG News dataset for this model

In [2]:
# check directory for data if it doesnt already exist
if not os.path.isdir('./.data'):
    os.mkdir('./.data')
    
#Get train and text dataset to tensor
train_dataset, test_dataset = text_classification.DATASETS['AG_NEWS'](
    root='./.data', ngrams=2, vocab=None)



.data\ag_news_csv.tar.gz: 11.8MB [00:00, 22.6MB/s]
120000lines [00:10, 11480.79lines/s]
120000lines [00:20, 5864.85lines/s]
7600lines [00:01, 6062.22lines/s]


In [25]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [3]:

# check directory for data if it doesnt already exist
if not os.path.isdir('./.data'):
    os.mkdir('./.data')
    
#Get train and text dataset to tensor
yelp_train_dataset, yelp_test_dataset = text_classification.DATASETS['YelpReviewFull'](
    root='./.data', ngrams=2, vocab=None)


.data\yelp_review_full_csv.tar.gz: 196MB [00:06, 30.9MB/s] 
650000lines [03:12, 3377.03lines/s]
650000lines [05:30, 1966.15lines/s]
50000lines [00:20, 2399.17lines/s]


In [49]:
#File path to the csv file
csv_file = "./.data/yelp_review_full_csv/train.csv"

# Read csv file into dataframe
df = pd.read_csv(csv_file, names=["label", "review"])

# Print first 5 rows in the dataframe
df.head()

Unnamed: 0,label,review
0,5,dr. goldberg offers everything i look for in a...
1,2,"Unfortunately, the frustration of being Dr. Go..."
2,4,Been going to Dr. Goldberg for over 10 years. ...
3,4,Got a letter in the mail last week that said D...
4,1,I don't know what Dr. Goldberg was like before...


In [55]:
def addGender(df):
    if df['label'] >= 3:
        return 'F'
    else:
        return 'M'

df['gender'] = df.apply(addGender, axis=1)

In [56]:
df.head()

Unnamed: 0,label,review,gender
0,5,dr. goldberg offers everything i look for in a...,F
1,2,"Unfortunately, the frustration of being Dr. Go...",M
2,4,Been going to Dr. Goldberg for over 10 years. ...,F
3,4,Got a letter in the mail last week that said D...,F
4,1,I don't know what Dr. Goldberg was like before...,M


In [57]:
df['label'].value_counts()

5    130000
4    130000
3    130000
2    130000
1    130000
Name: label, dtype: int64

In [58]:
df['gender'].value_counts()

F    390000
M    260000
Name: gender, dtype: int64

In [8]:
yelp_train_dataset.get_vocab()

<torchtext.vocab.Vocab at 0x20e82f76400>

In [19]:
yelp_train_dataset.get_labels()
#0 - 4 : rating classes (4 is highly recommended)

{0, 1, 2, 3, 4}

##  Construct an Optimizer, an iterable containing the parameters to optimize. Then, you can specify optimizer-specific options such as the learning rate, weight decay, etc.

Optimizers are algorithms or methods used to change the attributes of the neural network such as weights and learning rate to reduce the losses.

Basically the optimizer tells us how bad we are doing after each layer.

In [14]:
import torch.nn as nn
import torch.nn.functional as F
class TextSentiment(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)


## Functions used to generate batch

Since the text entries have different lengths, a custom function generate_batch() is used to generate data batches and offsets. The function is passed to collate_fn in torch.utils.data.DataLoader. The input to collate_fn is a list of tensors with the size of batch_size, and the collate_fn function packs them into a mini-batch.

In [15]:
def generate_batch(batch):
    label = torch.tensor([entry[0] for entry in batch])
    text = [entry[1] for entry in batch]
    offsets = [0] + [len(entry) for entry in text]
    # torch.Tensor.cumsum returns the cumulative sum
    # of elements in the dimension dim.
    # torch.Tensor([1.0, 2.0, 3.0]).cumsum(dim=0)

    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text = torch.cat(text)
    return text, offsets, label

## Define functions to train the model and evaluate results.

In [16]:
from torch.utils.data import DataLoader

def train_func(sub_train_):

    # Train the model
    train_loss = 0
    train_acc = 0
    data = DataLoader(sub_train_, batch_size=BATCH_SIZE, shuffle=True,
                      collate_fn=generate_batch)
    for i, (text, offsets, cls) in enumerate(data):
        optimizer.zero_grad()
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        output = model(text, offsets)
        loss = criterion(output, cls)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        train_acc += (output.argmax(1) == cls).sum().item()

    # Adjust the learning rate
    scheduler.step()

    return train_loss / len(sub_train_), train_acc / len(sub_train_)

def test(data_):
    loss = 0
    acc = 0
    data = DataLoader(data_, batch_size=BATCH_SIZE, collate_fn=generate_batch)
    for text, offsets, cls in data:
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        with torch.no_grad():
            output = model(text, offsets)
            loss = criterion(output, cls)
            loss += loss.item()
            acc += (output.argmax(1) == cls).sum().item()

    return loss / len(data_), acc / len(data_)

In [17]:
VOCAB_SIZE = len(yelp_train_dataset.get_vocab())
EMBED_DIM = 32
BATCH_SIZE = 16
NUN_CLASS = len(yelp_train_dataset.get_labels())
model = TextSentiment(VOCAB_SIZE, EMBED_DIM, NUN_CLASS).to(device)

## Train the Model

CrossEntropyLoss criterion combines nn.LogSoftmax() and nn.NLLLoss() in a single class. It is useful when training a classification problem with C classes. SGD implements stochastic gradient descent method as optimizer. The initial learning rate is set to 4.0. StepLR is used here to adjust the learning rate through epochs.


Softmax is an activation function of a neural network to normalize the output of a network to a probability distribution over predicted output classes.

In [20]:
import time
from torch.utils.data.dataset import random_split
N_EPOCHS = 10
min_valid_loss = float('inf')

#activation function
criterion = torch.nn.CrossEntropyLoss().to(device)
#Stochastic Gradient descient with optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=4.0)

scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

train_len = int(len(yelp_train_dataset) * 0.95)
sub_train_, sub_valid_ = \
    random_split(yelp_train_dataset, [train_len, len(yelp_train_dataset) - train_len])

for epoch in range(N_EPOCHS):

    start_time = time.time()
    train_loss, train_acc = train_func(sub_train_)
    valid_loss, valid_acc = test(sub_valid_)

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

Epoch: 1  | time in 1 minutes, 30 seconds
	Loss: 0.0527(train)	|	Acc: 63.7%(train)
	Loss: 0.0001(valid)	|	Acc: 60.0%(valid)
Epoch: 2  | time in 1 minutes, 33 seconds
	Loss: 0.0497(train)	|	Acc: 66.0%(train)
	Loss: 0.0000(valid)	|	Acc: 64.1%(valid)
Epoch: 3  | time in 1 minutes, 35 seconds
	Loss: 0.0471(train)	|	Acc: 68.0%(train)
	Loss: 0.0000(valid)	|	Acc: 60.6%(valid)
Epoch: 4  | time in 1 minutes, 31 seconds
	Loss: 0.0448(train)	|	Acc: 69.7%(train)
	Loss: 0.0000(valid)	|	Acc: 64.0%(valid)
Epoch: 5  | time in 1 minutes, 34 seconds
	Loss: 0.0426(train)	|	Acc: 71.5%(train)
	Loss: 0.0000(valid)	|	Acc: 64.7%(valid)
Epoch: 6  | time in 1 minutes, 35 seconds
	Loss: 0.0405(train)	|	Acc: 73.0%(train)
	Loss: 0.0001(valid)	|	Acc: 65.1%(valid)
Epoch: 7  | time in 1 minutes, 44 seconds
	Loss: 0.0384(train)	|	Acc: 74.6%(train)
	Loss: 0.0000(valid)	|	Acc: 60.2%(valid)
Epoch: 8  | time in 1 minutes, 54 seconds
	Loss: 0.0363(train)	|	Acc: 76.3%(train)
	Loss: 0.0001(valid)	|	Acc: 62.8%(valid)
Epoch: 9

# Test with testing dataset

In [33]:
#print('Checking the results of test dataset...')
#test_loss, test_acc = test(yelp_test_dataset)
#print(f'\tLoss: {test_loss:.4f}(test)\t|\tAcc: {test_acc * 100:.1f}%(test)')

## Test with String text

In [46]:
import re
from torchtext.data.utils import ngrams_iterator
from torchtext.data.utils import get_tokenizer

def predict(text, model, vocab, ngrams):
    tokenizer = get_tokenizer("basic_english")
    with torch.no_grad():
        text = torch.tensor([vocab[token]
                            for token in ngrams_iterator(tokenizer(text), ngrams)])
        output = model(text, torch.tensor([0]))
        return output.argmax(1).item() + 1

ex_text_str = "Best tacos ever!!"
ex_text_str_1 = "Worst tacos ever!!"
ex_text_str_2 = "I would eat here again. Almost as good a taco bell!"

vocab = yelp_train_dataset.get_vocab()
model = model.to("cpu")

print([predict(ex_text_str, model, vocab, 2)])
print([predict(ex_text_str_1, model, vocab, 2)])
print([predict(ex_text_str_2, model, vocab, 2)])

[5]
[1]
[3]


Resources:
This example is from the [PyTorch Beginner Tutorial](https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html)