<a href="https://colab.research.google.com/github/adldotori/PapersWithCode/blob/master/1_Convolutional_Neural_Networks_for_Sentence_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Convolutional Neural Networks for Sentence Classification

## Preparing Data

In [None]:
MODEL_VARIATIONS = 'rand' # 'rand', 'static', 'non-static', 'multichannel'
DATASET = 'MR' # 'MR', 'SST-1', 'SST-2', 'Subj', 'TREC', 'CR', 'MPQA'
EMBEDDING_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = 1
DROPOUT = 0.5

In [99]:
import pandas as pd
import re

def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip()

data = {'text':[], 'label':[]}
with open('/content/drive/MyDrive/review_polarity/rt-polarity.pos','rb') as f:
    pos = f.readlines()
    for i in pos:
        data['text'].append(clean_str(i.decode('latin1')))
        data['label'].append(1)

with open('/content/drive/MyDrive/review_polarity/rt-polarity.neg','rb') as f:
    neg = f.readlines()
    for i in neg:
        data['text'].append(clean_str(i.decode('latin1')))
        data['label'].append(0)

df = pd.DataFrame(data=data)
df.to_csv('data.csv', index=None)

In [101]:
import torch
from torchtext import data
from torchtext import datasets
import random
import numpy as np
import spacy

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(sequential=True, tokenize='spacy', lower=True, batch_first = True)
LABEL = data.LabelField(dtype = torch.float)

train_data = data.TabularDataset(path='./data.csv',
                                 format='csv', 
                                 fields=[('text', TEXT),('label', LABEL)],
                                 skip_header=True)

train_data, valid_data, test_data = train_data.split(random_state = random.seed(SEED), split_ratio=[0.8,0.1,0.1])

Build the vocab and load the pre-trained word embeddings.

In [102]:
from torchtext.vocab import Vectors
from gensim.models import KeyedVectors

MAX_VOCAB_SIZE = 25000
# word2vec_model = KeyedVectors.load_word2vec_format('eng_w2v')
# vectors = Vectors(name="eng_w2v")
TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors="glove.6B.100d",
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train_data)

As before, we create the iterators.

In [103]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    sort_key = lambda x: len(x.text),
    batch_size = BATCH_SIZE, 
    device = device)

## Build the Model

In [117]:
class CNN1d(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.convs = nn.ModuleList([
                                    nn.Conv1d(in_channels = embedding_dim, 
                                              out_channels = n_filters, 
                                              kernel_size = fs)
                                    for fs in filter_sizes
                                    ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        #text = [batch size, sent len]
        embedded = self.embedding(text)# [batch size, sent len, emb dim]
        embedded = embedded.permute(0, 2, 1) # [batch size, emb dim, sent len]
        conved = [F.relu(conv(embedded)) for conv in self.convs] # [batch size, n_filters, sent len - filter_sizes[n] + 1]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved] # [batch size, n_filters]
        cat = self.dropout(torch.cat(pooled, dim = 1)) # [batch size, n_filters * len(filter_sizes)]  
        return self.fc(cat)

In [118]:
INPUT_DIM = len(TEXT.vocab)
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

In [161]:
model = CNN1d(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

Next, we'll load the pre-trained embeddings

In [162]:
pretrained_embeddings = TEXT.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)
# model.embedding.weight.requires_grad = False

tensor([[-0.1117, -0.4966,  0.1631,  ...,  1.2647, -0.2753, -0.1325],
        [-0.8555, -0.7208,  1.3755,  ...,  0.0825, -1.1314,  0.3997],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.1509,  0.2089,  0.7544,  ..., -0.2359,  0.4850,  0.0263],
        [ 0.5719, -0.7835,  0.4352,  ..., -0.2164, -0.7241,  0.2962],
        [-0.0230,  0.6082, -1.5700,  ...,  0.5421, -0.8635, -0.2364]])

In [163]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 1,775,301 trainable parameters


Then zero the initial weights of the unknown and padding tokens.

In [164]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = nn.init.uniform_(torch.empty(EMBEDDING_DIM), -0.5, 0.5)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

## Train the Model

Training is the same as before. We initialize the optimizer, loss function (criterion) and place the model and criterion on the GPU (if available)

In [165]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

We implement the function to calculate accuracy...

In [166]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

We define a function for training our model...

**Note**: as we are using dropout again, we must remember to use `model.train()` to ensure the dropout is "turned on" while training.

In [167]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

We define a function for testing our model...

**Note**: again, as we are now using dropout, we must remember to use `model.eval()` to ensure the dropout is "turned off" while evaluating.

In [168]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [169]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [170]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut4-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 1s
	Train Loss: 0.619 | Train Acc: 64.74%
	 Val. Loss: 0.528 |  Val. Acc: 75.30%
Epoch: 02 | Epoch Time: 0m 0s
	Train Loss: 0.463 | Train Acc: 78.21%
	 Val. Loss: 0.472 |  Val. Acc: 77.00%
Epoch: 03 | Epoch Time: 0m 0s
	Train Loss: 0.338 | Train Acc: 85.84%
	 Val. Loss: 0.459 |  Val. Acc: 78.28%
Epoch: 04 | Epoch Time: 0m 0s
	Train Loss: 0.239 | Train Acc: 90.71%
	 Val. Loss: 0.486 |  Val. Acc: 77.92%
Epoch: 05 | Epoch Time: 0m 0s
	Train Loss: 0.156 | Train Acc: 94.68%
	 Val. Loss: 0.587 |  Val. Acc: 76.18%


We get test results comparable to the previous 2 models!

In [171]:
model.load_state_dict(torch.load('tut4-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.492 | Test Acc: 77.08%


## User Input

And again, as a sanity check we can check some input sentences

**Note**: As mentioned in the implementation details, the input sentence has to be at least as long as the largest filter height used. We modify our `predict_sentiment` function to also accept a minimum length argument. If the tokenized input sentence is less than `min_len` tokens, we append padding tokens (`<pad>`) to make it `min_len` tokens.

In [76]:
import spacy
nlp = spacy.load('en')

def predict_sentiment(model, sentence, min_len = 5):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    if len(tokenized) < min_len:
        tokenized += ['<pad>'] * (min_len - len(tokenized))
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(0)
    prediction = torch.sigmoid(model(tensor))
    return prediction.item()

In [77]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


An example negative review...

In [78]:
predict_sentiment(model, "This film is terrible")

0.4916306436061859

An example positive review...

In [79]:
predict_sentiment(model, "This film is great")

0.11324454843997955

In [80]:
predict_sentiment(model, "i'm so hungry, aren't you?")

0.6346037983894348

In [81]:
predict_sentiment(model, "I don't understand the people who say this movie is great.")

0.6579892635345459