In [23]:
import torch

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torchtext import data
from torchtext import datasets
from transformers import BertTokenizer, BertModel
from torchtext import data
import torch.nn as nn
import torch.optim as optim

from transformers import BertTokenizer

import pandas as pd
import numpy as np

import jsonlines
import time
import random
import numpy as np

from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [2]:
%load_ext autoreload
%autoreload 2

from seewhence import models
from seewhence import train

In [3]:
# Directories & Paths
OUTPUT_DIR = 'data'

# Load and split Sarcasm Data

In [4]:
# reddit_path = '/home/ben/data/acl_2020/reddit.jsonl'
# twitter_path = '/home/ben/data/acl_2020/twitter.jsonl'
# train_split = 0.8
# %run ./preprocessing.ipynb

# Tokenizer

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [6]:
init_token = tokenizer.cls_token
eos_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token

In [7]:
init_token_idx = tokenizer.convert_tokens_to_ids(init_token)
eos_token_idx = tokenizer.convert_tokens_to_ids(eos_token)
pad_token_idx = tokenizer.convert_tokens_to_ids(pad_token)
unk_token_idx = tokenizer.convert_tokens_to_ids(unk_token)

In [8]:
init_token_idx = tokenizer.cls_token_id
eos_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id


In [9]:
max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']

In [10]:
def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence) 
    tokens = tokens[:max_input_length-2]
    return tokens

# Fields

In [11]:
TEXT = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = tokenize_and_cut,
                  preprocessing = tokenizer.convert_tokens_to_ids,
                  init_token = init_token_idx,
                  eos_token = eos_token_idx,
                  pad_token = pad_token_idx,
                  unk_token = unk_token_idx)

LABEL = data.LabelField(dtype = torch.float)

# Torch Text Dataset

In [12]:
# Fields
fields = {'combined': ('text', TEXT), 'label': ('label', LABEL)}

In [13]:
train_data, valid_data, test_data = data.TabularDataset.splits(
                                        path = OUTPUT_DIR,
                                        train = 'train.csv',
                                        validation ='valid.csv',
                                        test = 'test.csv',
                                        format = 'csv',
                                        fields = fields
)

In [14]:
LABEL.build_vocab(train_data)

In [15]:
BATCH_SIZE = 32

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    sort_key=lambda x: len(x.text),
    sort_within_batch=True,
    device = device)

## Build the Model

In [16]:
bert = BertModel.from_pretrained('bert-base-uncased')

In [17]:
HIDDEN_DIM = 64
OUTPUT_DIM = 1
N_LAYERS = 1
BIDIRECTIONAL = True
DROPOUT = 0.5
output_path = 'models/bert_'
model = models.BERTGRUSentiment(bert,
                         HIDDEN_DIM,
                         OUTPUT_DIM,
                         N_LAYERS,
                         BIDIRECTIONAL,
                         DROPOUT)

In [18]:
print(f'The model has {train.count_parameters(model):,} trainable parameters')

The model has 109,802,625 trainable parameters


## Train the Model

In [19]:
optimizer = optim.Adam(model.parameters())

In [20]:
criterion = nn.BCEWithLogitsLoss()

In [21]:
model = model.to(device)
criterion = criterion.to(device)

In [27]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss, train_acc = train.train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = train.evaluate(model, valid_iterator, criterion)
        
    end_time = time.time()
        
    epoch_mins, epoch_secs = train.epoch_time(start_time, end_time)
        
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        if output_path:
            torch.save(model.state_dict(), output_path)
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 2m 10s
	Train Loss: 0.654 | Train Acc: 60.41%
	 Val. Loss: 0.588 |  Val. Acc: 66.63%
Epoch: 02 | Epoch Time: 2m 11s
	Train Loss: 0.598 | Train Acc: 66.73%
	 Val. Loss: 0.552 |  Val. Acc: 69.76%
Epoch: 03 | Epoch Time: 2m 11s
	Train Loss: 0.564 | Train Acc: 70.05%
	 Val. Loss: 0.537 |  Val. Acc: 72.67%
Epoch: 04 | Epoch Time: 2m 11s
	Train Loss: 0.528 | Train Acc: 73.15%
	 Val. Loss: 0.528 |  Val. Acc: 72.05%
Epoch: 05 | Epoch Time: 2m 11s
	Train Loss: 0.492 | Train Acc: 75.85%
	 Val. Loss: 0.507 |  Val. Acc: 74.83%


In [28]:
model.load_state_dict(torch.load('tut6-model.pt'))
test_loss, test_acc = evaluate(model, test_iterator, criterion)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

FileNotFoundError: [Errno 2] No such file or directory: 'tut6-model.pt'

## Results

In [None]:
train.predict(model, tokenizer, "Saudi Arabia offers Germany 200 mosques - one ...")

In [None]:
results = test[['combined','label']]
results['pred'] = train.results.combined.apply(lambda x : predict(model, tokenizer, x))