In [None]:
# Install requirements, then download and unzip embeddings

! pip install -r requirements.txt
! wget https://nlp.stanford.edu/data/glove.twitter.27B.zip
! unzip glove.twitter.27B.zip

In [None]:
from functions.util import load_datasets, split_data

# Set up paths and constants
embeddings_path = 'glove.twitter.27B.200d.txt'
vocab_path = "./vocab.txt"
SPECIAL_TOKENS = ['<UNK>', '<PAD>', '<SOS>', '<EOS>']

# Download and split data
train_sentences, train_labels, test_sentences, test_labels, label2i = load_datasets()
training_sentences, training_labels, dev_sentences, dev_labels = split_data(train_sentences, train_labels, split=0.85)

In [None]:
from functions.tokenizer import Tokenizer

# Set up tokenizer and make vocab
tokenizer = Tokenizer()
all_data = train_sentences + test_sentences
tokenized_data = tokenizer.tokenize(all_data)
vocab = sorted(set([w for ws in tokenized_data + [SPECIAL_TOKENS] for w in ws]))
with open('vocab.txt', 'w') as vf:
    vf.write('\n'.join(vocab))

In [None]:
from functions.embeddings import read_pretrained_embeddings, get_oovs, update_embeddings

# Load the pretrained embeddings, find the out-of-vocabularies, and add to word2i and embeddings
glove_word2i, glove_embeddings = read_pretrained_embeddings(embeddings_path, vocab_path)
oovs = get_oovs(vocab_path, glove_word2i)
word2i, embeddings = update_embeddings(glove_word2i, glove_embeddings, oovs)

In [None]:
from functions.ironymodel import IronyDetector

# Initialize model
model = IronyDetector(
    input_dim=embeddings.shape[1],
    hidden_dim=128,
    embeddings_tensor=embeddings,
    pad_idx=word2i["<PAD>"],
    output_size=len(label2i),
)

In [None]:
from functions.processing import make_batches, encode_sentences, encode_labels
import torch

# Set hyperparameters
batch_size = 8
epochs = 3
learning_rate = 0.00005
weight_decay = 0
optimizer = torch.optim.AdamW(model.parameters(), learning_rate, weight_decay=weight_decay)

# Create batches
batch_tokenized = []
for batch in make_batches(training_sentences, batch_size):
    batch_tokenized.append(tokenizer(batch))
batch_labels = make_batches(training_labels, batch_size)
dev_sentences = tokenizer(dev_sentences)
test_sentences = tokenizer(test_sentences)

# Encode data
train_features = [encode_sentences(batch, word2i) for batch in batch_tokenized]
train_labels = [encode_labels(batch) for batch in batch_labels]
dev_features = encode_sentences(dev_sentences, word2i)
dev_labels = [int(l) for l in dev_labels]

In [None]:
from functions.ironymodel import training_loop

# Train model
trained_model = training_loop(
    epochs,
    train_features,
    train_labels,
    dev_features,
    dev_labels,
    optimizer,
    model,
    label2i,
)

In [None]:
from functions.ironymodel import predict
from functions.util import f1_score, avg_f1_score

# Test model
test_features = encode_sentences(test_sentences, word2i)
test_labels = [int(l) for l in test_labels]
preds = predict(trained_model, test_features)
dev_f1 = f1_score(preds, test_labels, label2i['1'])
dev_avg_f1 = avg_f1_score(preds, test_labels, set(label2i.values()))
print(f"Test F1 {dev_f1}")
print(f"Avg Test F1 {dev_avg_f1}")