## Introduction
I use glove embeddings to build an irony detector for tweets

Data comes from: https://github.com/Cyvhee/SemEval2018-Task3

In [1]:
"""
TODO:
- change training data, glove embeddings, hyperparameters
- download on pc and try to run on gpu
- Run model until good f1 then save
- Commit #2
- Can use random forests to prevent overfitting?
- add baseline?
- Writeup: key skills demonstrated, intro to task, testing results
- Commit #3
- Figure out how to add try out
- Build try out shell
- Final Commit
"""

'\nTODO:\n6. Can use random forests to prevent overfitting?\n7. Do writeup: introduction, code explanation at each cell, results\n- write util.py descriptions\n- add baseline?\n8. Commit #1\n- change training data, glove embeddings, hyperparameters\n- download on pc and try to run on gpu\n-  Run model until good f1 then save\n- Commit #2\n9. Figure out how to add try out\n10. Build try out shell\n- Final Commit\n'

In [2]:
# Download and extract embeddings

# ! wget https://nlp.stanford.edu/data/glove.twitter.27B.zip
# ! unzip glove.twitter.27B.zip


In [3]:
import util

embeddings_path = 'glove.twitter.27B.25d.txt'
vocab_path = "./vocab.txt"
SPECIAL_TOKENS = ['<UNK>', '<PAD>', '<SOS>', '<EOS>']

# Download and split data
train_sentences, train_labels, test_sentences, test_labels, label2i = util.load_datasets()
training_sentences, training_labels, dev_sentences, dev_labels = util.split_data(train_sentences, train_labels, split=0.85)

In [4]:
# Set up tokenizer and make vocab
tokenizer = util.Tokenizer()
all_data = train_sentences + test_sentences
tokenized_data = tokenizer.tokenize(all_data)
# vocab = sorted(set([w for ws in tokenized_data + [SPECIAL_TOKENS] for w in ws]))
# with open('vocab.txt', 'w') as vf:
#     vf.write('\n'.join(vocab))

In [5]:
# Load the pretrained embeddings, find the out-of-vocabularies, and add to word2i and embeddings
glove_word2i, glove_embeddings = util.get_glove_embeddings(embeddings_path)
oovs = util.get_oovs(vocab_path, glove_word2i)
word2i, embeddings = util.update_embeddings(glove_word2i, glove_embeddings, oovs)

Reading embeddings from glove.twitter.27B.25d.txt...


In [6]:
# Initialize model
import torch

model = util.IronyDetector(
    input_dim=embeddings.shape[1],
    hidden_dim=8,
    embeddings_tensor=embeddings,
    pad_idx=word2i["<PAD>"],
    output_size=len(label2i),
)

In [7]:
# Hyperparameters
batch_size = 8
epochs = 1
learning_rate = 0.00005
weight_decay = 0
optimizer = torch.optim.AdamW(model.parameters(), learning_rate, weight_decay=weight_decay)

# Create batches
batch_tokenized = []
for batch in util.make_batches(training_sentences, batch_size):
    batch_tokenized.append(tokenizer(batch))
batch_labels = util.make_batches(training_labels, batch_size)
dev_sentences = tokenizer(dev_sentences)
test_sentences = tokenizer(test_sentences)

# Encode data
train_features = [util.encode_sentences(batch, word2i) for batch in batch_tokenized]
train_labels = [util.encode_labels(batch) for batch in batch_labels]
dev_features = util.encode_sentences(dev_sentences, word2i)
dev_labels = [int(l) for l in dev_labels]

In [8]:
# Train model
trained_model = util.training_loop(
    epochs,
    train_features,
    train_labels,
    dev_features,
    dev_labels,
    optimizer,
    model,
    label2i,
)

Training...


  0%|          | 0/408 [00:00<?, ?it/s]

epoch 1, loss: 0.6955968020301239
Evaluating dev...
Dev F1 0.6350710900473934
Avg Dev F1 0.3175355450236967


In [9]:
# Test model
test_features = util.encode_sentences(test_sentences, word2i)
test_labels = [int(l) for l in test_labels]
preds = util.predict(trained_model, test_features)
dev_f1 = util.f1_score(preds, test_labels, label2i['1'])
dev_avg_f1 = util.avg_f1_score(preds, test_labels, set(label2i.values()))
print(f"Test F1 {dev_f1}")
print(f"Avg Test F1 {dev_avg_f1}")

Test F1 0.5680365296803653
Avg Test F1 0.28401826484018267


In [10]:
# Save model
util.save_model()

Saved model at: /Users/alvinchen/Documents/GitHub/semeval_irony_detector/pretrained_model/
