<a href="https://colab.research.google.com/github/binxuankong/basic-tweet-bot/blob/master/OffensEvalLSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# http://pytorch.org/
from os.path import exists
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())
cuda_output = !ldconfig -p|grep cudart.so|sed -e 's/.*\.\([0-9]*\)\.\([0-9]*\)$/cu\1\2/'
accelerator = cuda_output[0] if exists('/dev/nvidia0') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.4.1-{platform}-linux_x86_64.whl torchvision

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [25]:
import pandas as pd
import copy
import re
import numpy as np
import nltk
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import codecs
import random
import csv

from tqdm import tqdm 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from torch.autograd import Variable
from sklearn.metrics import f1_score

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
data_directory = "/content/drive/My Drive/OffensEval/data/startkit/training-v1/offenseval-training-v1.tsv"
print("Reading offenseval-training-v1.tsv...")
training_data = pd.read_csv(data_directory, sep='\t', header=0)
tweets = training_data[["tweet"]]
task_a_labels = training_data[["subtask_a"]]
task_b_labels = training_data.query("subtask_a == 'OFF'")[["subtask_b"]]
task_c_labels = training_data.query("subtask_b == 'TIN'")[["subtask_c"]]
clean_tweets = copy.deepcopy(tweets)

Reading offenseval-training-v1.tsv...


In [0]:
def clean_data(tweet):
  noises = ['@USER', 'URL', '\'s', '\'ve', 'n\'t', '\'re', '\'ll', '\'d']
  for noise in noises:
    tweet = tweet.replace(noise, '')
  return re.sub(r'[^a-zA-Z]', ' ', tweet)

def tokenize(tweet):
  return word_tokenize(tweet.lower())

def remove_stop_words(tweets):
  clean_tweets = []
  stop_words = set(stopwords.words('english'))
  for tweet in tweets:
    if tweet not in stop_words:
      if tweet.replace(' ', '') != '':
        if len(tweet) > 1:
          clean_tweets.append(tweet)
  return clean_tweets

def lemmatize_and_stem(tweets):
  clean_tweets = []
  lemmatizer = WordNetLemmatizer()
  stemmer = PorterStemmer()
  for tweet in tweets:
    tweet = lemmatizer.lemmatize(tweet)
    tweet = stemmer.stem(tweet)
    if len(tweet) > 1:
      clean_tweets.append(tweet)
  return clean_tweets

def word_to_index(tweets):
  vocabulary = []
  for tweet in tweets:
    for token in tweet:
      if token not in vocabulary:
        vocabulary.append(token)
  word2index = {w: idx for (idx, w) in enumerate(vocabulary)}
  return word2index

def label_to_index(labels):
  dictionary = []
  for label in labels:
    if label not in dictionary:
      dictionary.append(label)
  label2index = {l: idx for (idx, l) in enumerate(dictionary)}
  return label2index

def tweet_to_tensor(tweets, word2index, max_len):
  vectorized_tweets = [[word2index[token] for token in tweet if token in word2index] for tweet in tweets]
  tweet_tensor = Variable(torch.zeros((len(vectorized_tweets), max_len))).long()
  tweet_lengths = [len(tweet) for tweet in vectorized_tweets]
  for index, (tweet, tweetlen) in enumerate(zip(vectorized_tweets, tweet_lengths)):
    tweet_tensor[index, :tweetlen] = torch.LongTensor(tweet)
  return tweet_tensor

def get_tensors_by_label(tensors, labels, keyword, max_len):
  if tensors.shape[0] != len(labels):
    print("Unmatching sizes")
    return
  length = labels.count(keyword)
  tweet_tensor = Variable(torch.zeros((length, max_len))).long()
  index = 0
  for tensor, label in zip(tensors, labels):
    if label == keyword:
      tweet_tensor[index] = tensor
      index += 1
  return tweet_tensor

def label_to_tensor(labels, label2index):
  vectorized_labels = [label2index[label] for label in labels if label in label2index]
  label_tensor = torch.FloatTensor(vectorized_labels)
  return label_tensor

In [7]:
tqdm.pandas(desc="Cleaning Data...")
clean_tweets['tweet'] = tweets['tweet'].progress_apply(clean_data)
tqdm.pandas(desc="Tokenizing Data...")
clean_tweets['tokens'] = clean_tweets['tweet'].progress_apply(tokenize)
tqdm.pandas(desc="Removing Stop Words...")
clean_tweets['tokens'] = clean_tweets['tokens'].progress_apply(remove_stop_words)
tqdm.pandas(desc="Lemmatizing And Stemming...")
clean_tweets['tokens'] = clean_tweets['tokens'].progress_apply(lemmatize_and_stem)

Cleaning Data...: 100%|██████████| 13240/13240 [00:00<00:00, 82996.81it/s]
Tokenizing Data...: 100%|██████████| 13240/13240 [00:02<00:00, 5458.37it/s]
Removing Stop Words...: 100%|██████████| 13240/13240 [00:02<00:00, 6207.36it/s]
Lemmatizing And Stemming...: 100%|██████████| 13240/13240 [00:07<00:00, 1854.08it/s]


In [8]:
tokenized_tweets = clean_tweets['tokens'].tolist()

tweet_lengths = [len(tweet) for tweet in tokenized_tweets]
max_length = np.max(np.array(tweet_lengths))

word2index = word_to_index(tokenized_tweets)

labels_a = task_a_labels['subtask_a'].values.tolist()
labels_b = task_b_labels['subtask_b'].values.tolist()
labels_c = task_c_labels['subtask_c'].values.tolist()

a2index = label_to_index(labels_a)
b2index = label_to_index(labels_b)
c2index = label_to_index(labels_c)

tweets_a_tensor = tweet_to_tensor(tokenized_tweets, word2index, max_length)
tweets_b_tensor = get_tensors_by_label(tweets_a_tensor, labels_a, "OFF", max_length)
tweets_c_tensor = get_tensors_by_label(tweets_b_tensor, labels_b, "TIN", max_length)

labels_a_tensor = label_to_tensor(labels_a, a2index)
labels_b_tensor = label_to_tensor(labels_b, b2index)
labels_c_tensor = label_to_tensor(labels_c, c2index)

print("Task A tensor size:")
print(tweets_a_tensor.shape)
print("Task B tensor size:")
print(tweets_b_tensor.shape)
print("Task C tensor size:")
print(tweets_c_tensor.shape)

Task A tensor size:
torch.Size([13240, 39])
Task B tensor size:
torch.Size([4400, 39])
Task C tensor size:
torch.Size([3876, 39])


In [0]:
def train_valid_split(dataset, labels, validation_split):
  dataset_size = dataset.shape[0]
  split = int(np.floor(validation_split * dataset_size))
  train_data = dataset[split:]
  valid_data = dataset[:split]
  train_labels = labels[split:]
  valid_labels = labels[:split]
  return train_data, valid_data, train_labels, valid_labels

def check_accuracy(output, target, num_class=2):
  if num_class == 2:
    output = torch.round(torch.sigmoid(output))
  else:
    output = F.log_softmax(output)
    output = output.data.max(1)[1].numpy()
  correct = (output == target).float()
  acc = correct.sum()/len(correct)
  return acc

def check_f1_score(output, target, num_class=2):
  if num_class == 2:
    output = torch.round(torch.sigmoid(output))
  else:
    output = F.log_softmax(output)
    output = output.data.max(1)[1].numpy()
  score = f1_score(target, output, average='macro')
  return score

In [0]:
class CNN(nn.Module):
    
  def __init__(self, vocab_size, embedding_dim, out_channels, window_size, output_dim, dropout):     
    super(CNN, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    #in_channels -- 1 text channel
    #out_channels -- the number of output channels
    #kernel_size is (window size x embedding dim)
    self.conv = nn.Conv2d(in_channels=1, out_channels=out_channels, kernel_size=(window_size,embedding_dim))
    #the dropout layer
    self.dropout = nn.Dropout(dropout)
    #the output layer
    self.fc = nn.Linear(out_channels, output_dim)
        
  def forward(self, x):
    #(batch size, max sent length)
    embedded = self.embedding(x)
    #(batch size, max sent length, embedding dim)
    #images have 3 RGB channels 
    #for the text we add 1 channel
    embedded = embedded.unsqueeze(1)
    #(batch size, 1, max sent length, embedding dim)
    feature_maps = self.conv(embedded)
    #(batch size, n filters, max input length - window size +1)
    feature_maps = feature_maps.squeeze(3)
    feature_maps = F.relu(feature_maps)
    #the max pooling layer
    pooled = F.max_pool1d(feature_maps, feature_maps.shape[2])
    pooled = pooled.squeeze(2)
    dropped = self.dropout(pooled)
    preds = self.fc(dropped)
    return preds

In [0]:
class LSTMClassifier(nn.Module):
  
  def __init__(self, vocab_size, embedding_dim, hidden_dim, label_size, output_dim, dropout):
    super(LSTMClassifier, self).__init__()
    self.hidden_dim = hidden_dim
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
    self.hidden2label = nn.Linear(hidden_dim, label_size)
    self.dropout_layer = nn.Dropout(dropout)
    self.fc = nn.Linear(hidden_dim, output_dim)
  
  def init_hidden(self, batch_size):
    return (Variable(torch.zeros(1, batch_size, self.hidden_dim)), Variable(torch.zeros(1, batch_size, self.hidden_dim)))
  
  def forward(self, x):
    self.hidden = self.init_hidden(x.shape[0])
    embedded = self.embedding(x)
    outputs, (ht, ct) = self.lstm(embedded, self.hidden)
    output = self.dropout_layer(ht[-1])
    preds = self.fc(output)
    return preds

In [0]:
def train_part(model, dataset, labels, optimizer, epochs=1, num_class=2, validation_split=0.2):
  feature_train, feature_valid, target_train, target_valid = train_valid_split(dataset, labels, validation_split)
  if num_class == 2:
    loss_fn = nn.BCEWithLogitsLoss()
  else:
    loss_fn = nn.CrossEntropyLoss()
    target_train = target_train.long()
    target_valid = target_valid.long()
  
  for e in range(1, epochs+1):
    model.train()
    optimizer.zero_grad()
    
    predictions = model(feature_train).squeeze(1)
    loss = loss_fn(predictions, target_train)
    acc = check_accuracy(predictions, target_train, num_class)
    
    loss.backward()
    optimizer.step()
    
    epoch_loss = loss.item()
    epoch_acc = acc
    
    model.eval()
    
    with torch.no_grad():
      predictions_valid = model(feature_valid).squeeze(1)
      loss = loss_fn(predictions_valid, target_valid)
      acc = check_accuracy(predictions_valid, target_valid, num_class)
      valid_loss = loss.item()
      valid_acc = acc
    
    print(f'| Epoch: {e:02} | Train Loss: {epoch_loss:.3f} | Train Acc: {epoch_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')
    
  return valid_acc

In [13]:
!pip install GPy GPyOpt

import GPy
import GPyOpt
from GPyOpt.methods import BayesianOptimization

Collecting GPy
[?25l  Downloading https://files.pythonhosted.org/packages/98/7d/e55ffc3b16b68e8b50ccecacec56715bcf49d5c2f204f5ba60374d419611/GPy-1.9.6.tar.gz (873kB)
[K    100% |████████████████████████████████| 880kB 21.5MB/s 
[?25hCollecting GPyOpt
[?25l  Downloading https://files.pythonhosted.org/packages/9c/40/ca8f080d74d9f4e29069faa944fcfb083e8693b6daaba0f1e4bc65c88650/GPyOpt-1.2.5.tar.gz (55kB)
[K    100% |████████████████████████████████| 61kB 20.7MB/s 
Collecting paramz>=0.9.0 (from GPy)
[?25l  Downloading https://files.pythonhosted.org/packages/fd/78/b0f0164a32518bfd3b98cb2e149b7a4d5504d13fb503b31a6c59b958ed18/paramz-0.9.4.tar.gz (70kB)
[K    100% |████████████████████████████████| 71kB 23.1MB/s 
Building wheels for collected packages: GPy, GPyOpt, paramz
  Building wheel for GPy (setup.py) ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/97/82/1d/32a361e1ff2b4d9129a60343831dd99cdc74440e2db1c55264
  Building wheel for GPyOpt (setup.py) ... [?25ldone


In [0]:
# Use GPyOpt for hyper-parameter search
domain = [{'name': 'embedding_dim', 'type': 'discrete', 'domain': (100, 120, 140, 160, 180, 200)},
          {'name': 'n_out_channels', 'type': 'discrete', 'domain': (100, 120, 140, 160, 180, 200)},
          {'name': 'hidden_dim', 'type': 'discrete', 'domain': (32, 64, 96, 128)},
          {'name': 'drop_out', 'type': 'continuous', 'domain': (0.2, 0.5)},
          {'name': 'lr', 'type': 'continuous', 'domain': (0.0001, 0.01)},
          {'name': 'momentum', 'type': 'continuous', 'domain': (0.5, 0.9)}
         ]

In [15]:
INPUT_DIM = len(word2index)
OUTPUT_DIM = 1

# window size
WINDOW_SIZE = 1

def taskA_tuning(params):
  param = params[0]
  # CNN(vocab_size, embedding_dim, out_channels, window_size, output_dim, dropout)
  # model = CNN(INPUT_DIM, int(param[0]), int(param[1]), WINDOW_SIZE, OUTPUT_DIM, param[3])
  # LSTMClassifier(vocab_size, embedding_dim, hidden_dim, label_size, output_dim, dropout)
  model = LSTMClassifier(INPUT_DIM, int(param[0]), int(param[2]), 2, OUTPUT_DIM, param[3])
  optimizer = optim.SGD(model.parameters(), lr=param[4], momentum=param[5])
  acc = train_part(model, tweets_a_tensor, labels_a_tensor, optimizer, epochs=10)
  return acc

taskA_opt = BayesianOptimization(f=taskA_tuning,
                                 domain=domain,
                                 model_type='GP',
                                 acquisition_type='EI',
                                 acquisition_jitter=0.05,
                                 maximize=True)

taskA_opt.run_optimization(max_iter=20)

| Epoch: 01 | Train Loss: 0.728 | Train Acc: 33.23% | Val. Loss: 0.724 | Val. Acc: 33.57% |
| Epoch: 02 | Train Loss: 0.726 | Train Acc: 33.36% | Val. Loss: 0.720 | Val. Acc: 33.57% |
| Epoch: 03 | Train Loss: 0.720 | Train Acc: 33.87% | Val. Loss: 0.714 | Val. Acc: 33.57% |
| Epoch: 04 | Train Loss: 0.715 | Train Acc: 34.87% | Val. Loss: 0.707 | Val. Acc: 33.57% |
| Epoch: 05 | Train Loss: 0.708 | Train Acc: 36.93% | Val. Loss: 0.700 | Val. Acc: 33.57% |
| Epoch: 06 | Train Loss: 0.701 | Train Acc: 42.10% | Val. Loss: 0.694 | Val. Acc: 33.72% |
| Epoch: 07 | Train Loss: 0.694 | Train Acc: 50.16% | Val. Loss: 0.687 | Val. Acc: 66.43% |
| Epoch: 08 | Train Loss: 0.687 | Train Acc: 57.23% | Val. Loss: 0.681 | Val. Acc: 66.43% |
| Epoch: 09 | Train Loss: 0.681 | Train Acc: 62.74% | Val. Loss: 0.676 | Val. Acc: 66.43% |
| Epoch: 10 | Train Loss: 0.676 | Train Acc: 65.00% | Val. Loss: 0.671 | Val. Acc: 66.43% |
| Epoch: 01 | Train Loss: 0.673 | Train Acc: 64.95% | Val. Loss: 0.670 | Val. Ac

In [16]:
print("The hyperparameters used for the model in task A are")
print(f'embedding_dim: {int(taskA_opt.X[-1][0])}')
print(f'hidden_dim: {int(taskA_opt.X[-1][2])}')
print(f'drop_out: {taskA_opt.X[-1][3]:.2f}')
print(f'learning_rate: {taskA_opt.X[-1][4]:.4f}')
print(f'weight_decay: {taskA_opt.X[-1][5]:.4f}')

The hyperparameters used for the model in task A are
embedding_dim: 100
hidden_dim: 32
drop_out: 0.20
learning_rate: 0.0001
weight_decay: 0.9000


In [17]:
# CNN(vocab_size, embedding_dim, out_channels, window_size, output_dim, dropout)
# test_model_A = CNN(INPUT_DIM, int(taskA_opt.X[-1][0]), int(taskA_opt.X[-1][1]), WINDOW_SIZE, OUTPUT_DIM, taskA_opt.X[-1][3])
# LSTMClassifier(vocab_size, embedding_dim, hidden_dim, label_size, output_dim, dropout)
test_model_A = LSTMClassifier(INPUT_DIM, int(taskA_opt.X[-1][0]), int(taskA_opt.X[-1][2]), 2, OUTPUT_DIM, int(taskA_opt.X[-1][3]))
optimizer_A = optim.SGD(test_model_A.parameters(), lr=taskA_opt.X[-1][4], momentum=taskA_opt.X[-1][5])
train_part(test_model_A, tweets_a_tensor, labels_a_tensor, optimizer_A, epochs=50)

| Epoch: 01 | Train Loss: 0.667 | Train Acc: 66.84% | Val. Loss: 0.668 | Val. Acc: 66.43% |
| Epoch: 02 | Train Loss: 0.667 | Train Acc: 66.84% | Val. Loss: 0.668 | Val. Acc: 66.43% |
| Epoch: 03 | Train Loss: 0.667 | Train Acc: 66.84% | Val. Loss: 0.668 | Val. Acc: 66.43% |
| Epoch: 04 | Train Loss: 0.667 | Train Acc: 66.84% | Val. Loss: 0.668 | Val. Acc: 66.43% |
| Epoch: 05 | Train Loss: 0.667 | Train Acc: 66.84% | Val. Loss: 0.667 | Val. Acc: 66.43% |
| Epoch: 06 | Train Loss: 0.667 | Train Acc: 66.84% | Val. Loss: 0.667 | Val. Acc: 66.43% |
| Epoch: 07 | Train Loss: 0.667 | Train Acc: 66.84% | Val. Loss: 0.667 | Val. Acc: 66.43% |
| Epoch: 08 | Train Loss: 0.667 | Train Acc: 66.84% | Val. Loss: 0.667 | Val. Acc: 66.43% |
| Epoch: 09 | Train Loss: 0.667 | Train Acc: 66.84% | Val. Loss: 0.667 | Val. Acc: 66.43% |
| Epoch: 10 | Train Loss: 0.667 | Train Acc: 66.84% | Val. Loss: 0.667 | Val. Acc: 66.43% |
| Epoch: 11 | Train Loss: 0.666 | Train Acc: 66.84% | Val. Loss: 0.667 | Val. Ac

tensor(0.6643)

In [34]:
_, feature_valid, _, target_valid = train_valid_split(tweets_a_tensor, labels_a_tensor, 0.2)

test_model_A.eval()

with torch.no_grad():
  predictions = test_model_A(feature_valid).squeeze(1)
  acc = check_accuracy(predictions, target_valid)
  score = check_f1_score(predictions, target_valid)

print(f'Validation Accuracy: {acc:.4f}')
print(f'Validation F1 Score: {score:.4f}')

Validation Accuracy: 0.6643
Validation F1 Score: 0.3991


In [18]:
INPUT_DIM = len(word2index)
OUTPUT_DIM = 1

# window size
WINDOW_SIZE = 1

def taskB_tuning(params):
  param = params[0]
  # CNN(vocab_size, embedding_dim, out_channels, window_size, output_dim, dropout)
  # model = CNN(INPUT_DIM, int(param[0]), int(param[1]), WINDOW_SIZE, OUTPUT_DIM, param[3])
  # LSTMClassifier(vocab_size, embedding_dim, hidden_dim, label_size, output_dim, dropout)
  model = LSTMClassifier(INPUT_DIM, int(param[0]), int(param[2]), 2, OUTPUT_DIM, param[3])
  optimizer = optim.SGD(model.parameters(), lr=param[4], momentum=param[5])
  acc = train_part(model, tweets_b_tensor, labels_b_tensor, optimizer, epochs=10)
  return acc

taskB_opt = BayesianOptimization(f=taskB_tuning,
                                 domain=domain,
                                 model_type='GP',
                                 acquisition_type='EI',
                                 acquisition_jitter=0.05,
                                 maximize=True)

taskB_opt.run_optimization(max_iter=20)

| Epoch: 01 | Train Loss: 0.616 | Train Acc: 86.28% | Val. Loss: 0.600 | Val. Acc: 89.43% |
| Epoch: 02 | Train Loss: 0.606 | Train Acc: 87.16% | Val. Loss: 0.585 | Val. Acc: 89.43% |
| Epoch: 03 | Train Loss: 0.590 | Train Acc: 87.67% | Val. Loss: 0.568 | Val. Acc: 89.43% |
| Epoch: 04 | Train Loss: 0.576 | Train Acc: 87.61% | Val. Loss: 0.552 | Val. Acc: 89.43% |
| Epoch: 05 | Train Loss: 0.560 | Train Acc: 87.70% | Val. Loss: 0.536 | Val. Acc: 89.43% |
| Epoch: 06 | Train Loss: 0.546 | Train Acc: 87.73% | Val. Loss: 0.522 | Val. Acc: 89.43% |
| Epoch: 07 | Train Loss: 0.533 | Train Acc: 87.76% | Val. Loss: 0.510 | Val. Acc: 89.43% |
| Epoch: 08 | Train Loss: 0.520 | Train Acc: 87.76% | Val. Loss: 0.498 | Val. Acc: 89.43% |
| Epoch: 09 | Train Loss: 0.511 | Train Acc: 87.76% | Val. Loss: 0.488 | Val. Acc: 89.43% |
| Epoch: 10 | Train Loss: 0.500 | Train Acc: 87.76% | Val. Loss: 0.479 | Val. Acc: 89.43% |
| Epoch: 01 | Train Loss: 0.647 | Train Acc: 76.68% | Val. Loss: 0.639 | Val. Ac

In [19]:
print("The hyperparameters used for the model in task B are")
print(f'embedding_dim: {int(taskB_opt.X[-1][0])}')
print(f'hidden_dim: {int(taskB_opt.X[-1][2])}')
print(f'drop_out: {taskB_opt.X[-1][3]:.2f}')
print(f'learning_rate: {taskB_opt.X[-1][4]:.4f}')
print(f'weight_decay: {taskB_opt.X[-1][5]:.4f}')

The hyperparameters used for the model in task B are
embedding_dim: 100
hidden_dim: 64
drop_out: 0.34
learning_rate: 0.0100
weight_decay: 0.8571


In [20]:
# CNN(vocab_size, embedding_dim, out_channels, window_size, output_dim, dropout)
# test_model_B = CNN(INPUT_DIM, int(taskB_opt.X[-1][0]), int(taskB_opt.X[-1][1]), WINDOW_SIZE, OUTPUT_DIM, taskB_opt.X[-1][3])
# LSTMClassifier(vocab_size, embedding_dim, hidden_dim, label_size, output_dim, dropout)
test_model_B = LSTMClassifier(INPUT_DIM, int(taskB_opt.X[-1][0]), int(taskB_opt.X[-1][2]), 2, OUTPUT_DIM, taskB_opt.X[-1][3])
optimizer_B = optim.SGD(test_model_B.parameters(), lr=taskB_opt.X[-1][4], momentum=taskB_opt.X[-1][5])
train_part(test_model_B, tweets_b_tensor, labels_b_tensor, optimizer_B, epochs=50)

| Epoch: 01 | Train Loss: 0.725 | Train Acc: 25.23% | Val. Loss: 0.703 | Val. Acc: 10.57% |
| Epoch: 02 | Train Loss: 0.705 | Train Acc: 39.43% | Val. Loss: 0.666 | Val. Acc: 89.43% |
| Epoch: 03 | Train Loss: 0.667 | Train Acc: 74.20% | Val. Loss: 0.620 | Val. Acc: 89.43% |
| Epoch: 04 | Train Loss: 0.623 | Train Acc: 87.33% | Val. Loss: 0.571 | Val. Acc: 89.43% |
| Epoch: 05 | Train Loss: 0.578 | Train Acc: 87.76% | Val. Loss: 0.525 | Val. Acc: 89.43% |
| Epoch: 06 | Train Loss: 0.533 | Train Acc: 87.76% | Val. Loss: 0.484 | Val. Acc: 89.43% |
| Epoch: 07 | Train Loss: 0.496 | Train Acc: 87.76% | Val. Loss: 0.450 | Val. Acc: 89.43% |
| Epoch: 08 | Train Loss: 0.465 | Train Acc: 87.76% | Val. Loss: 0.424 | Val. Acc: 89.43% |
| Epoch: 09 | Train Loss: 0.441 | Train Acc: 87.76% | Val. Loss: 0.403 | Val. Acc: 89.43% |
| Epoch: 10 | Train Loss: 0.423 | Train Acc: 87.76% | Val. Loss: 0.387 | Val. Acc: 89.43% |
| Epoch: 11 | Train Loss: 0.409 | Train Acc: 87.76% | Val. Loss: 0.375 | Val. Ac

tensor(0.8943)

In [35]:
_, feature_valid, _, target_valid = train_valid_split(tweets_b_tensor, labels_b_tensor, 0.2)

test_model_B.eval()

with torch.no_grad():
  predictions = test_model_B(feature_valid).squeeze(1)
  acc = check_accuracy(predictions, target_valid)
  score = check_f1_score(predictions, target_valid)

print(f'Validation Accuracy: {acc:.4f}')
print(f'Validation F1 Score: {score:.4f}')

Validation Accuracy: 0.8943
Validation F1 Score: 0.4721


In [21]:
INPUT_DIM = len(word2index)
OUTPUT_DIM = 3

# window size
WINDOW_SIZE = 1

def taskC_tuning(params):
  param = params[0]
  # CNN(vocab_size, embedding_dim, out_channels, window_size, output_dim, dropout)
  # model = CNN(INPUT_DIM, int(param[0]), int(param[1]), WINDOW_SIZE, OUTPUT_DIM, param[3])
  # LSTMClassifier(vocab_size, embedding_dim, hidden_dim, label_size, output_dim, dropout)
  model = LSTMClassifier(INPUT_DIM, int(param[0]), int(param[2]), 2, OUTPUT_DIM, param[3])
  optimizer = optim.SGD(model.parameters(), lr=param[4], momentum=param[5])
  acc = train_part(model, tweets_c_tensor, labels_c_tensor, optimizer, epochs=10, num_class=3)
  return acc

taskC_opt = BayesianOptimization(f=taskC_tuning,
                                 domain=domain,
                                 model_type='GP',
                                 acquisition_type='EI',
                                 acquisition_jitter=0.05,
                                 maximize=True)

taskC_opt.run_optimization(max_iter=20)

| Epoch: 01 | Train Loss: 1.050 | Train Acc: 51.63% | Val. Loss: 1.044 | Val. Acc: 60.00% |
| Epoch: 02 | Train Loss: 1.042 | Train Acc: 53.11% | Val. Loss: 1.038 | Val. Acc: 60.00% |
| Epoch: 03 | Train Loss: 1.039 | Train Acc: 54.18% | Val. Loss: 1.032 | Val. Acc: 60.00% |
| Epoch: 04 | Train Loss: 1.032 | Train Acc: 55.14% | Val. Loss: 1.025 | Val. Acc: 60.00% |
| Epoch: 05 | Train Loss: 1.024 | Train Acc: 56.43% | Val. Loss: 1.019 | Val. Acc: 60.00% |
| Epoch: 06 | Train Loss: 1.023 | Train Acc: 56.37% | Val. Loss: 1.013 | Val. Acc: 60.00% |
| Epoch: 07 | Train Loss: 1.010 | Train Acc: 59.24% | Val. Loss: 1.007 | Val. Acc: 60.00% |
| Epoch: 08 | Train Loss: 1.005 | Train Acc: 59.88% | Val. Loss: 1.001 | Val. Acc: 60.00% |
| Epoch: 09 | Train Loss: 0.998 | Train Acc: 59.40% | Val. Loss: 0.996 | Val. Acc: 60.00% |
| Epoch: 10 | Train Loss: 0.995 | Train Acc: 60.01% | Val. Loss: 0.992 | Val. Acc: 60.00% |
| Epoch: 01 | Train Loss: 1.121 | Train Acc: 24.73% | Val. Loss: 1.113 | Val. Ac

In [22]:
print("The hyperparameters used for the model in task C are")
print(f'embedding_dim: {int(taskC_opt.X[-1][0])}')
print(f'hidden_dim: {int(taskC_opt.X[-1][2])}')
print(f'drop_out: {taskC_opt.X[-1][3]:.2f}')
print(f'learning_rate: {taskC_opt.X[-1][4]:.4f}')
print(f'weight_decay: {taskC_opt.X[-1][5]:.4f}')

The hyperparameters used for the model in task C are
embedding_dim: 100
hidden_dim: 128
drop_out: 0.20
learning_rate: 0.0001
weight_decay: 0.9000


In [23]:
# CNN(vocab_size, embedding_dim, out_channels, window_size, output_dim, dropout)
# test_model_C = CNN(INPUT_DIM, int(taskC_opt.X[-1][0]), int(taskC_opt.X[-1][1]), WINDOW_SIZE, OUTPUT_DIM, taskC_opt.X[-1][3])
# LSTMClassifier(vocab_size, embedding_dim, hidden_dim, label_size, output_dim, dropout)
test_model_C = LSTMClassifier(INPUT_DIM, int(taskC_opt.X[-1][0]), int(taskC_opt.X[-1][2]), 2, OUTPUT_DIM, taskC_opt.X[-1][3])
optimizer_C = optim.SGD(test_model_C.parameters(), lr=taskC_opt.X[-1][4], momentum=taskC_opt.X[-1][5])
train_part(test_model_C, tweets_c_tensor, labels_c_tensor, optimizer_C, epochs=50, num_class=3)

| Epoch: 01 | Train Loss: 1.042 | Train Acc: 62.14% | Val. Loss: 1.048 | Val. Acc: 60.00% |
| Epoch: 02 | Train Loss: 1.042 | Train Acc: 62.27% | Val. Loss: 1.048 | Val. Acc: 60.00% |
| Epoch: 03 | Train Loss: 1.041 | Train Acc: 62.11% | Val. Loss: 1.048 | Val. Acc: 60.00% |
| Epoch: 04 | Train Loss: 1.041 | Train Acc: 62.40% | Val. Loss: 1.048 | Val. Acc: 60.00% |
| Epoch: 05 | Train Loss: 1.040 | Train Acc: 62.14% | Val. Loss: 1.047 | Val. Acc: 60.00% |
| Epoch: 06 | Train Loss: 1.041 | Train Acc: 62.04% | Val. Loss: 1.047 | Val. Acc: 60.00% |
| Epoch: 07 | Train Loss: 1.040 | Train Acc: 62.21% | Val. Loss: 1.046 | Val. Acc: 60.00% |
| Epoch: 08 | Train Loss: 1.040 | Train Acc: 62.14% | Val. Loss: 1.045 | Val. Acc: 60.00% |
| Epoch: 09 | Train Loss: 1.038 | Train Acc: 62.21% | Val. Loss: 1.045 | Val. Acc: 60.00% |
| Epoch: 10 | Train Loss: 1.039 | Train Acc: 62.33% | Val. Loss: 1.044 | Val. Acc: 60.00% |
| Epoch: 11 | Train Loss: 1.035 | Train Acc: 62.24% | Val. Loss: 1.043 | Val. Ac

tensor(0.6000)

In [36]:
_, feature_valid, _, target_valid = train_valid_split(tweets_c_tensor, labels_c_tensor, 0.2)

test_model_C.eval()

with torch.no_grad():
  predictions = test_model_C(feature_valid).squeeze(1)
  acc = check_accuracy(predictions, target_valid, num_class=3)
  score = check_f1_score(predictions, target_valid, num_class=3)

print(f'Validation Accuracy: {acc:.4f}')
print(f'Validation F1 Score: {score:.4f}')

Validation Accuracy: 0.6000
Validation F1 Score: 0.2500


In [0]:
def indices_to_labels(indices, label2index):
  labels = []
  for index in indices:
    for key, num in label2index.items():
      if index == num:
        labels.append(key)
  return labels

In [38]:
testA_dir = "/content/drive/My Drive/OffensEval/data/taskA/testset-taska.tsv"
testB_dir = "/content/drive/My Drive/OffensEval/data/taskB/testset-taskb.tsv"
testC_dir = "/content/drive/My Drive/OffensEval/data/taskC/test_set_taskc.tsv"

testA_data = pd.read_csv(testA_dir, sep='\t', header=0)
testA_ids = testA_data["id"].tolist()
testA_tweets = testA_data[["tweet"]]
clean_testA = copy.deepcopy(testA_tweets)

testB_data = pd.read_csv(testB_dir, sep='\t', header=0)
testB_ids = testB_data["id"].tolist()
testB_tweets = testB_data[["tweet"]]
clean_testB = copy.deepcopy(testB_tweets)

testC_data = pd.read_csv(testC_dir, sep='\t', header=0)
testC_ids = testC_data["id"].tolist()
testC_tweets = testC_data[["tweet"]]
clean_testC = copy.deepcopy(testC_tweets)

tqdm.pandas(desc="Cleaning Data for Task A...")
clean_testA['tweet'] = testA_tweets['tweet'].progress_apply(clean_data)
tqdm.pandas(desc="Tokenizing Data for Task A...")
clean_testA['tokens'] = clean_testA['tweet'].progress_apply(tokenize)
tqdm.pandas(desc="Removing Stop Words for Task A...")
clean_testA['tokens'] = clean_testA['tokens'].progress_apply(remove_stop_words)
tqdm.pandas(desc="Lemmatizing And Stemming for Task A...")
clean_testA['tokens'] = clean_testA['tokens'].progress_apply(lemmatize_and_stem)

tqdm.pandas(desc="Cleaning Data for Task B...")
clean_testB['tweet'] = testB_tweets['tweet'].progress_apply(clean_data)
tqdm.pandas(desc="Tokenizing Data for Task B...")
clean_testB['tokens'] = clean_testB['tweet'].progress_apply(tokenize)
tqdm.pandas(desc="Removing Stop Words for Task B...")
clean_testB['tokens'] = clean_testB['tokens'].progress_apply(remove_stop_words)
tqdm.pandas(desc="Lemmatizing And Stemming for Task B...")
clean_testB['tokens'] = clean_testB['tokens'].progress_apply(lemmatize_and_stem)

tqdm.pandas(desc="Cleaning Data for Task C...")
clean_testC['tweet'] = testC_tweets['tweet'].progress_apply(clean_data)
tqdm.pandas(desc="Tokenizing Data for Task C...")
clean_testC['tokens'] = clean_testC['tweet'].progress_apply(tokenize)
tqdm.pandas(desc="Removing Stop Words for Task C...")
clean_testC['tokens'] = clean_testC['tokens'].progress_apply(remove_stop_words)
tqdm.pandas(desc="Lemmatizing And Stemming for Task C...")
clean_testC['tokens'] = clean_testC['tokens'].progress_apply(lemmatize_and_stem)

tokenized_testA = clean_testA['tokens'].tolist()
tokenized_testB = clean_testB['tokens'].tolist()
tokenized_testC = clean_testC['tokens'].tolist()

testA_tensor = tweet_to_tensor(tokenized_testA, word2index, max_length)
testB_tensor = tweet_to_tensor(tokenized_testB, word2index, max_length)
testC_tensor = tweet_to_tensor(tokenized_testC, word2index, max_length)

print("Test A tensor size:")
print(testA_tensor.shape)
print("Test B tensor size:")
print(testB_tensor.shape)
print("Test C tensor size:")
print(testC_tensor.shape)

Cleaning Data for Task A...: 100%|██████████| 860/860 [00:00<00:00, 58087.24it/s]
Tokenizing Data for Task A...: 100%|██████████| 860/860 [00:00<00:00, 5217.38it/s]
Removing Stop Words for Task A...: 100%|██████████| 860/860 [00:00<00:00, 5948.87it/s]
Lemmatizing And Stemming for Task A...: 100%|██████████| 860/860 [00:00<00:00, 1972.76it/s]
Cleaning Data for Task B...: 100%|██████████| 240/240 [00:00<00:00, 38679.46it/s]
Tokenizing Data for Task B...: 100%|██████████| 240/240 [00:00<00:00, 4958.73it/s]
Removing Stop Words for Task B...: 100%|██████████| 240/240 [00:00<00:00, 5565.72it/s]
Lemmatizing And Stemming for Task B...: 100%|██████████| 240/240 [00:00<00:00, 2053.27it/s]
Cleaning Data for Task C...: 100%|██████████| 213/213 [00:00<00:00, 34241.18it/s]
Tokenizing Data for Task C...: 100%|██████████| 213/213 [00:00<00:00, 4915.61it/s]
Removing Stop Words for Task C...: 100%|██████████| 213/213 [00:00<00:00, 5311.04it/s]
Lemmatizing And Stemming for Task C...: 100%|██████████| 213

Test A tensor size:
torch.Size([860, 39])
Test B tensor size:
torch.Size([240, 39])
Test C tensor size:
torch.Size([213, 39])





In [0]:
test_model_A.eval()

with torch.no_grad():
  output = test_model_A(testA_tensor).squeeze(1)
  output = torch.round(torch.sigmoid(output))

preds_testA = indices_to_labels(output, a2index)

taskA_df = pd.DataFrame(testA_ids, columns=['id'])
taskA_df['predict'] = preds_testA
taskA_df.to_csv('testA.csv', header=False, index=False)

In [0]:
test_model_B.eval()

with torch.no_grad():
  output = test_model_B(testB_tensor).squeeze(1)
  output = torch.round(torch.sigmoid(output))

preds_testB = indices_to_labels(output, b2index)

taskB_df = pd.DataFrame(testB_ids, columns=['id'])
taskB_df['predict'] = preds_testB
taskB_df.to_csv('testB.csv', header=False, index=False)

In [0]:
test_model_C.eval()

with torch.no_grad():
  output = test_model_C(testC_tensor).squeeze(1)
  output = F.log_softmax(output)
  output = output.data.max(1)[1].numpy()

preds_testC = indices_to_labels(output, c2index)

taskC_df = pd.DataFrame(testC_ids, columns=['id'])
taskC_df['predict'] = preds_testC
taskC_df.to_csv('testC.csv', header=False, index=False)