In [0]:
# Loading packages

import torch
from torch.autograd import Variable
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from tqdm import tqdm 
import codecs
import random

# We fix the seeds to get consistent results

SEED = 1111
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

In [0]:
def augment_negatives(corpus, labels):
    p, n = [], []
    for idx, label in enumerate(labels):
        if label == 0: 
            p.append(corpus[idx])
        else:
            n.append(corpus[idx])

    for negative in n:
        r = random.randint(0, len(p)) - 1
        
        even = random.randint(0, 5)
        
        new_negative = ""
        
        if even % 2 == 1:
            new_negative = negative + " " + p[r]
        else:
            new_negative = p[r] + " " + negative
            
        corpus.append(new_negative)
        labels.append(1)  
        
    return corpus, labels

In [0]:
def shuffle_corpus(train_corpus, train_labels_a):
    u = list(zip(train_corpus, train_labels_a))    
    random.shuffle(u)
    train_corpus, train_labels_a = zip(*u)

In [0]:
import csv
import numpy as np

def read_csv(path):
    rows = []
    with open(path) as tsvfile:
        reader = csv.DictReader(tsvfile, dialect='excel-tab')
        for row in reader:
            rows.append(row)
    return rows

In [5]:
!pip install nltk

import re
import nltk

from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

nltk.download("stopwords")
nltk.download("wordnet")

stop = stopwords.words("english")

ps = PorterStemmer()

def keep_only_spaces(text):
  return re.sub(r'([^\s\w]|_)+', '', text)

def remove_stop_words(text):
    text.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [0]:
import csv
import re

def translator(user_string):
    user_string = user_string.split(" ")
    j = 0
    for _str in user_string:
        fileName = "slang.txt"
        accessMode = "r"
        with open(fileName, accessMode) as myCSVfile:
            dataFromFile = csv.reader(myCSVfile, delimiter="=")
            _str = keep_only_spaces(_str)
            changed = False
            for row in dataFromFile:
                if _str.upper() == row[0]:
                    user_string[j] = row[1]
                    changed = True
            if not changed:
                user_string[j] = _str
            myCSVfile.close()
        j = j + 1
    # Replacing commas with spaces for final output.
    return ' '.join(user_string)

In [7]:
rows = read_csv('offenseval-training-v1.tsv')
train_corpus = [translator(row['tweet'].lower()).lower() for row in rows]
train_labels_a = [1 if row['subtask_a'] == 'OFF' else 0 for row in rows]
train_labels_b = [row['subtask_b'] for row in rows]
train_labels_c = [row['subtask_c'] for row in rows]

# train_corpus, train_labels_a = augment_negatives(train_corpus, train_labels_a)
  
print(train_corpus)
print(train_labels_a)

print(len(train_corpus))
print(len(train_labels_a))

[1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 

In [0]:
from nltk.stem.wordnet import WordNetLemmatizer

lmtzr = WordNetLemmatizer()

def get_tokenized_corpus(corpus):
    tokenized_corpus = []

    for sentence in corpus:
        tokenized_sentence = []
        for token in sentence.split(' '): 
            if token not in stop:
#                   tokenized_sentence.append(token)
              
                tokenized_sentence.append(lmtzr.lemmatize(token))
#                 tokenized_sentence.append(ps.stem(token))
#             else:
#                 print(token)
        tokenized_corpus.append(tokenized_sentence)
 
    return tokenized_corpus

In [9]:
tokenized_corpus = get_tokenized_corpus(train_corpus)
print(len(tokenized_corpus))

13240


In [10]:
def get_vocabulary(tokenized_corpus):
    vocabulary = [] # Let us put all the tokens (mostly words) 
                    # appearing in the vocabulary in a list
    for sentence in tokenized_corpus:
        for token in sentence:
            if token not in vocabulary:
                vocabulary.append(token)
    return vocabulary
  
vocabulary = get_vocabulary(tokenized_corpus)
print(vocabulary)
print(len(vocabulary))

18766


In [0]:
def get_word2idx(tokenized_corpus, vocabulary):  
  word2idx = {w: idx+1 for (idx, w) in enumerate(vocabulary)}
  # we reserve the 0 index for the placeholder token
  word2idx['<pad>'] = 0
  return word2idx

word2idx = get_word2idx(tokenized_corpus, vocabulary)

In [0]:
def get_idx2word(vocabulary):
    return {idx: w for (idx, w) in enumerate(vocabulary)}
  
idx2word = get_idx2word(vocabulary)

In [13]:
def parse_input(tokenized_corpus, word2idx, labels, max_len):
  # we index our sentences
  vectorized_sentences = [[word2idx[token] for token in sentence if token in word2idx] for sentence in tokenized_corpus]
#   print(vectorized_sentences)
  
  # we create a tensor of a fixed size filled with zeroes for padding
  sentences_tensor = Variable(torch.zeros((len(vectorized_sentences), max_len))).long()
  sentences_lengths = [len(sentence) for sentence in vectorized_sentences]
  
  # we fill it with our vectorized sentences 
  
  for idx, (sentence, sentence_len) in enumerate(zip(vectorized_sentences, sentences_lengths)):
      sentences_tensor[idx, :sentence_len] = torch.LongTensor(sentence)

  labels_tensor = torch.FloatTensor(labels)
  
  return sentences_tensor, labels_tensor

sentences_lengths = [len(sentence) for sentence in tokenized_corpus]
max_len = np.max(np.array(sentences_lengths))

train_sentences_tensor, train_labels_tensor = parse_input(tokenized_corpus, word2idx, train_labels_a, max_len)
print(train_sentences_tensor)
print(train_labels_tensor)

print(train_sentences_tensor.shape)
print(train_labels_tensor.shape)

tensor([[    1,     2,     3,  ...,     0,     0,     0],
        [    1,     1,     6,  ...,     0,     0,     0],
        [   14,    15,    16,  ...,     0,     0,     0],
        ...,
        [    1,  2732,  2696,  ...,     0,     0,     0],
        [    1,  1322,     0,  ...,     0,     0,     0],
        [18765,   963,  1032,  ...,     0,     0,     0]])
tensor([1., 1., 0.,  ..., 1., 1., 0.])
torch.Size([13240, 81])
torch.Size([13240])


In [14]:
valid = read_csv('validation.tsv')

print(valid)

valid_corpus = [translator(row['tweet'].lower().replace("&amp", " ")).lower() for row in valid]
# valid_corpus = [row['tweet'].lower() for row in valid]
valid_labels_a = [1 if row['subtask_a'] == 'OFF' else 0 for row in valid]

offensives = np.count_nonzero(valid_labels_a)
nonoffensives = len(valid_labels_a) - offensives

# TODO: maybe augment

print(offensives)
print(nonoffensives)

print(valid_corpus)
print(valid_labels_a)

print(len(valid_corpus))
print(len(valid_labels_a))

[OrderedDict([('tweet', '@BreitbartNews OK Shannon, YOU tell the veterans in those locker rooms they have to stay there until the celebration of what they fought for is over.'), ('subtask_a', 'NOT'), ('subtask_b', 'NULL'), ('subtask_c', 'NULL')]), OrderedDict([('tweet', '@LeftyGlenn @jaredeker @BookUniverse @hashtagzema @RalphLombardi @NathanHRubin Fine... Because i could afford a gun if i wanted to. I could fit it into my budget. My budgeting is fine??? Here in canada we have gun insurance and gun control? And lotsa p'), ('subtask_a', 'NOT'), ('subtask_b', 'NULL'), ('subtask_c', 'NULL')]), OrderedDict([('tweet', 'Hot Mom Sucks Off Step Son In Shower 8 min https://t.co/Y0zi9f5z6J'), ('subtask_a', 'OFF'), ('subtask_b', 'UNT'), ('subtask_c', 'NULL')]), OrderedDict([('tweet', 'bro these are some cute butt plugs I’m trying to cop https://t.co/RsnxRF4HTi'), ('subtask_a', 'OFF'), ('subtask_b', 'UNT'), ('subtask_c', 'NULL')]), OrderedDict([('tweet', 'Arizona Supreme Court strikes down state l

In [15]:
tokenized_valid_corpus = get_tokenized_corpus(valid_corpus)
valid_sentences_tensor, valid_labels_tensor = parse_input(tokenized_valid_corpus, word2idx, valid_labels_a, max_len)
print(valid_sentences_tensor)
print(valid_labels_tensor)

print(valid_sentences_tensor.shape)
print(valid_labels_tensor.shape)

tensor([[ 1015, 13666,   215,  ...,     0,     0,     0],
        [ 1223,   198,  8835,  ...,     0,     0,     0],
        [ 1462,   154,   983,  ...,     0,     0,     0],
        ...,
        [  202,  2974,   888,  ...,     0,     0,     0],
        [  309,   317,  1671,  ...,     0,     0,     0],
        [ 1600,   295, 12701,  ...,     0,     0,     0]])
tensor([0., 0., 1., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1.,
        1., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 1., 0., 0., 1., 1., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1.,
        1., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1.,
        0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.

In [16]:
test = read_csv('testset-taska.tsv')

print(test)

test_corpus = [row['tweet'] for row in test]
# test_labels_a = [1 if row['subtask_a'] == 'OFF' else 0 for row in test]

print(test_corpus)
# print(test_labels_a)

FileNotFoundError: ignored

In [0]:
def accuracy(output, target):
    output = torch.round(torch.sigmoid(output))
    correct = (output == target).float()
    acc = correct.sum()/len(correct) 
    return acc    

In [0]:
def f_measure(output, gold):  
  pred = torch.round(torch.sigmoid(output))
  pred = pred.detach().cpu().numpy()
   
  test_pos_preds = np.sum(pred)
  test_pos_real = np.sum(gold)
    
  true_positives = (np.logical_and(pred, gold)).astype(int)
  true_positives = np.sum(true_positives)
  print(true_positives)
  
  precision = true_positives/test_pos_preds
  recall = true_positives/test_pos_real
  
  fscore = 2.0*precision*recall/(precision+recall)
  print("Test: Recall: %.2f, Precision: %.2f, F-measure: %.2f\n" % (recall, precision, fscore))  

In [0]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, output_dim):
        super(LSTMTagger, self).__init__()

        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.linear = nn.Linear(hidden_dim, output_dim)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # the axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(1, 40, self.hidden_dim),
                torch.zeros(1, 40, self.hidden_dim))

    def forward(self, sentence):
#         print("sentence shape " + str(sentence.shape))
      
        embeds = self.word_embeddings(sentence)
        
#         print("embeedding shape after layer " + str(embeds.shape))
        
        # if bidirectional num_directions = 2
        # input to lstm must have dims (seq_len, batch, input_size), (num_layers * num_directions, batch, hidden_size)
        seq_len = embeds.size(1)
        batch = embeds.size(0)
        input_size = embeds.size(2)
        lstm_out, self.hidden = self.lstm(embeds.permute(1, 0, 2), self.hidden)
        
#         print("lstm aici " + str(lstm_out.shape))
        # keep only the last output of the sequence
        last = lstm_out.permute(1, 0, 2)[:, -1, :]
        res = self.linear(last)
        
#         print("res shape " + str(res.shape))
#         print(res.squeeze()[-1])
        return res.squeeze()
#         tag_scores = F.log_softmax(tag_space, dim=1)
#         return tag_scores

In [0]:
import torchvision

feature_train = train_sentences_tensor
target_train = train_labels_tensor

feature_valid = valid_sentences_tensor
target_valid = valid_labels_tensor

# print(feature_train.shape)
# print(feature_valid.shape)
training_set = torch.utils.data.TensorDataset(feature_train, target_train)
train_input = torch.utils.data.DataLoader(training_set, batch_size=40,
                        shuffle=True)
validation_set = torch.utils.data.TensorDataset(feature_valid, target_valid)
valid_input = torch.utils.data.DataLoader(validation_set, batch_size=40,
                        shuffle=True)

EPOCHS = 100

INPUT_DIM = len(word2idx)
EMBEDDING_DIM = 200
OUTPUT_DIM = 1
HIDDEN_DIM = 100

# def __init__(self, embedding_dim, hidden_dim, vocab_size, output_dim):
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, INPUT_DIM, OUTPUT_DIM)
model.hidden = model.init_hidden()

optimizer = optim.Adam(model.parameters(), lr=0.0008)
loss_fn = nn.BCEWithLogitsLoss()

for epoch in range(EPOCHS):
    
    model.train()
    epoch_loss = 0
    epoch_predictions = []
    for batch, labels in train_input:
        optimizer.zero_grad()
        model.hidden = model.init_hidden()

        predictions = model(batch)

        loss = loss_fn(predictions, labels)
        epoch_loss += loss.item()
        loss.backward()
        optimizer.step()
        
        epoch_predictions.append(predictions.cpu().detach().numpy().tolist())
        
        
    model.eval()
#     print(torch.tensor(epoch_predictions).view(-1))
#     print(torch.tensor(epoch_predictions).view(-1).shape)
    epoch_acc = accuracy(torch.tensor(epoch_predictions).view(-1), target_train)
  
    with torch.no_grad():
        valid_loss = 0
        predictions = []
        for batch, labels in valid_input:
            predictions_valid = model(batch)
            loss = loss_fn(predictions_valid, labels)
            valid_loss += loss.item()
            predictions.append(predictions_valid.cpu().numpy().tolist())
        
        valid_acc = accuracy(torch.tensor(predictions).view(-1), target_valid)

    print(f'| Epoch: {epoch:02} | Train Loss: {epoch_loss:.3f} | Train Acc: {epoch_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')
    
model.eval()

# feature = feature_valid
# target = target_valid

# with torch.no_grad():
 
#     predictions = model(feature).squeeze(1)
#     loss = loss_fn(predictions, target)
#     acc = accuracy(predictions, target)
#     print(f'| Test Loss: {loss:.3f} | Test Acc: {acc*100:.2f}%')
#     f_measure(predictions, valid_labels_a)

    
# print(np.count_nonzero(valid_labels_a))

| Epoch: 00 | Train Loss: 211.147 | Train Acc: 66.72% | Val. Loss: 4.566 | Val. Acc: 75.94% |
| Epoch: 01 | Train Loss: 210.663 | Train Acc: 66.76% | Val. Loss: 4.657 | Val. Acc: 75.94% |
| Epoch: 02 | Train Loss: 210.551 | Train Acc: 66.76% | Val. Loss: 4.504 | Val. Acc: 75.94% |
| Epoch: 03 | Train Loss: 210.536 | Train Acc: 66.77% | Val. Loss: 4.620 | Val. Acc: 75.94% |
| Epoch: 04 | Train Loss: 210.500 | Train Acc: 66.76% | Val. Loss: 4.604 | Val. Acc: 75.94% |
| Epoch: 05 | Train Loss: 210.482 | Train Acc: 66.77% | Val. Loss: 4.599 | Val. Acc: 75.94% |
| Epoch: 06 | Train Loss: 210.471 | Train Acc: 66.76% | Val. Loss: 4.579 | Val. Acc: 75.94% |
| Epoch: 07 | Train Loss: 210.491 | Train Acc: 66.76% | Val. Loss: 4.572 | Val. Acc: 75.94% |
| Epoch: 08 | Train Loss: 210.455 | Train Acc: 66.76% | Val. Loss: 4.563 | Val. Acc: 75.94% |
| Epoch: 09 | Train Loss: 210.454 | Train Acc: 66.76% | Val. Loss: 4.564 | Val. Acc: 75.94% |
| Epoch: 10 | Train Loss: 210.506 | Train Acc: 66.76% | Val.