In [2]:
# Loading packages
!pip install nltk

exec(open('utilities_cw.py').read())

# We fix the seeds to get consistent results
SEED = 1111
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
one.py is being run directly


In [0]:
from sklearn.model_selection import train_test_split

train_file = 'offenseval-training-v1.tsv'

# Ratio 1 - OFF, 2 - NOT
corpus, labels = parse_dataset_task_a(train_file)

In [4]:
print(len(corpus))
print(len(labels))

off = np.count_nonzero(labels)
notoff = len(labels) - off

print("OFF " + str(off))
print("NOT " + str(notoff))

13240
13240
OFF 4400
NOT 8840


In [0]:
train_corpus, valid_corpus, train_labels, valid_labels = train_test_split(corpus, labels, test_size=0.3, random_state=42)

In [0]:
tokenize_f = tokenize_stemming

In [9]:
tokenized_train_corpus = tokenize_f(train_corpus)
print(len(tokenized_train_corpus))

9268


In [10]:
vocabulary = get_vocabulary(tokenized_train_corpus)
print(vocabulary)
print(len(vocabulary))

['user', 'truli', 'anyth', 'avoid', 'gun', 'control', 'wrong', 'character', 'fuck', 'truth', 'guy', 'ignor', 'see', 'need', 'add', 'law', 'democrat', 'run', 'thing', '', 'area', 'much', 'less', 'crime', 'boom', 'mayb', 'book', 'deal', 'isnt', 'good', 'idea', 'fisadeclassif', 'maga', 'qanon', 'trustsess', 'wow', 'corrupt', 'deep', 'url', 'unfirtun', 'seen', 'hispan', 'justifi', 'shit', 'trump', 'argu', 'two', 'famili', 'member', 'mine', 'believ', 'help', 'pr', 'lot', 'rel', 'fan', 'amp', 'agre', 'major', 'immigr', 'come', 'bad', 'upset', 'rubio', 'want', 'doj', 'investig', 'kerri', 'iran', 'meddl', 'via', 'attent', 'public', 'antifa', 'target', 'list', 'account', 'high', 'profil', 'name', 'mani', 'random', 'support', 'twitter', 'suspend', 'pleas', 'report', 'check', 'doesnt', 'partli', 'fund', 'tell', 'know', 'bum', 'write', 'ass', 'bottom', 'proof', 'higher', 'intellig', 'pass', 'money', 'outsid', 'hope', 'real', 'person', 'perfect', 'tweet', 'obama', 'devil', 'incarn', 'jesu', 'christ

In [11]:
word2idx = get_word2idx(tokenized_train_corpus, vocabulary)
print(len(word2idx))

12583


In [12]:
idx2word = get_idx2word(vocabulary)
print(len(idx2word))

12582


In [13]:
sentences_lengths = [len(sentence) for sentence in tokenized_train_corpus]

# sentences_lengths_all = [len(sentence) for sentence in tokenized_corpus]
# CHECK IF THIS IS ALRIGHT
max_len = np.max(np.array(sentences_lengths))
print(max_len)

train_sentences_tensor, train_labels_tensor = parse_input(tokenized_train_corpus, word2idx, train_labels, max_len)

print(train_sentences_tensor)
print(train_labels_tensor)

print(train_sentences_tensor.shape)
print(train_labels_tensor.shape)

81
tensor([[    1,     1,     2,  ...,     0,     0,     0],
        [    1,     7,     8,  ...,     0,     0,     0],
        [    1,     1,    14,  ...,     0,     0,     0],
        ...,
        [    1,     1,   441,  ...,     0,     0,     0],
        [    1,  4946,   659,  ...,     0,     0,     0],
        [    1,     1, 11322,  ...,     0,     0,     0]])
tensor([0., 1., 0.,  ..., 0., 0., 0.])
torch.Size([9268, 81])
torch.Size([9268])


In [14]:
test_file = 'validation.tsv'
test_corpus, test_labels = parse_dataset_task_a(test_file)

offensives = np.count_nonzero(test_labels)
nonoffensives = len(test_labels) - offensives

print("OFF " + str(offensives))
print("NOT " + str(nonoffensives))

print(test_corpus)
print(test_labels)

print(len(test_corpus))
print(len(test_labels))

OFF 77
NOT 243
['breitbartnews ok shannon you tell the veterans in those locker rooms they have to stay there until the celebration of what they fought for is over', 'leftyglenn jaredeker bookuniverse hashtagzema ralphlombardi nathanhrubin fine because i could afford a gun if i wanted to i could fit it into my budget my budgeting is fine here in canada we have gun insurance and gun control and lotsa p', 'hot mom sucks off step son in shower 8 min ', 'bro these are some cute butt plugs im trying to cop ', 'arizona supreme court strikes down state legislation prohibiting medical marijuana use on public college c ', 'arguing gun control is wrong of me whoever has a strong opinion doesnt really care about facts they have a emotional connection to the issue and its a waste of me to argue when am i gonna learn that laughing out loud', 'doctors interest in medical marijuana far outpaces available research  via thecanniforniancom ', 'a mustread and a mustshare for all your friends who support 

In [0]:
def parse_in(tokenized_train_corpus, word2idx, labels, max_len):
    # we index our sentences
    vectorized_sentences = [[word2idx[token] for token in sentence if token in word2idx] for sentence in tokenized_train_corpus]
    
    # we create a tensor of a fixed size filled with zeroes for padding
    sentences_tensor = Variable(torch.zeros((len(vectorized_sentences), max_len))).long()
    sentences_lengths = [len(sentence) for sentence in vectorized_sentences]

    # we fill it with our vectorized sentences 
    for idx, (sentence, sentence_len) in enumerate(zip(vectorized_sentences, sentences_lengths)):
        sentences_tensor[idx, :sentence_len] = torch.LongTensor(sentence)
            
    labels_tensor = torch.FloatTensor(labels)

    return sentences_tensor, labels_tensor

In [16]:
tokenized_valid_corpus = tokenize_f(valid_corpus)

print(len(tokenized_valid_corpus))
print(len(valid_corpus))
print(len(valid_labels))

valid_sentences_tensor, valid_labels_tensor = parse_in(tokenized_valid_corpus, word2idx, valid_labels, max_len)

print(valid_sentences_tensor)
print(valid_labels_tensor)

print(max_len)

print(valid_sentences_tensor.shape)
print(valid_labels_tensor.shape)

3972
3972
3972
tensor([[   1,    1, 4818,  ...,    0,    0,    0],
        [   1,    1,    5,  ...,    0,    0,    0],
        [   1,  577, 1595,  ...,    0,    0,    0],
        ...,
        [ 163,  345,  346,  ...,    0,    0,    0],
        [   1,    1,    1,  ...,    0,    0,    0],
        [   1,    1,  129,  ...,    0,    0,    0]])
tensor([0., 0., 0.,  ..., 0., 0., 1.])
81
torch.Size([3972, 81])
torch.Size([3972])


In [17]:
tokenized_test_corpus = tokenize_f(test_corpus)
test_sentences_tensor, test_labels_tensor = parse_input(tokenized_test_corpus, word2idx, test_labels, max_len)

print(test_sentences_tensor)
print(test_labels_tensor)

print(test_sentences_tensor.shape)
print(test_labels_tensor.shape)

tensor([[1632,  118,   92,  ...,    0,    0,    0],
        [ 993,  165, 3130,  ...,    0,    0,    0],
        [1872, 2459, 1255,  ...,    0,    0,    0],
        ...,
        [1000, 2152,  494,  ...,    0,    0,    0],
        [ 493,  439, 1176,  ...,    0,    0,    0],
        [6089,  255, 7495,  ...,    0,    0,    0]])
tensor([0., 0., 1., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1.,
        1., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 1., 0., 0., 1., 1., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1.,
        1., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1.,
        0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        0., 0., 1., 0., 0., 0., 0.

In [0]:
def train_model_custom(model, optimizer, loss_fn, feature_train, target_train, feature_valid, target_valid, feature_test, target_test):    
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        predictions = model(feature_train).squeeze(1)
        loss = loss_fn(predictions, target_train)
        acc = accuracy(predictions, target_train)
        loss.backward()
        optimizer.step()

        epoch_loss = loss.item()
        epoch_acc = acc
  
        model.eval()
  
        with torch.no_grad():

            predictions_valid = model(feature_valid).squeeze(1)
            loss = loss_fn(predictions_valid, target_valid)
            acc = accuracy(predictions_valid, target_valid)
            valid_loss = loss.item()
            valid_acc = acc

        print(f'| Epoch: {epoch:02} | Train Loss: {epoch_loss:.3f} | Train Acc: {epoch_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')
    
    model.eval()

    with torch.no_grad():
 
        predictions = model(feature_test).squeeze(1)
        u = torch.FloatTensor(target_test)
        loss = loss_fn(predictions, u)
        acc = accuracy(predictions, u)
        print(f'| Test Loss: {loss:.3f} | Test Acc: {acc*100:.2f}%')
        f_measure(predictions, target_test)

    print(np.count_nonzero(target_test))

In [19]:
epochs=100

INPUT_DIM = len(word2idx)

EMBEDDING_DIM = 100
OUTPUT_DIM = 1
LEARNING_RATE = 0.0008

# the hyperparamerts specific to CNN
# we define the number of filters
N_OUT_CHANNELS = 100
# we define the window size
WINDOW_SIZE = 1
# we apply the dropout with the probability 0.5
DROPOUT = 0.5

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_OUT_CHANNELS, WINDOW_SIZE, OUTPUT_DIM, DROPOUT)

optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
loss_fn = nn.BCEWithLogitsLoss()

train_model_custom(model, optimizer, loss_fn, train_sentences_tensor, train_labels_tensor, valid_sentences_tensor, valid_labels_tensor, test_sentences_tensor, test_labels)

| Epoch: 00 | Train Loss: 0.934 | Train Acc: 35.32% | Val. Loss: 0.828 | Val. Acc: 33.53% |
| Epoch: 01 | Train Loss: 0.870 | Train Acc: 37.76% | Val. Loss: 0.776 | Val. Acc: 34.44% |
| Epoch: 02 | Train Loss: 0.813 | Train Acc: 40.88% | Val. Loss: 0.734 | Val. Acc: 40.99% |
| Epoch: 03 | Train Loss: 0.767 | Train Acc: 46.55% | Val. Loss: 0.699 | Val. Acc: 49.50% |
| Epoch: 04 | Train Loss: 0.740 | Train Acc: 49.92% | Val. Loss: 0.672 | Val. Acc: 58.76% |
| Epoch: 05 | Train Loss: 0.711 | Train Acc: 54.11% | Val. Loss: 0.653 | Val. Acc: 64.17% |
| Epoch: 06 | Train Loss: 0.690 | Train Acc: 57.16% | Val. Loss: 0.640 | Val. Acc: 65.71% |
| Epoch: 07 | Train Loss: 0.677 | Train Acc: 59.82% | Val. Loss: 0.634 | Val. Acc: 66.44% |
| Epoch: 08 | Train Loss: 0.665 | Train Acc: 62.40% | Val. Loss: 0.631 | Val. Acc: 66.47% |
| Epoch: 09 | Train Loss: 0.655 | Train Acc: 64.11% | Val. Loss: 0.633 | Val. Acc: 66.44% |
| Epoch: 10 | Train Loss: 0.661 | Train Acc: 63.96% | Val. Loss: 0.637 | Val. Ac

In [0]:
testing = read_csv('testset-taska.tsv')

tweets = [row['tweet'] for row in testing]
tokenized_corpus = tokenize_f(tweets)

test_tensor, labels_tensor = parse_input(tokenized_corpus, word2idx, test_labels, max_len)

In [22]:
predictions_test = model(test_tensor).squeeze(1)

output = torch.round(torch.sigmoid(predictions_test))

print(output)

print(len(output))

tensor([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 0., 0., 

In [0]:
ids = [row['id'] for row in testing]
out = ['OFF' if e == 1 else 'NOT' for e in output.detach().numpy()]
zipped = list(zip(ids, out))

with open('predictions_task_a_split_stemming.csv', "w") as f:
    writer = csv.writer(f, dialect='excel')
    for row in zipped:
        writer.writerow(row)