In [15]:
# Loading packages
!pip install nltk

exec(open('utilities_cw.py').read())

# We fix the seeds to get consistent results
SEED = 1111
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
one.py is being run directly


In [0]:
from sklearn.model_selection import train_test_split

train_file = 'offenseval-training-v1.tsv'

# Ratio 1 - OFF, 2 - NOT
corpus, labels = parse_dataset_task_b(train_file)

In [0]:
train_corpus, valid_corpus, train_labels, valid_labels = train_test_split(corpus, labels, test_size=0.3, random_state=42)

train_corpus, train_labels = augment_untargeted(train_corpus, train_labels, 1600)
valid_corpus, valid_labels = augment_untargeted(valid_corpus, valid_labels, 900)

In [0]:
tokenize_f = tokenize_stemming

In [19]:
tokenized_train_corpus = tokenize_f(train_corpus)
print(len(tokenized_train_corpus))

4680


In [20]:
vocabulary = get_vocabulary(tokenized_train_corpus)
print(vocabulary)
print(len(vocabulary))

['user', 'im', 'save', 'good', 'shitttt', 'anoth', 'falsehood', 'dont', 'think', 'anyon', 'want', 'abolish', '2nd', 'amend', 'grow', 'major', 'sensibl', 'gun', 'control', 'form', 'better', 'background', 'check', 'ban', 'assault', 'rifl', 'trump', 'say', 'anyth', 'rile', 'base', 'snake', 'oil', 'russian', 'spi', 'receiv', 'much', 'money', 'putin', 'incom', 'evil', 'soy', 'conspiraci', 'nut', 'disgrac', 'danger', 'implod', 'remov', 'word', 'mueller', 'constip', 'diarrhea', 'url', 'exactli', 'willi', 'half', 'brain', 'would', 'consid', 'conserv', '', 'guy', 'liter', 'poster', 'child', 'liber', 'lmfaooooo', 'foh', 'w', 'free', 'thought', 'shit', 'lmaoooo', 'win', 'could', 'implement', 'last', 'part', 'plan', 'use', 'guillotin', 'china', 'blade', 'mexico', 'maga', 'lockthemallup', 'confirmkavanaugh', 'wwg1wga', 'patriotsunit', 'godblessamerica', 'godblesstheworld', 'godblessourmilitari', '100thmonkey', 'farright', 'psychopath', 'leader', 'invis', 'press', 'creat', 'catkil', 'excus', 'bring'

In [21]:
word2idx = get_word2idx(tokenized_train_corpus, vocabulary)
print(len(word2idx))

6682


In [22]:
idx2word = get_idx2word(vocabulary)
print(len(idx2word))

6681


In [23]:
sentences_lengths = [len(sentence) for sentence in tokenized_train_corpus]

# sentences_lengths_all = [len(sentence) for sentence in tokenized_corpus]
# CHECK IF THIS IS ALRIGHT
max_len = np.max(np.array(sentences_lengths))
print(max_len)

train_sentences_tensor, train_labels_tensor = parse_input(tokenized_train_corpus, word2idx, train_labels, max_len)

print(train_sentences_tensor)
print(train_labels_tensor)

print(train_sentences_tensor.shape)
print(train_labels_tensor.shape)

74
tensor([[   1,    2,    3,  ...,    0,    0,    0],
        [   1,    6,    7,  ...,    0,    0,    0],
        [   1,   34,   35,  ...,    0,    0,    0],
        ...,
        [   1, 1009, 2484,  ...,    0,    0,    0],
        [   1,    1,    1,  ...,    0,    0,    0],
        [   1,  461,  868,  ...,    0,    0,    0]])
tensor([0., 1., 1.,  ..., 0., 0., 0.])
torch.Size([4680, 74])
torch.Size([4680])


In [0]:
test_file = 'validation.tsv'
test_corpus, test_labels = parse_dataset_task_b(test_file)

In [0]:
def parse_in(tokenized_train_corpus, word2idx, labels, max_len):
    # we index our sentences
    vectorized_sentences = [[word2idx[token] for token in sentence if token in word2idx] for sentence in tokenized_train_corpus]
    
    # we create a tensor of a fixed size filled with zeroes for padding
    sentences_tensor = Variable(torch.zeros((len(vectorized_sentences), max_len))).long()
    sentences_lengths = [len(sentence) for sentence in vectorized_sentences]

    # we fill it with our vectorized sentences 
    for idx, (sentence, sentence_len) in enumerate(zip(vectorized_sentences, sentences_lengths)):
        sentences_tensor[idx, :sentence_len] = torch.LongTensor(sentence)
            
    labels_tensor = torch.FloatTensor(labels)

    return sentences_tensor, labels_tensor

In [0]:
tokenized_valid_corpus = tokenize_f(valid_corpus)

valid_sentences_tensor, valid_labels_tensor = parse_in(tokenized_valid_corpus, word2idx, valid_labels, max_len)

In [0]:
tokenized_test_corpus = tokenize_f(test_corpus)
test_sentences_tensor, test_labels_tensor = parse_input(tokenized_test_corpus, word2idx, test_labels, max_len)

In [0]:
def train_model_custom(model, optimizer, loss_fn, feature_train, target_train, feature_valid, target_valid, feature_test, target_test):    
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        predictions = model(feature_train).squeeze(1)
        loss = loss_fn(predictions, target_train)
        acc = accuracy(predictions, target_train)
        loss.backward()
        optimizer.step()

        epoch_loss = loss.item()
        epoch_acc = acc
  
        model.eval()
  
        with torch.no_grad():

            predictions_valid = model(feature_valid).squeeze(1)
            loss = loss_fn(predictions_valid, target_valid)
            acc = accuracy(predictions_valid, target_valid)
            valid_loss = loss.item()
            valid_acc = acc

        print(f'| Epoch: {epoch:02} | Train Loss: {epoch_loss:.3f} | Train Acc: {epoch_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')
    
    model.eval()

    with torch.no_grad():
 
        predictions = model(feature_test).squeeze(1)
        u = torch.FloatTensor(target_test)
        loss = loss_fn(predictions, u)
        acc = accuracy(predictions, u)
        print(f'| Test Loss: {loss:.3f} | Test Acc: {acc*100:.2f}%')
        f_measure(predictions, target_test)

    print(np.count_nonzero(target_test))

In [29]:
epochs=100

INPUT_DIM = len(word2idx)

EMBEDDING_DIM = 100
OUTPUT_DIM = 1
LEARNING_RATE = 0.0008

# the hyperparamerts specific to CNN
# we define the number of filters
N_OUT_CHANNELS = 100
# we define the window size
WINDOW_SIZE = 2
# we apply the dropout with the probability 0.5
DROPOUT = 0.5

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_OUT_CHANNELS, WINDOW_SIZE, OUTPUT_DIM, DROPOUT)

optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
loss_fn = nn.BCEWithLogitsLoss()

train_model_custom(model, optimizer, loss_fn, train_sentences_tensor, train_labels_tensor, valid_sentences_tensor, valid_labels_tensor, test_sentences_tensor, test_labels)

| Epoch: 00 | Train Loss: 0.734 | Train Acc: 55.56% | Val. Loss: 0.700 | Val. Acc: 52.93% |
| Epoch: 01 | Train Loss: 0.714 | Train Acc: 55.66% | Val. Loss: 0.684 | Val. Acc: 55.50% |
| Epoch: 02 | Train Loss: 0.702 | Train Acc: 55.62% | Val. Loss: 0.678 | Val. Acc: 58.69% |
| Epoch: 03 | Train Loss: 0.682 | Train Acc: 57.52% | Val. Loss: 0.674 | Val. Acc: 59.19% |
| Epoch: 04 | Train Loss: 0.689 | Train Acc: 57.20% | Val. Loss: 0.672 | Val. Acc: 58.83% |
| Epoch: 05 | Train Loss: 0.679 | Train Acc: 58.25% | Val. Loss: 0.672 | Val. Acc: 59.82% |
| Epoch: 06 | Train Loss: 0.670 | Train Acc: 60.41% | Val. Loss: 0.672 | Val. Acc: 60.09% |
| Epoch: 07 | Train Loss: 0.654 | Train Acc: 61.39% | Val. Loss: 0.674 | Val. Acc: 58.38% |
| Epoch: 08 | Train Loss: 0.660 | Train Acc: 60.96% | Val. Loss: 0.677 | Val. Acc: 58.60% |
| Epoch: 09 | Train Loss: 0.647 | Train Acc: 62.93% | Val. Loss: 0.678 | Val. Acc: 58.65% |
| Epoch: 10 | Train Loss: 0.641 | Train Acc: 64.25% | Val. Loss: 0.679 | Val. Ac

In [0]:
testing = read_csv('testset-taskb.tsv')

tweets = [row['tweet'] for row in testing]
tokenized_corpus = tokenize_f(tweets)

test_tensor, labels_tensor = parse_input(tokenized_corpus, word2idx, test_labels, max_len)

In [34]:
predictions_test = model(test_tensor).squeeze(1)

output = torch.round(torch.sigmoid(predictions_test))

print(output)

print(len(output))

tensor([1., 1., 0., 1., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1., 0., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 0., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 1., 1., 1., 1., 1.,
        1., 0., 0., 1., 1., 1., 0., 1., 1., 0., 1., 1., 1., 1., 0., 1., 1., 1.,
        1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 0., 1.,
        1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1.,
        0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 1., 1.,
        0., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 0.,
        1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 0., 1., 1., 1., 1., 0., 

In [0]:
ids = [row['id'] for row in testing]
out = ['UNT' if e == 0 else 'TIN' for e in output.detach().numpy()]
zipped = list(zip(ids, out))

with open('predictions_task_b_split_augmentation.csv', "w") as f:
    writer = csv.writer(f, dialect='excel')
    for row in zipped:
        writer.writerow(row)