In [24]:
# Loading packages
!pip install nltk

exec(open('utilities_cw.py').read())

# We fix the seeds to get consistent results
SEED = 1111
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
one.py is being run directly


In [0]:
from sklearn.model_selection import train_test_split

train_file = 'offenseval-training-v1.tsv'
corpus, labels = parse_dataset_task_b(train_file)

In [0]:
print(len(train_corpus))
print(len(train_labels))


In [0]:
test_file = 'validation.tsv'
test_corpus, test_labels = parse_dataset_task_b(test_file)

In [0]:
corpus += test_corpus
labels += test_labels

In [0]:
train_corpus, valid_corpus, train_labels, valid_labels = train_test_split(corpus, labels, test_size=0, random_state=42)

# train_corpus, train_labels = augment_untargeted(train_corpus, train_labels, 1600)
# valid_corpus, valid_labels = augment_untargeted(valid_corpus, valid_labels, 900)
    
print(train_corpus)
print(train_labels)

print(len(train_corpus))
print(len(train_labels))

In [0]:
tokenize_f = tokenize_stemming

In [0]:
tokenized_train_corpus = tokenize_f(train_corpus)
print(len(tokenized_train_corpus))

In [0]:
vocabulary = get_vocabulary(tokenized_train_corpus)
print(vocabulary)
print(len(vocabulary))

In [0]:
word2idx = get_word2idx(tokenized_train_corpus, vocabulary)
print(len(word2idx))

In [0]:
idx2word = get_idx2word(vocabulary)
print(len(idx2word))

In [0]:
sentences_lengths = [len(sentence) for sentence in tokenized_train_corpus]
max_len = np.max(np.array(sentences_lengths))

train_sentences_tensor, train_labels_tensor = parse_input(tokenized_train_corpus, word2idx, train_labels, max_len)

print(train_sentences_tensor)
print(train_labels_tensor)

print(train_sentences_tensor.shape)
print(train_labels_tensor.shape)

In [0]:
def parse_in(tokenized_corpus, word2idx, labels, max_len):
    # we index our sentences
    vectorized_sentences = [[word2idx[token] for token in sentence if token in word2idx] for sentence in tokenized_corpus]
  
  
    # we create a tensor of a fixed size filled with zeroes for padding
    sentences_tensor = Variable(torch.zeros((len(vectorized_sentences), max_len))).long()
    sentences_lengths = [len(sentence) for sentence in vectorized_sentences]

    # we fill it with our vectorized sentences 
    for idx, (sentence, sentence_len) in enumerate(zip(vectorized_sentences, sentences_lengths)):
        sentences_tensor[idx, :sentence_len] = torch.LongTensor(sentence)
            
    labels_tensor = torch.FloatTensor(labels)

    return sentences_tensor, labels_tensor

In [0]:
def train_model_custom(model, optimizer, loss_fn, feature_train, target_train):    
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        predictions = model(feature_train).squeeze(1)
        loss = loss_fn(predictions, target_train)
        acc = accuracy(predictions, target_train)
        loss.backward()
        optimizer.step()

        epoch_loss = loss.item()
        epoch_acc = acc
  
        print(f'| Epoch: {epoch:02} | Train Loss: {epoch_loss:.3f} | Train Acc: {epoch_acc*100:.2f}% | ')


In [0]:
epochs=100

INPUT_DIM = len(word2idx)
EMBEDDING_DIM = 100
OUTPUT_DIM = 1
LEARNING_RATE = 0.0008

# the hyperparamerts specific to CNN
# we define the number of filters
N_OUT_CHANNELS = 100
# we define the window size
WINDOW_SIZE = 2
# we apply the dropout with the probability 0.5
DROPOUT = 0.5

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_OUT_CHANNELS, WINDOW_SIZE, OUTPUT_DIM, DROPOUT)

optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
loss_fn = nn.BCEWithLogitsLoss()

train_model_custom(model, optimizer, loss_fn, train_sentences_tensor, train_labels_tensor)

In [0]:
testing = read_csv('testset-taskb.tsv')

tweets = [row['tweet'] for row in testing]
tokenized_corpus = tokenize(tweets)

test_tensor, labels_tensor = parse_input(tokenized_corpus, word2idx, test_labels, max_len)

In [0]:
predictions_test = model(test_tensor).squeeze(1)

output = torch.round(torch.sigmoid(predictions_test))

print(output)

print(len(output))

In [0]:
ids = [row['id'] for row in testing]
out = ['UNT' if e == 0 else 'TIN' for e in output.detach().numpy()]
zipped = list(zip(ids, out))

with open('predictions_task_b_full_training_stemming_window_2.csv', "w") as f:
    writer = csv.writer(f, dialect='excel')
    for row in zipped:
        writer.writerow(row)