<a href="https://colab.research.google.com/github/adesam146/nlpcw/blob/sam_preprocessing/NLP_CW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install allennlp



In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data import sampler

import torchvision.datasets as dset

import numpy as np
import pandas as pd

import torchvision.transforms as T

from allennlp.modules.elmo import Elmo, batch_to_ids


In [4]:
#Use pretrained ELMO weights. 
options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"

elmo = Elmo(options_file, weight_file, 2, dropout=0)

02/19/2019 10:13:18 - INFO - allennlp.modules.elmo -   Initializing ELMo


In [5]:
#Elmo test
sentences = [['First', 'sentence', '.'], ['Another', '.'], 
             ["Oh", "here", "we", "Go", "now", "you", "fool", "."], 
             ["meaninglesswordnotinvocab"]]
             
character_ids = batch_to_ids(sentences)

# The shape would be (#sentences, length_of_longest_sentence, len of longest word(in original corpus?)=50)
# Note if a word has n characters the first n+2 entries in its size 50 embedding would be used to
# represent it. '+2' because it seems like they implicitly add a start and end of word
# 'character'
print(character_ids.shape)

embeddings = elmo(character_ids)
print(embeddings.keys())
embed = embeddings["elmo_representations"]

# The length of embed is given by the num_output_representations we specfied
# when creating the ELMO class, I believe this are just the number of different
# weighted combination of the 3 layers in the "ELMO network" so could theortically
# be any integer but is usually 1 or 2
print(len(embed))
print(embed[0].shape)
print(embed[1].shape)


torch.Size([4, 8, 50])
dict_keys(['elmo_representations', 'mask'])
2
torch.Size([4, 8, 1024])
torch.Size([4, 8, 1024])


In [None]:
# ELMO takes a list of parsed sentences as an input
# It generates an embedding of length 1024 per word
# We then need to find a good method of combining the word vecs to create 
# a sentence embedding (this article is good: https://medium.com/huggingface/universal-word-sentence-embeddings-ce48ddc8fc3a). 


In [None]:
from google.colab import files

#File upload instructions:
  #1. Run this cell
  #2. Press "Choose Files" at the bottom of this cell
  #3. Select offenseval-training-v1.tsv locally to load it as the variable noisy_values  
  
file1 = files.upload()


In [6]:
train = pd.read_csv("OffensEval_task_data/start-kit/training-v1/offenseval-training-v1.tsv", delimiter="\t")
print(train.shape)
print(train.head())

(13240, 5)
      id                                              tweet subtask_a  \
0  86426  @USER She should ask a few native Americans wh...       OFF   
1  90194  @USER @USER Go home you’re drunk!!! @USER #MAG...       OFF   
2  16820  Amazon is investigating Chinese employees who ...       NOT   
3  62688  @USER Someone should'veTaken" this piece of sh...       OFF   
4  43605  @USER @USER Obama wanted liberals &amp; illega...       NOT   

  subtask_b subtask_c  
0       UNT       NaN  
1       TIN       IND  
2       NaN       NaN  
3       UNT       NaN  
4       NaN       NaN  


In [7]:
train.count()

id           13240
tweet        13240
subtask_a    13240
subtask_b     4400
subtask_c     3876
dtype: int64

In [8]:
total = train['id'].count().item()
off_count = train[train['subtask_a'] == "OFF"]['id'].count()

print("Number of offensive", off_count)
print("Number of inoffensive", total - off_count)

Number of offensive 4400
Number of inoffensive 8840


**The above shows that the training dataset is not very balanced (in offensive is about twice as much). How could this be addressed. Get more data? Augment offensive comments by adding neutral words to create more data or concat offensive and inoffensive comments to make new offensive comments?**

In [9]:
training_percent = 0.8
training_size = int(training_percent * total)
validation_size = total - training_size

corpus = train['tweet'].to_numpy()
labels = train['subtask_a']
labels[labels == 'OFF'] = 1.
labels[labels == 'NOT'] = 0.
labels = labels.to_numpy(dtype=np.double).reshape(-1, 1)

# train_dataset, validation_dataset = torch.utils.data.random_split(TensorDataset(corpus, labels), [training_size, validation_size])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [10]:
indices = np.random.shuffle(list(range(total)))

training_sents = corpus[:training_size]
training_labels = labels[:training_size]

validation_sents = corpus[training_size:]
validation_labels = labels[training_size:]

In [11]:
import re

def get_tokenised_corpus(corpus):
    """
    This assumes the corpus can be iterated through and
    retains the order in which the sentences appeared in the corpus
    """
    tokenized_corpus = [] # Let us put the tokenized corpus in a list
    for sentence in corpus:
        tokenized_sentence = []
        for token in re.split(r'\s', sentence.lower()): # simplest split is 
            if token:
              # To avoid the empty string
              tokenized_sentence.append(token)
        tokenized_corpus.append(tokenized_sentence)
    
    return tokenized_corpus

In [12]:
training_ids = batch_to_ids(get_tokenised_corpus(training_sents))
validation_ids = batch_to_ids(get_tokenised_corpus(validation_sents))

print(training_ids[:2])

tensor([[[259,  65, 118,  ..., 261, 261, 261],
         [259, 116, 105,  ..., 261, 261, 261],
         [259, 116, 105,  ..., 261, 261, 261],
         ...,
         [  0,   0,   0,  ...,   0,   0,   0],
         [  0,   0,   0,  ...,   0,   0,   0],
         [  0,   0,   0,  ...,   0,   0,   0]],

        [[259,  65, 118,  ..., 261, 261, 261],
         [259,  65, 118,  ..., 261, 261, 261],
         [259, 104, 112,  ..., 261, 261, 261],
         ...,
         [  0,   0,   0,  ...,   0,   0,   0],
         [  0,   0,   0,  ...,   0,   0,   0],
         [  0,   0,   0,  ...,   0,   0,   0]]])


In [13]:
class SimpleClassifier(nn.Module):
    def __init__(self, out_channels, window_size, dropout):
        super(SimpleClassifier, self).__init__()
        self.embeddings = Elmo(options_file, weight_file, 1, dropout=0)
        embedding_dim = 1024
        
        self.conv = nn.Conv2d(in_channels=1, out_channels=out_channels, kernel_size=(window_size, embedding_dim))
        
        self.dropout = nn.Dropout(dropout)
        
        self.fc = nn.Linear(out_channels, 1)
        
    def forward(self, x):
        # x is the output from batch_to_ids
        
        
        # Only looking at one (the first) layer from elmo for now 
        # which is my I am indexing at 0
        embedded = self.embeddings(x)['elmo_representations'][0]
        
        #images have 3 RGB channels 
        #for the text we add 1 channel
        embedded = embedded.unsqueeze(1)
        
        #(batch size, 1, max sent length, embedding dim)
        
        feature_maps = self.conv(embedded)
        # (batch size, out_channels, max sent length - window size +1, 1)
        
        feature_maps = feature_maps.squeeze(3)
        feature_maps = F.relu(feature_maps)
        
        #the max pooling layer
        pooled = F.max_pool1d(feature_maps, feature_maps.shape[2])
        pooled = pooled.squeeze(2)
        # (batch size, out_channels)
        
        dropped = self.dropout(pooled)
 
        return self.fc(dropped)

In [21]:
data_loader = DataLoader(TensorDataset(training_ids, torch.from_numpy(training_labels)), batch_size=1, shuffle=True)

In [15]:
 def accuracy(output, target):
    with torch.no_grad():
        prob_output = torch.sigmoid(output)

        prob_output[prob_output > 0.5] = 1.
        prob_output[prob_output <= 0.5] = 0.

        acc = (prob_output == target).sum(dtype=torch.float) / output.shape[0]
 
    return acc

In [18]:
NUM_EPOCHS = 10
model = SimpleClassifier(out_channels=100, window_size=3, dropout=0.5)
optimizer = optim.SGD(model.parameters(), lr=0.01)
loss_fn = nn.BCEWithLogitsLoss()

02/19/2019 10:21:45 - INFO - allennlp.modules.elmo -   Initializing ELMo


In [None]:
for epoch in range(NUM_EPOCHS):
    losses = []
    accs = []
    for sent_ids, target in data_loader:
        print("Got here 1")
        model.train()
        optimizer.zero_grad()
        
        logits = model(sent_ids)
        logits = logits.type(torch.DoubleTensor)
        print("Got here 2")

        target = target.type(torch.DoubleTensor)
        loss = loss_fn(logits, target)
        
        print("Got here 3")
        
        loss.backward()
        optimizer.step()
        
#         model.eval()
#         with torch.no_grad():
#             logits = model(validation_ids)
#             losses.append(lost_fn(logits, validation_labels).item())
#             accs.append(accuracy(logits, target))
            
    print(f'| Epoch: {epoch:02} | Val. Loss: {np.mean(losses):.3f} | Val. Acc: {np.mean(accs)*100:.2f}% |')
        
        
    

Got here 1
Got here 2
Got here 3
Got here 1
Got here 2
Got here 3
Got here 1
Got here 2
Got here 3
Got here 1
Got here 2
Got here 3
Got here 1
Got here 2
Got here 3
Got here 1
Got here 2
Got here 3
Got here 1
Got here 2
Got here 3
Got here 1
Got here 2
Got here 3
Got here 1
Got here 2
Got here 3
Got here 1
Got here 2
Got here 3
Got here 1
Got here 2
Got here 3
Got here 1
Got here 2
Got here 3
Got here 1
Got here 2
Got here 3
Got here 1
Got here 2
Got here 3
Got here 1
Got here 2
Got here 3
Got here 1
Got here 2
Got here 3
Got here 1
Got here 2
Got here 3
Got here 1
Got here 2
Got here 3
Got here 1
Got here 2
Got here 3
Got here 1
Got here 2
Got here 3
Got here 1
Got here 2
Got here 3
Got here 1
Got here 2
Got here 3
Got here 1
Got here 2
Got here 3
Got here 1
Got here 2
Got here 3
Got here 1
Got here 2
Got here 3
Got here 1
Got here 2
Got here 3
Got here 1
Got here 2
Got here 3
Got here 1
Got here 2
Got here 3
Got here 1
Got here 2
Got here 3
Got here 1
Got here 2
Got here 3
Got here 1