<a href="https://colab.research.google.com/github/antsh3k/NN-learning/blob/master/5_Agressive_Tweets_Classification_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



---


BEfore doing anything **Switch to GPU**


---




# Prepare your environment 

In [0]:
# IMPORTS (try to organize/group your imports)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import re
import json
import spacy
from os import path

import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim

import sklearn.metrics
from sklearn.metrics import f1_score, accuracy_score, precision_score, \
                            recall_score

In [0]:
# Any global variables
SEED = 15
DATA_PATH = '/tmp/'
MAX_SEQ_LEN = 40
nlp = spacy.load('en')
DEVICE = 'cuda'

In [0]:
# Set SEEDs
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

# Examine and Prepare the Data


## In deep learning it is not very often that we load the whole dataset into memory, especially the text portion of datasets.

In [0]:
# Download the data and save into raw_data.txt
import requests
url = "https://raw.githubusercontent.com/w-is-h/DeepLearningNLP-Medical/master/Session_5/data/tweets.json"
response = requests.get(url, stream=True)

# Save the dataset into a file
f_raw_data = open(path.join(DATA_PATH, 'raw_data.txt'), 'wb')
f_raw_data.write(response.content)
f_raw_data.close()

In [0]:
# To checkout files we now use bash commands
!head /content/raw_data.txt

head: cannot open '/content/raw_data.txt' for reading: No such file or directory


In [0]:
f_x_raw = open(path.join(DATA_PATH, 'x_raw.txt'), 'w')
# We load labels because they are small
y = []


for line in open(path.join(DATA_PATH, 'raw_data.txt'), 'r'):
  # Each line is in fact a json document
  doc = json.loads(line) 

  # : Write text to the file and append labels to 'y',
  #each row must contain the text of one tweet
  f_x_raw.write("{}\n".format(doc['content']))
  y.append(int(doc['annotation']['label'][0]))

# Close the file
f_x_raw.close()

# This is a typical way to add sanity checks to your code, can be very helpful.
assert type(y[0]) == int

### Before cleaning we should analyse the dataset and understand what to remove or keep, but I've already done that so we skip it.

In [0]:
# : Cleaning
# Every time a character (excluding numbers) is repeated more than 2 times, 
# reduce to 2 - e.g. "0000 yesssssssss!!!!!!" -> "0000 yess!!"
def clean_text(text):
  clean_text = re.sub(r'([^0-9]{1})\1{2,}', r'\1\1', text)
  return clean_text

In [0]:
# Test the clean_text function
test_text = "0000 yesssssss!!!!!!"
test_out = clean_text(test_text)
print(test_out)

real_out = "0000 yess!!"
assert real_out == test_out

0000 yess!!


# Download Word Embeddings

It is very rare to train your own embeddings, if your domain is not exteremly specific. Usually we use pretrained embeddings.

In this case we are going to use embeddings from GloVe: Global Vectors for Word Representation. They have pretrained vectors for twitter datasets. 


**NOTE:**

The downside of doing this is that we can't continue the trainig of the vectors unless they are in the gensim word2vec format. 




---


More info at: https://nlp.stanford.edu/projects/glove/

In [0]:
# JUPYTER/COLAB ONLY!!!
# Download the data
!wget http://nlp.stanford.edu/data/glove.twitter.27B.zip
!unzip glove*.zip

--2019-10-08 13:49:05--  http://nlp.stanford.edu/data/glove.twitter.27B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.twitter.27B.zip [following]
--2019-10-08 13:49:05--  https://nlp.stanford.edu/data/glove.twitter.27B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.twitter.27B.zip [following]
--2019-10-08 13:49:05--  http://downloads.cs.stanford.edu/nlp/data/glove.twitter.27B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1520408563 (1.4G) [appli

In [0]:
# Load the vectors
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

# Convert glove file to word2vec format
glove_file = datapath('/content/glove.twitter.27B.200d.txt')
tmp_file = get_tmpfile("tmp_word2vec.txt")
_ = glove2word2vec(glove_file, tmp_file)

# Load the newly generated file
model = KeyedVectors.load_word2vec_format(tmp_file)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
# Sanity - Check similarity 
model.most_similar("house")

  if np.issubdtype(vec.dtype, np.int):


[('room', 0.799299418926239),
 ('home', 0.7727779746055603),
 ('apartment', 0.7143650054931641),
 ('party', 0.7122235894203186),
 ('out', 0.6893113255500793),
 ('my', 0.683701753616333),
 ('dad', 0.680922269821167),
 ("'s", 0.6800175309181213),
 ('going', 0.6792358756065369),
 ('up', 0.6730260848999023)]

In [0]:
embeddings = [] # A list of embeddings for each word in the word2vec vocab

# Embeddings is a list, meaning we know that embeddings[1] is a vector for the 
#word with ID=1, but we don't know what word is that. That is why we need 
#the id2word and word2id mappings.
id2word = {}
word2id = {}

# Loop over all words in the vocabulary and add the values
for word in model.vocab.keys():
  id2word[len(embeddings)] = word
  word2id[word] = len(embeddings)
  embeddings.append(model[word])

# Add <UNK> and <PAD>
word = "<UNK>"
id2word[len(embeddings)] = word
word2id[word] = len(embeddings)
embeddings.append(np.random.rand(len(embeddings[0])))
word = "<PAD>"
id2word[len(embeddings)] = word
word2id[word] = len(embeddings)
embeddings.append(np.zeros(len(embeddings[0])))

# : Convert the embeddings list into a numpy array
#embeddings = 0#?

# Convert the embeddings list into a tensor
embeddings = torch.tensor(embeddings, dtype=torch.float32)

# Sanity
assert len(embeddings) == len(id2word) == len(word2id)
assert model['house'][0] == embeddings[word2id['house']][0]

# Convert words to integers

Usually we don't want to keep our input in the string format, it is very time/memory costly to load text all the time. We want to convert text into integers. That is why we have our mapping `word2id`

In [0]:
x_ind = []
for text in open(path.join(DATA_PATH, 'x_raw.txt')):
  # : clean text
  text = clean_text(text.strip())
  # Covnert text to lowercased tokens, skip punct and white-space
  tkns = [tkn.lower_ for tkn in nlp.tokenizer(text) if not tkn.is_punct and
          len(tkn.lower_.strip()) > 0]
  # Convert each token into its id
  ind_tkns = [word2id.get(tkn, word2id.get("<UNK>")) for tkn in tkns]
  # Append to x_ind
  x_ind.append(ind_tkns)


In [0]:
print(x_ind[1])

[147, 32, 124, 2567, 124, 109, 243, 26, 45, 80773, 1193514, 13, 25700, 70, 55, 408, 22480, 33, 41, 11, 1697, 183, 15927, 273, 63]


In [0]:
# : convert the indexes for x_ind[1] back to words
" ".join([id2word[i] for i in x_ind[1]])

"she is as dirty as they come and that crook <UNK> the dems are so fucking corrupt it 's a joke make republicans look like"

# Analyse the dataset

In [0]:
#calculate all the statistics
tweet_lengths = [len(tweet) for tweet in x_ind]
pos = np.sum(y) # Number of positive examples
neg = len(y) - pos # Number of negative examples 
avg = np.average(tweet_lengths) # Average tweet length
md = np.median(tweet_lengths) # Median tweet length
mx = np.max(tweet_lengths) # Maximum tweet length
mi = np.min(tweet_lengths) # Minimum tweet length

print("Number of positive examples: {}".format(neg))
print("Number of negative examples: {}".format(pos))
print("Average length of the tweets: {}".format(avg))
print("Median length of the tweets: {}".format(md))
print("Max length of the tweets: {}".format(mx))
print("Min length of the tweets: {}".format(mi))

Number of positive examples: 12179
Number of negative examples: 7822
Average length of the tweets: 13.369781510924454
Median length of the tweets: 12.0
Max length of the tweets: 363
Min length of the tweets: 0


In [0]:
# TODO: Calculate the tweet lengths, if length > 40 set to 40, if length == 0 set to 1
prim_tweet_lens = [] # Append tweet lengths here
for tweet in x_ind:
  if len(tweet) > 40:
    prim_tweet_lens.append(40)
  elif len(tweet) == 0:
    prim_tweet_lens.append(1)
  else:
    prim_tweet_lens.append(len(tweet))
    

# Prepare the dataset

In [0]:
# Pad everything to the same length, or remove the extra
x_ind_pad = []
for i in range(len(x_ind)):
  tweet = x_ind[i]
  tweet = tweet[0:MAX_SEQ_LEN]
  if len(tweet) < MAX_SEQ_LEN:
    tweet.extend([word2id['<PAD>']] * (MAX_SEQ_LEN - len(tweet)))
  x_ind_pad.append(tweet)

# Print the stats again
tweet_lengths = [len(tweet) for tweet in x_ind_pad]
pos = np.sum(y) # Number of positive examples
neg = len(y) - pos # Number of negative examples 
avg = np.average(tweet_lengths) # Average tweet length
md = np.median(tweet_lengths) # Median tweet length
mx = np.max(tweet_lengths) # Maximum tweet length
mi = np.min(tweet_lengths) # Minimum tweet length

print("Number of positive examples: {}".format(neg))
print("Number of negative examples: {}".format(pos))
print("Average length of the tweets: {}".format(avg))
print("Median length of the tweets: {}".format(md))
print("Max length of the tweets: {}".format(mx))
print("Min length of the tweets: {}".format(mi))

Number of positive examples: 12179
Number of negative examples: 7822
Average length of the tweets: 40.0
Median length of the tweets: 40.0
Max length of the tweets: 40
Min length of the tweets: 40


In [0]:
# Split into train/test and move to pytorch 
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test, l_train, l_test = train_test_split(x_ind_pad, y, prim_tweet_lens, test_size=0.2, random_state=SEED)

x_train = torch.tensor(x_train, dtype=torch.long)
y_train = torch.tensor(y_train, dtype=torch.long)
l_train = torch.tensor(l_train, dtype=torch.long)

x_test = torch.tensor(x_test, dtype=torch.long)
y_test = torch.tensor(y_test, dtype=torch.long)
l_test = torch.tensor(l_test, dtype=torch.long)

# Build the network

In [0]:
class RNN(nn.Module):
  def __init__(self, embeddings, padding_idx):
    super(RNN, self).__init__()
    # Get the required sizes
    vocab_size = len(embeddings)
    embedding_size = len(embeddings[0])
    
    # Initialize embeddings
    self.embeddings = nn.Embedding(vocab_size, embedding_size, padding_idx=padding_idx)
    self.embeddings.load_state_dict({'weight': embeddings})
    # Disable training for the embeddings - IMPORTANT
    self.embeddings.weight.requires_grad = False

    hidden_size = 300 # the weight dimension

    # Create the RNN cell
    #self.rnn = nn.LSTM(input_size=200, hidden_size=hidden_size, num_layers=1, dropout=0.5) to make an LSTM
    #self.rnn = nn.GRU(input_size=200, hidden_size=hidden_size, num_layers=1, dropout=0.5) to make a GRU
    self.rnn = nn.RNN(input_size=200, hidden_size=hidden_size, num_layers=1, dropout=0.5) 
    self.fc1 = nn.Linear(hidden_size, 2)

  def forward(self, x, lns):
    # Embed the input: from id -> vec
    x = self.embeddings(x) # x.shape = batch_size x sequence_length x emb_size
    
    # Tell RNN to ignore padding and set the batch_first to True
    x = torch.nn.utils.rnn.pack_padded_sequence(x, lns, batch_first=True, enforce_sorted=False) 

    # TODO: run 'x' through the RNN
    x, hidden = self.rnn(x) #?

    # Add the padding again
    x, hidden = torch.nn.utils.rnn.pad_packed_sequence(x, batch_first=True)
    
    # For each example in batch select the value at the length of that sentence
    row_indices = torch.arange(0, x.size(0)).long()
    x = x[row_indices, lns-1, :]

    # TODO: Push x through the fc network
    x = self.fc1(x)
    return x

In [0]:
# TODO:
device =  torch.device(DEVICE) # Create a torch device
net = RNN(embeddings, padding_idx=word2id['<PAD>']) # Create an instance of the RNN, take care what input parameters does it require
criterion = nn.CrossEntropyLoss() # Set the criterion to Cross Entropy Loss
parameters = filter(lambda p: p.requires_grad, net.parameters()) # Get only the parameters that require training
optimizer = optim.Adam(parameters, lr=0.001)  # Set the optimizer to Adam with lr = 0.001
net.to(device) # Move the network to device

  "num_layers={}".format(dropout, num_layers))


RNN(
  (embeddings): Embedding(1193516, 200, padding_idx=1193515)
  (rnn): RNN(200, 300, dropout=0.5)
  (fc1): Linear(in_features=300, out_features=2, bias=True)
)

# Train

In [0]:
device = torch.device(DEVICE)
# Move data to the right device only test, train is in batches
x_test = x_test.to(device)
y_test = y_test.to(device)
l_test = l_test.to(device)

losses = []
accs = []
accs_dev = []

batch_size = 1000

# : calculate the number of batches given training size len(x_train)
num_batches = int(np.ceil(len(x_train) / batch_size))
for epoch in range(80):
  # : Switch network to train mode
  net.train()

  # Create the running loss array
  running_loss = []
  for i in range(num_batches):
    start = i * batch_size
    end = (i+1) * batch_size
    
    # : Get the batch
    x_train_batch = x_train[start:end] #?
    y_train_batch = y_train[start:end] #?  
    l_train_batch = l_train[start:end] #? 

    # : Move the batches to the right device
    x_train_batch = x_train_batch.to(device) #?
    y_train_batch = y_train_batch.to(device) #?
    l_train_batch = l_train_batch.to(device) #?

    # zero gradients
    optimizer.zero_grad()
    # Get outputs for our batch
    outputs = net(x_train_batch, l_train_batch)
    # Get loss
    loss = criterion(outputs, y_train_batch)
    # Do the backward step
    loss.backward()
    # Do the optimizer step
    optimizer.step()

    # Add the loss to the running_loss
    running_loss.append(loss.item())

  if epoch % 5 == 0:
      net.eval()
      outputs = net(x_train_batch, l_train_batch)
      acc = sklearn.metrics.accuracy_score([1 if x > 0.5 else 0 for x in torch.max(outputs, 1)[1].cpu().detach().numpy()], y_train_batch.cpu().numpy())
      outputs_dev = net(x_test, l_test)
      acc_dev = sklearn.metrics.accuracy_score(torch.max(outputs_dev, 1)[1].cpu().detach().numpy(), y_test.cpu().numpy())
      f1_dev = f1_score(y_test.cpu().numpy(), torch.max(outputs_dev, 1)[1].cpu().detach().numpy())
      p_dev = precision_score(y_test.cpu().numpy(), torch.max(outputs_dev, 1)[1].cpu().detach().numpy())
      r_dev = recall_score(y_test.cpu().numpy(), torch.max(outputs_dev, 1)[1].cpu().detach().numpy())
      
      print("Epoch: {:4} Loss: {:.5f} Acc: {:.3f} Acc Dev: {:.3f} F1 Dev: {:.3f} p Dev: {:.3f} r Dev: {:.3f}".format(epoch, np.average(running_loss), acc, acc_dev, f1_dev, p_dev, r_dev))
      
print('Finished Training')

Epoch:    0 Loss: 0.65219 Acc: 0.657 Acc Dev: 0.667 F1 Dev: 0.569 p Dev: 0.566 r Dev: 0.572
Epoch:    5 Loss: 0.49421 Acc: 0.764 Acc Dev: 0.726 F1 Dev: 0.649 p Dev: 0.639 r Dev: 0.660
Epoch:   10 Loss: 0.40767 Acc: 0.826 Acc Dev: 0.731 F1 Dev: 0.640 p Dev: 0.659 r Dev: 0.622
Epoch:   15 Loss: 0.35099 Acc: 0.858 Acc Dev: 0.756 F1 Dev: 0.707 p Dev: 0.656 r Dev: 0.767
Epoch:   20 Loss: 0.29109 Acc: 0.895 Acc Dev: 0.789 F1 Dev: 0.732 p Dev: 0.715 r Dev: 0.750
Epoch:   25 Loss: 0.19174 Acc: 0.929 Acc Dev: 0.796 F1 Dev: 0.758 p Dev: 0.696 r Dev: 0.832
Epoch:   30 Loss: 0.14220 Acc: 0.953 Acc Dev: 0.838 F1 Dev: 0.808 p Dev: 0.742 r Dev: 0.887
Epoch:   35 Loss: 0.08511 Acc: 0.971 Acc Dev: 0.842 F1 Dev: 0.818 p Dev: 0.735 r Dev: 0.921
Epoch:   40 Loss: 0.07452 Acc: 0.976 Acc Dev: 0.865 F1 Dev: 0.836 p Dev: 0.785 r Dev: 0.895
Epoch:   45 Loss: 0.03410 Acc: 0.988 Acc Dev: 0.884 F1 Dev: 0.861 p Dev: 0.794 r Dev: 0.941
Epoch:   50 Loss: 0.02309 Acc: 0.992 Acc Dev: 0.880 F1 Dev: 0.857 p Dev: 0.788 r

In [0]:
tweet = "This NN class is Class is fabbb" # Write a tweet
print(tweet)

This NN class is Class is fabbb


In [0]:
# Tokenize using the same procedure as used for the dataset

tweet = clean_text(tweet.strip())
# Covnert text to lowercased tokens, skip punct and white-space
tkns = [tkn.lower_ for tkn in nlp.tokenizer(tweet) if not tkn.is_punct and
          len(tkn.lower_.strip()) > 0] 
print(tkns)

['this', 'nn', 'class', 'is', 'class', 'is', 'fabb']


In [0]:
# Convert tokens to indices 
# Convert tokens to indices - watch out for unk words
inds = [word2id.get(tkn, word2id.get("<UNK>")) for tkn in tkns]
print(inds)

[53, 222, 726, 32, 726, 32, 368814]


In [0]:
# Add padding to match len(inds) == 40

inds.extend([word2id['<PAD>']] * (MAX_SEQ_LEN - len(inds)))
print(inds)

[53, 222, 726, 32, 726, 32, 368814, 1193515, 1193515, 1193515, 1193515, 1193515, 1193515, 1193515, 1193515, 1193515, 1193515, 1193515, 1193515, 1193515, 1193515, 1193515, 1193515, 1193515, 1193515, 1193515, 1193515, 1193515, 1193515, 1193515, 1193515, 1193515, 1193515, 1193515, 1193515, 1193515, 1193515, 1193515, 1193515, 1193515]


In [0]:
# Move to torch
inds = torch.tensor([inds]).to(device)
print(inds)

tensor([[     53,     222,     726,      32,     726,      32,  368814, 1193515,
         1193515, 1193515, 1193515, 1193515, 1193515, 1193515, 1193515, 1193515,
         1193515, 1193515, 1193515, 1193515, 1193515, 1193515, 1193515, 1193515,
         1193515, 1193515, 1193515, 1193515, 1193515, 1193515, 1193515, 1193515,
         1193515, 1193515, 1193515, 1193515, 1193515, 1193515, 1193515, 1193515]],
       device='cuda:0')


In [0]:
# Predict
net.eval() # Switch network to eval mode
torch.softmax(net(inds, torch.tensor([5])), dim=1)

tensor([[0.9497, 0.0503]], device='cuda:0', grad_fn=<SoftmaxBackward>)

In [0]:
# TODO: Get all predictions for x_test and apply softmax
out = torch.softmax(net(x_test, l_test), dim=1)
#?
print(out)

tensor([[1.1310e-06, 1.0000e+00],
        [3.4313e-04, 9.9966e-01],
        [9.9994e-01, 5.5441e-05],
        ...,
        [9.9758e-01, 2.4162e-03],
        [1.0032e-04, 9.9990e-01],
        [1.1559e-04, 9.9988e-01]], device='cuda:0', grad_fn=<SoftmaxBackward>)


In [0]:
# Find a couple of examples where the Net is sure it is correct
out = out.detach().cpu().numpy()
#
for i in range(200):
  pred = np.argmax(out[i])
  if pred != y_test[i] and out[i][pred] > 0.9:
    print(i)

0
22
24
33
39
40
50
62
64
69
78
82
115
138
151
155
162
184
189
192
198


In [0]:
# Print one example
ind = 189
print(y_test[ind])
print(out[ind])
print(" ".join([id2word[i] for i in x_test[ind].cpu().detach().numpy() if id2word[i] != '<PAD>']))

tensor(0, device='cuda:0')
[1.3419524e-04 9.9986577e-01]
ahahaahahahaha yourr a funny kid
