In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!python -m spacy download en_core_web_lg

Collecting en_core_web_lg==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9 MB)
[K     |████████████████████████████████| 827.9 MB 877 kB/s 
Building wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-lg: filename=en_core_web_lg-2.2.5-py3-none-any.whl size=829180942 sha256=9cfe637049e29d4f8e1a413ec6e245fe9bf1552fb43a596b3cb2ede4c1cbd74f
  Stored in directory: /tmp/pip-ephem-wheel-cache-z9bhjjkt/wheels/11/95/ba/2c36cc368c0bd339b44a791c2c1881a1fb714b78c29a4cb8f5
Successfully built en-core-web-lg
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [3]:
from torchtext.legacy.data import Field, LabelField
from torchtext.legacy.data import TabularDataset
from torchtext.legacy.data import Iterator, BucketIterator

In [4]:
import torch
import torchtext
# from torchtext.data import Field, LabelField # For torch<=0.8.0, the importing of functions should be `from torchtext.data`
# from torchtext.data import TabularDataset
# from torchtext.data import Iterator, BucketIterator
import spacy
import en_core_web_lg
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn as nn
import torch.nn.functional as F
import torch.autograd as autograd
from tqdm import tqdm, trange
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report, confusion_matrix

manual_seed = 77
torch.manual_seed(manual_seed)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
n_gpu = torch.cuda.device_count()
if n_gpu > 0:
    torch.cuda.manual_seed(manual_seed)

cpu


In [5]:
spacy_en = en_core_web_lg.load()

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings (tokens)
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]


In [6]:
TEXT = Field(sequential=True, tokenize=tokenize_en, lower=True)
LABEL = Field(sequential=False, unk_token = None)

train, val = TabularDataset.splits(
               path="./drive/My Drive/CNN/", # the root directory where the data lies
               train='train_sentiment.tsv', validation="val_sentiment.tsv", # file names
               format='tsv',
               skip_header=True, # if your tsv file has a header, make sure to pass this to ensure it doesn't get proceesed as data!
               fields=[('tweet', TEXT), ('label', LABEL)])

In [7]:
test = TabularDataset.splits(
               path="/content/drive/MyDrive/CNN/", # the root directory where the data lies
               test="masks.tsv", # file names
               format='tsv',
               skip_header=True, # if your tsv file has a header, make sure to pass this to ensure it doesn't get proceesed as data!
               fields=[(None, None), ('tweet', TEXT), (None, None), ('label', LABEL)])
test = test[0]

In [8]:
from zipfile import ZipFile
  
# specifying the zip file name
file_name = "./drive/My Drive/CNN/emb.zip"
  
# opening the zip file in READ mode
with ZipFile(file_name, 'r') as zip:
    # printing all the contents of the zip file
    zip.printdir()
  
    # extracting all the files
    print('Extracting all the files now...')
    zip.extractall()
    print('Done!')

File Name                                             Modified             Size
glove.twitter.27B.100d.txt                     2015-12-22 16:04:54   1021671926
glove.twitter.27B.200d.txt                     2015-12-22 16:04:54   2057595650
glove.twitter.27B.25d.txt                      2015-12-22 16:04:54    257699930
glove.twitter.27B.50d.txt                      2015-12-22 16:04:54    510889212
Extracting all the files now...
Done!


In [9]:
import torchtext.vocab as vocab
loaded_vectors = torchtext.vocab.Vectors('/content/drive/MyDrive/CNN/glove.twitter.27B.200d.txt')
TEXT.build_vocab(train, vectors=loaded_vectors, max_size=len(loaded_vectors.stoi))
TEXT.vocab.set_vectors(stoi=loaded_vectors.stoi, vectors=loaded_vectors.vectors, dim=loaded_vectors.dim)
LABEL.build_vocab(train)
print("Vocabulary size of TEXT:",len(TEXT.vocab.stoi))
print("Vocabulary size of LABEL:",len(LABEL.vocab.stoi))

100%|█████████▉| 1193516/1193517 [01:25<00:00, 13887.67it/s]


Vocabulary size of TEXT: 50220
Vocabulary size of LABEL: 3


In [521]:
print(len(test))

90


In [10]:
LABEL.vocab.stoi

defaultdict(None, {'0': 2, '1': 0, '2': 1})

In [11]:
import torch
from torchtext.legacy import data
from torchtext.legacy import datasets
import random
import numpy as np
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iter, val_iter, test_iter = data.BucketIterator.splits(
    (train, val, test), 
    batch_size = 32,
    sort_key=lambda x: len(x.tweet), 
    sort=True,
    sort_within_batch=True,
    device = device)

In [12]:
class CNN_BiLSTM(nn.Module):

    def __init__(self, args):
        super(CNN_BiLSTM, self).__init__()
        self.args = args
        self.hidden_dim = args.lstm_hidden_dim
        self.num_layers = args.lstm_num_layers
        V = args.embed_num
        D = args.embed_dim
        C = args.class_num
        self.C = C
        Ci = 1
        Co = args.kernel_num
        Ks = args.kernel_sizes
        self.embed = nn.Embedding(V, D, padding_idx=args.paddingId)
        # pretrained  embedding
        if args.word_Embedding:
            self.embed.weight.data.copy_(args.pretrained_weight)

        # CNN
        self.convs1 = [nn.Conv2d(Ci, Co, (K, D), padding=(K//2, 0), stride=1) for K in Ks]
        print(self.convs1)
        # for cnn cuda
        # if self.args.cuda is True:
        #     for conv in self.convs1:
        #         conv = conv.cuda()

        # BiLSTM
        self.bilstm = nn.LSTM(D, self.hidden_dim, num_layers=self.num_layers, dropout=args.dropout, bidirectional=True, bias=True)

        # linear
        L = len(Ks) * Co + self.hidden_dim * 2
        self.hidden2label1 = nn.Linear(L, L // 2)
        self.hidden2label2 = nn.Linear(L // 2, C)

        # dropout
        self.dropout = nn.Dropout(args.dropout)

    def forward(self, x):

        embed = self.embed(x)

        # CNN
        cnn_x = embed
        # print(cnn_x.shape)
        cnn_x = torch.transpose(cnn_x, 0, 1)
        # print(cnn_x.shape)
        cnn_x = cnn_x.unsqueeze(1)
        cnn_x = [F.relu(conv(cnn_x)).squeeze(3) for conv in self.convs1]  # [(N,Co,W), ...]*len(Ks)
        cnn_x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in cnn_x]  # [(N,Co), ...]*len(Ks)
        cnn_x = torch.cat(cnn_x, 1)
        cnn_x = self.dropout(cnn_x)
        # print(cnn_x.shape)
        # BiLSTM
        # print('blstm')
        bilstm_x = embed.view(len(x), embed.size(1), -1)
        # print(bilstm_x.shape)
        bilstm_out, _ = self.bilstm(bilstm_x)
        # print(bilstm_out.shape)
        bilstm_out = torch.transpose(bilstm_out, 0, 1)
        # print(bilstm_out.shape)
        bilstm_out = torch.transpose(bilstm_out, 1, 2)
        # print(bilstm_out.shape)
        bilstm_out = F.max_pool1d(bilstm_out, bilstm_out.size(2)).squeeze(2)
        # print(bilstm_out.shape)
        # bilstm_out = F.tanh(bilstm_out)

        # CNN and BiLSTM CAT
        cnn_x = torch.transpose(cnn_x, 0, 1)
        bilstm_out = torch.transpose(bilstm_out, 0, 1)
        cnn_bilstm_out = torch.cat((cnn_x, bilstm_out), 0)
        cnn_bilstm_out = torch.transpose(cnn_bilstm_out, 0, 1)

        # linear
        cnn_bilstm_out = self.hidden2label1(F.tanh(cnn_bilstm_out))
        cnn_bilstm_out = self.hidden2label2(F.tanh(cnn_bilstm_out))

        # output
        logit = cnn_bilstm_out
        return logit

In [197]:
x = torch.randint(low=0, high=10, size=(7,32))
print(x.shape)
pred = model(x)

torch.Size([7, 32])




In [196]:
# token_tensor = torch.tensor(token_ids, device=device).unsqueeze(0)
# S(model, token_tensor, 6, 1)

In [13]:
vocabulary_size = len(TEXT.vocab.stoi) 
hidden_dim = 700
embedding_dim = 200 
num_layers = 1
kernel_sizes = [4,5,6] 
kernels_num = 32 
dropout = 0.7
output_size = 3
lr = 0.001        
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
num_epoch = 20 

class CNNArgs:
  def __init__(self, hidden_dim, num_layers, vocabulary_size, embedding_dim, output_size, kernels_num, kernel_sizes, dropout, padIdx) -> None:
      self.lstm_hidden_dim = hidden_dim
      self.lstm_num_layers = num_layers
      self.embed_num = vocabulary_size 
      self.embed_dim = embedding_dim
      self.class_num = output_size
      self.kernel_num = kernels_num
      self.kernel_sizes = kernel_sizes
      self.dropout = dropout
      self.paddingId = padIdx
      self.word_Embedding = None
      self.cuda = True


args_for_cnn = CNNArgs(hidden_dim, num_layers, vocabulary_size, embedding_dim, output_size, kernels_num, kernel_sizes, dropout, PAD_IDX)

model = CNN_BiLSTM(args_for_cnn).to(device)
pretrained_embeddings = TEXT.vocab.vectors
print(model)
print(vocabulary_size)
model.embed.weight.data.copy_(pretrained_embeddings)
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embed.weight.data[UNK_IDX] = torch.zeros(embedding_dim)
model.embed.weight.data[PAD_IDX] = torch.zeros(embedding_dim)

optimizer = torch.optim.Adam(model.parameters(), lr=lr)  
criterion = nn.CrossEntropyLoss()

[Conv2d(1, 32, kernel_size=(4, 200), stride=(1, 1), padding=(2, 0)), Conv2d(1, 32, kernel_size=(5, 200), stride=(1, 1), padding=(2, 0)), Conv2d(1, 32, kernel_size=(6, 200), stride=(1, 1), padding=(3, 0))]
CNN_BiLSTM(
  (embed): Embedding(50220, 200, padding_idx=1)
  (bilstm): LSTM(200, 700, dropout=0.7, bidirectional=True)
  (hidden2label1): Linear(in_features=1496, out_features=748, bias=True)
  (hidden2label2): Linear(in_features=748, out_features=3, bias=True)
  (dropout): Dropout(p=0.7, inplace=False)
)
50220


  "num_layers={}".format(dropout, num_layers))


In [14]:
def train(model, iterator, optimizer, criterion):
    
    model.train()
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        batch_input, labels = batch.tweet, batch.label
        batch_input = batch_input.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(batch_input)
        loss = criterion(outputs, labels)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.cpu().item()

    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    all_pred=[]
    all_label = []
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            batch_input, labels = batch.tweet, batch.label
            batch_input = batch_input.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()

            outputs = model(batch_input)

            loss = criterion(outputs, labels)

            epoch_loss += loss.cpu().item()

            # identify the predicted class for each example in the batch
            probabilities, predicted = torch.max(outputs.cpu().data, 1)
            # put all the true labels and predictions to two lists
            all_pred.extend(predicted)
            all_label.extend(labels.cpu())
    
    accuracy = accuracy_score(all_label, all_pred)
    f1score = f1_score(all_label, all_pred, average='macro') 
    return epoch_loss / len(iterator), accuracy, f1score

In [None]:
MAX_EPOCH = 15
total_step = len(train_iter)
loss_list = []
acc_list = []

for epoch in trange(MAX_EPOCH, desc="Epoch"):
    train_loss = train(model, train_iter, optimizer, criterion)  
    val_loss, val_acc, val_f1 = evaluate(model, val_iter, criterion)

    # Create checkpoint at end of each epoch
    state_dict_model = model.state_dict() 
    state = {
        'epoch': epoch,
        'state_dict': state_dict_model,
        'optimizer': optimizer.state_dict()
        }
    print('\n Epoch [{}/{}], Train Loss: {:.4f}, Validation Loss: {:.4f}, Validation Accuracy: {:.4f}, Validation F1: {:.4f}'.format(epoch+1, MAX_EPOCH, train_loss, val_loss, val_acc, val_f1))
    torch.save(state, "./drive/My Drive/CNN/CNN_LSTM_F"+str(epoch+1)+".pt")





 Epoch [1/15], Train Loss: 0.5173, Validation Loss: 0.7103, Validation Accuracy: 0.6865, Validation F1: 0.6682





 Epoch [2/15], Train Loss: 0.3025, Validation Loss: 0.9422, Validation Accuracy: 0.6560, Validation F1: 0.6429





 Epoch [3/15], Train Loss: 0.1742, Validation Loss: 1.3201, Validation Accuracy: 0.6290, Validation F1: 0.6196





 Epoch [4/15], Train Loss: 0.1139, Validation Loss: 1.6470, Validation Accuracy: 0.6485, Validation F1: 0.6230





 Epoch [5/15], Train Loss: 0.0846, Validation Loss: 1.8795, Validation Accuracy: 0.6530, Validation F1: 0.6080





 Epoch [6/15], Train Loss: 0.0766, Validation Loss: 1.9758, Validation Accuracy: 0.6435, Validation F1: 0.6192





 Epoch [7/15], Train Loss: 0.0522, Validation Loss: 2.2490, Validation Accuracy: 0.6425, Validation F1: 0.6274





 Epoch [8/15], Train Loss: 0.0365, Validation Loss: 2.2763, Validation Accuracy: 0.6380, Validation F1: 0.6151





 Epoch [9/15], Train Loss: 0.0333, Validation Loss: 2.7409, Validation Accuracy: 0.6355, Validation F1: 0.5964





 Epoch [10/15], Train Loss: 0.0306, Validation Loss: 2.9121, Validation Accuracy: 0.6250, Validation F1: 0.6012





 Epoch [11/15], Train Loss: 0.0259, Validation Loss: 2.8866, Validation Accuracy: 0.6280, Validation F1: 0.5984





 Epoch [12/15], Train Loss: 0.0260, Validation Loss: 2.9893, Validation Accuracy: 0.6380, Validation F1: 0.6006





 Epoch [13/15], Train Loss: 0.0222, Validation Loss: 2.9984, Validation Accuracy: 0.6190, Validation F1: 0.6048





 Epoch [14/15], Train Loss: 0.0180, Validation Loss: 3.2289, Validation Accuracy: 0.6365, Validation F1: 0.6209





 Epoch [15/15], Train Loss: 0.0203, Validation Loss: 3.3065, Validation Accuracy: 0.6180, Validation F1: 0.6016


Epoch: 100%|██████████| 15/15 [8:42:22<00:00, 2089.49s/it]


In [15]:
args_for_cnn = CNNArgs(hidden_dim, num_layers, vocabulary_size, embedding_dim, output_size, kernels_num, kernel_sizes, dropout, PAD_IDX)

model = CNN_BiLSTM(args_for_cnn).to(device)
pretrained_embeddings = TEXT.vocab.vectors
print(model)
print(vocabulary_size)
model.embed.weight.data.copy_(pretrained_embeddings)
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embed.weight.data[UNK_IDX] = torch.zeros(embedding_dim)
model.embed.weight.data[PAD_IDX] = torch.zeros(embedding_dim)

optimizer = torch.optim.Adam(model.parameters(), lr=lr)  
criterion = nn.CrossEntropyLoss()

[Conv2d(1, 32, kernel_size=(4, 200), stride=(1, 1), padding=(2, 0)), Conv2d(1, 32, kernel_size=(5, 200), stride=(1, 1), padding=(2, 0)), Conv2d(1, 32, kernel_size=(6, 200), stride=(1, 1), padding=(3, 0))]
CNN_BiLSTM(
  (embed): Embedding(50220, 200, padding_idx=1)
  (bilstm): LSTM(200, 700, dropout=0.7, bidirectional=True)
  (hidden2label1): Linear(in_features=1496, out_features=748, bias=True)
  (hidden2label2): Linear(in_features=748, out_features=3, bias=True)
  (dropout): Dropout(p=0.7, inplace=False)
)
50220


  "num_layers={}".format(dropout, num_layers))


In [16]:
model.load_state_dict(torch.load('/content/drive/MyDrive/CNN/CNN_LSTM_F1.pt')['state_dict'])

<All keys matched successfully>

In [32]:
# masks
# 0.9200501243273417 0.6333333333333333 0.5588888888888889
# test_loss, test_acc, test_f1 = evaluate(model, test_iter, criterion)
# print(test_loss, test_acc, test_f1)
# vaccines
# 0.8588742166757584 0.5841584158415841 0.6099456099456099
test_loss, test_acc, test_f1 = evaluate(model, test_iter, criterion)
print(test_loss, test_acc, test_f1)



0.917639414469401 0.6222222222222222 0.5515091093908828


In [17]:
model.eval()
model = model.to(device)
print(model)

CNN_BiLSTM(
  (embed): Embedding(50220, 200, padding_idx=1)
  (bilstm): LSTM(200, 700, dropout=0.7, bidirectional=True)
  (hidden2label1): Linear(in_features=1496, out_features=748, bias=True)
  (hidden2label2): Linear(in_features=748, out_features=3, bias=True)
  (dropout): Dropout(p=0.7, inplace=False)
)


In [18]:
LABEL.vocab.stoi
# "negative" - 0
# "neutral" - 1
# "positive" - 2

defaultdict(None, {'0': 2, '1': 0, '2': 1})

In [181]:
# import numpy as np
# sample_sentence = 'it was an amazing performance'
# print(sample_sentence)
# token_ids = [TEXT.vocab.stoi[tok] for tok in tokenize_en(sample_sentence)]
# print(token_ids)
# if len(token_ids) < 7:
#     token_ids += [TEXT.vocab.stoi['pad']] * (7 - len(token_ids))
# print(token_ids)
# token_tensor = torch.tensor(token_ids, device=device).unsqueeze(0).transpose(1,0)
# print(token_tensor)
# print(token_tensor.size())
# probabilities, predicted = torch.max(model(token_tensor).cpu().data, 1)
# print(probabilities)
# print(predicted)

In [180]:
# prediction = model(token_tensor).data
# prediction

In [19]:

def S(trained_model, token_tensor, erase_tokenid, gold_label_id):
    '''
    input:
    i) trained_model - trained CNN model on IMDB dataset
    ii) token_tensor - tensor containing token ids for a test sample 'e'
    iii) erase_tokenid - id corresponding to the token to be erased. -1 if no token needs to be erased (useful in computing S(e,c))
    iv) gold_label_id - gold sentiment label for the test sample 'e'
    
    returns:
    probability by model to the gold label. S(e,c) if erase_tokenid == -1 else S(e,c,~d)
    '''
    
    # print(token_tensor.shape)
    #token_tensor = [1, sent len]  note: batch_size is 1
    embedded = model.embed(token_tensor)
    # print(embedded)
    # print(embedded.shape)
    #embedded = [batch size, sent len, emb dim]
    
    # check if word embedings corresponding to a token has to be erased
    if erase_tokenid != -1:
        embedded[0, erase_tokenid, :].fill_(0.0)
    # print(embedded)
    # print(embedded.shape)
    #embedded = [batch size, sent len, emb dim]
    # cnn_x = torch.transpose(cnn_x, 0, 1)
    # print(0)
        
    cnn_x = embedded
    # print(cnn_x.shape)
    # print(1)
    # cnn_x = torch.transpose(cnn_x, 0, 1)
    # print(cnn_x.shape)
    cnn_x = cnn_x.unsqueeze(1)
    cnn_x = [F.relu(conv(cnn_x)).squeeze(3) for conv in model.convs1]  # [(N,Co,W), ...]*len(Ks)
    cnn_x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in cnn_x]  # [(N,Co), ...]*len(Ks)
    cnn_x = torch.cat(cnn_x, 1)
    cnn_x = model.dropout(cnn_x)
    # print('cnn output:')
    # print(cnn_x.shape)
    # print('embedded blstm')
    bilstm_x = embedded.view(embedded.size(1), len(token_tensor), -1)

    # print(bilstm_x.shape)
    bilstm_out, _ = model.bilstm(bilstm_x)
    # print(bilstm_out.shape)
    bilstm_out = torch.transpose(bilstm_out, 0, 1)
    bilstm_out = torch.transpose(bilstm_out, 1, 2)
    bilstm_out = F.max_pool1d(bilstm_out, bilstm_out.size(2)).squeeze(2)
        # bilstm_out = F.tanh(bilstm_out)
    # print(bilstm_out.shape)
        # CNN and BiLSTM CAT
    cnn_x = torch.transpose(cnn_x, 0, 1)
    bilstm_out = torch.transpose(bilstm_out, 0, 1)
    cnn_bilstm_out = torch.cat((cnn_x, bilstm_out), 0)
    cnn_bilstm_out = torch.transpose(cnn_bilstm_out, 0, 1)

        # linear
    cnn_bilstm_out = model.hidden2label1(F.tanh(cnn_bilstm_out))
    cnn_bilstm_out = model.hidden2label2(F.tanh(cnn_bilstm_out))

        # output
    logit = F.softmax(cnn_bilstm_out)
    probabilities, predicted = torch.max(logit, 1)

    return probabilities.item()

In [20]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import altair as alt

def interpret_sentence(sentence, min_len = 7, true_label = 1):
  '''
  input:
  i) sentence - the sample sentence to be analyzed
  ii) min_len - minimum length of the sentence (alternatively, maximum width of the filter used by the orginal author)
  iii) true_label - gold sentiment label for the sample sentence to be analyzed
  returns:
  None (actually the code plots the heatmap and returns nothing)
  '''
  # tokenize sentence
  token_ids = [TEXT.vocab.stoi[tok] for tok in tokenize_en(sentence)]
  # pad the sentence if necessary
  if len(token_ids) < 7: # where 5 in min_len
    token_ids += [TEXT.vocab.stoi['pad']] * (7 - len(token_ids))
  
  # convert to tensor
  token_tensor = torch.tensor(token_ids, device=device).unsqueeze(0)
    
  # get prediction
  probabilities, prediction = torch.max(model(token_tensor.transpose(1,0)).cpu().data, 1)
  prediction = prediction.item()
  # print('pred')
  # print(prediction)
   
  # get word importance
  tokens, imp_scores = [None], []
  s_ec = S(model, token_tensor, -1, true_label)
  # print("ses")
  # print(s_ec)
  for t_i, token_id in enumerate(token_ids):
    if token_id != TEXT.vocab.stoi['pad']: # we don't need to calculate importance score for pad tokens (seriously?)
      s_e = S(model, token_tensor, t_i, true_label)
      tokens.append(TEXT.vocab.itos[token_id])
      # print((s_ec-s_e)/s_ec)
      imp_scores.append([((s_ec-s_e)/s_ec)])
  
  # plot the heatmap and other values
  # print(tokens)
  # print(imp_scores)
  print('sentence = %s'%sentence)
  print('predicted label = %d; gold label = %d'%(prediction, true_label))
  # print(imp_scores)
  # fig, ax = plt.subplots()
  # im = ax.imshow(np.array(imp_scores).transpose(), cmap=plt.cm.Blues)
  # ax.set_xticklabels(tokens)
  # ax.set_yticklabels(['importance'])
  # plt.colorbar(im)
  # fig.set_size_inches(15, 8)
  # fig.tight_layout()
  # plt.show()
  lst = [score[0] for score in imp_scores]
  source = pd.DataFrame({'tokens': tokens[1:], 'score': lst})
  return source, tokens

In [22]:
def visualisation(df, tokens, w=500):
  viz = alt.Chart(df).mark_rect().encode(
    x=alt.X('tokens', sort=tokens[1:], axis=alt.Axis(labelAngle=-45)),
    color=alt.Color('score', scale=alt.Scale(scheme='blues'))).properties(
    width=w,
    height=70
)
  return viz

In [434]:
df, tokens = interpret_sentence('it was an amazing performance', true_label = 1.0)
visualisation(df, tokens)

sentence = it was an amazing performance
predicted label = 1; gold label = 1




In [436]:
df, tokens = interpret_sentence('368 people died yesterday. RIP those ignored through the boredom of repeated mistakes.', true_label = 2.0)
print(df)
visualisation(df, tokens, w=1100)



sentence = 368 people died yesterday. RIP those ignored through the boredom of repeated mistakes.
predicted label = 2; gold label = 2
       tokens     score
0       <unk>  0.000000
1      people -0.005417
2        died  0.040391
3   yesterday -0.001702
4           . -0.000633
5       <unk>  0.000000
6       those -0.012151
7     ignored  0.027310
8     through  0.003373
9         the  0.001165
10    boredom  0.005314
11         of -0.002616
12   repeated -0.021318
13   mistakes -0.001154
14          .  0.009348


In [437]:
df, tokens = interpret_sentence(' \n\nWhat have we become when our boredom means that 368 people die and no one cares. \n\nBut we are free of the heinous mask. \n\nThe UK is full of selfishness. #CovidIsntOver', true_label = 2.0)
visualisation(df, tokens, w=1100)



sentence =  

What have we become when our boredom means that 368 people die and no one cares. 

But we are free of the heinous mask. 

The UK is full of selfishness. #CovidIsntOver
predicted label = 2; gold label = 2


In [438]:
# wrong prediction
df, tokens = interpret_sentence("Which science are we following now? Today it's $cience.  #covid #vaccines #masks #money #admityouwerewrong #freedom https://t.co/1QihIkgTn6", true_label = 2.0)
visualisation(df, tokens, w=1100)



sentence = Which science are we following now? Today it's $cience.  #covid #vaccines #masks #money #admityouwerewrong #freedom https://t.co/1QihIkgTn6
predicted label = 0; gold label = 2


In [None]:
# "negative" - 0 - 2
# "neutral" - 1 - 0
# "positive" - 2 - 1

In [33]:
# wrong prediction
df, tokens = interpret_sentence("#votefordout #masks #covid #Dougford #ontario I wish I lived in PEI or Quebec where the premier listened to science https://t.co/uU757dQ1Rw", true_label = 2.0)
visualisation(df, tokens, w=1100)



sentence = #votefordout #masks #covid #Dougford #ontario I wish I lived in PEI or Quebec where the premier listened to science https://t.co/uU757dQ1Rw
predicted label = 1; gold label = 2


In [36]:

df, tokens = interpret_sentence("Every single conference I have gone to, I come home very sick. That is until now! I just spent many days around hundreds of people (all who were required to mask throughout the conference) and I am not sick!  #MasksWork #RetreatMigraine2022 #ConferencesDoneRight @CoalitionCHAMP", true_label = 1.0)
visualisation(df, tokens, w=1100)



sentence = Every single conference I have gone to, I come home very sick. That is until now! I just spent many days around hundreds of people (all who were required to mask throughout the conference) and I am not sick!  #MasksWork #RetreatMigraine2022 #ConferencesDoneRight @CoalitionCHAMP
predicted label = 2; gold label = 1


In [27]:
tweet = "Went to the cinema for the first time in over two years last night. The cinema wanted Covid passes and mask wearing. Felt safe snd had a great time #MasksWork  #CovidIsNotOver"
df, tokens = interpret_sentence(tweet, true_label = 1.0)
print(df)
visualisation(df, tokens, w=1100)



sentence = Went to the cinema for the first time in over two years last night. The cinema wanted Covid passes and mask wearing. Felt safe snd had a great time #MasksWork  #CovidIsNotOver
predicted label = 1; gold label = 1
     tokens     score
0      went  0.003533
1        to  0.010863
2       the  0.014964
3    cinema  0.004385
4       for -0.000398
5       the -0.001119
6     first -0.002968
7      time -0.006005
8        in -0.001713
9      over -0.003675
10      two -0.012342
11    years -0.004103
12     last -0.003900
13    night -0.002500
14        . -0.001869
15      the  0.002625
16   cinema -0.000662
17   wanted  0.004856
18    <unk>  0.000000
19   passes  0.000435
20      and -0.001705
21     mask -0.007450
22  wearing -0.007644
23        .  0.016414
24     felt -0.012062
25     safe  0.019762
26      snd -0.034972
27      had  0.017694
28        a  0.035144
29    great  0.271539
30     time  0.034871
31        #  0.018155
32    <unk>  0.000000
33          -0.006120
34     

In [416]:
tweet = "75% of Torontonians are helplessly, hopelessly brainwashing. Please realize how preposterous your obedience to this nonsense is. #CanadaHasFallen #canada #masks #covid #covid19 #insanity https://t.co/3BHoZYZ70t"
df, tokens = interpret_sentence(tweet, true_label = 2.0)
visualisation(df, tokens, w=1100)



sentence = 75% of Torontonians are helplessly, hopelessly brainwashing. Please realize how preposterous your obedience to this nonsense is. #CanadaHasFallen #canada #masks #covid #covid19 #insanity https://t.co/3BHoZYZ70t
predicted label = 2; gold label = 2


In [28]:
tweet = "Good morning friends!  I have a question. No judgement, I promise.  Are you still wearing your mask when indoors shopping, or at the mall?  #Covid #Masks"
df, tokens = interpret_sentence(tweet, true_label = 0.0)
print(df)
visualisation(df, tokens, w=1100)



sentence = Good morning friends!  I have a question. No judgement, I promise.  Are you still wearing your mask when indoors shopping, or at the mall?  #Covid #Masks
predicted label = 1; gold label = 0
       tokens     score
0        good -0.038277
1     morning  0.031962
2     friends  0.012230
3           !  0.256023
4             -0.013876
5           i  0.006017
6        have -0.006208
7           a  0.012426
8    question -0.092443
9           .  0.019244
10         no  0.130987
11  judgement -0.077967
12          ,  0.057872
13          i  0.005470
14    promise -0.002789
15          . -0.010885
16            -0.007511
17        are -0.000476
18        you  0.015591
19      still -0.027247
20    wearing -0.022267
21       your  0.001715
22       mask -0.017866
23       when -0.014147
24    indoors -0.011763
25   shopping -0.009971
26          , -0.011456
27         or -0.058106
28         at -0.033474
29        the -0.021002
30       mall -0.022245
31          ? -0.028404
32     

In [29]:
tweet = "Do what you want. But, I personally believe we should still be wearing #masks in public settings. Especially very crowded indoor public areas. For the time being at least. Who's with me? #Ottawa #Ontario #MaskUp #WearAMask #MaskMandate #COVID19 #COVID #CovidIsNotOver #DougFord"
df, tokens = interpret_sentence(tweet, true_label = 0.0)
print(df)
visualisation(df, tokens, w=1100)



sentence = Do what you want. But, I personally believe we should still be wearing #masks in public settings. Especially very crowded indoor public areas. For the time being at least. Who's with me? #Ottawa #Ontario #MaskUp #WearAMask #MaskMandate #COVID19 #COVID #CovidIsNotOver #DougFord
predicted label = 0; gold label = 0
        tokens     score
0           do  0.023612
1         what  0.006425
2          you  0.010153
3         want  0.008118
4            . -0.007561
5          but  0.025619
6            , -0.018537
7            i -0.096060
8   personally -0.086140
9      believe -0.075975
10          we -0.070131
11      should -0.010968
12       still -0.003058
13          be -0.025509
14     wearing  0.037614
15           # -0.019107
16       masks  0.018792
17          in -0.013538
18      public  0.055958
19    settings  0.175334
20           . -0.103784
21  especially -0.027063
22        very -0.023579
23     crowded  0.010533
24      indoor -0.018714
25      public  0.000236


In [446]:
tweet = "Dozens and dozens and dozens of young sportspeople dying of heart related issues https://t.co/JZDZsYkDAw #VaccineSideEffects #death"
df, tokens = interpret_sentence(tweet, true_label = 2.0)
visualisation(df, tokens, w=1100)



sentence = Dozens and dozens and dozens of young sportspeople dying of heart related issues https://t.co/JZDZsYkDAw #VaccineSideEffects #death
predicted label = 2; gold label = 2


In [447]:
tweet="Because only a healthy you can make the world beautiful and better too. Be healthy, keep well! #worldhealthday #health #life #betterlife #pandemic #covid #globalhealth #lockdown #medicine #vaccine #greatjob #warriors #goodhealth #healthday https://t.co/qn8X8pOJFo"
df, tokens = interpret_sentence(tweet, true_label = 1.0)
visualisation(df, tokens, w=1100)



sentence = Because only a healthy you can make the world beautiful and better too. Be healthy, keep well! #worldhealthday #health #life #betterlife #pandemic #covid #globalhealth #lockdown #medicine #vaccine #greatjob #warriors #goodhealth #healthday https://t.co/qn8X8pOJFo
predicted label = 1; gold label = 1


In [460]:
tweet = "Triple vaxxed. Got COVID. Not in ICU. Thank you, vaccination! #VaccinesWork https://t.co/c0ldCCnn7s"
df, tokens = interpret_sentence(tweet, true_label = 1.0)
visualisation(df, tokens, w=1100)



sentence = Triple vaxxed. Got COVID. Not in ICU. Thank you, vaccination! #VaccinesWork https://t.co/c0ldCCnn7s
predicted label = 1; gold label = 1
