In [172]:
import pandas as pd

import transformers
from transformers import BertModel, BertTokenizer, RobertaConfig, RobertaModel
import torch
import torch.nn as nn
from dont_patronize_me import DontPatronizeMe

from sklearn.model_selection import train_test_split

import datasets

RANDOM_SEED = 42

In [148]:
dpm = DontPatronizeMe('.', '.')
dpm = DontPatronizeMe('.', 'dontpatronizeme_pcl.tsv')

dpm.load_task1()

dpm2 = DontPatronizeMe('.', '.')
dpm2 = DontPatronizeMe('.', 'dontpatronizeme_categories.tsv')

dpm2.load_task2()

Map of label to numerical label:
{'Unbalanced_power_relations': 0, 'Shallow_solution': 1, 'Presupposition': 2, 'Authority_voice': 3, 'Metaphors': 4, 'Compassion': 5, 'The_poorer_the_merrier': 6}


In [152]:
df = dpm.train_task1_df
df_cate = dpm2.train_task2_df

In [154]:
df

Unnamed: 0,par_id,art_id,keyword,country,text,label,orig_label
0,1,@@24942188,hopeless,ph,"We 're living in times of absolute insanity , ...",0,0
1,2,@@21968160,migrant,gh,"In Libya today , there are countless number of...",0,0
2,3,@@16584954,immigrant,ie,"""White House press secretary Sean Spicer said ...",0,0
3,4,@@7811231,disabled,nz,Council customers only signs would be displaye...,0,0
4,5,@@1494111,refugee,ca,""""""" Just like we received migrants fleeing El ...",0,0
...,...,...,...,...,...,...,...
10464,10465,@@14297363,women,lk,"""Sri Lankan norms and culture inhibit women fr...",0,1
10465,10466,@@70091353,vulnerable,ph,He added that the AFP will continue to bank on...,0,0
10466,10467,@@20282330,in-need,ng,""""""" She has one huge platform , and informatio...",1,3
10467,10468,@@16753236,hopeless,in,""""""" Anja Ringgren Loven I ca n't find a word t...",1,4


In [78]:

PRE_TRAINED_MODEL_NAME = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

### Sample text to visualise tokenisation

In [79]:
sample_txt = 'Hello! I love that you are so poor.'
tokens_sample = tokenizer.tokenize(sample_txt)
token_ids = tokenizer.convert_tokens_to_ids(tokens_sample)
print(token_ids) 
print(len(token_ids)) 
print(tokens_sample)

[8667, 106, 146, 1567, 1115, 1128, 1132, 1177, 2869, 119]
10
['Hello', '!', 'I', 'love', 'that', 'you', 'are', 'so', 'poor', '.']


In [84]:
print(tokenizer.sep_token, tokenizer.sep_token_id) # end of sentence marker
print(tokenizer.cls_token, tokenizer.cls_token_id) # start of sentence token 
print(tokenizer.pad_token, tokenizer.pad_token_id) # token for padding
print(tokenizer.unk_token, tokenizer.unk_token_id) # token for unknown

[SEP] 102
[CLS] 101
[PAD] 0
[UNK] 100


Then, we do embedding on the tokens. 

In [111]:
encoding = tokenizer.encode_plus(
  sample_txt,
  max_length=32,
  truncation = True, # truncate examples to max length 
  add_special_tokens=True, # Add '[CLS]' and '[SEP]'
  return_token_type_ids=False,
  padding = "max_length", 
  return_attention_mask=True,
  return_tensors='pt',  # Return PyTorch tensors
)

print(encoding.keys()) # dict_keys(['input_ids', 'attention_mask'])
print(encoding.input_ids) 

dict_keys(['input_ids', 'attention_mask'])
tensor([[ 101, 8667,  106,  146, 1567, 1115, 1128, 1132, 1177, 2869,  119,  102,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]])


In [136]:
print(encoding['attention_mask'])

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]])


In [46]:
class SentimentClassifier(nn.Module):
  def __init__(self, n_classes):
    super(SentimentClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
  def forward(self, input_ids, attention_mask):
    _, pooled_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    output = self.drop(pooled_output)
    return self.out(output)

In [166]:
df_train, df_test = train_test_split(
  df,
  test_size=0.3, 
  random_state = RANDOM_SEED
)

df_test, df_val = train_test_split(
  df_test,
  test_size=0.5, 
  random_state = RANDOM_SEED
)

df_train.shape, df_val.shape, df_test.shape

((7328, 7), (1571, 7), (1570, 7))

### Dataloader

In [173]:
# class MyDataset(Dataset):

#   def __init__(self,df):
#     x=df.iloc[:,0:-3].values
#     y=price_df.iloc[:,-2].values

#     self.x_train=torch.tensor(x,dtype=torch.float32)
#     self.y_train=torch.tensor(y,dtype=torch.float32)

#   def __len__(self):
#     return len(self.y_train)
  
#   def __getitem__(self,idx):
#     return self.x_train[idx],self.y_train[idx]

NameError: name 'Dataset' is not defined

In [None]:
class OlidDataset(torch.utils.data.Dataset):

    def __init__(self, tokenizer, input_set):

        self.tokenizer = tokenizer
        self.texts = input_set['texts']
        self.labels = input_set['labels']
    """
    
    def collate_fn(self, batch):

        texts = []
        labels_a = []
        labels_b = []
        labels_c = []

        for b in batch:
          # converts the texts and labels into indexes
          # these are passed into encodings below
            texts.append(b['text'])
            labels_a.append(b['label_a']) 
            labels_b.append(b['label_b']) 
            labels_c.append(b['label_c'])

        # The maximum sequence size for BERT is 512 but here the tokenizer truncate sentences longer than 128 tokens.  
        # We also pad shorter sentences to a length of 128 tokens
        encodings = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=128)
        labels = {}
        encodings['label_a'] =  torch.tensor(labels_a)
        encodings['label_b'] =  torch.tensor(labels_b)
        encodings['label_c'] =  torch.tensor(labels_c)
        
        return encodings
        """
    def __len__(self):
        return len(self.y_train)

    def __getitem__(self, idx):
       # idx - index of the tweet
       # labels
        item = {'text': self.texts[idx],
                'label_a': self.labels[idx][0],
                'label_b': self.labels[idx][1],
                'label_c': self.labels[idx][2]}
        return item