# Using RoBERTa for Characterizing User Decisions

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers



In [3]:
!pip install fastai



In [4]:
from fastai.text import *
from fastai.metrics import *
from transformers import RobertaTokenizer

In [5]:
# Creating a config object to store task specific information
class Config(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        for k, v in kwargs.items():
            setattr(self, k, v)
    
    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)
        
config = Config(
    testing=False,
    seed = 2019,
    roberta_model_name='roberta-base', # can also be exchnaged with roberta-large 
    max_lr=1e-5,
    epochs=1,
    use_fp16=False,
    bs=4, 
    max_seq_len=256, 
    num_labels = 4,
    hidden_dropout_prob=.05,
    hidden_size=768, # 1024 for roberta-large
    start_tok = "<s>",
    end_tok = "</s>",
)

In [6]:
df = pd.read_csv('/content/drive/MyDrive/app_reviews_all_annotated2.csv')

In [7]:
if config.testing: df = df[:5000]
print(df.shape)

(46103, 8)


In [8]:
df = df[['review', 'argument_cat']]
# Remove missing rows
df = df.dropna()
df

Unnamed: 0,review,argument_cat
0,owsm Full Review,
1,Amazing app for edit Full Review,Arg
2,Thank you SO much for ALL of the pointers!! ...,
3,It's awesome Full Review,
4,Great Full Review,
...,...,...
46098,Cool Full Review,
46099,Good Full Review,
46100,Best Full Review,
46101,Nice game Full Review,


In [9]:
feat_cols = "review"
label_cols = "argument_cat"

## Setting Up the Tokenizer

In [10]:
class FastAiRobertaTokenizer(BaseTokenizer):
    """Wrapper around RobertaTokenizer to be compatible with fastai"""
    def __init__(self, tokenizer: RobertaTokenizer, max_seq_len: int=128, **kwargs): 
        self._pretrained_tokenizer = tokenizer
        self.max_seq_len = max_seq_len 
    def __call__(self, *args, **kwargs): 
        return self 
    def tokenizer(self, t:str) -> List[str]: 
        """Adds Roberta bos and eos tokens and limits the maximum sequence length""" 
        return [config.start_tok] + self._pretrained_tokenizer.tokenize(t)[:self.max_seq_len - 2] + [config.end_tok]

In [11]:
# create fastai tokenizer for roberta
roberta_tok = RobertaTokenizer.from_pretrained("roberta-base")

fastai_tokenizer = Tokenizer(tok_func=FastAiRobertaTokenizer(roberta_tok, max_seq_len=config.max_seq_len), 
                             pre_rules=[], post_rules=[])

In [12]:
# create fastai vocabulary for roberta
path = Path()
roberta_tok.save_vocabulary(path)

with open('vocab.json', 'r') as f:
    roberta_vocab_dict = json.load(f)
    
fastai_roberta_vocab = Vocab(list(roberta_vocab_dict.keys()))

In [13]:
# Setting up pre-processors
class RobertaTokenizeProcessor(TokenizeProcessor):
    def __init__(self, tokenizer):
         super().__init__(tokenizer=tokenizer, include_bos=False, include_eos=False)

class RobertaNumericalizeProcessor(NumericalizeProcessor):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)


def get_roberta_processor(tokenizer:Tokenizer=None, vocab:Vocab=None):
    """
    Constructing preprocessors for Roberta
    We remove sos and eos tokens since we add that ourselves in the tokenizer.
    We also use a custom vocabulary to match the numericalization with the original Roberta model.
    """
    return [RobertaTokenizeProcessor(tokenizer=tokenizer), RobertaNumericalizeProcessor(vocab=vocab)]

## Setting up the DataBunch

In [14]:
# Creating a Roberta specific DataBunch class
class RobertaDataBunch(TextDataBunch):
    "Create a `TextDataBunch` suitable for training Roberta"
    @classmethod
    def create(cls, train_ds, valid_ds, test_ds=None, path:PathOrStr='.', bs:int=64, val_bs:int=None, pad_idx=1,
               pad_first=True, device:torch.device=None, no_check:bool=False, backwards:bool=False, 
               dl_tfms:Optional[Collection[Callable]]=None, **dl_kwargs) -> DataBunch:
        "Function that transform the `datasets` in a `DataBunch` for classification. Passes `**dl_kwargs` on to `DataLoader()`"
        datasets = cls._init_ds(train_ds, valid_ds, test_ds)
        val_bs = ifnone(val_bs, bs)
        collate_fn = partial(pad_collate, pad_idx=pad_idx, pad_first=pad_first, backwards=backwards)
        train_sampler = SortishSampler(datasets[0].x, key=lambda t: len(datasets[0][t][0].data), bs=bs)
        train_dl = DataLoader(datasets[0], batch_size=bs, sampler=train_sampler, drop_last=True, **dl_kwargs)
        dataloaders = [train_dl]
        for ds in datasets[1:]:
            lengths = [len(t) for t in ds.x.items]
            sampler = SortSampler(ds.x, key=lengths.__getitem__)
            dataloaders.append(DataLoader(ds, batch_size=val_bs, sampler=sampler, **dl_kwargs))
        return cls(*dataloaders, path=path, device=device, dl_tfms=dl_tfms, collate_fn=collate_fn, no_check=no_check)

In [15]:
class RobertaTextList(TextList):
    _bunch = RobertaDataBunch
    _label_cls = TextList

In [16]:
# loading the tokenizer and vocab processors
processor = get_roberta_processor(tokenizer=fastai_tokenizer, vocab=fastai_roberta_vocab)

# creating our databunch 
data = RobertaTextList.from_df(df, ".", cols=feat_cols, processor=processor) \
    .split_by_rand_pct(seed=config.seed) \
    .label_from_df(cols=label_cols,label_cls=CategoryList) \
    .databunch(bs=config.bs, pad_first=False, pad_idx=0)

  return np.array(a, dtype=dtype, **kwargs)


# Building the Model

In [17]:
import torch
import torch.nn as nn
from transformers import RobertaModel
torch.manual_seed(0)
# defining our model architecture 
class CustomRobertaModel(nn.Module):
    def __init__(self,num_labels=2):
        super(CustomRobertaModel,self).__init__()
        self.num_labels = num_labels
        self.roberta = RobertaModel.from_pretrained(config.roberta_model_name)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, num_labels) # defining final output layer
        
    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        _ , pooled_output = self.roberta(input_ids, token_type_ids, attention_mask,return_dict=False) # 
        logits = self.classifier(pooled_output)        
        return logits

In [18]:
roberta_model = CustomRobertaModel(num_labels=config.num_labels)

learn = Learner(data, roberta_model, metrics=[accuracy])

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
learn.model.roberta.train() # setting roberta to train as it is in eval mode by default
learn.fit_one_cycle(config.epochs, max_lr=config.max_lr)

epoch,train_loss,valid_loss,accuracy,time
0,0.319165,0.289738,0.887503,14:33


# Getting Predictions

In [20]:
def get_preds_as_nparray(ds_type) -> np.ndarray:
    learn.model.roberta.eval()
    preds = learn.get_preds(ds_type)[0].detach().cpu().numpy()
    sampler = [i for i in data.dl(ds_type).sampler]
    reverse_sampler = np.argsort(sampler)
    ordered_preds = preds[reverse_sampler, :]
    pred_values = np.argmax(ordered_preds, axis=1)
    return ordered_preds, pred_values

In [21]:
preds, pred_values = get_preds_as_nparray(DatasetType.Valid)

In [22]:
# accuracy on valid
(pred_values == data.valid_ds.y.items).mean()

0.8875027120850509

In [23]:
from sklearn.metrics import confusion_matrix, classification_report
print('-------Argument Classification------')

y_test = data.valid_ds.y.items
y_pred = pred_values

print('Confusion matrix:')
print(confusion_matrix(y_test,y_pred,labels=[0,1,2,3]))
print()
print('Classification report:')
print(classification_report(y_test,y_pred,labels=[0,1,2,3],target_names=['Arg','Both','Dec','None']))

-------Argument Classification------
Confusion matrix:
[[3033   95    3  274]
 [ 173  259   10    9]
 [  15   16   34   26]
 [ 391   12   13 4855]]

Classification report:
              precision    recall  f1-score   support

         Arg       0.84      0.89      0.86      3405
        Both       0.68      0.57      0.62       451
         Dec       0.57      0.37      0.45        91
        None       0.94      0.92      0.93      5271

    accuracy                           0.89      9218
   macro avg       0.76      0.69      0.72      9218
weighted avg       0.89      0.89      0.89      9218



In [24]:
np.unique(y_pred)

array([0, 1, 2, 3])

# Optional: Saving/Loading the model weights

Once you are satisfied with training, you can use the following methods to save and load your pretrained weights.

In [25]:
def save_model(learner, file_name):
    st = learner.model.state_dict()
    torch.save(st, file_name) # will save model in current dir # backend is pickle 

def load_model(learner, file_name):
    st = torch.load(file_name)
    learner.model.load_state_dict(st)

# monkey patching Learner methods to save and load model file
Learner.save_model = save_model
Learner.load_model = load_model

In [26]:
# learn.save_model("my_model.pth")

# learn.load_model("my_model.pth")

In [27]:
#####################################DECISION CLASSIFICATION########################
df = pd.read_csv('/content/drive/MyDrive/app_reviews_all_annotated2.csv')
df = df[['review', 'decision_cat']]
# Remove missing rows
df = df.dropna()
df

Unnamed: 0,review,decision_cat
5,Everyone should use it Full Review,Acquiring
37,Improve face detection please !!! Full Rev...,Requesting
192,I think we just need more updates Full Rev...,Requesting
201,if you're looking for an alternative for Pho...,Recommendation
221,it's the best photo editor app i've ever see...,Requesting
...,...,...
45986,OMG IT'S SO AWESOME YOU SHOULD TRY IT Full...,Recommendation
46051,I litterly love this game i use to play this...,Acquiring
46076,"Please make more levels. Other wise, it's go...",Requesting
46077,awesome game worth the money! Full Review,Buying


In [28]:
config = Config(
    testing=False,
    seed = 2019,
    roberta_model_name='roberta-base', # can also be exchnaged with roberta-large 
    max_lr=1e-5,
    epochs=1,
    use_fp16=False,
    bs=4, 
    max_seq_len=256, 
    num_labels = 5,
    hidden_dropout_prob=.05,
    hidden_size=768, # 1024 for roberta-large
    start_tok = "<s>",
    end_tok = "</s>",
)

In [29]:
feat_cols = "review"
label_cols = "decision_cat"

In [30]:
# loading the tokenizer and vocab processors
processor = get_roberta_processor(tokenizer=fastai_tokenizer, vocab=fastai_roberta_vocab)

# creating our databunch 
data = RobertaTextList.from_df(df, ".", cols=feat_cols, processor=processor) \
    .split_by_rand_pct(seed=config.seed) \
    .label_from_df(cols=label_cols,label_cls=CategoryList) \
    .databunch(bs=config.bs, pad_first=False, pad_idx=0)

  return np.array(a, dtype=dtype, **kwargs)


  return array(a, dtype, copy=False, order=order)


In [31]:
roberta_model = CustomRobertaModel(num_labels=config.num_labels)

learn = Learner(data, roberta_model, metrics=[accuracy])

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [32]:
learn.model.roberta.train() # setting roberta to train as it is in eval mode by default
learn.fit_one_cycle(config.epochs, max_lr=config.max_lr)

epoch,train_loss,valid_loss,accuracy,time
0,0.446449,0.343834,0.915162,00:57


  return array(a, dtype, copy=False, order=order)


In [33]:
def get_preds_as_nparray(ds_type) -> np.ndarray:
    learn.model.roberta.eval()
    preds = learn.get_preds(ds_type)[0].detach().cpu().numpy()
    sampler = [i for i in data.dl(ds_type).sampler]
    reverse_sampler = np.argsort(sampler)
    ordered_preds = preds[reverse_sampler, :]
    pred_values = np.argmax(ordered_preds, axis=1)
    return ordered_preds, pred_values

In [34]:
preds, pred_values = get_preds_as_nparray(DatasetType.Valid)

In [35]:
# accuracy on valid
(pred_values == data.valid_ds.y.items).mean()

0.9151624548736462

In [36]:
print('-------Decision Classification------')
y_test = data.valid_ds.y.items
y_pred = pred_values

print('Confusion matrix:')
print(confusion_matrix(y_test,y_pred,labels=[0,1,2,3,4]))
print()
print('Classification report:')
print(classification_report(y_test,y_pred,labels=[0,1,2,3,4],target_names=['Acquiring','Buying','Rating','Recommendation','Requesting']))

-------Decision Classification------
Confusion matrix:
[[244   1   2  10   5]
 [  6   9   1   5   1]
 [  0   0  73   0   0]
 [  5   0   3 128   1]
 [  2   0   3   2  53]]

Classification report:
                precision    recall  f1-score   support

     Acquiring       0.95      0.93      0.94       262
        Buying       0.90      0.41      0.56        22
        Rating       0.89      1.00      0.94        73
Recommendation       0.88      0.93      0.91       137
    Requesting       0.88      0.88      0.88        60

      accuracy                           0.92       554
     macro avg       0.90      0.83      0.85       554
  weighted avg       0.92      0.92      0.91       554



In [37]:
# learn2.save_model("my_model2.pth")
# https://medium.com/@devkosal/using-roberta-with-fastai-for-nlp-7ed3fed21f6c
# learn2.load_model("my_model2.pth")