In [87]:
import torch
from torch.autograd import Variable
from transformers import AutoTokenizer, AutoConfig

import numpy as np
import pandas as pd
from tqdm import tqdm

from TRC.models import ModelForWeightedSequenceClassification
from common_utils import extract_from_dataframe, mask_batch_seq_generator, pad_sequences

In [93]:
ASSIGN_WEIGHT = True
MAX_LENGTH = 128

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model_name = 'roberta-large'

model_path = '/home/cc/rora_tesi_new/log/log_TRC/roberta-large/bertweet-seq/20_epoch/data/True_weight/42_seed/saved-model/pytorch_model.bin'
config_path = '/home/cc/rora_tesi_new/log/log_TRC/roberta-large/bertweet-seq/20_epoch/data/True_weight/42_seed/saved-model/config.json'

cuda


In [89]:
def load_local_model(model_path, config_path, device, model_name):

    config = config = AutoConfig.from_pretrained(config_path)

    model = ModelForWeightedSequenceClassification(model_name=model_name,config=config)
    checkpoint = torch.load(model_path, map_location=device)
    model.load_state_dict(checkpoint)

    return model

def create_weight(Y_train):
    # weight of each class in loss function
    class_weight = [np.array(Y_train).shape[0] / (np.array(Y_train) == i).sum() for i in range(len(set(Y_train)))]
    class_weight = torch.FloatTensor(class_weight)
    return class_weight

def prepare_data(data_path, need_columns):

    data = pd.read_pickle(data_path)
    X, Y = extract_from_dataframe(data, need_columns)
    return X, Y

In [90]:
train_data_path = '/home/cc/rora_tesi_new/data/train.p'
test_data_path = '/home/cc/rora_tesi_new/data/test.p'

need_columns = ['tweet_tokens', 'sentence_class']

_, Y_train = prepare_data(train_data_path, need_columns)
X_test_raw, Y_test = prepare_data(test_data_path, need_columns)

test_batch_size = Y_test.shape[0]


In [91]:
class_weight = None
if ASSIGN_WEIGHT:
    class_weight = create_weight(Y_train)

In [92]:
tokenizer = AutoTokenizer.from_pretrained(model_name, normalization = True)
model = load_local_model(model_path, config_path, device, model_name)
model = model.to(device)
labels = sorted(model.config.label2id, key=model.config.label2id.get)

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [78]:
def simple_tokenize(orig_tokens, tokenizer):
    """
    tokenize a array of raw text
    """
    # bert_tokens = ["[CLS]"]
    bert_tokens = [tokenizer.cls_token]
    for x in orig_tokens:
        bert_tokens.extend(tokenizer.tokenize(x))
    # bert_tokens.append("[SEP]")
    bert_tokens.append(tokenizer.sep_token)
    return bert_tokens

def tokenize_with_new_mask(orig_text, max_length, tokenizer):
    """
    tokenize a array of raw text and generate corresponding
    attention labels array and attention masks array
    """
    bert_tokens = [simple_tokenize(t, tokenizer) for t in orig_text]
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in bert_tokens]
    input_ids = pad_sequences(input_ids, maxlen=max_length, dtype="long", truncating="post", padding="post")
    attention_masks = []
    for seq in input_ids:
        seq_mask = [float(i > 0) for i in seq]
        attention_masks.append(seq_mask)
    attention_masks = np.array(attention_masks)
    return input_ids, attention_masks

In [79]:
X_test, masks_test = tokenize_with_new_mask(X_test_raw, MAX_LENGTH, tokenizer)

num_batches = X_test.shape[0] // test_batch_size
test_batch_generator = mask_batch_seq_generator(X_test, Y_test, masks_test, test_batch_size)

In [98]:
def f(X_test_raw, model, device, class_weight):
    
    X_test, masks_test = tokenize_with_new_mask(X_test_raw, MAX_LENGTH, tokenizer)

    num_batches = X_test.shape[0] // test_batch_size
    test_batch_generator = mask_batch_seq_generator(X_test, Y_test, masks_test, test_batch_size)
    
    model.eval()

    with torch.no_grad():
        for b in tqdm(range(num_batches)):
            x_batch, y_batch, masks_batch = next(test_batch_generator)
            if len(x_batch.shape) == 3:
                x_batch = Variable(torch.FloatTensor(x_batch)).to(device)
            else:
                x_batch = Variable(torch.LongTensor(x_batch)).to(device)
            y_batch = y_batch.astype(np.float64)
            y_batch = Variable(torch.LongTensor(y_batch)).to(device)
            masks_batch = Variable(torch.FloatTensor(masks_batch)).to(device)
            class_weight = class_weight.to(device) if class_weight is not None else None
            outputs = model(x_batch, masks_batch, labels=y_batch, class_weight=class_weight)
            print(outputs)
            loss, logits = outputs[:2]

    return logits



In [None]:
import shap
explainer = shap.Explainer(f, tokenizer, output_names=labels)

In [99]:
print(f(X_test_raw[1], model, device, class_weight))

0it [00:00, ?it/s]


UnboundLocalError: local variable 'logits' referenced before assignment

In [47]:
shap_values = explainer(X_test[1])

ValueError: text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).