In [1]:
import torch
import pytorch_lightning as pl
import pandas as pd
import numpy as np
from transformers import AutoTokenizer
import shap
from tqdm import tqdm
from rouge import Rouge

from transformers_trainer import TransformersTrainer

pl.seed_everything(42)
rouge = Rouge()

[rank: 0] Global seed set to 42


In [2]:
model_name = 'mental/mental-bert-base-uncased'
model = TransformersTrainer.load_from_checkpoint(
    f'../classification_models/{model_name}.ckpt',
    model_name=model_name
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model.to('cuda:0')
model.eval()
print('Loaded Model')

Some weights of the model checkpoint at mental/mental-bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.p

Loaded Model


In [3]:
def shap_pred(text, output_logit):
    inp = tokenizer(
        list(text),
        max_length=256,
        padding="max_length",
        return_tensors="pt",
        truncation=True,
    )
    
    inp = {k: v.to('cuda:0') for k,v in inp.items()}

    with torch.no_grad():
        y_hat = model(inp)

    return y_hat[output_logit].detach().cpu().numpy()

In [8]:
test_df = pd.read_csv('../data/test_data_pre.csv')
belong_df = test_df[test_df['belong'] == 1].reset_index().drop(columns=['index']).dropna(subset=['belong_exp'])
burden_df = test_df[test_df['burden'] == 1].reset_index().drop(columns=['index']).dropna(subset=['burden_exp'])

In [None]:
belong_explainer = shap.Explainer(lambda x: shap_pred(x, 0), tokenizer)
exp = belong_explainer(belong_df['text'].to_list())
for i in range(len(belong_df)):
    belong_df.loc[i, 'top_keywords'] = ' '.join(exp.data[i][np.argsort(-exp.values[i])][:10])
    
    r = rouge.get_scores(belong_df['top_keywords'][i], belong_df['belong_exp'][i])[0]
    d = {f'{k}_{kk}': vv for k, v in r.items() for kk, vv in v.items()}
    for k, v in d.items():
        belong_df.loc[i, k] = v
    
belong_df.to_csv('../results/belong_shap.csv', index=False)

In [None]:
burden_explainer = shap.Explainer(lambda x: shap_pred(x, 1), tokenizer)
exp = burden_explainer(burden_df['text'].to_list())
for i in range(len(burden_df)):
    burden_df.loc[i, 'top_keywords'] = ' '.join(exp.data[i][np.argsort(-exp.values[i])][:10])
    
    r = rouge.get_scores(burden_df['top_keywords'][i], burden_df['burden_exp'][i])[0]
    d = {f'{k}_{kk}': vv for k, v in r.items() for kk, vv in v.items()}
    for k, v in d.items():
        burden_df.loc[i, k] = v
    
burden_df.to_csv('../results/burden_shap.csv', index=False)