In [None]:
# drive mount. colab에 내 구글 드라이브 연결
from google.colab import drive
drive.mount('/content/drive')

# clone git repo
!git clone https://github.com/hila-chefer/Transformer-Explainability.git

# change directory
import os
os.chdir(f'./Transformer-Explainability')


!pip install torch==1.7.0 torchvision==0.8.1 # 일반 GPU/CPU를 사용하는 경우
#!pip install torch==1.7.0+cu110 torchvision==0.8.1+cu110 torchaudio==0.7.0 -f https://download.pytorch.org/whl/torch_stable.html  # GPU A100을 사용하는 경우
!pip install transformers==3.5.1
!pip install captum
!pip install matplotlib==3.2.2 &> /dev/null

Mounted at /content/drive
Cloning into 'Transformer-Explainability'...
remote: Enumerating objects: 377, done.[K
remote: Counting objects: 100% (152/152), done.[K
remote: Compressing objects: 100% (78/78), done.[K
remote: Total 377 (delta 127), reused 74 (delta 74), pack-reused 225[K
Receiving objects: 100% (377/377), 3.83 MiB | 17.22 MiB/s, done.
Resolving deltas: 100% (190/190), done.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torch==1.7.0
  Downloading torch-1.7.0-cp37-cp37m-manylinux1_x86_64.whl (776.7 MB)
[K     |████████████████████████████████| 776.7 MB 4.3 kB/s 
[?25hCollecting torchvision==0.8.1
  Downloading torchvision-0.8.1-cp37-cp37m-manylinux1_x86_64.whl (12.7 MB)
[K     |████████████████████████████████| 12.7 MB 59.3 MB/s 
Collecting dataclasses
  Downloading dataclasses-0.6-py3-none-any.whl (14 kB)
Installing collected packages: dataclasses, torch, torchvision
  Attempting uninstall: torch
    Fou

In [None]:
##### 시각화 관련 필수 라이브러리
import torch

from transformers import BertTokenizer
from transformers import AutoTokenizer  # bert 모델에 따라 알맞은 tokenizer를 자동으로 로드

from BERT_explainability.modules.BERT.ExplanationGenerator import Generator
from BERT_explainability.modules.BERT.BertForSequenceClassification import BertForSequenceClassification

from captum.attr import visualization # XAI관련 라이브러리의 시각화 함수
#####


from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import get_linear_schedule_with_warmup

from sklearn.metrics import accuracy_score

import os
import json
import pickle
import numpy as np
import random
import gzip
from collections import OrderedDict

In [None]:
# GPU 찾기. 없으면 CPU로 동작
if torch.cuda.is_available():  
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

# check torch is available
print(torch.__version__)
print(torch.tensor([1.0, 2.0]).cuda())

There are 1 GPU(s) available.
We will use the GPU: Tesla T4
1.7.0
tensor([1., 2.], device='cuda:0')


In [None]:
def preprocess_amazon_from_gzip(json_filepath, data_num=None):
  review_texts = []
  ratings = []
  products = []

  with gzip.open(json_filepath, "rb") as f:
    for line in f:
      info = json.loads(line)

      try:
        review_text = info["reviewText"]
        rating = int(info["overall"])
        product = info["asin"]
      except:
        continue
      
      review_texts.append(review_text)
      ratings.append(rating)
      products.append(product)
      if data_num and len(review_texts)==data_num: break

  print(f'collected {len(review_texts)} datas')
  return review_texts, ratings, products


def generate_tokenized_dataset(review_texts, ratings):
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

  input_ids = []
  attention_masks = []

  for sent in review_texts:
      encoded_dict = tokenizer.encode_plus(
                          sent, 
                          add_special_tokens = True,
                          max_length = 512,
                          truncation=True,
                          padding = True,
                          return_attention_mask = True,
                          return_tensors = 'pt',
                    )
      
      input_ids.append(encoded_dict['input_ids'])
      attention_masks.append(encoded_dict['attention_mask'])

      if len(input_ids)%100000==0: print(f"tonkenized {len(input_ids)} datas...")

  dataset = np.array([input_ids, attention_masks, ratings])
  print("dataset is gernerated")
  return dataset


# 하나의 문장에 대해 토큰,예측,설명을 생성
def interpret_sentence(model, expl_generator, tokenizer, sentence, target_class=None, padding=False):
  # tokenize 해서 token id와 attention mask를 얻기
  if padding : encoding = tokenizer(sentence, max_length=512, padding="max_length", truncation=True, return_tensors='pt')
  else : encoding = tokenizer(sentence, max_length=512, return_tensors='pt')
  input_ids = encoding['input_ids'].to(device)
  attention_mask = encoding['attention_mask'].to(device)
  tokens = tokenizer.convert_ids_to_tokens(input_ids.flatten())

  # 모델 출력
  output = torch.nn.functional.softmax(model(input_ids=input_ids, attention_mask=attention_mask)[0], dim=-1)
  pred_class = output.argmax(dim=-1).item()
  output = output.detach().cpu().numpy()
  if target_class==None: target_class = pred_class  # 일단은 예측 라벨에 대해서 설명 생성

  # 설명 생성
  expl = expl_generator.generate_LRP(input_ids=input_ids, attention_mask=attention_mask, start_layer=11, index=target_class)[0]
  expl = expl.detach().cpu().numpy()
  expl = (expl - expl.min()) / (expl.max() - expl.min()) # normalize scores
  # normalize 방식이 https://captum.ai/tutorials/IMDB_TorchText_Interpret 랑 다른데? 링크는 l2, 코드는 최댓값을 1로.

  return tokens, output, expl, pred_class, target_class


# 전체 데이터셋에 대해 설명 생성
def interpret_all_sentences(model, expl_generator, tokenizer, sentences, ratings, products):
  records = []
  data_num = len(sentences)

  for i in range(data_num):
    sentence = sentences[i]
    rating = ratings[i]
    product = products[i]

    # 문장에 대한 설명 생성
    tokens, output, expl, pred_class, target_class = interpret_sentence(model, expl_generator, tokenizer, sentence)

    # true label 판단
    true_class = rating

    records.append([tokens, rating, product, output, expl, true_class, pred_class, target_class])

  return np.array(records)


# interpret_all_sentences의 출력 또는 그 출력을 저장한 파일경로로부터 설명 생성
def visualize_expl(records, visualization_num):
  # record가 파일 경로일 경우 불러오기
  if isinstance(records, str): records = np.load(records, allow_pickle=True)

  vis_datas = []
  for i in range(visualization_num):
    tokens, rating, product, output, expl, true_class, pred_class, target_class = records[i]
    if target_class<3: expl *= -1  # negative일 경우 빨간색으로 visualize하기 위해.

    # visualization 객체 생성해서 추가
    vis_datas.append(visualization.VisualizationDataRecord(
                                  expl,
                                  output[0][pred_class],
                                  pred_class,
                                  true_class,
                                  target_class,
                                  expl.sum(),       
                                  tokens,
                                  1))
  
  # visualize
  visualization.visualize_text(vis_datas)

In [None]:
def generate_expl(model_name, data_name, data_num):
  root = "/content/drive/MyDrive/CS470_team_2in1"

  review_texts, ratings, products = preprocess_amazon_from_gzip(root+'/dataset/'+data_name, data_num)

  model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)
  model.cuda()
  model.load_state_dict(torch.load(root+'/colab/model/'+model_name, map_location=device))
  model.eval()

  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

  epxl_generator = Generator(model)

  records = interpret_all_sentences(model, epxl_generator, tokenizer, review_texts, ratings, products)
  np.save(root+"/colab/explanation/"+"amazon_book_expl_LRP_only10000.npy", records)

  visualize_expl(records,500)


In [None]:
#generate_expl("test.pt", "Books_5.json.gz", 10000)

collected 10000 datas


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
5.0,5 (0.92),5.0,30.8,"[CLS] the king , the mice and the cheese by nancy gurney is an excellent children ' s book . it is one that i well remember from my own childhood and purchased for my daughter who loves it . it is about a king who has trouble with rude mice eating his cheese . he consult ##s his wise men and they suggest cats to chase away the mice . the cats become a nu ##isance , so the wise men recommend the king bring in dogs to chase the cats away . the cycle goes on until the mice are finally brought back to chase away the elephants , brought in to chase away the lions that ' d chased away the dogs . the story ends in compromise and friendship between the mice and the king . the story also teaches cause and effect relationships . the pictures that accompany the story are humorous and memorable . i was thrilled to discover that it is back in print . i * highly * recommend it for children ages 2 to 7 . [SEP]"
,,,,
5.0,5 (0.97),5.0,3.5,[CLS] the kids loved it ! [SEP]
,,,,
5.0,5 (0.97),5.0,9.5,[CLS] my students ( 3 & 4 year olds ) loved this book ! definitely recommend it to other teachers . [SEP]
,,,,
5.0,5 (0.97),5.0,1.52,[CLS] love it [SEP]
,,,,
5.0,5 (0.98),5.0,1.45,[CLS] great ! [SEP]
,,,,


In [None]:
#records = np.load('/content/drive/MyDrive/CS470_team_2in1/colab/explanation/amazon_book_expl_LRP_only10000.npy', allow_pickle=True)
#print(records[:10])

[[list(['[CLS]', 'the', 'king', ',', 'the', 'mice', 'and', 'the', 'cheese', 'by', 'nancy', 'gurney', 'is', 'an', 'excellent', 'children', "'", 's', 'book', '.', 'it', 'is', 'one', 'that', 'i', 'well', 'remember', 'from', 'my', 'own', 'childhood', 'and', 'purchased', 'for', 'my', 'daughter', 'who', 'loves', 'it', '.', 'it', 'is', 'about', 'a', 'king', 'who', 'has', 'trouble', 'with', 'rude', 'mice', 'eating', 'his', 'cheese', '.', 'he', 'consult', '##s', 'his', 'wise', 'men', 'and', 'they', 'suggest', 'cats', 'to', 'chase', 'away', 'the', 'mice', '.', 'the', 'cats', 'become', 'a', 'nu', '##isance', ',', 'so', 'the', 'wise', 'men', 'recommend', 'the', 'king', 'bring', 'in', 'dogs', 'to', 'chase', 'the', 'cats', 'away', '.', 'the', 'cycle', 'goes', 'on', 'until', 'the', 'mice', 'are', 'finally', 'brought', 'back', 'to', 'chase', 'away', 'the', 'elephants', ',', 'brought', 'in', 'to', 'chase', 'away', 'the', 'lions', 'that', "'", 'd', 'chased', 'away', 'the', 'dogs', '.', 'the', 'story', '