In [None]:
# drive mount. colab에 내 구글 드라이브 연결
from google.colab import drive
drive.mount('/content/drive')

# clone git repo
!git clone https://github.com/hila-chefer/Transformer-Explainability.git

# change directory
import os
os.chdir(f'./Transformer-Explainability')

# install libraries
!pip install -r requirements.txt &> /dev/null
!pip install captum &> /dev/null
!pip install matplotlib==3.2.2 &> /dev/null

Mounted at /content/drive
Cloning into 'Transformer-Explainability'...
remote: Enumerating objects: 377, done.[K
remote: Counting objects: 100% (152/152), done.[K
remote: Compressing objects: 100% (78/78), done.[K
remote: Total 377 (delta 127), reused 74 (delta 74), pack-reused 225[K
Receiving objects: 100% (377/377), 3.83 MiB | 37.39 MiB/s, done.
Resolving deltas: 100% (190/190), done.


# 새 섹션

In [None]:
### transformer 및 설명 생성을 위한 라이브러리
import torch

from transformers import BertTokenizer
from transformers import AutoTokenizer  # bert 모델에 따라 알맞는 tokenizer를 자동으로 로드

from BERT_explainability.modules.BERT.ExplanationGenerator import Generator
from BERT_explainability.modules.BERT.BertForSequenceClassification import BertForSequenceClassification

from captum.attr import visualization # XAI관련 라이브러리의 시각화 함수


### 아마존 데이터셋 분석을 위해 추가한 라이브러리
import json
import pickle
import numpy as np

In [None]:
"""
model = BertForSequenceClassification.from_pretrained("textattack/bert-base-uncased-SST-2").to("cuda")
tokenizer = AutoTokenizer.from_pretrained("textattack/bert-base-uncased-SST-2")

model.eval()  # model을 evaluation mode로 전환
explanations = Generator(model)  # 설명 생성 객체 초기화
"""

'\nmodel = BertForSequenceClassification.from_pretrained("textattack/bert-base-uncased-SST-2").to("cuda")\ntokenizer = AutoTokenizer.from_pretrained("textattack/bert-base-uncased-SST-2")\n\nmodel.eval()  # model을 evaluation mode로 전환\nexplanations = Generator(model)  # 설명 생성 객체 초기화\n'

In [None]:
def preprocess_amazon(json_filename = 'AMAZON_FASHION_5.json'):
  review_texts = []
  ratings = []
  products = []

  root = '/content/drive/MyDrive/CS470_team_2in1'
  with open(root+'/'+'dataset'+'/'+json_filename, "r") as json_file:
    for line in json_file:
      info = json.loads(line)

      try:
        review_text = info["reviewText"]
        rating = int(info["overall"])
        product = info["asin"]
      except:
        continue
      
      review_texts.append(review_text)
      ratings.append(rating)
      products.append(product)

  print(f'collected {len(review_texts)} datas')
  return review_texts, ratings, products

In [None]:
# 하나의 문장에 대해 토큰,예측,설명을 생성
def interpret_sentence(model, expl_generator, tokenizer, sentence, target_class=None):
  # tokenize 해서 token id와 attention mask를 얻기
  encoding = tokenizer(sentence, return_tensors='pt')
  # 만약 token 개수가 model input dim을 넘는다면 아래 코드 사용. max_length는 몇이어야 할지 모르겠다. 모델 학습때는 이걸 사용하려나
  # encoding = tokenizer(sentence, max_length=100, padding="max_length", truncation=True, return_tensors='pt')
  input_ids = encoding['input_ids'].to("cuda")
  attention_mask = encoding['attention_mask'].to("cuda")
  tokens = tokenizer.convert_ids_to_tokens(input_ids.flatten())

  # 모델 출력
  output = torch.nn.functional.softmax(model(input_ids=input_ids, attention_mask=attention_mask)[0], dim=-1)
  pred_class = output.argmax(dim=-1).item()
  output = output.detach().cpu().numpy()
  if target_class==None: target_class = pred_class  # 일단은 예측 라벨에 대해서 설명 생성

  # 설명 생성
  expl = expl_generator.generate_LRP(input_ids=input_ids, attention_mask=attention_mask, start_layer=11, index=target_class)[0]
  expl = expl.detach().cpu().numpy()
  expl = (expl - expl.min()) / (expl.max() - expl.min()) # normalize scores
  # normalize 방식이 https://captum.ai/tutorials/IMDB_TorchText_Interpret 랑 다른데? 링크는 l2, 코드는 최댓값을 1로.

  return tokens, output, expl, pred_class, target_class



# 전체 데이터셋에 대해 설명 생성
def interpret_all_sentences(model, expl_generator, tokenizer, sentences, ratings, products):
  records = []
  data_num = len(sentences)

  for i in range(data_num):
    sentence = sentences[i]
    rating = ratings[i]
    product = products[i]

    # 문장에 대한 설명 생성
    tokens, output, expl, pred_class, target_class = interpret_sentence(model, expl_generator, tokenizer, sentence)

    # true label 판단
    true_class = 1 if rating>=3.0 else 0

    records.append([tokens, rating, product, output, expl, true_class, pred_class, target_class])

  return np.array(records)

In [None]:
# interpret_all_sentences의 출력 또는 그 출력을 저장한 파일경로로부터 설명 생성
def visualize_expl(records, visualization_num):
  # record가 파일 경로일 경우 불러오기
  if isinstance(records, str): records = np.load(records, allow_pickle=True)

  vis_datas = []
  for i in range(visualization_num):
    tokens, rating, product, output, expl, true_class, pred_class, target_class = records[i]
    if target_class==0: expl *= -1  # negative일 경우 빨간색으로 visualize하기 위해.

    # visualization 객체 생성해서 추가
    vis_datas.append(visualization.VisualizationDataRecord(
                                  expl,
                                  output[0][pred_class],
                                  pred_class,
                                  true_class,
                                  target_class,
                                  expl.sum(),       
                                  tokens,
                                  1))
  
  # visualize
  visualization.visualize_text(vis_datas)

In [None]:
def generate_LRP_amazon(record_save_dir, json_filename):
  # preprocess
  review_texts, ratings, products = preprocess_amazon(json_filename)

  # load model, tokenizer, explation_generator
  model = BertForSequenceClassification.from_pretrained("textattack/bert-base-uncased-SST-2").to("cuda")
  model.eval()  # model을 evaluation mode로 전환
  tokenizer = AutoTokenizer.from_pretrained("textattack/bert-base-uncased-SST-2")
  epxl_generator = Generator(model)  # 설명 생성 객체

  # 설명 생성 및 저장
  records = interpret_all_sentences(model, epxl_generator, tokenizer, review_texts, ratings, products)
  np.save(record_save_dir+"/"+"LRP_amazon.npy", records)
  #with open(record_save_dir+"/"+"LRP_amazon","wb") as f: pickle.dump(records, f)

  # visualize
  visualize_expl(records,10,1)

In [None]:
#generate_LRP_amazon('/content/drive/MyDrive/CS470_team_2in1/colab', 'AMAZON_FASHION_5.json')
visualize_expl('/content/drive/MyDrive/CS470_team_2in1/colab/LRP_amazon.npy',100)

TypeError: ignored

In [None]:
records = np.load('/content/drive/MyDrive/CS470_team_2in1/colab/LRP_amazon.npy', allow_pickle=True)
print(records[:10])