# 0. Load required modules.

In [1]:
import datasets
import pickle
from transformers import BertTokenizer, BertModel, BertConfig, BertForSequenceClassification
import torch
import numpy as np
from tqdm import tqdm_notebook

# 1. Load the generated dataset.

In [2]:
with open('./data_pos.pickle', 'rb') as f:
    data_pos = pickle.load(f)
with open('./data_set1.pickle', 'rb') as f:
    data_set1 = pickle.load(f)    
with open('./data_set2.pickle', 'rb') as f:
    data_set2 = pickle.load(f)

# 2. Load the trained model.

In [9]:
model_file = '/home/skhong/WordImportance/bert_tf/qnli/pytorch_model.bin'
config_file = '/home/skhong/WordImportance/bert_tf/qnli/config.json'
vocab_file = '/home/skhong/WordImportance/bert/qnli/vocab.txt'
model_version = 'bert-base-uncased'
config = BertConfig.from_json_file(config_file)
model = BertForSequenceClassification(config)
state_dict = torch.load(model_file)
model.load_state_dict(state_dict)
tokenizer = BertTokenizer(vocab_file)

# 3. Extract attention values between tokens.
- For models with WI added, perform the addition and extract attention values; for cases where it's not the case, input the excluded ones and extract attention values.

In [10]:
X = []
Y = []

model.eval()

for i, data in tqdm_notebook(enumerate(data_pos), total=len(data_pos)):
    input_ids = data[0]
    token_type_ids = data[1]
    tfidf_ids = data[2]
    
    i_pos = data_set1[i][0]
    j_pos = data_set1[i][1]
    j_random_pos = data_set2[i][1]

    try:
        with torch.no_grad():
            pred = model(input_ids, token_type_ids=token_type_ids, tfidf_ids=tfidf_ids)
    except:
        print(input_ids.shape)
    temp_x1 = []
    temp_x2 = []
    
    # 128 is the maximum input length for the model's text.
    for k in range(12):
        temp_data = pred.attentions[k].detach().cpu().numpy().reshape(12, 512, 512)
        
        for l in range(12):
            temp_x1.append(temp_data[l][i_pos][j_pos])
            temp_x2.append(temp_data[l][i_pos][j_random_pos])
    
    Y.append(1)
    X.append(temp_x1)
    Y.append(0)
    X.append(temp_x2)
    
X = np.array(X)
Y = np.array(Y)


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i, data in tqdm_notebook(enumerate(data_pos), total=len(data_pos)):


  0%|          | 0/2000 [00:00<?, ?it/s]

In [10]:
print(Y)

[]


# 4. WI Validation Model based Performance Evaluation

In [12]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, shuffle=True, stratify=Y, random_state=34)

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
clf = RandomForestClassifier(
            n_estimators=50, 
            criterion='entropy', 
            max_depth=5, 
            max_features='sqrt',
            max_samples=0.9,
            bootstrap=True,
            oob_score=True, 
            random_state=100
        ).fit(x_train,y_train)

## Performance Evaluation
print('ACC : ', clf.score(x_test,y_test))


ACC :  0.81
