In [None]:
import pickle
from pathlib import Path
import numpy as np
import scipy as sp

import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import random
from tqdm import tqdm

gpu = "0"
device = torch.device(f"cuda:{gpu}" if torch.cuda.is_available() else "cpu")

In [None]:
inference_results = list(Path("/home/ec2-user/SageMaker/halu_code/results/").rglob("*.pickle"))
print (inference_results)

In [None]:
def check_capitals(response, answer):
    if "," in answer:
        answer = answer.split(',')[0]
    return answer in response

In [None]:
class FFHallucinationClassifier(torch.nn.Module):
    def __init__(self, input_shape, dropout = 0.5):
        super().__init__()
        self.dropout = dropout
        
        #https://arxiv.org/pdf/2304.13734.pdf
        self.linear_relu_stack =torch.nn.Sequential(
            torch.nn.Linear(input_shape, 256),
            torch.nn.ReLU(),
            torch.nn.Dropout(self.dropout),
            torch.torch.nn.Linear(256, 128),
            torch.torch.nn.ReLU(),
            torch.nn.Dropout(self.dropout),
            torch.nn.Linear(128, 64),
            torch.nn.ReLU(),
            torch.nn.Dropout(self.dropout),
            torch.nn.Linear(64, 2)
            )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits
    
class RNNHallucinationClassifier(torch.nn.Module):
    def __init__(self, dropout=0.0):
        super().__init__()
        hidden_dim = 128
        num_layers = 4
        self.lstm = torch.nn.GRU(1, hidden_dim, num_layers, dropout=dropout, batch_first=True, bidirectional=False)
        self.linear = torch.nn.Linear(hidden_dim, 2)
    
    def forward(self, seq):
        lstm_out, _ = self.lstm(seq)
        return self.linear(lstm_out)[-1, -1, :]

In [None]:
def gen_classifier_roc(inputs):
    X_train, X_test, y_train, y_test = train_test_split(inputs, correct.astype(int), test_size = 0.2)
    classifier_model = FFHallucinationClassifier(X_train.shape[1]).to(device)
    X_train = torch.tensor(X_train).to(device)
    y_train = torch.tensor(y_train).to(torch.long).to(device)
    X_test = torch.tensor(X_test).to(device)
    y_test = torch.tensor(y_test).to(torch.long).to(device)

    optimizer = torch.optim.AdamW(classifier_model.parameters(), lr=0.00001, weight_decay=0.01)

    for i in range(1001):
        optimizer.zero_grad()
        sample = torch.randperm(X_train.shape[0])[:512]
        pred = classifier_model(X_train[sample])
        loss = torch.nn.functional.cross_entropy(pred, y_train[sample])
        loss.backward()
        #lr_scheduler.step()
        optimizer.step()
    classifier_model.eval()
    with torch.no_grad():
        pred = torch.nn.functional.softmax(classifier_model(X_test), dim=1)
        prediction_classes = (pred[:,1]>0.5).type(torch.long).cpu()
        return roc_auc_score(y_test.cpu(), pred[:,1].cpu()), (prediction_classes.numpy()==y_test.cpu().numpy()).mean()

In [None]:
all_results = {}

In [None]:
for idx, results_file in enumerate(tqdm(inference_results)):
    if results_file not in all_results.keys():
        try:
            del results
        except:
            pass
        try:
            classifier_results = {}
            with open(results_file, "rb") as infile:
                results = pickle.loads(infile.read())
            correct = np.array(results['correct'])
            if 'capitals' in results_file.stem:
                correct = np.array([check_capitals(i, j) for i,j in zip(results['str_response'], results['answers'])])
    
            # attributes
            X_train, X_test, y_train, y_test = train_test_split(results['attributes_first'], correct.astype(int), test_size = 0.2)
            batch_size = 512
            #batch_size = 4

            rnn_model = RNNHallucinationClassifier()
            optimizer = torch.optim.AdamW(rnn_model.parameters(), lr=0.00001, weight_decay=0.01)

            for step in range(10):
                x_sub, y_sub = zip(*random.sample(list(zip(X_train, y_train)), batch_size))
                y_sub = torch.tensor(y_sub).to(torch.long)
                optimizer.zero_grad()
                preds = torch.stack([rnn_model(torch.tensor(i).view(1, -1, 1).to(torch.float)) for i in x_sub])
                loss = torch.nn.functional.cross_entropy(preds, y_sub)
                loss.backward()
                optimizer.step()
            preds = torch.stack([rnn_model(torch.tensor(i).view(1, -1, 1).to(torch.float)) for i in X_test])
            preds = torch.nn.functional.softmax(preds, dim=1)
            prediction_classes = (preds[:,1]>0.5).type(torch.long).cpu()
            #print(roc_auc_score(y_test, preds[:,1].detach().cpu().numpy()))
            classifier_results['attribution_rnn_roc'] = roc_auc_score(y_test, preds[:,1].detach().cpu().numpy())
            classifier_results['attribution_rnn_acc'] = (prediction_classes.numpy()==y_test).mean()

            # logits
            first_logits = np.stack([sp.special.softmax(i[j]) for i,j in zip(results['logits'], results['start_pos'])])
            first_logits_roc, first_logits_acc = gen_classifier_roc(first_logits)
            #print(first_logits_roc)
            classifier_results['first_logits_roc'] = first_logits_roc
            classifier_results['first_logits_acc'] = first_logits_acc

            final_logits = np.stack([sp.special.softmax(i[-1]) for i in results['logits']])
            final_logits_roc, final_logits_acc = gen_classifier_roc(first_logits)
            #print(final_logits_roc)
            classifier_results['final_logits_roc'] = final_logits_roc
            classifier_results['final_logits_acc'] = final_logits_acc

            # fully connected
            for layer in range(results['first_fully_connected'][0].shape[0]):
                layer_roc, layer_acc = gen_classifier_roc(np.stack([i[layer] for i in results['first_fully_connected']]))
                classifier_results[f'first_fully_connected_roc_{layer}'] = layer_roc
                classifier_results[f'first_fully_connected_acc_{layer}'] = layer_acc

            for layer in range(results['final_fully_connected'][0].shape[0]):
                layer_roc, layer_acc = gen_classifier_roc(np.stack([i[layer] for i in results['final_fully_connected']]))
                classifier_results[f'final_fully_connected_roc_{layer}'] = layer_roc
                classifier_results[f'final_fully_connected_acc_{layer}'] = layer_acc

            # attention
            for layer in range(results['first_attention'][0].shape[0]):
                layer_roc, layer_acc = gen_classifier_roc(np.stack([i[layer] for i in results['first_attention']]))
                classifier_results[f'first_attention_roc_{layer}'] = layer_roc
                classifier_results[f'first_attention_acc_{layer}'] = layer_acc

            for layer in range(results['final_attention'][0].shape[0]):
                layer_roc, layer_acc = gen_classifier_roc(np.stack([i[layer] for i in results['final_attention']]))
                classifier_results[f'final_attention_roc_{layer}'] = layer_roc
                classifier_results[f'final_attention_acc_{layer}'] = layer_acc
            
            all_results[results_file] = classifier_results.copy()
        except:
            continue

In [None]:
print(all_results.keys())