In [None]:
import os
import pickle
from pathlib import Path
import numpy as np
import scipy as sp

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from tqdm import tqdm

In [None]:
birth_place = list(Path("/nvme1/results/").glob("*"))
capitals = list(Path("/nvme2/results/").glob("*"))
trivia= list(Path("/nvme3/results/").glob("*"))
founders = list(Path("/nvme4/results/").glob("*"))

In [None]:
models = ['open_llama_7b', 'open_llama_13b', 'falcon-7b', 'falcon-40b', 'opt-6.7b', 'opt-30b']
datasets = [capitals, trivia, founders, birth_place]

In [None]:
results_data = {}
for model_name in models:
    for dataset in tqdm(datasets):
        model_files = [file for file in dataset if model_name in file.as_posix()]
        if model_files:
            try:
                with open(model_files[0], "rb") as infile:
                    results = pickle.loads(infile.read())
                num_layers = results['first_fully_connected'][0].shape[0]
                layer_pos = num_layers-2
                first_attribute_entropy = np.array([sp.stats.entropy(i) for i in results['attributes_first']])
                first_logits_entropy = np.array([sp.stats.entropy(sp.special.softmax(i[j])) for i,j in zip(results['logits'], results['start_pos'])])
                last_logits_entropy = np.array([sp.stats.entropy(sp.special.softmax(i[-1])) for i in results['logits']])
                first_logit_decomp = PCA(n_components=2).fit_transform(np.array([i[j] for i,j in zip(results['logits'], results['start_pos'])]))
                last_logit_decomp = PCA(n_components=2).fit_transform(np.array([i[-1] for i in results['logits']]))
                first_token_layer_activations = np.array([i[layer_pos] for i in results['first_fully_connected']])
                final_token_layer_activations = np.array([i[layer_pos] for i in results['final_fully_connected']])
                first_token_layer_attention = np.array([i[layer_pos] for i in results['first_attention']])
                final_token_layer_attention = np.array([i[layer_pos] for i in results['final_attention']])
                correct = np.array(results['correct'])
                results_data[model_files[0].stem] = {'first_attribute_entropy': first_attribute_entropy,
                                                        'correct': correct,
                                                        'first_logits_entropy': first_logits_entropy,
                                                        'last_logits_entropy': last_logits_entropy,
                                                        'first_logit_decomp': first_logit_decomp,
                                                        'last_logit_decomp': last_logit_decomp,
                                                        'first_token_layer_activations': first_token_layer_activations,
                                                        'final_token_layer_activations': final_token_layer_activations,
                                                        'first_token_layer_attention': first_token_layer_attention,
                                                        'final_token_layer_attention': final_token_layer_attention,}
                del results
            except:
                print(model_files[0])
                continue

In [None]:
fig, axes = plt.subplots(4, 6, sharex=True, figsize=(20,12))
#fig.suptitle('Cumulative Distribution of Entropy of Integrated Gradients of Input Tokens')

for name, results in results_data.items():
    if 'open_llama_7b' in name:
        row = 0
    elif 'open_llama_13b' in name:
        row = 1
    elif 'falcon-7b' in name:
        row = 2
    elif 'falcon-40b' in name:
        row = 3
    elif 'opt-6.7b' in name:
        row = 4
    elif 'opt-30b' in name:
        row = 5
    if 'capitals' in name:
        col = 0
    elif 'founder' in name:
        col = 1
    if 'birth' in name:
        col = 2
    if 'trivia' in name:
        col = 3
    correct = results['correct']
    first_attribute_entropy = results['first_attribute_entropy']
    sns.ecdfplot(first_attribute_entropy[np.where(correct==True)[0]], ax=axes[col, row], label="Non-Hallucination", linewidth = 2)
    sns.ecdfplot(first_attribute_entropy[np.where(correct==False)[0]], ax=axes[col, row], label="Hallucination", linewidth = 2).set(ylabel=None)

axes[0][0].legend(loc="lower right")
axes[0][0].set_title('OpenLlama 7B')
axes[0][1].set_title('OpenLlama 13B')
axes[0][2].set_title('Falcon 7B')
axes[0][3].set_title('Falcon 40B')
axes[0][4].set_title('OPT 6.7B')
axes[0][5].set_title('OPT 30B')
axes[0][0].set_ylabel('Capitals', rotation=90, size='large')
axes[1][0].set_ylabel('Founders', rotation=90, size='large')
axes[2][0].set_ylabel('Place of Birth', rotation=90, size='large')
axes[3][0].set_ylabel('General Trivia', rotation=90, size='large')
plt.savefig('figs/ecdf_ig.pdf', bbox_inches='tight')

In [None]:
fig, axes = plt.subplots(4, 6, sharex=True, figsize=(20,12))
#fig.suptitle('Cumulative Distribution of Entropy of Softmax of first token')

for name, results in results_data.items():
    if 'open_llama_7b' in name:
        row = 0
    elif 'open_llama_13b' in name:
        row = 1
    elif 'falcon-7b' in name:
        row = 2
    elif 'falcon-40b' in name:
        row = 3
    elif 'opt-6.7b' in name:
        row = 4
    elif 'opt-30b' in name:
        row = 5
    if 'capitals' in name:
        col = 0
    elif 'founder' in name:
        col = 1
    if 'birth' in name:
        col = 2
    if 'trivia' in name:
        col = 3
    correct = results['correct']
    entropy = results['first_logits_entropy']
    sns.ecdfplot(entropy[np.where(correct==True)[0]], ax=axes[col, row], label="Non-Hallucination", linewidth = 2)
    sns.ecdfplot(entropy[np.where(correct==False)[0]], ax=axes[col, row], label="Hallucination", linewidth = 2).set(ylabel=None)

axes[0][0].legend(loc="upper left")
axes[0][0].set_title('OpenLlama 7B')
axes[0][1].set_title('OpenLlama 13B')
axes[0][2].set_title('Falcon 7B')
axes[0][3].set_title('Falcon 40B')
axes[0][4].set_title('OPT 6.7B')
axes[0][5].set_title('OPT 30B')
axes[0][0].set_ylabel('Capitals', rotation=90, size='large')
axes[1][0].set_ylabel('Founders', rotation=90, size='large')
axes[2][0].set_ylabel('Place of Birth', rotation=90, size='large')
axes[3][0].set_ylabel('General Trivia', rotation=90, size='large')
plt.savefig('figs/ecdf_softmax.pdf', bbox_inches='tight')

In [None]:
fig, axes = plt.subplots(4, 6, sharex=True, figsize=(20,12))
#fig.suptitle('Cumulative Distribution of Entropy of Softmax output for last token')

for name, results in results_data.items():
    if 'open_llama_7b' in name:
        col = 0
    elif 'open_llama_13b' in name:
        col = 1
    elif 'falcon-7b' in name:
        col = 2
    elif 'falcon-40b' in name:
        col = 3
    elif 'opt-6.7b' in name:
        col = 4
    elif 'opt-30b' in name:
        col = 5
    if 'capitals' in name:
        row = 0
    elif 'founder' in name:
        row = 1
    if 'birth' in name:
        row = 2
    if 'trivia' in name:
        row = 3
    correct = results['correct']
    entropy = results['last_logits_entropy']
    sns.ecdfplot(entropy[np.where(correct==True)[0]], ax=axes[row, col], label="Non-Hallucination", linewidth = 2)
    sns.ecdfplot(entropy[np.where(correct==False)[0]], ax=axes[row, col], label="Hallucination", linewidth = 2).set(ylabel=None)

axes[0][0].legend(loc="upper left")
axes[0][0].set_title('OpenLlama 7B')
axes[0][1].set_title('OpenLlama 13B')
axes[0][2].set_title('Falcon 7B')
axes[0][3].set_title('Falcon 40B')
axes[0][4].set_title('OPT 6.7B')
axes[0][5].set_title('OPT 30B')
axes[0][0].set_ylabel('Capitals', rotation=90, size='large')
axes[1][0].set_ylabel('Founders', rotation=90, size='large')
axes[2][0].set_ylabel('Place of Birth', rotation=90, size='large')
axes[3][0].set_ylabel('General Trivia', rotation=90, size='large')

In [None]:
fig, axes = plt.subplots(4, 6, sharex=True, figsize=(20,12))
#fig.suptitle('Cumulative Distribution of Entropy of layer activations for first token')

for name, results in results_data.items():
    if 'open_llama_7b' in name:
        col = 0
    elif 'open_llama_13b' in name:
        col = 1
    elif 'falcon-7b' in name:
        col = 2
    elif 'falcon-40b' in name:
        col = 3
    elif 'opt-6.7b' in name:
        col = 4
    elif 'opt-30b' in name:
        col = 5
    if 'capitals' in name:
        row = 0
    elif 'founder' in name:
        row = 1
    if 'birth' in name:
        row = 2
    if 'trivia' in name:
        row = 3
    correct = results['correct']
    entropy = entropy = sp.stats.entropy(results['first_token_layer_activations'] - results['first_token_layer_activations'].min(axis=0), axis=1)
    entropy = entropy - entropy.min()
    entropy = entropy/entropy.max()
    sns.ecdfplot(entropy[np.where(correct==True)[0]], ax=axes[row, col], label="Non-Hallucination", linewidth = 2)
    sns.ecdfplot(entropy[np.where(correct==False)[0]], ax=axes[row, col], label="Hallucination", linewidth = 2).set(ylabel=None)

axes[0][0].legend(loc="upper left")
axes[0][0].set_title('OpenLlama 7B')
axes[0][1].set_title('OpenLlama 13B')
axes[0][2].set_title('Falcon 7B')
axes[0][3].set_title('Falcon 40B')
axes[0][4].set_title('OPT 6.7B')
axes[0][5].set_title('OPT 30B')
axes[0][0].set_ylabel('Capitals', rotation=90, size='large')
axes[1][0].set_ylabel('Founders', rotation=90, size='large')
axes[2][0].set_ylabel('Place of Birth', rotation=90, size='large')
axes[3][0].set_ylabel('General Trivia', rotation=90, size='large')
plt.savefig('figs/ecdf_hidden.pdf', bbox_inches='tight')

In [None]:
fig, axes = plt.subplots(4, 6, sharex=True, figsize=(20,12))
#fig.suptitle('Cumulative Distribution of Entropy of layer attentions for first token')

for name, results in results_data.items():
    if 'open_llama_7b' in name:
        col = 0
    elif 'open_llama_13b' in name:
        col = 1
    elif 'falcon-7b' in name:
        col = 2
    elif 'falcon-40b' in name:
        col = 3
    elif 'opt-6.7b' in name:
        col = 4
    elif 'opt-30b' in name:
        col = 5
    if 'capitals' in name:
        row = 0
    elif 'founder' in name:
        row = 1
    if 'birth' in name:
        row = 2
    if 'trivia' in name:
        row = 3
    correct = results['correct']
    entropy = entropy = sp.stats.entropy(results['first_token_layer_attention'] - results['first_token_layer_attention'].min(axis=0), axis=1)
    entropy = entropy - entropy.min()
    entropy = entropy/entropy.max()
    sns.ecdfplot(entropy[np.where(correct==True)[0]], ax=axes[row, col], label="Non-Hallucination", linewidth = 2)
    sns.ecdfplot(entropy[np.where(correct==False)[0]], ax=axes[row, col], label="Hallucination", linewidth = 2).set(ylabel=None)

axes[0][0].legend(loc="upper left")
axes[0][0].set_title('OpenLlama 7B')
axes[0][1].set_title('OpenLlama 13B')
axes[0][2].set_title('Falcon 7B')
axes[0][3].set_title('Falcon 40B')
axes[0][4].set_title('OPT 6.7B')
axes[0][5].set_title('OPT 30B')
axes[0][0].set_ylabel('Capitals', rotation=90, size='large')
axes[1][0].set_ylabel('Founders', rotation=90, size='large')
axes[2][0].set_ylabel('Place of Birth', rotation=90, size='large')
axes[3][0].set_ylabel('General Trivia', rotation=90, size='large')
plt.savefig('figs/ecdf_attention.pdf', bbox_inches='tight')

In [None]:
fig, axes = plt.subplots(4, 6, sharex=False, figsize=(20,12))
#fig.suptitle('PCA Clustering of Softmax output for first token')

for name, results in results_data.items():
    if 'open_llama_7b' in name:
        col = 0
    elif 'open_llama_13b' in name:
        col = 1
    elif 'falcon-7b' in name:
        col = 2
    elif 'falcon-40b' in name:
        col = 3
    elif 'opt-6.7b' in name:
        col = 4
    elif 'opt-30b' in name:
        col = 5
    if 'capitals' in name:
        row = 0
    elif 'founder' in name:
        row = 1
    if 'birth' in name:
        row = 2
    if 'trivia' in name:
        row = 3
    correct = results['correct']
    decomp = results['first_logit_decomp']
    sns.scatterplot(x=decomp[:,0], y=decomp[:,1], hue=correct, ax=axes[row, col])

axes[0][0].set_title('OpenLlama 7B')
axes[0][1].set_title('OpenLlama 13B')
axes[0][2].set_title('Falcon 7B')
axes[0][3].set_title('Falcon 40B')
axes[0][4].set_title('OPT 6.7B')
axes[0][5].set_title('OPT 30B')
axes[0][0].set_ylabel('Capitals', rotation=90, size='large')
axes[1][0].set_ylabel('Founders', rotation=90, size='large')
axes[2][0].set_ylabel('Place of Birth', rotation=90, size='large')
axes[3][0].set_ylabel('General Trivia', rotation=90, size='large')

plt.savefig('figs/softmax_pca.png', bbox_inches='tight')

In [None]:
fig, axes = plt.subplots(4, 6, sharex=False, figsize=(20,12))
#fig.suptitle('PCA Clustering of Softmax output for last token')

for name, results in results_data.items():
    if 'open_llama_7b' in name:
        col = 0
    elif 'open_llama_13b' in name:
        col = 1
    elif 'falcon-7b' in name:
        col = 2
    elif 'falcon-40b' in name:
        col = 3
    elif 'opt-6.7b' in name:
        col = 4
    elif 'opt-30b' in name:
        col = 5
    if 'capitals' in name:
        row = 0
    elif 'founder' in name:
        row = 1
    if 'birth' in name:
        row = 2
    if 'trivia' in name:
        row = 3
    correct = results['correct']
    decomp = results['last_logit_decomp']
    sns.scatterplot(x=decomp[:,0], y=decomp[:,1], hue=correct, ax=axes[row, col])

axes[0][0].set_title('OpenLlama 7B')
axes[0][1].set_title('OpenLlama 13B')
axes[0][2].set_title('Falcon 7B')
axes[0][3].set_title('Falcon 40B')
axes[0][4].set_title('OPT 6.7B')
axes[0][4].set_title('OPT 30B')
axes[0][0].set_ylabel('Capitals', rotation=90, size='large')
axes[1][0].set_ylabel('Founders', rotation=90, size='large')
axes[2][0].set_ylabel('Place of Birth', rotation=90, size='large')
axes[3][0].set_ylabel('General Trivia', rotation=90, size='large')

In [None]:
fig, axes = plt.subplots(4, 6, sharex=False, figsize=(20,12))
fig.suptitle('PCA Clustering of Final Fully Connected Layer Activations for First Token')

for name, results in results_data.items():
    if 'open_llama_7b' in name:
        col = 0
    elif 'open_llama_13b' in name:
        col = 1
    elif 'falcon-7b' in name:
        col = 2
    elif 'falcon-40b' in name:
        col = 3
    elif 'opt-6.7b' in name:
        col = 4
    elif 'opt-30b' in name:
        col = 5
    if 'capitals' in name:
        row = 0
    elif 'founder' in name:
        row = 1
    if 'birth' in name:
        row = 2
    if 'trivia' in name:
        row = 3
    correct = results['correct']
    decomp = results['first_token_layer_activations']
    sns.scatterplot(x=decomp[:,0], y=decomp[:,1], hue=correct, ax=axes[row, col])
    #axes[row, col].scatter(x=decomp[:,0], y=decomp[:,1])

axes[0][0].set_title('OpenLlama 7B')
axes[0][1].set_title('OpenLlama 13B')
axes[0][2].set_title('Falcon 7B')
axes[0][3].set_title('Falcon 40B')
axes[0][4].set_title('OPT 6.7B')
axes[0][5].set_title('OPT 30B')
axes[0][0].set_ylabel('Capitals', rotation=90, size='large')
axes[1][0].set_ylabel('Founders', rotation=90, size='large')
axes[2][0].set_ylabel('Place of Birth', rotation=90, size='large')
axes[3][0].set_ylabel('General Trivia', rotation=90, size='large')

In [None]:
fig, axes = plt.subplots(4, 6, sharex=False, figsize=(20,12))
fig.suptitle('PCA Clustering of Final Fully Connected Layer Activations for Final Token')

for name, results in results_data.items():
    if 'open_llama_7b' in name:
        col = 0
    elif 'open_llama_13b' in name:
        col = 1
    elif 'falcon-7b' in name:
        col = 2
    elif 'falcon-40b' in name:
        col = 3
    elif 'opt-6.7b' in name:
        col = 4
    elif 'opt-30b' in name:
        col = 5
    if 'capitals' in name:
        row = 0
    elif 'founder' in name:
        row = 1
    if 'birth' in name:
        row = 2
    if 'trivia' in name:
        row = 3
    correct = results['correct']
    decomp = results['final_token_layer_activations']
    sns.scatterplot(x=decomp[:,0], y=decomp[:,1], hue=correct, ax=axes[row, col])

axes[0][0].set_title('OpenLlama 7B')
axes[0][1].set_title('OpenLlama 13B')
axes[0][2].set_title('Falcon 7B')
axes[0][3].set_title('Falcon 40B')
axes[0][4].set_title('OPT 6.7B')
axes[0][5].set_title('OPT 30B')
axes[0][0].set_ylabel('Capitals', rotation=90, size='large')
axes[1][0].set_ylabel('Founders', rotation=90, size='large')
axes[2][0].set_ylabel('Place of Birth', rotation=90, size='large')
axes[3][0].set_ylabel('General Trivia', rotation=90, size='large')

In [None]:
for i, j in results_data.items():
    print(i)
    print(j['correct'].mean())