In [25]:
import pickle
import numpy as np
import scipy.io as sio
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import datetime 
import seaborn as sns
from scipy import stats
import xarray as xr
from importlib import reload 
import torch
from transformers import *
from neural_nlp.benchmarks.neural import *
from neural_nlp.models import *
import neural_nlp
from neural_nlp.models.implementations import *
from neural_nlp.stimuli import StimulusSet
from pathlib import Path
import os
# Plot specifications
sns.set(context='talk')
sns.set_style("whitegrid", {'axes.grid': False})
plt.rc('axes', edgecolor='black')
plt.rc('axes', edgecolor='black')
plt.rcParams['axes.spines.right'] = False
plt.rcParams['axes.spines.top'] = False

subjectID = '018'
subjectIDs = ['018', '288', '289', '296', '426']
score_name = '/om/user/msch/share/neural_nlp/benchmark=Pereira2018-encoding,model=gpt2-xl,subsample=None.pkl'
# score_name = '/om/user/msch/share/neural_nlp/benchmark=Pereira2018-encoding,model=glove,subsample=None.pkl'

In [9]:
import func as f

In [12]:
def get_model_activations(stimuli, group, layers, tokenizer, model, max_num_words=512):
    states_sentences = [[]]*len(layers)
    for value in stimuli[group].unique():
        df = stimuli[stimuli[group] == value]
        sentences = list(df.sentence)
        tokenized_sentences = [] # store tokenized stimuli (concatenated in a single list)
        indices = [] # keep track of where each sentence ends
        for sent in sentences:
            tokenized_sentences.extend(tokenizer(sent)['input_ids'])
            indices.append(len(tokenized_sentences)-1)
        for i in indices: # for each sentence
            end = i+1
            start = max(0, end - max_num_words) # use all context before it unless it is above max_num_words
            input_ids = torch.tensor(tokenized_sentences[start:end])
            result_model = model(input_ids, output_hidden_states=True)
            hidden_states = result_model[2]
            for i, layer in enumerate(layers):
                state = hidden_states[layer][-1,:].detach().numpy() # get last word
                states_sentences[i].append(state)
    return np.array(states_sentences)

In [13]:
benchmark_pool = [
    # primary benchmarks
    ('Pereira2018-encoding', PereiraEncoding),
    ('Fedorenko2016v3-encoding', Fedorenko2016V3Encoding),
    ('Blank2014fROI-encoding', Blank2014fROIEncoding),
    # secondary benchmarks
    ('Pereira2018-rdm', PereiraRDM),
    ('Fedorenko2016v3-rdm', Fedorenko2016V3RDM),
    ('Fedorenko2016v3nonlang-encoding', Fedorenko2016V3NonLangEncoding),
    ('Blank2014fROI-rdm', Blank2014fROIRDM),
]
benchmark_pool = {identifier: LazyLoad(lambda identifier=identifier, ctr=ctr: ctr(identifier=identifier))
                  for identifier, ctr in benchmark_pool}

# how to fetch stimulus set
benchmark_test = benchmark_pool['Pereira2018-encoding']
stimuli = benchmark_test._target_assembly.attrs['stimulus_set']

In [16]:
stimulus_set = stimuli.copy()
stimulus_set.loc[:, 'passage_id'] = stimulus_set['experiment'] + stimulus_set['passage_index'].astype(str)

In [17]:
stimulus_set

Unnamed: 0,sentence,sentence_num,stimulus_id,experiment,story,passage_index,passage_label,passage_category,passage_id
0,Beekeeping encourages the conservation of loca...,0,243sentences.0,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping,243sentences1
1,It is in every beekeeper's interest to conserv...,1,243sentences.1,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping,243sentences1
2,"As a passive form of agriculture, it does not ...",2,243sentences.2,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping,243sentences1
3,Beekeepers also discourage the use of pesticid...,3,243sentences.3,243sentences,243sentences.beekeeping,1,beekeeping,beekeeping,243sentences1
4,Artisanal beekeepers go to extremes for their ...,4,243sentences.4,243sentences,243sentences.beekeeping,2,beekeeping,beekeeping,243sentences2
...,...,...,...,...,...,...,...,...,...
622,Some windows have multiple panes to increase i...,379,384sentences.379,384sentences,384sentences.building_part,95,Window,building_part,384sentences95
623,A woman is a female human adult.,380,384sentences.380,384sentences,384sentences.human,96,Woman,human,384sentences96
624,A woman is stereotypically seen as a caregiver.,381,384sentences.381,384sentences,384sentences.human,96,Woman,human,384sentences96
625,A woman can become pregnant and bear children.,382,384sentences.382,384sentences,384sentences.human,96,Woman,human,384sentences96


In [None]:
# Check number of overlapping last words
stim = pd.read_csv('/Users/gt/Documents/GitHub/control-neural/data/pereira_stimulus_set.csv')
last_words = [x.split(' ')[-1].lower() for x in stim.sentence.values]
np.unique(last_words, return_counts=True)

In [18]:
pretrained_weights = 'gpt2'
tokenizer_gpt2 = GPT2Tokenizer.from_pretrained(pretrained_weights)
model_gpt2 = GPT2Model.from_pretrained(pretrained_weights)

Some weights of GPT2Model were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.attn.masked_bias', 'h.1.attn.masked_bias', 'h.2.attn.masked_bias', 'h.3.attn.masked_bias', 'h.4.attn.masked_bias', 'h.5.attn.masked_bias', 'h.6.attn.masked_bias', 'h.7.attn.masked_bias', 'h.8.attn.masked_bias', 'h.9.attn.masked_bias', 'h.10.attn.masked_bias', 'h.11.attn.masked_bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [60]:
t6 = get_model_activations(stimulus_set, 'passage_id', [6], tokenizer_gpt2, model_gpt2, max_num_words=512)

In [61]:
np.shape(t6)

(1, 627, 768)

In [62]:
t7 = get_model_activations(stimulus_set, 'passage_id', [7], tokenizer_gpt2, model_gpt2, max_num_words=512)

In [63]:
t6=np.squeeze(t6)

In [64]:
t7=np.squeeze(t7)

In [52]:
t12 = np.squeeze(get_model_activations(stimulus_set, 'passage_id', [12], tokenizer_gpt2, model_gpt2, max_num_words=512))

In [66]:
np.save('GPT2_layer12_act_Pereira2018.npy', t12)

## CKA

In [31]:
os.chdir((Path(os.path.dirname(os.getcwd())) / '..' / '..'/ 'CKA-Centered-Kernel-Alignment').resolve())

In [32]:
os.getcwd()

'/Users/gt/Documents/GitHub/CKA-Centered-Kernel-Alignment'

In [33]:
import pickle
import gzip
import cca_core
from CKA import linear_CKA, kernel_CKA

In [42]:
np.shape(t6)

(627, 768)

In [56]:
corrt = np.corrcoef(t6, t7)

In [57]:
np.shape(corrt)

(1254, 1254)

In [58]:
corrt

array([[1.        , 0.92892599, 0.89543713, ..., 0.85815811, 0.83035886,
        0.78656707],
       [0.92892599, 1.        , 0.92689966, ..., 0.87903157, 0.86623215,
        0.83279897],
       [0.89543713, 0.92689966, 1.        , ..., 0.87513774, 0.87777565,
        0.85968498],
       ...,
       [0.85815811, 0.87903157, 0.87513774, ..., 1.        , 0.93713077,
        0.91197734],
       [0.83035886, 0.86623215, 0.87777565, ..., 0.93713077, 1.        ,
        0.93501629],
       [0.78656707, 0.83279897, 0.85968498, ..., 0.91197734, 0.93501629,
        1.        ]])

In [65]:
print('Linear CKA, between X and Y: {}'.format(linear_CKA(t6.T,t7.T)))
print('Linear CKA, between X and X: {}'.format(linear_CKA(t6.T,t6.T)))


Linear CKA, between X and Y: 0.9568201562019218
Linear CKA, between X and X: 1.0


In [47]:
print('RBF Kernel CKA, between X and Y: {}'.format(kernel_CKA(t6, t7)))
print('RBF Kernel CKA, between X and X: {}'.format(kernel_CKA(t7, t7)))


RBF Kernel CKA, between X and Y: 1.0000000000000002
RBF Kernel CKA, between X and X: 1.0000000000000002


In [53]:
linear_CKA(t6.T,t12.T)

0.008682458764857041

In [54]:
linear_CKA(t7.T,t12.T)

0.008682458764857041