In [16]:
import numpy as np
import pandas as pd
import torch
import sys
import torch
import tqdm
from scipy.spatial.distance import cosine
import transformers as tfm


from sentence_transformers import SentenceTransformer


%config Completer.use_jedi = False

In [2]:
models_list = ['bert','bertLarge','gpt2', 'roberta', 'mpnet']
models_output_size = {'bert':768,'bertLarge':1024,'gpt2':768,'roberta':768,'mpnet':768}

In [3]:
tokenizers_func = [tfm.BertTokenizer,tfm.BertTokenizer,tfm.GPT2Tokenizer,tfm.RobertaTokenizer,tfm.MPNetTokenizer]
models_func = [tfm.BertModel,tfm.BertModel,tfm.GPT2Model,tfm.RobertaModel,tfm.MPNetModel]

In [4]:
path = '/home/bastien/Downloads/LanguageModels/'

In [5]:
tokenizers = {models_list[i] : tokenizers_func[i].from_pretrained(path+models_list[i]) for i in range(len(models_list))}
models = {models_list[i] : models_func[i].from_pretrained(path+models_list[i]) for i in range(len(models_list))}

Some weights of the model checkpoint at /home/bastien/Downloads/LanguageModels/bert were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at /home/bastien/Downloads/LanguageModels/bertLarge were not used when initializing BertMode

In [6]:
contexts = ['I come from', 'He lives in', 'She moved to']
short_ctxts = ['come','lives','moved']

In [7]:
cities = pd.read_csv('./csv/worldcitiespop.csv',header=0,dtype={'AccentCity':'str', 'Region':'object'}).dropna().drop('Region',axis=1)
cities = cities[cities.Population>100000].reset_index(drop=True)

In [8]:
cities

Unnamed: 0,Country,City,AccentCity,Population,Latitude,Longitude
0,ae,abu dhabi,Abu Dhabi,603687.0,24.466667,54.366667
1,ae,dubai,Dubai,1137376.0,25.258172,55.304717
2,ae,sharjah,Sharjah,543942.0,25.357310,55.403304
3,af,baglan,Baglan,108481.0,36.130684,68.708286
4,af,gardez,Gardez,103732.0,33.597439,69.225922
...,...,...,...,...,...,...
3522,zw,gweru,Gweru,201879.0,-19.450000,29.816667
3523,zw,harare,Harare,2213701.0,-17.817778,31.044722
3524,zw,kadoma,Kadoma,100276.0,-18.350000,29.916667
3525,zw,kwekwe,Kwekwe,116332.0,-18.916667,29.816667


In [9]:
capitals = pd.read_csv('./csv/country-capitals.csv').drop(['ContinentName','CountryCode'], axis=1)
capitals

Unnamed: 0,CountryName,CapitalName,Latitude,Longitude
0,Somaliland,Hargeisa,9.550000,44.050000
1,South Georgia and South Sandwich Islands,King Edward Point,-54.283333,-36.500000
2,French Southern and Antarctic Lands,Port-aux-Français,-49.350000,70.216667
3,Palestine,Jerusalem,31.766667,35.233333
4,Aland Islands,Mariehamn,60.116667,19.900000
...,...,...,...,...
237,Zimbabwe,Harare,-17.816667,31.033333
238,Northern Cyprus,North Nicosia,35.183333,33.366667
239,Hong Kong,Hong Kong,22.302711,114.177216
240,British Indian Ocean Territory,Diego Garcia,-7.300000,72.400000


In [11]:
countries = pd.read_csv('./csv/countries.csv',keep_default_na=False)
countries

Unnamed: 0,Name,Code,Latitude,Longitude
0,Andorra,AD,42.5000,1.5000
1,United Arab Emirates,AE,24.0000,54.0000
2,Afghanistan,AF,33.0000,65.0000
3,Antigua and Barbuda,AG,17.0500,-61.8000
4,Anguilla,AI,18.2500,-63.1667
...,...,...,...,...
244,Yemen,YE,15.0000,48.0000
245,Mayotte,YT,-12.8333,45.1667
246,South Africa,ZA,-29.0000,24.0000
247,Zambia,ZM,-15.0000,30.0000


In [13]:
model_name = 'bert'
ctx = contexts[0]
token = tokenizers[model_name]
model = models[model_name]

In [19]:
entry = 'Paris'
inputs = token(ctx+" "+entry,return_tensors="pt")
outputs = model(**inputs,output_hidden_states=True)

In [21]:
outputs.last_hidden_state

tensor([[[ 6.6166e-01,  1.5474e-01,  1.1787e-01,  ..., -6.8726e-02,
           3.0570e-01, -2.5763e-02],
         [ 3.9590e-01,  6.6684e-02,  4.5810e-01,  ...,  4.2802e-01,
           3.7462e-02,  1.2796e-02],
         [ 4.4277e-01,  1.5790e-01, -5.7216e-02,  ...,  6.8028e-01,
          -2.0171e-01, -1.1367e-01],
         [ 1.2365e-01, -1.7644e-02,  5.1875e-01,  ..., -8.7626e-02,
           2.6407e-01, -1.8693e-01],
         [ 1.6168e-01,  1.0686e-01, -1.4996e-02,  ..., -3.0191e-01,
           3.8160e-01,  3.8241e-04],
         [ 1.0472e+00,  4.1524e-02, -5.4551e-01,  ...,  4.7855e-01,
           8.1386e-01, -1.5856e-02]]], grad_fn=<NativeLayerNormBackward>)

In [12]:
i = 0
ind = 100
ctx=contexts[i]

for entry in cities.AccentCity :
    inputs = token(ctx+" "+entry,return_tensors="pt")
    
    #If the entry is split in multiple tokens, we need to aggregate the tensors
    entry_tokens = token([entry],is_split_into_words=True,add_special_tokens=False)['input_ids']

    expected_length = len(token([entry],is_split_into_words=True,add_special_tokens=False)['input_ids']) #number of tokens for the entry
    last_ctx_token = token(ctx,add_special_tokens=False)['input_ids'][-1] #last token of the context string
    start_index = inputs['input_ids'][0].tolist().index(last_ctx_token)+1 #first index of the entry's tensors
    if inputs['input_ids'][0][start_index:start_index+expected_length].tolist()!= entry_tokens:
        print(entry_tokens)
        print(inputs['input_ids'][0][start_index:start_index+expected_length])
        raise ValueError()


#entry_embedding = outputs.last_hidden_state[0][start_index:start_index+expected_length].mean(axis=0)

#entry_embedding.shape

# Last layer embedding

In [27]:
# Cities
series = cities.AccentCity
for model_name in models_list:
    token = tokenizers[model_name]
    model = models[model_name]
    size = models_output_size[model_name]
    for i, ctx in enumerate(contexts):
        buffer_arr = np.empty((len(series),size))
        for ind,entry in tqdm.tqdm(enumerate(series),desc=model_name + ' : ' + short_ctxts[i], unit_scale=True, total=len(series)):
            inputs = token(ctx+" "+entry,return_tensors="pt")
            outputs = model(**inputs)
            
            #If the entry is split in multiple tokens, we need to aggregate the tensors
            expected_length = len(token([entry],is_split_into_words=True,add_special_tokens=False)['input_ids']) #number of tokens for the entry
            last_ctx_token = token(ctx,add_special_tokens=False)['input_ids'][-1] #last token of the context string
            start_index = inputs['input_ids'][0].tolist().index(last_ctx_token)+1 #first index of the entry's tensors

            entry_embedding = outputs.last_hidden_state[0][start_index:start_index+expected_length].mean(axis=0)
            buffer_arr[ind] = entry_embedding.detach().numpy()
    
        with open('embd_files/'+model_name+'_'+ short_ctxts[i] + '_cities.npy','wb') as f:
            np.save(file=f,arr=buffer_arr)

bert : come: 100%|██████████| 3.53k/3.53k [05:13<00:00, 11.2it/s]
bert : lives: 100%|██████████| 3.53k/3.53k [04:32<00:00, 12.9it/s]
bert : moved: 100%|██████████| 3.53k/3.53k [04:16<00:00, 13.7it/s]
bertLarge : come: 100%|██████████| 3.53k/3.53k [13:27<00:00, 4.37it/s]  
bertLarge : lives: 100%|██████████| 3.53k/3.53k [13:33<00:00, 4.34it/s]
bertLarge : moved: 100%|██████████| 3.53k/3.53k [13:38<00:00, 4.31it/s]
gpt2 : come: 100%|██████████| 3.53k/3.53k [04:28<00:00, 13.1it/s] 
gpt2 : lives: 100%|██████████| 3.53k/3.53k [04:18<00:00, 13.6it/s]
gpt2 : moved: 100%|██████████| 3.53k/3.53k [04:04<00:00, 14.4it/s]
roberta : come: 100%|██████████| 3.53k/3.53k [04:03<00:00, 14.5it/s] 
roberta : lives: 100%|██████████| 3.53k/3.53k [03:48<00:00, 15.4it/s]
roberta : moved: 100%|██████████| 3.53k/3.53k [03:54<00:00, 15.0it/s]
mpnet : come: 100%|██████████| 3.53k/3.53k [04:13<00:00, 13.9it/s]
mpnet : lives: 100%|██████████| 3.53k/3.53k [04:06<00:00, 14.3it/s]
mpnet : moved: 100%|██████████| 3.53k

In [28]:
# Capitals
series = capitals.CapitalName
for model_name in models_list:
    token = tokenizers[model_name]
    model = models[model_name]
    size = models_output_size[model_name]
    for i, ctx in enumerate(contexts):
        buffer_arr = np.empty((len(series),size))
        for ind,entry in tqdm.tqdm(enumerate(series),desc=model_name + ' : ' + short_ctxts[i], unit_scale=True, total=len(series)):
            inputs = token(ctx+" "+entry,return_tensors="pt")
            outputs = model(**inputs)
            
            #If the entry is split in multiple tokens, we need to aggregate the tensors
            expected_length = len(token([entry],is_split_into_words=True,add_special_tokens=False)['input_ids']) #number of tokens for the entry
            last_ctx_token = token(ctx,add_special_tokens=False)['input_ids'][-1] #last token of the context string
            start_index = inputs['input_ids'][0].tolist().index(last_ctx_token)+1 #first index of the entry's tensors

            entry_embedding = outputs.last_hidden_state[0][start_index:start_index+expected_length].mean(axis=0)
            buffer_arr[ind] = entry_embedding.detach().numpy()
    
        with open('embd_files/'+model_name+'_'+ short_ctxts[i] + '_capitals.npy','wb') as f:
            np.save(file=f,arr=buffer_arr)

bert : come: 100%|██████████| 242/242 [00:18<00:00, 12.9it/s] 
bert : lives: 100%|██████████| 242/242 [00:16<00:00, 14.4it/s] 
bert : moved: 100%|██████████| 242/242 [00:18<00:00, 13.1it/s] 
bertLarge : come: 100%|██████████| 242/242 [00:57<00:00, 4.23it/s] 
bertLarge : lives: 100%|██████████| 242/242 [00:55<00:00, 4.36it/s] 
bertLarge : moved: 100%|██████████| 242/242 [00:55<00:00, 4.33it/s] 
gpt2 : come: 100%|██████████| 242/242 [00:17<00:00, 13.6it/s] 
gpt2 : lives: 100%|██████████| 242/242 [00:17<00:00, 13.9it/s] 
gpt2 : moved: 100%|██████████| 242/242 [00:17<00:00, 13.9it/s] 
roberta : come: 100%|██████████| 242/242 [00:17<00:00, 14.0it/s] 
roberta : lives: 100%|██████████| 242/242 [00:20<00:00, 12.0it/s] 
roberta : moved: 100%|██████████| 242/242 [00:17<00:00, 13.6it/s] 
mpnet : come: 100%|██████████| 242/242 [00:17<00:00, 14.1it/s] 
mpnet : lives: 100%|██████████| 242/242 [00:17<00:00, 14.2it/s] 
mpnet : moved: 100%|██████████| 242/242 [00:16<00:00, 14.4it/s] 


In [29]:
# Countries
series = countries.Name
for model_name in models_list:
    token = tokenizers[model_name]
    model = models[model_name]
    size = models_output_size[model_name]
    for i, ctx in enumerate(contexts):
        buffer_arr = np.empty((len(series),size))
        for ind,entry in tqdm.tqdm(enumerate(series),desc=model_name + ' : ' + short_ctxts[i], unit_scale=True, total=len(series)):
            inputs = token(ctx+" "+entry,return_tensors="pt")
            outputs = model(**inputs)
            
            #If the entry is split in multiple tokens, we need to aggregate the tensors
            expected_length = len(token([entry],is_split_into_words=True,add_special_tokens=False)['input_ids']) #number of tokens for the entry
            last_ctx_token = token(ctx,add_special_tokens=False)['input_ids'][-1] #last token of the context string
            start_index = inputs['input_ids'][0].tolist().index(last_ctx_token)+1 #first index of the entry's tensors

            entry_embedding = outputs.last_hidden_state[0][start_index:start_index+expected_length].mean(axis=0)
            buffer_arr[ind] = entry_embedding.detach().numpy()
    
        with open('embd_files/'+model_name+'_'+ short_ctxts[i] + '_countries.npy','wb') as f:
            np.save(file=f,arr=buffer_arr)

bert : come: 100%|██████████| 249/249 [00:18<00:00, 13.6it/s] 
bert : lives: 100%|██████████| 249/249 [00:18<00:00, 13.8it/s] 
bert : moved: 100%|██████████| 249/249 [00:17<00:00, 13.8it/s] 
bertLarge : come: 100%|██████████| 249/249 [00:57<00:00, 4.34it/s] 
bertLarge : lives: 100%|██████████| 249/249 [00:59<00:00, 4.18it/s] 
bertLarge : moved: 100%|██████████| 249/249 [00:56<00:00, 4.42it/s] 
gpt2 : come: 100%|██████████| 249/249 [00:18<00:00, 13.8it/s] 
gpt2 : lives: 100%|██████████| 249/249 [00:18<00:00, 13.8it/s] 
gpt2 : moved: 100%|██████████| 249/249 [00:18<00:00, 13.4it/s] 
roberta : come: 100%|██████████| 249/249 [00:17<00:00, 13.9it/s] 
roberta : lives: 100%|██████████| 249/249 [00:17<00:00, 14.1it/s] 
roberta : moved: 100%|██████████| 249/249 [00:17<00:00, 14.0it/s] 
mpnet : come: 100%|██████████| 249/249 [00:17<00:00, 14.4it/s] 
mpnet : lives: 100%|██████████| 249/249 [00:17<00:00, 14.1it/s] 
mpnet : moved: 100%|██████████| 249/249 [00:17<00:00, 14.2it/s] 


# Average of last 4 layers

In [45]:
# Cities
series = cities.AccentCity
for model_name in models_list:
    token = tokenizers[model_name]
    model = models[model_name]
    size = models_output_size[model_name]
    for i, ctx in enumerate(contexts):
        buffer_arr = np.empty((len(series),size))
        for ind,entry in tqdm.tqdm(enumerate(series),desc=model_name + ' : ' + short_ctxts[i], unit_scale=True, total=len(series)):
            inputs = token(ctx+" "+entry,return_tensors="pt")
            outputs = model(**inputs,output_hidden_states=True)
            
            #If the entry is split in multiple tokens, we need to aggregate the tensors
            expected_length = len(token([entry],is_split_into_words=True,add_special_tokens=False)['input_ids']) #number of tokens for the entry
            last_ctx_token = token(ctx,add_special_tokens=False)['input_ids'][-1] #last token of the context string
            start_index = inputs['input_ids'][0].tolist().index(last_ctx_token)+1 #first index of the entry's tensors

            entry_embedding = torch.stack(outputs.hidden_states[-4:],axis=0)[:,0,:,:].mean(axis=0)[start_index:start_index+expected_length].mean(axis=0)
            buffer_arr[ind] = entry_embedding.detach().numpy()
    
        with open('embd_files/4layers_'+model_name+'_'+ short_ctxts[i] + '_cities.npy','wb') as f:
            np.save(file=f,arr=buffer_arr)

bert : come: 100%|██████████| 3.53k/3.53k [04:07<00:00, 14.2it/s] 
bert : lives: 100%|██████████| 3.53k/3.53k [04:21<00:00, 13.5it/s]
bert : moved: 100%|██████████| 3.53k/3.53k [04:17<00:00, 13.7it/s]
bertLarge : come: 100%|██████████| 3.53k/3.53k [13:37<00:00, 4.31it/s] 
bertLarge : lives: 100%|██████████| 3.53k/3.53k [13:13<00:00, 4.44it/s]
bertLarge : moved: 100%|██████████| 3.53k/3.53k [13:05<00:00, 4.49it/s]
gpt2 : come: 100%|██████████| 3.53k/3.53k [03:55<00:00, 14.9it/s] 
gpt2 : lives: 100%|██████████| 3.53k/3.53k [03:56<00:00, 14.9it/s]
gpt2 : moved: 100%|██████████| 3.53k/3.53k [03:53<00:00, 15.1it/s]
roberta : come: 100%|██████████| 3.53k/3.53k [03:53<00:00, 15.1it/s] 
roberta : lives: 100%|██████████| 3.53k/3.53k [03:53<00:00, 15.1it/s]
roberta : moved: 100%|██████████| 3.53k/3.53k [03:53<00:00, 15.1it/s]
mpnet : come: 100%|██████████| 3.53k/3.53k [03:49<00:00, 15.3it/s] 
mpnet : lives: 100%|██████████| 3.53k/3.53k [03:49<00:00, 15.4it/s]
mpnet : moved: 100%|██████████| 3.53

In [46]:
# Capitals
series = capitals.CapitalName
for model_name in models_list:
    token = tokenizers[model_name]
    model = models[model_name]
    size = models_output_size[model_name]
    for i, ctx in enumerate(contexts):
        buffer_arr = np.empty((len(series),size))
        for ind,entry in tqdm.tqdm(enumerate(series),desc=model_name + ' : ' + short_ctxts[i], unit_scale=True, total=len(series)):
            inputs = token(ctx+" "+entry,return_tensors="pt")
            outputs = model(**inputs,output_hidden_states=True)
            
            #If the entry is split in multiple tokens, we need to aggregate the tensors
            expected_length = len(token([entry],is_split_into_words=True,add_special_tokens=False)['input_ids']) #number of tokens for the entry
            last_ctx_token = token(ctx,add_special_tokens=False)['input_ids'][-1] #last token of the context string
            start_index = inputs['input_ids'][0].tolist().index(last_ctx_token)+1 #first index of the entry's tensors

            entry_embedding = torch.stack(outputs.hidden_states[-4:],axis=0)[:,0,:,:].mean(axis=0)[start_index:start_index+expected_length].mean(axis=0)
            buffer_arr[ind] = entry_embedding.detach().numpy()
    
        with open('embd_files/4layers_'+model_name+'_'+ short_ctxts[i] + '_capitals.npy','wb') as f:
            np.save(file=f,arr=buffer_arr)

bert : come: 100%|██████████| 242/242 [00:16<00:00, 14.6it/s] 
bert : lives: 100%|██████████| 242/242 [00:16<00:00, 14.9it/s] 
bert : moved: 100%|██████████| 242/242 [00:16<00:00, 14.8it/s] 
bertLarge : come: 100%|██████████| 242/242 [00:55<00:00, 4.38it/s] 
bertLarge : lives: 100%|██████████| 242/242 [00:52<00:00, 4.62it/s] 
bertLarge : moved: 100%|██████████| 242/242 [00:54<00:00, 4.44it/s] 
gpt2 : come: 100%|██████████| 242/242 [00:15<00:00, 15.2it/s] 
gpt2 : lives: 100%|██████████| 242/242 [00:16<00:00, 15.0it/s] 
gpt2 : moved: 100%|██████████| 242/242 [00:15<00:00, 15.5it/s] 
roberta : come: 100%|██████████| 242/242 [00:16<00:00, 14.9it/s] 
roberta : lives: 100%|██████████| 242/242 [00:15<00:00, 15.4it/s] 
roberta : moved: 100%|██████████| 242/242 [00:16<00:00, 14.9it/s] 
mpnet : come: 100%|██████████| 242/242 [00:15<00:00, 15.5it/s] 
mpnet : lives: 100%|██████████| 242/242 [00:15<00:00, 15.5it/s] 
mpnet : moved: 100%|██████████| 242/242 [00:15<00:00, 15.2it/s] 


In [44]:
# Countries
series = countries.Name
for model_name in models_list:
    token = tokenizers[model_name]
    model = models[model_name]
    size = models_output_size[model_name]
    for i, ctx in enumerate(contexts):
        buffer_arr = np.empty((len(series),size))
        for ind,entry in tqdm.tqdm(enumerate(series),desc=model_name + ' : ' + short_ctxts[i], unit_scale=True, total=len(series)):
            inputs = token(ctx+" "+entry,return_tensors="pt")
            outputs = model(**inputs,output_hidden_states=True)
            
            #If the entry is split in multiple tokens, we need to aggregate the tensors
            expected_length = len(token([entry],is_split_into_words=True,add_special_tokens=False)['input_ids']) #number of tokens for the entry
            last_ctx_token = token(ctx,add_special_tokens=False)['input_ids'][-1] #last token of the context string
            start_index = inputs['input_ids'][0].tolist().index(last_ctx_token)+1 #first index of the entry's tensors

            entry_embedding = torch.stack(outputs.hidden_states[-4:],axis=0)[:,0,:,:].mean(axis=0)[start_index:start_index+expected_length].mean(axis=0)
            buffer_arr[ind] = entry_embedding.detach().numpy()
    
        with open('embd_files/4layers_'+model_name+'_'+ short_ctxts[i] + '_countries.npy','wb') as f:
            np.save(file=f,arr=buffer_arr)

bert : come: 100%|██████████| 249/249 [00:16<00:00, 15.3it/s] 
bert : lives: 100%|██████████| 249/249 [00:15<00:00, 16.0it/s] 
bert : moved: 100%|██████████| 249/249 [00:15<00:00, 15.9it/s] 
bertLarge : come: 100%|██████████| 249/249 [00:53<00:00, 4.63it/s] 
bertLarge : lives: 100%|██████████| 249/249 [00:50<00:00, 4.89it/s] 
bertLarge : moved: 100%|██████████| 249/249 [00:49<00:00, 5.02it/s] 
gpt2 : come: 100%|██████████| 249/249 [00:17<00:00, 14.0it/s] 
gpt2 : lives: 100%|██████████| 249/249 [00:15<00:00, 16.5it/s] 
gpt2 : moved: 100%|██████████| 249/249 [00:15<00:00, 16.1it/s] 
roberta : come: 100%|██████████| 249/249 [00:15<00:00, 15.9it/s] 
roberta : lives: 100%|██████████| 249/249 [00:15<00:00, 16.2it/s] 
roberta : moved: 100%|██████████| 249/249 [00:15<00:00, 16.6it/s] 
mpnet : come: 100%|██████████| 249/249 [00:17<00:00, 14.5it/s] 
mpnet : lives: 100%|██████████| 249/249 [00:16<00:00, 15.5it/s] 
mpnet : moved: 100%|██████████| 249/249 [00:16<00:00, 15.0it/s] 
