In [6]:
import os
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scipy.stats import spearmanr
from collections import defaultdict, Counter


from sklearn.cross_decomposition import PLSRegression

from sklearn.model_selection import cross_val_predict

from sklearn.decomposition import SparsePCA 

In [36]:
# function to estimate corrs 
def estimate_corrs(y_original, y_inferred, labels, corr_type):
    """
    corr_type è un parametro che specifica se la correlazione deve essere 
    calcolata tra le righe ("byrow") o tra le colonne ("bycolumn") delle due
    matrici fornite in input
    
    In out fornisce sia la correlazione media che un Counter con le correlazioni puntuali
    """
    corrs = Counter()
        
    if corr_type == "byrow":
        for i in range(len(y_original)):
            rho, _ = spearmanr(y_original[i], y_inferred[i])
            corrs[labels[i]] = rho
        
    elif corr_type == "bycolumn":
        for i in range(y_original.shape[1]):
            rho, _ = spearmanr(y_original[:,i], y_inferred[:,i])
            corrs[labels[i]] = rho
                
    average_rho = np.average(list(corrs.values()))
        
    return corrs, average_rho

#helper function to load the embeddings and reshape them along with the sspace
def embeddings_preparation(model_emb_path, sspace_nsubj , sspace_dobj, model_name = "", return_new_sspace = False):
    
    """Parameters:
    model_emb_path: name under wich embeddings are saved
    sspace_nsubj: the sspace loaded for the nsubj
    sspace_dobj: the sspace loaded for the dobj
    model_name: the name of the model we are referringe to. Will be used to store the spaces with the right key.
    return_new_sspace: parameter to decide if we want in output the reshaped sspaces. To be used the first time only.
    """
    
    #load the embeddings for the given model
    dir_path = "..\\Data\\Extracted_Embeddings"
    with open(os.path.join(dir_path,model_emb_path), "rb") as infile:
        embeddings_dict = pickle.load(infile)
    
    #convert the dictionary into a dataframe
    emb_df = pd.DataFrame(embeddings_dict).T 
    
    #take the index of the semantic spaces to match the extracted verbs
    verbs2keep_nsubj = sspace_nsubj.index.tolist()
    verbs2keep_dobj = sspace_dobj.index.tolist()
    
    #filter the verbs from the target semantic space to match the embeddings spaces
    model_nsubj = emb_df.filter(items = verbs2keep_nsubj, axis = 0).apply(lambda x:(x-x.min())/(x.max()-x.min()))
    model_dobj = emb_df.filter(items = verbs2keep_dobj, axis = 0 ).apply(lambda x:(x-x.min())/(x.max()-x.min()))

    #reverse the process to restrict the semantic spaces
    sspace_nsubj_final = sspace_nsubj.filter(model_nsubj.index.tolist(), axis = 0)
    sspace_dobbj_final = sspace_dobj.filter(model_dobj.index.tolist(), axis = 0)
    
    #store all the spaces along with a baseline in a dictionary
    spaces_dict_nsubj = {}
    spaces_dict_nsubj[model_name+"_nsubj"] = model_nsubj.values 
    spaces_dict_nsubj[model_name+"_nsubj_baseline"] = np.random.random_sample(model_nsubj.shape)
    
    # same for dobj
    spaces_dict_dobj = {}
    spaces_dict_dobj[model_name+"_dobj"] = model_dobj.values 
    spaces_dict_dobj[model_name+"_dobj_baseline"] = np.random.random_sample(model_dobj.shape)
    
    if return_new_sspace:
        return spaces_dict_nsubj, spaces_dict_dobj, sspace_nsubj_final, sspace_dobbj_final
    else:
        return spaces_dict_nsubj, spaces_dict_dobj


#helper function to automate correlations estimation
def get_correlations_values(model_space:dict, semantic_space: pd.DataFrame, sPca = True):
    
    """Parameters:
    model_space: a dictionary containig 2 keys: the actual model embedding space for the gram argument(e.g. nsubj)
                 and the random baseline shape as the model space.
    semantic_space: a data frame the target semantic space for the referred gram argument. Will be used as y and to derive labels
                    for corrs estimation.
    sPCA: a boolean to decide if we want to transform the target space with sPCA. Default to True to reduce noise."""
    
    # initialize the regressor and dictionary to be filled
    pls = PLSRegression(n_components= 10)
    all_corr = defaultdict(dict)
    all_avg_rhos = defaultdict(dict)
    y_pred = {}
    #get the verbs and properties names, to be used as labels in corr estimation
    verbs = semantic_space.index.tolist()
    properties = semantic_space.columns.tolist()
    
    #initialize the sPCA and transform the target space if param not False
    if sPca:
        print(f"Sparse PCA activated\n")
        pca = SparsePCA(n_components=14)
        y = pca.fit_transform(semantic_space.values)
    else:
        print(f"Not using Sparse PCA")
        y = semantic_space.values
        
    # mapping for the given space
    for k in model_space.keys():
        X = model_space[k]
        y = y
        #get prediction values with cross validation k=10
        y_pred[k] = cross_val_predict(pls, X, y, cv = 10)

        # store and print correlation by row    
        corrs, avg_rho = estimate_corrs(y,y_pred[k], verbs, 'byrow')
        print(f"Average row correlation for the {k} space: {avg_rho}")

        all_corr[k]['byrow'] = corrs
        all_avg_rhos[k]['byrow'] = avg_rho

        # store and print correlation by column
        corrs, avg_rho = estimate_corrs(y,y_pred[k], properties, 'bycolumn')
        print(f"Average column correlation for the {k} space: {avg_rho}\n")

        all_corr[k]['bycolumn'] = corrs
        all_avg_rhos[k]['bycolumn'] = avg_rho

    return all_corr, all_avg_rhos

def write_corrs(all_avg_rhos):
    pass 

In [28]:
np.random.seed(42)

In [29]:
#upload the semantic spaces for both arguments
sspace_nsubj = pd.read_csv("..\\Data\\Semantic_Spaces\\sspace_spr2_nsubj.csv", index_col = 'Token.Sent')
sspace_dobj = pd.read_csv("..\\Data\\Semantic_Spaces\\sspace_spr2_dobj.csv",  index_col = 'Token.Sent')

## Mapping BabyBERTa

In [27]:
#baby BERTa
model_emb_path = "target_embeddings_BabyBERTa-2.pkl"
bb_dict_nsubj, bb_dict_dobj, final_sspace_nsubj, final_sspace_dobj = embeddings_preparation(model_emb_path, 
                                                                    sspace_nsubj,
                                                                    sspace_dobj, 
                                                                    model_name = "BabyBERTa",
                                                                    return_new_sspace= True)

bb_corrs_nsubj, bb_avgr_nsubj = get_correlations_values(bb_dict_nsubj, final_sspace_nsubj)
bb_corrs_dobj, bb_avgr_dobj = get_correlations_values(bb_dict_dobj, final_sspace_dobj)

Sparse PCA activated

Average row correlation for the BabyBERTa_nsubj space: 0.3454790023254753
Average column correlation for the BabyBERTa_nsubj space: 0.3570435498045595

Average row correlation for the BabyBERTa_nsubj_baseline space: -0.004751265332178196
Average column correlation for the BabyBERTa_nsubj_baseline space: -0.018592378507054187

Sparse PCA activated

Average row correlation for the BabyBERTa_dobj space: 0.1994090416625628
Average column correlation for the BabyBERTa_dobj space: 0.20121971805471586

Average row correlation for the BabyBERTa_dobj_baseline space: -0.03594996552743032
Average column correlation for the BabyBERTa_dobj_baseline space: -0.03244091461116801



In [39]:
bb_corrs_nsubj, bb_avgr_nsubj = get_correlations_values(bb_dict_nsubj, final_sspace_nsubj,sPca = False)
bb_corrs_dobj, bb_avgr_dobj = get_correlations_values(bb_dict_dobj, final_sspace_dobj, sPca = False)

Not using Sparse PCA
Average row correlation for the BabyBERTa_nsubj space: 0.7710238419553743
Average column correlation for the BabyBERTa_nsubj space: 0.3570464734001003

Average row correlation for the BabyBERTa_nsubj_baseline space: 0.7079004224299861
Average column correlation for the BabyBERTa_nsubj_baseline space: -0.018589531823358273

Not using Sparse PCA
Average row correlation for the BabyBERTa_dobj space: 0.6575879560911262
Average column correlation for the BabyBERTa_dobj space: 0.20120690436413316

Average row correlation for the BabyBERTa_dobj_baseline space: 0.5645505268416493
Average column correlation for the BabyBERTa_dobj_baseline space: -0.03241292901561521



## Mapping Pythia

In [41]:
#Pythia
model_emb_path = "target_embeddings_pythia-70m-deduped.pkl"
pythia_dict_nsubj, pythia_dict_dobj= embeddings_preparation(model_emb_path, 
                                                    sspace_nsubj,
                                                    sspace_dobj, 
                                                    model_name = "Pythia70m"
                                                    )

pythia_corrs_nsubj, pythia_avgr_nsubj = get_correlations_values(pythia_dict_nsubj, final_sspace_nsubj)
pythia_corrs_dobj, pythia_avgr_dobj = get_correlations_values(pythia_dict_dobj, final_sspace_dobj)



Sparse PCA activated

Average row correlation for the Pythia70m_nsubj space: 0.40719985408782094
Average column correlation for the Pythia70m_nsubj space: 0.3987736007744081

Average row correlation for the Pythia70m_nsubj_baseline space: -0.02140577265058593
Average column correlation for the Pythia70m_nsubj_baseline space: -0.01697139281683269

Sparse PCA activated

Average row correlation for the Pythia70m_dobj space: 0.21492591915127127
Average column correlation for the Pythia70m_dobj space: 0.21103843645134115

Average row correlation for the Pythia70m_dobj_baseline space: -0.012480477269209658
Average column correlation for the Pythia70m_dobj_baseline space: -0.00791572584138237



In [42]:
pythia_corrs_nsubj, pythia_avgr_nsubj = get_correlations_values(pythia_dict_nsubj, final_sspace_nsubj, sPca = False)
pythia_corrs_dobj, pythia_avgr_dobj = get_correlations_values(pythia_dict_dobj, final_sspace_dobj,sPca = False)

Not using Sparse PCA
Average row correlation for the Pythia70m_nsubj space: 0.7816389431104404
Average column correlation for the Pythia70m_nsubj space: 0.39878595139884104

Average row correlation for the Pythia70m_nsubj_baseline space: 0.6869785933114928
Average column correlation for the Pythia70m_nsubj_baseline space: -0.016999105149870172

Not using Sparse PCA
Average row correlation for the Pythia70m_dobj space: 0.6682005261153099
Average column correlation for the Pythia70m_dobj space: 0.21107765574626722

Average row correlation for the Pythia70m_dobj_baseline space: 0.5479877990161518
Average column correlation for the Pythia70m_dobj_baseline space: -0.008249606870172932



## Mapping GPT2-XL

In [33]:
#GPT2-XL
model_emb_path = "target_embeddings_gpt2-xl.pkl"
gpt2xl_dict_nsubj, gpt2xl_dict_dobj= embeddings_preparation(model_emb_path, 
                                                    sspace_nsubj,
                                                    sspace_dobj, 
                                                    model_name = "GPT2-XL"
                                                    )

gpt2xl_corrs_nsubj, gpt2xl_avgr_nsubj = get_correlations_values(gpt2xl_dict_nsubj, final_sspace_nsubj)
gpt2xl_corrs_dobj, gpt2xl_avgr_dobj = get_correlations_values(gpt2xl_dict_dobj, final_sspace_dobj)

Sparse PCA activated

Average row correlation for the GPT2-XL_nsubj space: 0.4234804614472664
Average column correlation for the GPT2-XL_nsubj space: 0.4076927006612456

Average row correlation for the GPT2-XL_nsubj_baseline space: -0.00731840773334549
Average column correlation for the GPT2-XL_nsubj_baseline space: -0.0008493025021799626

Sparse PCA activated

Average row correlation for the GPT2-XL_dobj space: 0.2559327995947714
Average column correlation for the GPT2-XL_dobj space: 0.26106065176527615

Average row correlation for the GPT2-XL_dobj_baseline space: -0.026044378157054213
Average column correlation for the GPT2-XL_dobj_baseline space: -0.020931897352724566



In [40]:
gpt2xl_corrs_nsubj, gpt2xl_avgr_nsubj = get_correlations_values(gpt2xl_dict_nsubj, final_sspace_nsubj, sPca = False)
gpt2xl_corrs_dobj, gpt2xl_avgr_dobj = get_correlations_values(gpt2xl_dict_dobj, final_sspace_dobj, sPca = False)

Not using Sparse PCA
Average row correlation for the GPT2-XL_nsubj space: 0.7870254165356915
Average column correlation for the GPT2-XL_nsubj space: 0.4077370644255654

Average row correlation for the GPT2-XL_nsubj_baseline space: 0.6539906945969608
Average column correlation for the GPT2-XL_nsubj_baseline space: -0.0008460060457187347

Not using Sparse PCA
Average row correlation for the GPT2-XL_dobj space: 0.6824411759289376
Average column correlation for the GPT2-XL_dobj space: 0.26109680628558146

Average row correlation for the GPT2-XL_dobj_baseline space: 0.5649349274801263
Average column correlation for the GPT2-XL_dobj_baseline space: -0.020835481530775905

