# Phrase Similarity task

In this notebook, we demonstrate how to use a trained syntactic rand-walk model for the phrase similarity task introduced by Mitchell and Lapata in [this paper](https://onlinelibrary.wiley.com/doi/pdf/10.1111/j.1551-6709.2010.01106.x). The dataset is available [here](http://homepages.inf.ed.ac.uk/s0453356/share). We compare various composition techniques based on the syntactic rand-walk model. 

In [2]:
import numpy as np
import pandas as pd
import tensor_operations as to
from collections import defaultdict
from scipy import linalg as la
from scipy.stats import spearmanr, pearsonr

In [3]:
# set paths to important files
vocab_file = "../../datasets/rw_vocab_no_stopwords.txt"
embedding_file = "../../datasets/rw_vectors.txt"
param_file = "/usr/xtmp/abef/learned_params_dep_an_rw.npz"
phrase_file = "../../datasets/phrase_sim.txt"

In [4]:
# load in the vocab, create mapping from word to index
vocab = []
with open(vocab_file,"r") as f:
    for line in f:
        vocab.append(line.strip("\n"))
vocab_dict = defaultdict(lambda : -1) # this will return index -1 if key not found
for i, w in enumerate(vocab):
    vocab_dict[w] = i
    
# load in the word embeddings, compute norm of each embedding
vectors = np.loadtxt(embedding_file)
norms = la.norm(vectors,axis=1)

# load in the learned composition tensor
params = np.load(param_file)
T = params["arr_0"]

In [11]:
# load in phrase similarity dataset
phrases_all = pd.read_csv(phrase_file, sep=" ")
phrases = phrases_all.loc[phrases_all["type"]=="adjectivenouns"] # only use adjective-noun phrases

# get word frequencies -- used in sif composition
freqs = {}
tot = 0
with open("../../datasets/semvec_2015/enwiki_vocab.txt", "r") as f:
    for line in f:
        toks = line.strip().split(" ")
        freqs[toks[0]] = int(toks[1])
        tot += int(toks[1])
for item in freqs.items():
    freqs[item[0]] /= tot 

In [21]:
def cosine(v1, v2):
    """
    Return the cosine similarity of two vectors
    """
    return np.dot(v1,v2)/np.sqrt(np.dot(v1,v1)*np.dot(v2,v2))

def get_additive_score(vectors, phrase_df, vocab_dict, b):
    """
    Compute the spearman and pearson correlations between phrases
    according to the additive composition with weight parameter b,
    i.e. if v_a and v_n are the embeddings for the adjective and 
    noun, respectively, then use the composition b*v_a + v_n.
    
    Parameters
    ----------
    vectors : ndarray of shape (N,d)
       The word embeddings (one embedding per row)
    phrase_df : pandas DataFrame containing the phrase pairs and human similarities
    vocab_dict : dictionary that maps word to index
    b : nonnegative float
       The weighting parameter for the adjective embedding in the composition
    
    Returns
    -------
    (spear, pears) : the two similarity scores computed by the embeddings
    """
    score = []
    sim = []
    for i in range(phrase_df.shape[0]):
        a1 = phrase_df.iloc[i,3]
        n1 = phrase_df.iloc[i,4]
        a2 = phrase_df.iloc[i,5]
        n2 = phrase_df.iloc[i,6]

        a1_ind = vocab_dict[a1]
        n1_ind = vocab_dict[n1]
        a2_ind = vocab_dict[a2]
        n2_ind = vocab_dict[n2]

        p1 = b*vectors[a1_ind]+vectors[n1_ind]
        p2 = b*vectors[a2_ind]+vectors[n2_ind]
        
        sim.append(phrase_df.iloc[i,7])
        score.append(cosine(p1,p2))
    return spearmanr(score,sim)[0],pearsonr(score,sim)[0]

def get_tensor_score(T,vectors, phrase_df, vocab_dict, b, vectors_T = None, a=1):
    """
    Compute the spearman and pearson correlations between phrases
    according to the tensor composition with weight parameters a and b,
    i.e. if v_a and v_n are the embeddings for the adjective and 
    noun, respectively, then use the composition a*v_a + v_n + b*T(v_a,v_n,.).
    
    Parameters
    ----------
    T : ndarray of shape (d,d,d)
       The composition tensor
    vectors : ndarray of shape (N,d)
       The word embeddings (one embedding per row)
    phrase_df : pandas DataFrame containing the phrase pairs and human similarities
    vocab_dict : dictionary that maps word to index
    b : nonnegative float
       The weighting parameter for the tensor component
    a : nonnegative float (optional, defaults to 1)
       The weighting parameter for the adjective embedding
    vectors_T : ndarray of shape (N,d) (optional, defaults to None)
       Gives an optional set of embeddings used specifically in the tensor component,
       If None, the input vectors is used for tensor component as well. 
    
    Returns
    -------
    (spear, pears) : the two similarity scores computed by the embeddings
    """
    score = []
    sim = []
    if not vectors_T:
        vectors_T = vectors
        
    for i in range(phrase_df.shape[0]):
        a1 = phrase_df.iloc[i,3]
        n1 = phrase_df.iloc[i,4]
        a2 = phrase_df.iloc[i,5]
        n2 = phrase_df.iloc[i,6]

        a1_ind = vocab_dict[a1]
        n1_ind = vocab_dict[n1]
        a2_ind = vocab_dict[a2]
        n2_ind = vocab_dict[n2]

        p1 = a*vectors[a1_ind]+vectors[n1_ind] 
        t1 = b*to.bilinear_lowrank_batch_np(T,vectors_T[a1_ind],vectors_T[n1_ind])
        p2 = a*vectors[a2_ind]+vectors[n2_ind] 
        t2 = b*to.bilinear_lowrank_batch_np(T,vectors_T[a2_ind],vectors_T[n2_ind])

        sim.append(phrase_df.iloc[i,7])
        score.append(cosine(p1+t1,p2+t2))
    return spearmanr(score,sim)[0],pearsonr(score,sim)[0]

def get_sif_score(vectors, phrase_df, vocab_dict, freqs, A=1e-3): 
    """
    Compute the spearman and pearson correlations between phrases
    according to the sif composition with parameter A.
    See the paper "A simple but tough-to-beat baseline for sentence embeddings"
    by Arora et al. 
    
    Parameters
    ----------
    vectors : ndarray of shape (N,d)
       The word embeddings (one embedding per row)
    phrase_df : pandas DataFrame containing the phrase pairs and human similarities
    vocab_dict : dictionary that maps word to index
    freqs : a dictionary that maps each word to its relative frequency
    A : nonnegative float (optional, defaults to 1e-3)
       The smoothing parameter
    
    Returns
    -------
    (spear, pears) : the two similarity scores computed by the embeddings
    """
    score = []
    sim = []
    sif_embeddings = []
    X = np.zeros((300,2*phrase_df.shape[0]))
    for i in range(phrase_df.shape[0]):
        a1 = phrase_df.iloc[i,3]
        n1 = phrase_df.iloc[i,4]
        a2 = phrase_df.iloc[i,5]
        n2 = phrase_df.iloc[i,6]

        a1_ind = vocab_dict[a1]
        n1_ind = vocab_dict[n1]
        a2_ind = vocab_dict[a2]
        n2_ind = vocab_dict[n2]

        p1 = (A/(A+freqs[a1]))*vectors[a1_ind]+(A/(A+freqs[n1]))*vectors[n1_ind]
        p2 = (A/(A+freqs[a2]))*vectors[a2_ind]+(A/(A+freqs[n2]))*vectors[n2_ind]
        
        X[:,i] = p1
        X[:,phrase_df.shape[0]+i] = p2
        sif_embeddings.append([p1,p2])

        sim.append(phrase_df.iloc[i,7])
    u = la.svd(X)[0][:,0] # get top left singular vector of X

    for pair in sif_embeddings:
        for i in range(2):
            pair[i] = pair[i] - np.dot(pair[i],u)*u

    score=[]
    for pair in sif_embeddings:
        score.append(cosine(pair[0],pair[1]))
    return spearmanr(score,sim)[0],pearsonr(score,sim)[0]

def get_sif_tensor_score(T,vectors, phrase_df, vocab_dict, b, freqs, a=1, A=1e-3, vectors_T=None):
    """
    Compute the spearman and pearson correlations between phrases
    according to the sif composition with tensor component.
    
    Parameters
    ----------
    T : ndarray of shape (d,d,d)
       The composition tensor
    vectors : ndarray of shape (N,d)
       The word embeddings (one embedding per row)
    phrase_df : pandas DataFrame containing the phrase pairs and human similarities
    vocab_dict : dictionary that maps word to index
    b : nonnegative float
       The weighting parameter for the tensor component
    freqs : a dictionary that maps each word to its relative frequency
    a : nonnegative float (optional, defaults to 1)
       The weighting parameter for the sif embedding
    A : nonnegative float (optional, defaults to 1e-3)
       The smoothing parameter
    vectors_T : ndarray of shape (N,d) (optional, defaults to None)
       Gives an optional set of embeddings used specifically in the tensor component,
       If None, the input vectors is used for tensor component as well. 
    
    Returns
    -------
    (spear, pears) : the two similarity scores computed by the embeddings
    """
    score = []
    sim = []
    sif_embeddings = []
    tensor_comp = []
    X = np.zeros((300,2*phrase_df.shape[0]))
    if vectors_T is None:
        vectors_T=vectors
    for i in range(phrase_df.shape[0]):
        a1 = phrase_df.iloc[i,3]
        n1 = phrase_df.iloc[i,4]
        a2 = phrase_df.iloc[i,5]
        n2 = phrase_df.iloc[i,6]

        a1_ind = vocab_dict[a1]
        n1_ind = vocab_dict[n1]
        a2_ind = vocab_dict[a2]
        n2_ind = vocab_dict[n2]

        p1 = a*(A/(A+freqs[a1]))*vectors[a1_ind]+(A/(A+freqs[n1]))*vectors[n1_ind] 
        t1 = (A/(A+freqs[n1]))*(A/(A+freqs[a1]))*b \
            *to.bilinear_lowrank_batch_np(T,vectors_T[a1_ind],vectors_T[n1_ind])
        p2 = a*(A/(A+freqs[a2]))*vectors[a2_ind]+(A/(A+freqs[n2]))*vectors[n2_ind] 
        t2 = (A/(A+freqs[n2]))*(A/(A+freqs[a2]))*b \
            *to.bilinear_lowrank_batch_np(T,vectors_T[a2_ind],vectors_T[n2_ind])

        X[:,i] = p1
        X[:,phrase_df.shape[0]+i] = p2
        sif_embeddings.append([p1,p2])
        tensor_comp.append([t1,t2])

        sim.append(phrase_df.iloc[i,7])
        score.append(cosine(p1,p2))
    u = la.svd(X)[0][:,0] # get top left singular vector of X

    for pair in sif_embeddings:
        for i in range(2):
            pair[i] = pair[i] - np.dot(pair[i],u)*u

    score=[]
    for p,t in zip(sif_embeddings,tensor_comp):
        score.append(cosine(p[0]+t[0],p[1]+t[1]))
        
    return spearmanr(score,sim)[0],pearsonr(score,sim)[0]

def get_best_additive_param(vectors, phrase_df, vocab_dict):
    """
    Range over several parameter values for additive composition,
    return the results for each one.
    """
    b_params = np.linspace(0,2,21)
    b_results = np.zeros((2,len(b_params)))
    for k,b in enumerate(b_params):
        s,p = get_additive_score(vectors,phrase_df,vocab_dict,b)
        b_results[0,k] = s
        b_results[1,k] = p
    return b_results,b_params

def get_best_tensor_param(T,vectors, phrase_df, vocab_dict, a=1, vectors_T=None):
    """
    Range over several parameter values for tensor composition,
    return the results for each one.
    """
    b_params = np.linspace(0,1,11)
    b_results = np.zeros((2,len(b_params)))
    for k,b in enumerate(b_params):
        s,p = get_tensor_score(T,vectors,phrase_df,vocab_dict,b,a=a,vectors_T=vectors_T)
        b_results[0,k] = s
        b_results[1,k] = p
    return b_results,b_params

def get_best_sif_tensor_param(T,vectors, phrase_df, vocab_dict, freqs, a=1, vectors_T=None):
    """
    Range over several parameter values for sif+tensor composition,
    return the results for each one.
    """
    b_params = np.linspace(0,.5,11)
    b_results = np.zeros((2,len(b_params)))
    for k,b in enumerate(b_params):
        s,p = get_sif_tensor_score(T,vectors,phrase_df,vocab_dict,b,freqs,a=a,vectors_T=vectors_T)
        b_results[0,k] = s
        b_results[1,k] = p
    return b_results,b_params

In [22]:
# split the participants randomly into 3 disjoint groups, use two groups as development set
# on development set, select best composition params
# on test set, use the selected composition params to evaluate the composition techniques

participants = list(set(phrases["participant"]))
perm = np.random.permutation(len(participants))
results = np.zeros((3,6,2))

for fold in range(3):
    print("\nFold {}".format(fold))
    # split into dev and test sets
    dev_participants = [participants[i] for i in perm[18*fold:18*(fold+1)]]
    phrases_dev = phrases.loc[phrases.apply(lambda x: x[0] in dev_participants, axis=1)]
    phrases_test = phrases.loc[~phrases.apply(lambda x: x[0] in dev_participants,axis=1)]

    # compute score for plain additive composition (no weighting)
    add_scores = get_additive_score(vectors, phrases_test,vocab_dict,1)
    results[fold,0,:] = np.array(add_scores)

    # compute score for weighted additive composition, selecting parameter on dev set
    a_results,a_params = get_best_additive_param(vectors, phrases_dev, vocab_dict)
    a_spear,a_pears = [a_params[j] for j in a_results.argmax(axis=1)]
    res_spear,res_pears = get_additive_score(vectors,phrases_test,vocab_dict,a_spear)
    if a_spear != a_pears:
        res_pears = get_additive_score(vectors,phrases_test,vocab_dict,a_pears)[1]
    results[fold,1,:] = np.array([res_spear,res_pears])
    print("best weighted1 params: {}/{}".format(round(a_spear,2),round(a_pears,2)))

    # compute score for weighted additive composition, selecting parameter on test set
    a_results,a_params = get_best_additive_param(vectors, phrases_test, vocab_dict)
    a_spear,a_pears = [a_params[j] for j in a_results.argmax(axis=1)]
    res_spear,res_pears = get_additive_score(vectors,phrases_test,vocab_dict,a_spear)
    if a_spear != a_pears:
        res_pears = get_additive_score(vectors,phrases_test,vocab_dict,a_pears)[1]
    results[fold,2,:] = np.array([res_spear,res_pears])
    print("best weighted2 params: {}/{}".format(round(a_spear,2),round(a_pears,2)))

    # compute score for tensor composition, selecting parameter on dev set
    # use .6 for default weighting parameter a since this is a good weighted additive param
    b_results, b_params = get_best_tensor_param(T,vectors,phrases_dev,vocab_dict,a=.6)
    b_spear,b_pears = [b_params[j] for j in b_results.argmax(axis=1)]
    res_spear, res_pears = get_tensor_score(T,vectors,phrases_test,vocab_dict,b_spear,a=.6)
    if b_pears != b_spear:
        res_pears = get_tensor_score(T,vectors,phrases_test,vocab_dict,b_pears,a=.6)[1]
    results[fold,3,:] = np.array([res_spear,res_pears])
    print("best tensor params: {}/{}".format(round(b_spear,2),round(b_pears,2)))

    # compute score for sif embedding
    sif_scores = get_sif_score(vectors, phrases_test,vocab_dict,freqs)
    results[fold,4,:] = np.array(sif_scores)

    # compute score for sif+tensor embeddings, selecting parameter on dev set
    c_results, c_params = get_best_sif_tensor_param(T,vectors, phrases_dev, vocab_dict, freqs)
    c_spear,c_pears = [c_params[j] for j in c_results.argmax(axis=1)]
    res_spear, res_pears = get_sif_tensor_score(T,vectors,phrases_test,vocab_dict,c_spear,freqs)
    if c_spear != c_pears:
        res_pears = get_sif_tensor_score(T,vectors,phrases_test,vocab_dict,c_pears,freqs)[1]
    results[fold,5,:] = np.array([res_spear,res_pears])
    print("best sif tensor params: {}/{}".format(round(c_spear,2),round(c_pears,2)))

print(results.mean(axis=0))


Fold 0
best weighted1 params: 0.7/0.5
best weighted2 params: 0.7/0.6
best tensor params: 0.3/0.3
best sif tensor params: 0.05/0.25

Fold 1
best weighted1 params: 0.7/0.7
best weighted2 params: 0.7/0.5
best tensor params: 0.3/0.3
best sif tensor params: 0.05/0.1

Fold 2
best weighted1 params: 0.7/0.5
best weighted2 params: 0.7/0.6
best tensor params: 0.4/0.3
best sif tensor params: 0.05/0.2
[[0.44687226 0.43851155]
 [0.45194553 0.45118495]
 [0.45194553 0.45367238]
 [0.46171781 0.47070609]
 [0.48296984 0.47827045]
 [0.48432109 0.47940004]]
