# Import

In [1]:
import numpy as np
import random
from scipy.stats import spearmanr, pearsonr
from scipy.optimize import fsolve

#import the custom functions
import MFunctions as mf

In [2]:
#Load verb_symilarity from which we extract the pairs
simdict = mf.verb_similarity("SimVerb-3500.txt")

# Define some functions for the precision computation

## Here the precision is computed by averaging

In [None]:
# We decided to compute the precision by averaging over the precision in each category, which scales well
# for cases where the sample sizes are very different for syn/non/ant (we have ~300 syn ~100 ant ~2000 none)

In [3]:
def synvsant1(file_name):
    #Using observable vectors with cosine distance

    #Compute the mean and the standard deviation for the SYNONYMS
    syn1 = mf.averaged_product(simdict,'SYNONYMS',file_name,"std_dev")

    #Compute the mean and the standard deviation for the ANTONYMS
    ant1 = mf.averaged_product(simdict,'ANTONYMS',file_name,"std_dev")
    
    #Use the means and deviations to set the divide between synonyms and antonyms
    divide1 = ant1[0] + ant1[1]/syn1[1] * (syn1[0]-ant1[0])/2
    
    # Use the earlier defined function to compute the list of scalar products of the verbs and compare with divide
    prodlist1 = mf.averaged_product_list(simdict,'SYNONYMS',file_name,"std_dev")

    prodlist5 = mf.averaged_product_list(simdict,'ANTONYMS',file_name,"std_dev")
    
    # Compare and count the number of synonyms we get right
    syncomparison1 = [a > divide1 for a in list(prodlist1.values())]

    # Compare and count the number of antonyms we get right
    antcomparison1 = [a < divide1 for a in list(prodlist5.values())]

    #Compute the average precision
    precision1 = 1/2*(syncomparison1.count(True)/len(syncomparison1) + antcomparison1.count(True)/len(antcomparison1))
    
    return precision1

In [4]:
def synvsnonevsant1(file_name):
    #Using observable vectors with cosine distance

    #Compute the mean and the standard deviation for the SYNONYMS
    syn1 = mf.averaged_product(simdict,'SYNONYMS',file_name,"std_dev")
    
    #Compute the mean and the standard deviation for the NONE
    non1 = mf.averaged_product(simdict,'NONE',file_name,"std_dev")

    #Compute the mean and the standard deviation for the ANTONYMS
    ant1 = mf.averaged_product(simdict,'ANTONYMS',file_name,"std_dev")

    #Use the means and deviations to set the divide between synonyms and none
    divide1 = non1[0] + non1[1]/syn1[1] * (syn1[0]-non1[0])/2

    #Use the means and deviations to set the divide between none and antonyms
    divide5 = ant1[0] + ant1[1]/non1[1] * (non1[0]-ant1[0])/2
    
    # Use the earlier defined function to compute the list of scalar products of the verbs and compare with divide
    prodlist1 = mf.averaged_product_list(simdict,'SYNONYMS',file_name,"std_dev")

    prodlist5 = mf.averaged_product_list(simdict,'ANTONYMS',file_name,"std_dev")

    prodlist9 = mf.averaged_product_list(simdict,'NONE',file_name,"std_dev")

    # Compare and count the number of synonyms we get right
    syncomparison1 = [a > divide1 for a in list(prodlist1.values())]

    # Compare and count the number of none we get right
    noncomparison1 = [a > divide5 and a < divide1 for a in list(prodlist9.values())]

    # Compare and count the number of antonyms we get right
    antcomparison1 = [a < divide5 for a in list(prodlist5.values())]

    #Compute the average precision
    precision1 = 1/3*(syncomparison1.count(True)/len(syncomparison1) + antcomparison1.count(True)/len(antcomparison1) + noncomparison1.count(True)/len(noncomparison1))

    return precision1

In [5]:
def synvsant2(file_name,myseed1,myseed2,samplesize1,samplesize2):
    #Since here we use just a subset of the pairs of synonyms and antonyms to determine the means and deviation, we have to do this by hand

    #Compute the products for all the observable deviation vectors associated to the SYNONYMS pairs
    all1 = mf.averaged_product_list(simdict,'SYNONYMS',file_name,"std_dev")

    #The subset from which we copute the means is randomly selected, we seed the random generator for repeatable results. Chosen seed 7
    random.seed(myseed1)

    mysample = random.sample(list(all1),samplesize1)

    #Find complementary sample where we will predict and test
    mycomplsample = set(all1.keys()) - set(mysample)

    #Compute the means and standard deviations
    syn1 = [np.mean(np.array(list(map(all1.get,mysample)))),np.std(np.array(list(map(all1.get,mysample))))]

    #Now the same thing for the antonyms

    #Compute the products for all the observable deviation vectors associated to the SYNONYMS pairs
    all5 = mf.averaged_product_list(simdict,'ANTONYMS',file_name,"std_dev")

    #The subset from which we copute the means is randomly selected, we seed the random generator for repeatable results. Chosen seed 13
    #random.seed(117) #Interesting values come out for this seed
    random.seed(myseed2)

    mysample2 = random.sample(list(all5),samplesize2)

    #Find complementary sample where we will predict and test
    mycomplsample2 = set(all5.keys()) - set(mysample2)

    #Compute the means and standard deviations
    ant1 = [np.mean(np.array(list(map(all5.get,mysample2)))),np.std(np.array(list(map(all5.get,mysample2))))]

    #Use the means and deviations to set the divide between synonyms and antonyms
    divide1 = ant1[0] + ant1[1]/syn1[1] * (syn1[0]-ant1[0])/2
    
    # Next we test on the complementary sample
    # Compare and count the number of synonyms we get right
    syncomparison1 = [a > divide1 for a in list(map(all1.get,mycomplsample))]

    # Compare and count the number of antonyms we get right
    antcomparison1 = [a < divide1 for a in list(map(all5.get,mycomplsample2))]

    #Compute the average precision
    precision1 = 1/2*(syncomparison1.count(True)/len(syncomparison1) + antcomparison1.count(True)/len(antcomparison1))

    return precision1

In [6]:
def synvsnonevsant2(file_name,myseed1,myseed2,myseed3,samplesize1,samplesize2,samplesize3):
    
    #Since here we use just a subset of the pairs of synonyms and antonyms to determine the means and deviation, we have to do this by hand

    #Compute the products for all the observable deviation vectors associated to the SYNONYMS pairs
    all1 = mf.averaged_product_list(simdict,'SYNONYMS',file_name,"std_dev")

    #The subset from which we copute the means is randomly selected, we seed the random generator for repeatable results. Chosen seed 7
    random.seed(myseed1)

    mysample = random.sample(list(all1),samplesize1)

    #Find complementary sample where we will predict and test
    mycomplsample = set(all1.keys()) - set(mysample)

    #Compute the means and standard deviations
    syn1 = [np.mean(np.array(list(map(all1.get,mysample)))),np.std(np.array(list(map(all1.get,mysample))))]

    #Now the same thing for the antonyms

    #Compute the products for all the observable deviation vectors associated to the SYNONYMS pairs
    all5 = mf.averaged_product_list(simdict,'ANTONYMS',file_name,"std_dev")
    
    #The subset from which we copute the means is randomly selected, we seed the random generator for repeatable results. Chosen seed 13
    random.seed(myseed2)

    mysample2 = random.sample(list(all5),samplesize2)

    #Find complementary sample where we will predict and test
    mycomplsample2 = set(all5.keys()) - set(mysample2)

    #Compute the means and standard deviations
    ant1 = [np.mean(np.array(list(map(all5.get,mysample2)))),np.std(np.array(list(map(all5.get,mysample2))))]
    
    #Finally the same for None

    #Compute the products for all the observable deviation vectors associated to the SYNONYMS pairs
    all9 = mf.averaged_product_list(simdict,'NONE',file_name,"std_dev")
    
    #The subset from which we copute the means is randomly selected, we seed the random generator for repeatable results. Chosen seed 13
    random.seed(myseed3)

    mysample3 = random.sample(list(all9),samplesize3)

    #Find complementary sample where we will predict and test
    mycomplsample3 = set(all9.keys()) - set(mysample3)

    #Compute the means and standard deviations
    non1 = [np.mean(np.array(list(map(all9.get,mysample3)))),np.std(np.array(list(map(all9.get,mysample3))))]
    
    #Use the means and deviations to set the divide between synonyms and none
    divide1 = non1[0] + non1[1]/syn1[1] * (syn1[0]-non1[0])/2

    #Use the means and deviations to set the divide between none and antonyms
    divide5 = ant1[0] + ant1[1]/non1[1] * (non1[0]-ant1[0])/2
    
    # Next we test on the complementary sample
    # Compare and count the number of synonyms we get right
    syncomparison1 = [a > divide1 for a in list(map(all1.get,mycomplsample))]


    # Compare and count the number of none we get right
    noncomparison1 = [a > divide5 and a < divide1 for a in list(map(all9.get,mycomplsample3))]
    
    # Compare and count the number of antonyms we get right
    antcomparison1 = [a < divide5 for a in list(map(all5.get,mycomplsample2))]
    
    #Compute the average precision
    precision1 = 1/3*(syncomparison1.count(True)/len(syncomparison1) + antcomparison1.count(True)/len(antcomparison1) + noncomparison1.count(True)/len(noncomparison1))
    
    return precision1

In [27]:
def hypohyper1(file_name):
    #Using observable vectors with cosine distance

    #Compute the mean and the standard deviation for the SYNONYMS
    syn1 = mf.averaged_product(simdict,'COHYPONYMS',file_name,"std_dev")

    #Compute the mean and the standard deviation for the ANTONYMS
    ant1 = mf.averaged_product(simdict,'HYPER/HYPONYMS',file_name,"std_dev")
    
    #Use the means and deviations to set the divide between synonyms and antonyms
    divide1 = ant1[0] + ant1[1]/syn1[1] * (syn1[0]-ant1[0])/2
    
    # Use the earlier defined function to compute the list of scalar products of the verbs and compare with divide
    prodlist1 = mf.averaged_product_list(simdict,'COHYPONYMS',file_name,"std_dev")

    prodlist5 = mf.averaged_product_list(simdict,'HYPER/HYPONYMS',file_name,"std_dev")
    
    # Compare and count the number of synonyms we get right
    syncomparison1 = [a > divide1 for a in list(prodlist1.values())]

    # Compare and count the number of antonyms we get right
    antcomparison1 = [a < divide1 for a in list(prodlist5.values())]

    #Compute the average precision
    precision1 = 1/2*(syncomparison1.count(True)/len(syncomparison1) + antcomparison1.count(True)/len(antcomparison1))
    
    return precision1

In [25]:
def hypohyper2(file_name,myseed1,myseed2,samplesize1,samplesize2):
    #Since here we use just a subset of the pairs of synonyms and antonyms to determine the means and deviation, we have to do this by hand

    #Compute the products for all the observable deviation vectors associated to the SYNONYMS pairs
    all1 = mf.averaged_product_list(simdict,'COHYPONYMS',file_name,"std_dev")

    #The subset from which we copute the means is randomly selected, we seed the random generator for repeatable results. Chosen seed 7
    random.seed(myseed1)

    mysample = random.sample(list(all1),samplesize1)

    #Find complementary sample where we will predict and test
    mycomplsample = set(all1.keys()) - set(mysample)

    #Compute the means and standard deviations
    syn1 = [np.mean(np.array(list(map(all1.get,mysample)))),np.std(np.array(list(map(all1.get,mysample))))]

    #Now the same thing for the antonyms

    #Compute the products for all the observable deviation vectors associated to the SYNONYMS pairs
    all5 = mf.averaged_product_list(simdict,'HYPER/HYPONYMS',file_name,"std_dev")

    #The subset from which we copute the means is randomly selected, we seed the random generator for repeatable results. Chosen seed 13
    #random.seed(117) #Interesting values come out for this seed
    random.seed(myseed2)

    mysample2 = random.sample(list(all5),samplesize2)

    #Find complementary sample where we will predict and test
    mycomplsample2 = set(all5.keys()) - set(mysample2)

    #Compute the means and standard deviations
    ant1 = [np.mean(np.array(list(map(all5.get,mysample2)))),np.std(np.array(list(map(all5.get,mysample2))))]

    #Use the means and deviations to set the divide between synonyms and antonyms
    divide1 = ant1[0] + ant1[1]/syn1[1] * (syn1[0]-ant1[0])/2
    
    # Next we test on the complementary sample
    # Compare and count the number of synonyms we get right
    syncomparison1 = [a > divide1 for a in list(map(all1.get,mycomplsample))]

    # Compare and count the number of antonyms we get right
    antcomparison1 = [a < divide1 for a in list(map(all5.get,mycomplsample2))]

    #Compute the average precision
    precision1 = 1/2*(syncomparison1.count(True)/len(syncomparison1) + antcomparison1.count(True)/len(antcomparison1))

    return precision1

## Choosing the seeds for the various samples

In [7]:
#Experiment 2 requires averaging over a number of different random samples. To generate these in a way 
#which is random but repeatable we need a list of seeds for the random generator. We decided to pick 20 samples

#seed the random generator before genearting the integer seeds for the random samples
random.seed(7)

seed_list = [(random.randint(0,5000),random.randint(0,5000)) for a in range(20)]

#Make a second seed_list for when we need 3 seeds instead of 2

seed_list2 = [(random.randint(0,5000),random.randint(0,5000),random.randint(0,5000)) for a in range(20)]

## 13 obs defs

In [8]:
# Make observables object
set1 = mf.observables(["Linear.txt","Quadratic.txt"])
# Load matrices and make vector dictionaries
objsubset1 = mf.vector_dictionary("matrices_1160_arg_obj_context_subj.txt",set1)
subobjset1 = mf.vector_dictionary("matrices_1160_arg_subj_context_obj.txt",set1,[0])
obj08sub02set1 = mf.vector_dictionary([["matrices_1160_arg_obj_context_subj.txt","matrices_1160_arg_subj_context_obj.txt"],[0.8,0.2]],set1,[1])
obj09sub01set1 = mf.vector_dictionary([["matrices_1160_arg_obj_context_subj.txt","matrices_1160_arg_subj_context_obj.txt"],[0.9,0.1]],set1,[1])

file_list1 = [objsubset1,obj09sub01set1,obj08sub02set1,subobjset1]

## 15 obs defs

In [9]:
# Make observables object
set2 = mf.observables(["Cubic1.txt","Quartic1.txt"])
# Load matrices and make vector dictionaries
objsubset2 = mf.vector_dictionary("matrices_1160_arg_obj_context_subj.txt",set2)
subobjset2 = mf.vector_dictionary("matrices_1160_arg_subj_context_obj.txt",set2,[0])
obj08sub02set2 = mf.vector_dictionary([["matrices_1160_arg_obj_context_subj.txt","matrices_1160_arg_subj_context_obj.txt"],[0.8,0.2]],set2,[1])
obj09sub01set2 = mf.vector_dictionary([["matrices_1160_arg_obj_context_subj.txt","matrices_1160_arg_subj_context_obj.txt"],[0.9,0.1]],set2,[1])

file_list2 = [objsubset2,obj09sub01set2,obj08sub02set2,subobjset2]

## 28 obs defs

In [10]:
# Make observables object
set3 = mf.observables(["Linear.txt","Quadratic.txt","Cubic1.txt","Quartic1.txt","Additional1.txt"])
# Load matrices and make vector dictionaries
objsubset3 = mf.vector_dictionary("matrices_1160_arg_obj_context_subj.txt",set3)
subobjset3 = mf.vector_dictionary("matrices_1160_arg_subj_context_obj.txt",set3,[0])
obj08sub02set3 = mf.vector_dictionary([["matrices_1160_arg_obj_context_subj.txt","matrices_1160_arg_subj_context_obj.txt"],[0.8,0.2]],set3,[1])
obj09sub01set3 = mf.vector_dictionary([["matrices_1160_arg_obj_context_subj.txt","matrices_1160_arg_subj_context_obj.txt"],[0.9,0.1]],set3,[1])

file_list3 = [objsubset3,obj09sub01set3,obj08sub02set3,subobjset3]

## Experiment 1 syn vs ant

In [20]:
# Experiment 1 synonym vs antonym
[synvsant1(a) for a in file_list1]

[0.5602808691043986,
 0.5545398339515986,
 0.5565712771595125,
 0.5234499205087441]

In [22]:
# Experiment 1 synonym vs antonym
[synvsant1(a) for a in file_list2]

[0.5787405052110934,
 0.5684949655537891,
 0.5754725313548843,
 0.5537007595831125]

In [23]:
# Experiment 1 synonym vs antonym
[synvsant1(a) for a in file_list3]

[0.5562179826885709,
 0.5607224871930754,
 0.5779014308426074,
 0.5635930047694753]

In [57]:
# Experiment 1 synonym vs antonym
[[synvsant1(a) for a in localf] for localf in [file_list1,file_list2,file_list3]]

[[0.5602808691043986,
  0.5545398339515986,
  0.5565712771595125,
  0.5234499205087441],
 [0.5787405052110934,
  0.5684949655537891,
  0.5754725313548843,
  0.5537007595831125],
 [0.5562179826885709,
  0.5607224871930754,
  0.5779014308426074,
  0.5635930047694753]]

## Experiment 1 hyper/hypo vs cohypo

In [28]:
# Experiment 1 synonym vs antonym
[hypohyper1(a) for a in file_list1]

[0.5248026315789474,
 0.5194078947368421,
 0.5286842105263159,
 0.5505921052631579]

In [29]:
# Experiment 1 synonym vs antonym
[hypohyper1(a) for a in file_list2]

[0.5328947368421053,
 0.5220065789473685,
 0.5242763157894736,
 0.5445065789473684]

In [30]:
# Experiment 1 synonym vs antonym
[hypohyper1(a) for a in file_list3]

[0.5319078947368421,
 0.5445394736842105,
 0.5438157894736843,
 0.5655921052631578]

In [58]:
# Experiment 1 synonym vs antonym
[[hypohyper1(a) for a in localf] for localf in [file_list1,file_list2,file_list3]]

[[0.5248026315789474,
  0.5194078947368421,
  0.5286842105263159,
  0.5505921052631579],
 [0.5328947368421053,
  0.5220065789473685,
  0.5242763157894736,
  0.5445065789473684],
 [0.5319078947368421,
  0.5445394736842105,
  0.5438157894736843,
  0.5655921052631578]]

## Experiment 1 syn vs ant vs none

In [31]:
# Experiment 1 synonym vs antonym
[synvsnonevsant1(a) for a in file_list1]

[0.3597136103529964,
 0.37129490582176006,
 0.3766332730527103,
 0.34676898871272266]

In [32]:
# Experiment 1 synonym vs antonym
[synvsnonevsant1(a) for a in file_list2]

[0.3824331906173338,
 0.3728928473173997,
 0.3738713700350529,
 0.37076496411534776]

In [33]:
# Experiment 1 synonym vs antonym
[synvsnonevsant1(a) for a in file_list3]

[0.3661160387503098,
 0.379667109846138,
 0.38460790123194216,
 0.37446110847645364]

In [59]:
# Experiment 1 synonym vs antonym
[[synvsnonevsant1(a) for a in localf] for localf in [file_list1,file_list2,file_list3]]

[[0.3597136103529964,
  0.37129490582176006,
  0.3766332730527103,
  0.34676898871272266],
 [0.3824331906173338,
  0.3728928473173997,
  0.3738713700350529,
  0.37076496411534776],
 [0.3661160387503098,
  0.379667109846138,
  0.38460790123194216,
  0.37446110847645364]]

## Experiment 2 syn vs ant

In [41]:
# Apply the function to every file with the given seed and average. The averaging is done using pure functions
# (lambda) so to compute all the precisions only once

# Synonyms vs Antonyms, sample size roughly 60%

[(lambda x1 : np.array([np.mean(x1,axis = 0),np.std(x1,axis=0)]).T)([[synvsant2(myfile,*a,200,70) for myfile in localf] for a in seed_list ]) for localf in [file_list1,file_list2,file_list3]]

[array([[0.55718477, 0.0286868 ],
        [0.56323631, 0.03947724],
        [0.56193626, 0.03038054],
        [0.53352508, 0.02666175]]),
 array([[0.58256443, 0.02190353],
        [0.5679878 , 0.01723463],
        [0.56153935, 0.04105102],
        [0.56240796, 0.03135284]]),
 array([[0.55715601, 0.03037697],
        [0.57337782, 0.03124492],
        [0.58143695, 0.02554353],
        [0.57532789, 0.03366879]])]

In [40]:
# Synonyms vs Antonyms, sample size roughly 10%

[(lambda x1 : np.array([np.mean(x1,axis = 0),np.std(x1,axis=0)]).T)([[synvsant2(myfile,*a,30,12) for myfile in localf] for a in seed_list ]) for localf in [file_list1,file_list2,file_list3]]

[array([[0.55313735, 0.01489417],
        [0.56247255, 0.0131478 ],
        [0.56048803, 0.01326686],
        [0.53536726, 0.01787474]]),
 array([[0.58037989, 0.01292189],
        [0.57164855, 0.01582666],
        [0.56485782, 0.01662805],
        [0.5471289 , 0.00887981]]),
 array([[0.55912659, 0.01588535],
        [0.57385815, 0.01515517],
        [0.57766524, 0.01564467],
        [0.55278601, 0.02270612]])]

In [42]:
# Synonyms vs Antonyms, sample size roughly 5%

[(lambda x1 : np.array([np.mean(x1,axis = 0),np.std(x1,axis=0)]).T)([[synvsant2(myfile,*a,15,6) for myfile in localf] for a in seed_list ]) for localf in [file_list1,file_list2,file_list3]]

[array([[0.5546809 , 0.01317812],
        [0.56127639, 0.01222263],
        [0.56027491, 0.0118855 ],
        [0.54448208, 0.02198171]]),
 array([[0.5769784 , 0.01177556],
        [0.57148012, 0.0158921 ],
        [0.56489445, 0.02028405],
        [0.55553019, 0.01720257]]),
 array([[0.56086892, 0.01719457],
        [0.57201522, 0.01965759],
        [0.57795533, 0.01903933],
        [0.56073638, 0.02140626]])]

In [43]:
# Synonyms vs Antonyms, sample size 5 pairs

[(lambda x1 : np.array([np.mean(x1,axis = 0),np.std(x1,axis=0)]).T)([[synvsant2(myfile,*a,5,5) for myfile in localf] for a in seed_list ]) for localf in [file_list1,file_list2,file_list3]]

[array([[0.5577266 , 0.01389964],
        [0.56288629, 0.00989336],
        [0.56156131, 0.0092551 ],
        [0.55015436, 0.02242509]]),
 array([[0.57596063, 0.02098345],
        [0.57226697, 0.02336295],
        [0.56572667, 0.02486098],
        [0.55169404, 0.02936456]]),
 array([[0.56332978, 0.01718888],
        [0.57100389, 0.01619591],
        [0.57198489, 0.01973936],
        [0.55780731, 0.02383717]])]

## Experiment 2 hyper/hypo vs cohypo

In [47]:
# Apply the function to every file with the given seed and average. The averaging is done using pure functions
# (lambda) so to compute all the precisions only once

# Synonyms vs Antonyms, sample size roughly 60%

[(lambda x1 : np.array([np.mean(x1,axis = 0),np.std(x1,axis=0)]).T)([[hypohyper2(myfile,*a,114,480) for myfile in localf] for a in seed_list ]) for localf in [file_list1,file_list2,file_list3]]

[array([[0.50963816, 0.02529933],
        [0.5069778 , 0.03434824],
        [0.51860197, 0.02783025],
        [0.54222451, 0.0201456 ]]),
 array([[0.52762336, 0.01957085],
        [0.51708882, 0.01936838],
        [0.52254934, 0.02614483],
        [0.53748355, 0.02324084]]),
 array([[0.52259457, 0.02712426],
        [0.52703947, 0.0302006 ],
        [0.53069901, 0.02741803],
        [0.56026316, 0.02164468]])]

In [49]:
# Synonyms vs Antonyms, sample size roughly 10%

[(lambda x1 : np.array([np.mean(x1,axis = 0),np.std(x1,axis=0)]).T)([[hypohyper2(myfile,*a,19,80) for myfile in localf] for a in seed_list ]) for localf in [file_list1,file_list2,file_list3]]

[array([[0.52357273, 0.01338966],
        [0.52379751, 0.01891544],
        [0.52577668, 0.01569851],
        [0.55484284, 0.01587564]]),
 array([[0.52902778, 0.00848917],
        [0.52448099, 0.00567904],
        [0.53323282, 0.00684499],
        [0.54699379, 0.00603373]]),
 array([[0.53230629, 0.00869123],
        [0.53177083, 0.01318194],
        [0.53851425, 0.0116434 ],
        [0.56395833, 0.01211284]])]

In [50]:
# Synonyms vs Antonyms, sample size roughly 5%

[(lambda x1 : np.array([np.mean(x1,axis = 0),np.std(x1,axis=0)]).T)([[hypohyper2(myfile,*a,9,40) for myfile in localf] for a in seed_list ]) for localf in [file_list1,file_list2,file_list3]]

[array([[0.52189608, 0.01030549],
        [0.52167309, 0.01442212],
        [0.52569788, 0.01094266],
        [0.55201821, 0.00989789]]),
 array([[0.52883051, 0.01047954],
        [0.52650207, 0.00691944],
        [0.53402806, 0.00578395],
        [0.54659167, 0.00435978]]),
 array([[0.53271245, 0.00648235],
        [0.53048997, 0.01184247],
        [0.5345342 , 0.00833232],
        [0.56251872, 0.01063493]])]

In [51]:
# Synonyms vs Antonyms, sample size 5 pairs

[(lambda x1 : np.array([np.mean(x1,axis = 0),np.std(x1,axis=0)]).T)([[hypohyper2(myfile,*a,5,5) for myfile in localf] for a in seed_list ]) for localf in [file_list1,file_list2,file_list3]]

[array([[0.5168409 , 0.01241783],
        [0.51499235, 0.01568301],
        [0.51913989, 0.01263556],
        [0.54879143, 0.01070788]]),
 array([[0.52899286, 0.01248835],
        [0.52774095, 0.0097494 ],
        [0.53356026, 0.00652065],
        [0.54519208, 0.01666236]]),
 array([[0.52602329, 0.00781675],
        [0.52432857, 0.01223066],
        [0.53190719, 0.01100075],
        [0.55731939, 0.00973795]])]

## Experiment 2 syn s none vs ant

In [53]:
# Synonyms vs Antonyms, sample size 60%

[(lambda x1 : np.array([np.mean(x1,axis = 0),np.std(x1,axis=0)]).T)([[synvsnonevsant2(myfile,*a,200,70,1360) for myfile in localf] for a in seed_list2 ]) for localf in [file_list1,file_list2,file_list3]]

[array([[0.36039578, 0.02427392],
        [0.3705463 , 0.02603444],
        [0.37172346, 0.02522594],
        [0.35021321, 0.02605001]]),
 array([[0.3847989 , 0.02716517],
        [0.38569239, 0.02671016],
        [0.38778735, 0.02861508],
        [0.37448965, 0.0212687 ]]),
 array([[0.3692036 , 0.02766112],
        [0.37458247, 0.02750455],
        [0.3810474 , 0.02523456],
        [0.37123831, 0.0240733 ]])]

In [54]:
# Synonyms vs Antonyms, sample size 10%

[(lambda x1 : np.array([np.mean(x1,axis = 0),np.std(x1,axis=0)]).T)([[synvsnonevsant2(myfile,*a,30,12,225) for myfile in localf] for a in seed_list2 ]) for localf in [file_list1,file_list2,file_list3]]

[array([[0.36658493, 0.01510189],
        [0.37007301, 0.01209757],
        [0.3705702 , 0.01027956],
        [0.35227173, 0.00974955]]),
 array([[0.38692058, 0.01271578],
        [0.38058816, 0.01765156],
        [0.38414715, 0.01606983],
        [0.36916266, 0.00980572]]),
 array([[0.36776551, 0.01318403],
        [0.37679871, 0.01156412],
        [0.38452368, 0.01081359],
        [0.36400991, 0.0150756 ]])]

In [55]:
# Synonyms vs Antonyms, sample size 5%

[(lambda x1 : np.array([np.mean(x1,axis = 0),np.std(x1,axis=0)]).T)([[synvsnonevsant2(myfile,*a,15,6,112) for myfile in localf] for a in seed_list2 ]) for localf in [file_list1,file_list2,file_list3]]

[array([[0.38210355, 0.02758579],
        [0.38353394, 0.02454306],
        [0.38295199, 0.01871915],
        [0.36219333, 0.02644928]]),
 array([[0.38847314, 0.01290714],
        [0.38332315, 0.02182457],
        [0.3821228 , 0.01898871],
        [0.3709394 , 0.00898255]]),
 array([[0.38031931, 0.02803658],
        [0.38773164, 0.02059527],
        [0.39052781, 0.0174915 ],
        [0.36314847, 0.02351678]])]

In [56]:
# Synonyms vs Antonyms, sample size 5 pairs

[(lambda x1 : np.array([np.mean(x1,axis = 0),np.std(x1,axis=0)]).T)([[synvsnonevsant2(myfile,*a,5,5,5) for myfile in localf] for a in seed_list2 ]) for localf in [file_list1,file_list2,file_list3]]

[array([[0.37902192, 0.03132735],
        [0.38369504, 0.03117066],
        [0.38337985, 0.03059548],
        [0.36826069, 0.03048114]]),
 array([[0.39106474, 0.02427315],
        [0.3903828 , 0.03170344],
        [0.38823387, 0.03175568],
        [0.39318451, 0.03814151]]),
 array([[0.38331792, 0.03750307],
        [0.39103096, 0.0460884 ],
        [0.39138439, 0.04303463],
        [0.36960936, 0.03221026]])]

## Precision through sorting over same sample sizes

### Syn vs Ant

In [124]:
# we want to take all synonyms and antonyms, sort them, set the divide to the number of antonyms n and check
# how many of the first n are actually antonyms
def precision_by_sorting_synant(file_name):
    
    ants = mf.averaged_product_list(simdict,'ANTONYMS',file_name,"std_dev")
    syns = {}
    test2 = mf.averaged_product_list(simdict,'SYNONYMS',file_name,"std_dev")
    for a in random.sample(list(test2.keys()),len(ants)):
        syns.update({a : test2[a]})
    
    #allverbs = syns.update(ants)
    allverbs = syns | ants
    
    # Take the list of values from this dictionary and sort according to length
    alllengths = allverbs.values()
    # Sort by length
    alllengths = sorted(alllengths)
    
    #Check how many of the first n (length of the antonyms) are actually antonyms and how many of the following
    # are none
    allantvals = ants.values()
    allsynvals = syns.values()
    #print([set(allantvals).intersection(set(allnonvals)),set(allantvals).intersection(set(allsynvals)),set(allsynvals).intersection(set(allnonvals))])
    mycounts = []
    
    for a in alllengths[:len(allantvals)]:
        mycounts.append(a in allantvals)
    
    local = mycounts.count(True)
    #print(local/len(allantvals))
        
    for a in alllengths[-len(allsynvals):]:
        mycounts.append(a in allsynvals)
    
    local = mycounts.count(True)-local
    #print(local/len(allsynvals))
    
    # Count the trues
    mycounts = mycounts.count(True)
    
    # Compute precision, and recycle variables which I do not need anymore
    
    allantvals = mycounts/(len(allsynvals) + len(allantvals))
    
    return allantvals

In [125]:
def precision_by_sorting_synant_average(file_name,iterations,myseed):
    
    random.seed(myseed)
    
    mylist = []
    for a in range(iterations):
        mylist.append(precision_by_sorting_synant(file_name))
    
    return [np.mean(np.array(mylist)),np.std(np.array(mylist))]
        

In [126]:
precision_by_sorting_synant_average(objsubset1,20,33)

[0.5554054054054053, 0.01762522301487733]

In [127]:
[precision_by_sorting_synant_average(a,20,33) for a in file_list1]

[[0.5554054054054053, 0.01762522301487733],
 [0.572072072072072, 0.016241221961549496],
 [0.5657657657657656, 0.014693249036306387],
 [0.549099099099099, 0.019421676700279365]]

In [128]:
[precision_by_sorting_synant_average(a,20,33) for a in file_list2]

[[0.5734234234234235, 0.023540076811628684],
 [0.572072072072072, 0.016974273586861967],
 [0.5819819819819819, 0.01718809371922426],
 [0.568018018018018, 0.017899384221896866]]

In [129]:
[precision_by_sorting_synant_average(a,20,33) for a in file_list3]

[[0.5554054054054054, 0.01618490470560511],
 [0.5680180180180182, 0.020833028972697624],
 [0.5774774774774775, 0.02130016291448088],
 [0.5698198198198198, 0.021295399375806875]]

### Hyper/hypo vs cohypo

In [144]:
# we want to take all synonyms and antonyms, sort them, set the divide to the number of antonyms n and check
# how many of the first n are actually antonyms
def precision_by_sorting_hyper(file_name):
    
    ants = mf.averaged_product_list(simdict,'COHYPONYMS',file_name,"std_dev")
    syns = {}
    test2 = mf.averaged_product_list(simdict,'HYPER/HYPONYMS',file_name,"std_dev")
    for a in random.sample(list(test2.keys()),len(ants)):
        syns.update({a : test2[a]})
        
    (ants,syns) = (syns,ants)
    
    #allverbs = syns.update(ants)
    allverbs = syns | ants
    
    # Take the list of values from this dictionary and sort according to length
    alllengths = allverbs.values()
    # Sort by length
    alllengths = sorted(alllengths)
    
    #Check how many of the first n (length of the antonyms) are actually antonyms and how many of the following
    # are none
    allantvals = ants.values()
    allsynvals = syns.values()
    #print([set(allantvals).intersection(set(allnonvals)),set(allantvals).intersection(set(allsynvals)),set(allsynvals).intersection(set(allnonvals))])
    mycounts = []
    
    for a in alllengths[:len(allantvals)]:
        mycounts.append(a in allantvals)
    
    local = mycounts.count(True)
    #print(local/len(allantvals))
        
    for a in alllengths[-len(allsynvals):]:
        mycounts.append(a in allsynvals)
    
    local = mycounts.count(True)-local
    #print(local/len(allsynvals))
    
    # Count the trues
    mycounts = mycounts.count(True)
    
    # Compute precision, and recycle variables which I do not need anymore
    
    allantvals = mycounts/(len(allsynvals) + len(allantvals))
    
    return allantvals

In [145]:
def precision_by_sorting_hyper_average(file_name,iterations,myseed):
    
    random.seed(myseed)
    
    mylist = []
    for a in range(iterations):
        mylist.append(precision_by_sorting_hyper(file_name))
    
    return [np.mean(np.array(mylist)),np.std(np.array(mylist))]
        

In [146]:
precision_by_sorting_hyper_average(objsubset1,20,33)

[0.5239473684210527, 0.01600510652582628]

In [147]:
[precision_by_sorting_hyper_average(a,20,33) for a in file_list1]

[[0.5239473684210527, 0.01600510652582628],
 [0.5192105263157895, 0.019500337378997756],
 [0.5276315789473685, 0.018967460107676214],
 [0.551578947368421, 0.019664780728704628]]

In [148]:
[precision_by_sorting_hyper_average(a,20,33) for a in file_list2]

[[0.5228947368421053, 0.014051251657065784],
 [0.5315789473684212, 0.015163010832513635],
 [0.5326315789473685, 0.017184029176120535],
 [0.5594736842105263, 0.022211524048430725]]

In [149]:
[precision_by_sorting_hyper_average(a,20,33) for a in file_list3]

[[0.5331578947368422, 0.014708619592086455],
 [0.5371052631578948, 0.016848271897378028],
 [0.5436842105263158, 0.020927258265337947],
 [0.5652631578947369, 0.020000000000000004]]

### Syn vs Ant vs None

In [162]:
# we want to take all synonyms and antonyms, sort them, set the divide to the number of antonyms n and check
# how many of the first n are actually antonyms
def precision_by_sorting_synnonant(file_name):
    
    ants = mf.averaged_product_list(simdict,'ANTONYMS',file_name,"std_dev")
    syns = {}
    test2 = mf.averaged_product_list(simdict,'SYNONYMS',file_name,"std_dev")
    for a in random.sample(list(test2.keys()),len(ants)):
        syns.update({a : test2[a]})
    nons = {}
    test2 = mf.averaged_product_list(simdict,'NONE',file_name,"std_dev")
    for a in random.sample(list(test2.keys()),len(ants)):
        nons.update({a : test2[a]})
    
    #allverbs = syns.update(ants)
    allverbs = syns | nons | ants
    
    # Take the list of values from this dictionary and sort according to length
    alllengths = allverbs.values()
    # Sort by length
    alllengths = sorted(alllengths)
    
    #Check how many of the first n (length of the antonyms) are actually antonyms and how many of the following
    # are none
    allantvals = ants.values()
    allnonvals = list(nons.values())[:]
    allsynvals = syns.values()
    #print([set(allantvals).intersection(set(allnonvals)),set(allantvals).intersection(set(allsynvals)),set(allsynvals).intersection(set(allnonvals))])
    mycounts = []
    
    for a in alllengths[:len(ants)]:
        mycounts.append(a in allantvals)
    local = mycounts.count(True)
    #print(local/len(ants))
        
    for a in alllengths[len(ants) + 1:len(ants) + len(nons)]:
        mycounts.append(a in allnonvals)
    local = mycounts.count(True)-local
    #print(local/len(nons))
        
    for a in alllengths[-len(allsynvals):]:
        mycounts.append(a in allsynvals)
    local = mycounts.count(True)-local
    #print(local/len(syns))
    
    # Count the trues
    mycounts = mycounts.count(True)
    
    # Compute precision, and recycle variables which I do not need anymore
    
    allantvals = mycounts/(len(allsynvals) + len(allantvals) + len(allnonvals))
    
    return allantvals

In [155]:
def precision_by_sorting_synnonant_average(file_name,iterations,myseed):
    
    random.seed(myseed)
    
    mylist = []
    for a in range(iterations):
        mylist.append(precision_by_sorting_synnonant(file_name))
    
    return [np.mean(np.array(mylist)),np.std(np.array(mylist))]

In [161]:
precision_by_sorting_synnonant(objsubset1)

0.3333333333333333
0.27927927927927926
0.7297297297297297


0.33633633633633636

In [158]:
precision_by_sorting_synnonant_average(objsubset1,20,33)

[0.3546546546546547, 0.019108634558478117]

In [163]:
[precision_by_sorting_synnonant_average(a,20,33) for a in file_list1]

[[0.3546546546546547, 0.019108634558478117],
 [0.36396396396396397, 0.0217712540064767],
 [0.36036036036036034, 0.02175882394322738],
 [0.3355855855855856, 0.020715823941562014]]

In [164]:
[precision_by_sorting_synnonant_average(a,20,33) for a in file_list2]

[[0.3536036036036036, 0.021210535678306143],
 [0.35900900900900906, 0.018498961618164403],
 [0.35435435435435436, 0.018992658619629902],
 [0.3256756756756756, 0.0216648508455183]]

In [165]:
[precision_by_sorting_synnonant_average(a,20,33) for a in file_list3]

[[0.3454954954954955, 0.01667883655997623],
 [0.34894894894894896, 0.014730028190307174],
 [0.35015015015015016, 0.01697161699813188],
 [0.32387387387387384, 0.022677643986932578]]

## Precision on human judgment

In [80]:
#First extract the largest value and use it to normalise the rest
allvals=np.array([])
for a in simdict.keys():
    allvals = np.concatenate((allvals,np.array(simdict[a]).T[2]))
allvals = np.array([float(a) for a in allvals])
highest = max(allvals)
highest

9.96

In [81]:
#Extract all values for the synonym pairs and normalise
allsyn =np.array([float(a) for a in np.array(simdict['SYNONYMS']).T[2]])
allsyn = allsyn / highest
#Same for none
allnon =np.array([float(a) for a in np.array(simdict['NONE']).T[2]])
allnon = allnon / highest
#Same for anotnysm
allant =np.array([float(a) for a in np.array(simdict['ANTONYMS']).T[2]])
allant = allant / highest

In [82]:
# Compute mean and deviations
syn1 = [np.mean(allsyn),np.std(allsyn)]
non1 = [np.mean(allnon),np.std(allnon)]
ant1 = [np.mean(allant),np.std(allant)]
print(syn1,non1,ant1)

[0.6816415990760427, 0.210948684438062] [0.3445055904458733, 0.23515413813688968] [0.09816744455298675, 0.1073676682566156]


In [83]:
#Use the means and deviations to set the divide between synonyms and antonyms
divide1 = ant1[0] + ant1[1]/syn1[1] * (syn1[0]-ant1[0])/2

# Compare and count the number of synonyms we get right
syncomparison1 = [abs(a) > divide1 for a in allsyn]

# Compare and count the number of antonyms we get right
antcomparison1 = [abs(a) < divide1 for a in allant]

precision1 = (syncomparison1.count(True) + antcomparison1.count(True))/(len(syncomparison1) + len(antcomparison1))

print(divide1)
print(precision1)

0.24665440863627003
0.947242206235012


In [84]:
#Use the means and deviations to set the divide between synonyms and none
divide1 = non1[0] + non1[1]/syn1[1] * (syn1[0]-non1[0])/2

#Use the means and deviations to set the divide between none and antonyms
divide2 = ant1[0] + ant1[1]/non1[1] * (non1[0]-ant1[0])/2

# Compare and count the number of synonyms we get right
syncomparison1 = [abs(a) > divide1 for a in allsyn]

# Compare and count the number of none we get right
noncomparison1 = [abs(a) > divide2 and abs(a) < divide1 for a in allnon]

# Compare and count the number of antonyms we get right
antcomparison1 = [abs(a) < divide2 for a in allant]

precision1 = (syncomparison1.count(True) + antcomparison1.count(True) + noncomparison1.count(True))/(len(syncomparison1) + len(antcomparison1) + len(noncomparison1))

print(divide1)
print(divide2)
print(precision1)

0.5324160478055069
0.15440449939864861
0.49721115537848604


In [106]:
# We define similar functions as previously but we needed some tweeked input

allsyndict = {}
for a in simdict['SYNONYMS']:
    allsyndict.update({(a[0],a[1]):a[2]/highest})
    
allnondict = {}
for a in simdict['NONE']:
    allnondict.update({(a[0],a[1]):a[2]/highest})
    
allantdict = {}
for a in simdict['ANTONYMS']:
    allantdict.update({(a[0],a[1]):a[2]/highest})

In [107]:
def human1(myseed1,myseed2,samplesize1,samplesize2):
    
    #Compute the products for all the observable deviation vectors associated to the SYNONYMS pairs
    all1 = allsyndict

    #The subset from which we copute the means is randomly selected, we seed the random generator for repeatable results. Chosen seed 7
    random.seed(myseed1)

    mysample = random.sample(list(all1),samplesize1)

    #Find complementary sample where we will predict and test
    mycomplsample = set(all1.keys()) - set(mysample)

    #Compute the means and standard deviations
    syn1 = [np.mean(np.array(list(map(all1.get,mysample)))),np.std(np.array(list(map(all1.get,mysample))))]

    #Now the same thing for the antonyms

    #Compute the products for all the observable deviation vectors associated to the SYNONYMS pairs
    all5 = allantdict

    #The subset from which we copute the means is randomly selected, we seed the random generator for repeatable results. Chosen seed 13
    #random.seed(117) #Interesting values come out for this seed
    random.seed(myseed2)

    mysample2 = random.sample(list(all5),samplesize2)

    #Find complementary sample where we will predict and test
    mycomplsample2 = set(all5.keys()) - set(mysample2)

    #Compute the means and standard deviations
    ant1 = [np.mean(np.array(list(map(all5.get,mysample2)))),np.std(np.array(list(map(all5.get,mysample2))))]

    #Use the means and deviations to set the divide between synonyms and antonyms
    divide1 = ant1[0] + ant1[1]/syn1[1] * (syn1[0]-ant1[0])/2
    
    # Next we test on the complementary sample
    # Compare and count the number of synonyms we get right
    syncomparison1 = [a > divide1 for a in list(map(all1.get,mycomplsample))]

    # Compare and count the number of antonyms we get right
    antcomparison1 = [a < divide1 for a in list(map(all5.get,mycomplsample2))]

    #Compute the average precision
    precision1 = (syncomparison1.count(True) + antcomparison1.count(True))/(len(syncomparison1) + len(antcomparison1))

    return precision1

In [109]:
def human2(myseed1,myseed2,myseed3,samplesize1,samplesize2,samplesize3):
    
    #Since here we use just a subset of the pairs of synonyms and antonyms to determine the means and deviation, we have to do this by hand

    #Compute the products for all the observable deviation vectors associated to the SYNONYMS pairs
    all1 = allsyndict

    #The subset from which we copute the means is randomly selected, we seed the random generator for repeatable results. Chosen seed 7
    random.seed(myseed1)

    mysample = random.sample(list(all1),samplesize1)

    #Find complementary sample where we will predict and test
    mycomplsample = set(all1.keys()) - set(mysample)

    #Compute the means and standard deviations
    syn1 = [np.mean(np.array(list(map(all1.get,mysample)))),np.std(np.array(list(map(all1.get,mysample))))]

    #Now the same thing for the antonyms

    #Compute the products for all the observable deviation vectors associated to the SYNONYMS pairs
    all5 = allantdict
    
    #The subset from which we copute the means is randomly selected, we seed the random generator for repeatable results. Chosen seed 13
    random.seed(myseed2)

    mysample2 = random.sample(list(all5),samplesize2)

    #Find complementary sample where we will predict and test
    mycomplsample2 = set(all5.keys()) - set(mysample2)

    #Compute the means and standard deviations
    ant1 = [np.mean(np.array(list(map(all5.get,mysample2)))),np.std(np.array(list(map(all5.get,mysample2))))]
    
    #Finally the same for None

    #Compute the products for all the observable deviation vectors associated to the SYNONYMS pairs
    all9 = allnondict
    
    #The subset from which we copute the means is randomly selected, we seed the random generator for repeatable results. Chosen seed 13
    random.seed(myseed3)

    mysample3 = random.sample(list(all9),samplesize3)

    #Find complementary sample where we will predict and test
    mycomplsample3 = set(all9.keys()) - set(mysample3)

    #Compute the means and standard deviations
    non1 = [np.mean(np.array(list(map(all9.get,mysample3)))),np.std(np.array(list(map(all9.get,mysample3))))]
    
    #Use the means and deviations to set the divide between synonyms and none
    divide1 = non1[0] + non1[1]/syn1[1] * (syn1[0]-non1[0])/2

    #Use the means and deviations to set the divide between none and antonyms
    divide5 = ant1[0] + ant1[1]/non1[1] * (non1[0]-ant1[0])/2
    
    # Next we test on the complementary sample
    # Compare and count the number of synonyms we get right
    syncomparison1 = [a > divide1 for a in list(map(all1.get,mycomplsample))]


    # Compare and count the number of none we get right
    noncomparison1 = [a > divide5 and a < divide1 for a in list(map(all9.get,mycomplsample3))]
    
    # Compare and count the number of antonyms we get right
    antcomparison1 = [a < divide5 for a in list(map(all5.get,mycomplsample2))]
    
    #Compute the average precision
    precision1 = (syncomparison1.count(True) + antcomparison1.count(True) + noncomparison1.count(True))/(len(syncomparison1) + len(antcomparison1) + len(noncomparison1))
    
    return precision1

In [110]:
# Apply the function to every file with the given seed and average. The averaging is done using pure functions
# (lambda) so to compute all the precisions only once

# Synonyms vs Antonyms, sample size roughly 60%

(lambda x1 : [np.mean(x1,axis = 0),np.std(x1,axis=0)])([human1(*a,200,70) for a in seed_list ])

[0.9452380952380952, 0.013515861555309856]

In [111]:
# Synonyms vs None vs Antonyms, sample size roughly 60%

(lambda x1 : [np.mean(x1,axis = 0),np.std(x1,axis=0)])([human2(*a,200,70,1360) for a in seed_list2 ])

[0.5046590909090909, 0.017094248343998072]