In [1]:
# Import packages
import nltk
import csv
import numpy as np
import pandas as pd

In [2]:
# Import WordNet dataset
from nltk.corpus import wordnet as wn

In [3]:
objects = []
objectStrings = []
objectSets = []

In [4]:
# Array of Target Objects
objects = ['baby', 'book', 'bottle', 'cat', 'dog', 'hand', 'shoe', 'spoon']

# WordNet Senses Corresponding to the target objects (hand coded by Ananya Mittal)
objectStrings = ['baby.n.01', 'book.n.01', 'bottle.n.01', 'cat.n.01', 
                 'dog.n.01',  'hand.n.01', 'shoe.n.01',   'spoon.n.01']

# Get synsets for target objects
for o in objectStrings:
    objectSets.append(wn.synset(o))

In [5]:
# Create DataFrame of all MCDI words, including hand coded senses for each word
mcdi_df = pd.read_csv('MCDI.csv')
mcdi_df.columns = ['words','entry','pos','sense','string']
data = mcdi_df[['words','string']]

In [6]:
# Check DF
mcdi_df

Unnamed: 0,words,entry,pos,sense,string
0,airplane,airplane,.n.,1,airplane.n.01
1,animal,animal,.n.,1,animal.n.01
2,apple,apple,.n.,1,apple.n.01
3,arm,arm,.n.,1,arm.n.01
4,aunt,aunt,.n.,1,aunt.n.01
5,baby,baby,.n.,1,baby.n.01
6,babysitter,babysitter,.n.,1,babysitter.n.01
7,backyard,backyard,.n.,1,backyard.n.01
8,ball,ball,.n.,1,ball.n.01
9,balloon,balloon,.n.,2,balloon.n.02


In [7]:
# Get synsets for each word
def find_synset(st):
    try:
        return wn.synset(st)
    except:
        return None
mcdi_df['synset'] = mcdi_df['string'].map(lambda x: find_synset(x)) 

In [8]:
# Check DF
mcdi_df

Unnamed: 0,words,entry,pos,sense,string,synset
0,airplane,airplane,.n.,1,airplane.n.01,Synset('airplane.n.01')
1,animal,animal,.n.,1,animal.n.01,Synset('animal.n.01')
2,apple,apple,.n.,1,apple.n.01,Synset('apple.n.01')
3,arm,arm,.n.,1,arm.n.01,Synset('arm.n.01')
4,aunt,aunt,.n.,1,aunt.n.01,Synset('aunt.n.01')
5,baby,baby,.n.,1,baby.n.01,Synset('baby.n.01')
6,babysitter,babysitter,.n.,1,babysitter.n.01,Synset('babysitter.n.01')
7,backyard,backyard,.n.,1,backyard.n.01,Synset('backyard.n.01')
8,ball,ball,.n.,1,ball.n.01,Synset('ball.n.01')
9,balloon,balloon,.n.,2,balloon.n.02,Synset('balloon.n.02')


In [9]:
# Create DFs to store similarity ratings (OBJECTS x MCDI)
similarity_df_PL = pd.DataFrame(0.0, index=range(len(mcdi_df['words'])), columns=range(len(objects)))
similarity_df_LCH = pd.DataFrame(0.0, index=range(len(mcdi_df['words'])), columns=range(len(objects)))
similarity_df_WUP = pd.DataFrame(0.0, index=range(len(mcdi_df['words'])), columns=range(len(objects)))

In [10]:
## Create DFs to store similarity ratings (OBJECTS x OBJECTS)
#similarity_df_PL = pd.DataFrame(0.0, index=range(len(objects)), columns=range(len(objects)))
#similarity_df_LCH = pd.DataFrame(0.0, index=range(len(objects)), columns=range(len(objects)))
#similarity_df_WUP = pd.DataFrame(0.0, index=range(len(objects)), columns=range(len(objects)))

In [11]:
## Create DFs to store similarity ratings (MCDI x MCDI)
#similarity_df_PL = pd.DataFrame(0.0, index=range(len(mcdi_df['words'])), columns=range(len(mcdi_df['words'])))
#similarity_df_LCH = pd.DataFrame(0.0, index=range(len(mcdi_df['words'])), columns=range(len(mcdi_df['words'])))
#similarity_df_WUP = pd.DataFrame(0.0, index=range(len(mcdi_df['words'])), columns=range(len(mcdi_df['words'])))

In [12]:
# Helper function: check if a given word has synsets
def has_synset(st):
    if st is None: return False
    else: return True

In [13]:
# For each word in the MCDI, check that they all have synsets
i = 0
j = 0
index_ct = 0

for st in mcdi_df['synset']:
    if has_synset(st) == False: 
        i = i+1
        print(index_ct)
        
    else: j = j+1
        
    index_ct = index_ct+1
    
print("Number of words that do not have any synsets: ", i)
print("Number of words that do have synsets        : ", j)

Number of words that do not have any synsets:  0
Number of words that do have synsets        :  278


In [14]:
# Test: Path length similarity between BABY and babysitter
if(has_synset(mcdi_df['synset'][6])==True):
    similarity_df_PL[0][6] = objectSets[0].path_similarity(mcdi_df['synset'][6])
print("similarity = ", round(similarity_df_PL[0][6],3))

similarity =  0.111


In [15]:
# Populate similarity DFs using different measures of similarity
for w in range(len(mcdi_df['words'])): 
#for o1 in range(len(objects)): 
    for o in range(len(objects)): 
    #for w2 in range(len(mcdi_df['words'])): 
        if(has_synset(mcdi_df['synset'][w])==True):
            similarity_df_PL[o][w] = objectSets[o].path_similarity(mcdi_df['synset'][w])
            similarity_df_LCH[o][w] = objectSets[o].lch_similarity(mcdi_df['synset'][w])
            similarity_df_WUP[o][w] = objectSets[o].wup_similarity(mcdi_df['synset'][w])
        else:
            similarity_df_PL[o][w] = None
            similarity_df_LCH[o][w] = None
            similarity_df_WUP[o][w] = None

In [16]:
similarity_df_PL

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.062500,0.076923,0.100000,0.052632,0.071429,0.058824,0.090909,0.111111
1,0.142857,0.100000,0.111111,0.125000,0.333333,0.083333,0.125000,0.125000
2,0.076923,0.071429,0.076923,0.055556,0.076923,0.076923,0.083333,0.083333
3,0.071429,0.062500,0.066667,0.050000,0.066667,0.166667,0.071429,0.071429
4,0.166667,0.076923,0.083333,0.076923,0.125000,0.083333,0.090909,0.090909
5,1.000000,0.071429,0.076923,0.071429,0.111111,0.076923,0.083333,0.083333
6,0.111111,0.071429,0.076923,0.071429,0.111111,0.076923,0.083333,0.083333
7,0.066667,0.066667,0.071429,0.052632,0.071429,0.066667,0.076923,0.076923
8,0.076923,0.100000,0.142857,0.062500,0.090909,0.071429,0.125000,0.166667
9,0.090909,0.125000,0.142857,0.071429,0.111111,0.083333,0.166667,0.166667


In [17]:
# Format DFs
similarity_df_PL.columns = ['BABY', 'BOOK', 'BOTTLE', 'CAT', 'DOG', 'HAND', 'SHOE', 'SPOON']
similarity_df_LCH.columns = ['BABY', 'BOOK', 'BOTTLE', 'CAT', 'DOG', 'HAND', 'SHOE', 'SPOON']
similarity_df_WUP.columns = ['BABY', 'BOOK', 'BOTTLE', 'CAT', 'DOG', 'HAND', 'SHOE', 'SPOON']

similarity_df_PL.rename(index = mcdi_df['words'])
similarity_df_LCH.rename(index = mcdi_df['words'])
similarity_df_WUP.rename(index = mcdi_df['words'])

similarity_df_PL['words'] = mcdi_df['words']
similarity_df_LCH['words'] = mcdi_df['words']
similarity_df_WUP['words'] = mcdi_df['words']

similarity_df_WUP

Unnamed: 0,BABY,BOOK,BOTTLE,CAT,DOG,HAND,SHOE,SPOON,words
0,0.347826,0.454545,0.571429,0.307692,0.380952,0.200000,0.500000,0.600000,airplane
1,0.666667,0.470588,0.500000,0.666667,0.875000,0.266667,0.533333,0.533333,animal
2,0.363636,0.380952,0.400000,0.320000,0.400000,0.250000,0.421053,0.421053,apple
3,0.235294,0.210526,0.222222,0.173913,0.222222,0.705882,0.235294,0.235294,arm
4,0.571429,0.400000,0.421053,0.500000,0.631579,0.266667,0.444444,0.444444,aunt
5,1.000000,0.380952,0.400000,0.480000,0.600000,0.250000,0.421053,0.421053,baby
6,0.545455,0.380952,0.400000,0.480000,0.600000,0.250000,0.421053,0.421053,babysitter
7,0.300000,0.300000,0.315789,0.250000,0.315789,0.222222,0.333333,0.333333,backyard
8,0.400000,0.526316,0.666667,0.347826,0.444444,0.235294,0.588235,0.705882,ball
9,0.444444,0.588235,0.625000,0.380952,0.500000,0.266667,0.666667,0.666667,balloon


In [18]:
# Sanity Check: Test BABY
simA_ind_BABY = similarity_df_PL.sort_values(by=['BABY'], axis = 0, ascending = False)[0:12]['BABY']
simB_ind_BABY = similarity_df_LCH.sort_values(by=['BABY'], axis = 0, ascending = False)[0:12]['BABY']
simC_ind_BABY = similarity_df_WUP.sort_values(by=['BABY'], axis = 0, ascending = False)[0:12]['BABY']

print(simA_ind_BABY)
print(simB_ind_BABY)
print(simC_ind_BABY)

# 62  = child
# 182 = person
# 263 = plant
# 4   = aunt
# 188 = animal
# 1   = dog
# 35  = man
# 39  = boy
# 110 = sister
# 61  = child
# 142 = brother
# 81  = bird
# 195 = lady
# 109 = lady



5      1.000000
62     0.500000
182    0.200000
263    0.166667
4      0.166667
188    0.142857
1      0.142857
35     0.142857
39     0.142857
110    0.142857
61     0.142857
142    0.142857
Name: BABY, dtype: float64
5      3.637586
62     2.944439
182    2.028148
263    1.845827
4      1.845827
188    1.691676
1      1.691676
35     1.691676
39     1.691676
110    1.691676
61     1.691676
142    1.691676
Name: BABY, dtype: float64
5      1.000000
62     0.952381
1      0.666667
188    0.666667
182    0.666667
142    0.600000
110    0.600000
81     0.600000
35     0.600000
61     0.600000
195    0.571429
109    0.571429
Name: BABY, dtype: float64


In [19]:
# Sanity Check: Test BOOK
simA_ind_BOOK = similarity_df_PL.sort_values(by=['BOOK'], axis = 0, ascending = False)[1:12]['BOOK']
simB_ind_BOOK = similarity_df_LCH.sort_values(by=['BOOK'], axis = 0, ascending = False)[1:12]['BOOK']
simC_ind_BOOK = similarity_df_WUP.sort_values(by=['BOOK'], axis = 0, ascending = False)[1:12]['BOOK']

print(simA_ind_BOOK)
print(simB_ind_BOOK)
print(simC_ind_BOOK)

# 215 = toy
# 161 = picture
# 25  = block
# 75  = doll
# 45  = cake
# 169 = pool
# 234 = zoo
# 78  = drawer
# 170 = potty
# 73  = dish
# 196 = stairs
# 30  = box
# 183 = shoe

27     0.200000
252    0.142857
184    0.142857
26     0.142857
183    0.142857
241    0.125000
238    0.125000
222    0.125000
82     0.125000
192    0.125000
277    0.125000
Name: BOOK, dtype: float64
27     2.028148
252    1.691676
184    1.691676
26     1.691676
183    1.691676
241    1.558145
238    1.558145
222    1.558145
82     1.558145
192    1.558145
277    1.558145
Name: BOOK, dtype: float64
27     0.777778
184    0.666667
183    0.666667
252    0.625000
26     0.625000
222    0.588235
192    0.588235
238    0.588235
82     0.588235
241    0.588235
277    0.588235
Name: BOOK, dtype: float64


In [None]:
## Save Object x Words Models to CSVs
#similarity_df_PL.to_csv('path_similarity_OxW.csv')
#similarity_df_LCH.to_csv('lch_similarity_OxW.csv')
#similarity_df_WUP.to_csv('wup_similarity_OxW.csv')

In [None]:
## Save Object x Objects Models to CSVs
#similarity_df_PL.to_csv('path_similarity_OxO.csv')
#similarity_df_LCH.to_csv('lch_similarity_OxO.csv')
#similarity_df_WUP.to_csv('wup_similarity_OxO.csv')

In [None]:
## Save Words x Words Models to CSVs
#similarity_df_PL.to_csv('path_similarity_WxW.csv')
#similarity_df_LCH.to_csv('lch_similarity_WxW.csv')
#similarity_df_WUP.to_csv('wup_similarity_WxW.csv')