In [None]:
# Import packages
import nltk
import csv
import numpy as np
import pandas as pd

In [None]:
# Import WordNet dataset
from nltk.corpus import wordnet as wn

In [None]:
objects = []
objectStrings = []
objectSets = []

In [None]:
# Array of Target Objects
objects = ['baby', 'book', 'bottle', 'cat', 'dog', 'hand', 'shoe', 'spoon']

# WordNet Senses Corresponding to the target objects (hand coded by Ananya Mittal)
objectStrings = ['baby.n.01', 'book.n.01', 'bottle.n.01', 'cat.n.01', 
                 'dog.n.01',  'hand.n.01', 'shoe.n.01',   'spoon.n.01']

# Get synsets for target objects
for o in objectStrings:
    objectSets.append(wn.synset(o))

In [None]:
# Create DataFrame of all MCDI words, including hand coded senses for each word
mcdi_df = pd.read_csv('MCDI.csv')
mcdi_df.columns = ['words','entry','pos','sense','string']
data = mcdi_df[['words','string']]

In [None]:
# Check DF
mcdi_df

In [None]:
# Get synsets for each word
def find_synset(st):
    try:
        return wn.synset(st)
    except:
        return None
mcdi_df['synset'] = mcdi_df['string'].map(lambda x: find_synset(x)) 

In [None]:
# Check DF
mcdi_df

In [None]:
# Create DFs to store similarity ratings (OBJECTS x MCDI)
similarity_df_PL = pd.DataFrame(0.0, index=range(len(mcdi_df['words'])), columns=range(len(objects)))
similarity_df_LCH = pd.DataFrame(0.0, index=range(len(mcdi_df['words'])), columns=range(len(objects)))
similarity_df_WUP = pd.DataFrame(0.0, index=range(len(mcdi_df['words'])), columns=range(len(objects)))

In [None]:
# Helper function: check if a given word has synsets
def has_synset(st):
    if st is None: return False
    else: return True

In [None]:
# For each word in the MCDI, check that they all have synsets
i = 0
j = 0
index_ct = 0

for st in mcdi_df['synset']:
    if has_synset(st) == False: 
        i = i+1
        print(index_ct)
        
    else: j = j+1
        
    index_ct = index_ct+1
    
print("Number of words that do not have any synsets: ", i)
print("Number of words that do have synsets        : ", j)

In [None]:
# Test: Path length similarity between BABY and babysitter
if(has_synset(mcdi_df['synset'][6])==True):
    similarity_df_PL[0][6] = objectSets[0].path_similarity(mcdi_df['synset'][6])
print("similarity = ", round(similarity_df_PL[0][6],3))

In [None]:
# Populate similarity DFs using different measures of similarity
for w in range(len(mcdi_df['words'])): 
    for o in range(len(objects)): 
        if(has_synset(mcdi_df['synset'][w])==True):
            similarity_df_PL[o][w] = objectSets[o].path_similarity(mcdi_df['synset'][w])
            similarity_df_LCH[o][w] = objectSets[o].lch_similarity(mcdi_df['synset'][w])
            similarity_df_WUP[o][w] = objectSets[o].wup_similarity(mcdi_df['synset'][w])
        else:
            similarity_df_PL[o][w] = None
            similarity_df_LCH[o][w] = None
            similarity_df_WUP[o][w] = None

In [None]:
similarity_df_PL

In [None]:
# Format DFs
similarity_df_PL.columns = ['BABY', 'BOOK', 'BOTTLE', 'CAT', 'DOG', 'HAND', 'SHOE', 'SPOON']
similarity_df_LCH.columns = ['BABY', 'BOOK', 'BOTTLE', 'CAT', 'DOG', 'HAND', 'SHOE', 'SPOON']
similarity_df_WUP.columns = ['BABY', 'BOOK', 'BOTTLE', 'CAT', 'DOG', 'HAND', 'SHOE', 'SPOON']

similarity_df_PL.rename(index = mcdi_df['words'])
similarity_df_LCH.rename(index = mcdi_df['words'])
similarity_df_WUP.rename(index = mcdi_df['words'])

similarity_df_PL['words'] = mcdi_df['words']
similarity_df_LCH['words'] = mcdi_df['words']
similarity_df_WUP['words'] = mcdi_df['words']

similarity_df_WUP

In [None]:
# Sanity Check: Test BABY
simA_ind_BABY = similarity_df_PL.sort_values(by=['BABY'], axis = 0, ascending = False)[0:12]['BABY']
simB_ind_BABY = similarity_df_LCH.sort_values(by=['BABY'], axis = 0, ascending = False)[0:12]['BABY']
simC_ind_BABY = similarity_df_WUP.sort_values(by=['BABY'], axis = 0, ascending = False)[0:12]['BABY']

print(simA_ind_BABY)
print(simB_ind_BABY)
print(simC_ind_BABY)

# 62  = child
# 182 = person
# 263 = plant
# 4   = aunt
# 188 = animal
# 1   = dog
# 35  = man
# 39  = boy
# 110 = sister
# 61  = child
# 142 = brother
# 81  = bird
# 195 = lady
# 109 = lady



In [None]:
# Sanity Check: Test BOOK
simA_ind_BOOK = similarity_df_PL.sort_values(by=['BOOK'], axis = 0, ascending = False)[1:12]['BOOK']
simB_ind_BOOK = similarity_df_LCH.sort_values(by=['BOOK'], axis = 0, ascending = False)[1:12]['BOOK']
simC_ind_BOOK = similarity_df_WUP.sort_values(by=['BOOK'], axis = 0, ascending = False)[1:12]['BOOK']

print(simA_ind_BOOK)
print(simB_ind_BOOK)
print(simC_ind_BOOK)

# 215 = toy
# 161 = picture
# 25  = block
# 75  = doll
# 45  = cake
# 169 = pool
# 234 = zoo
# 78  = drawer
# 170 = potty
# 73  = dish
# 196 = stairs
# 30  = box
# 183 = shoe

In [None]:
# Save Object x Words Models to CSVs
similarity_df_PL.to_csv('path_similarity_OxW.csv')
similarity_df_LCH.to_csv('lch_similarity_OxW.csv')
similarity_df_WUP.to_csv('wup_similarity_OxW.csv')