In [162]:
from nltk.corpus import wordnet as wn
import pandas as pd
from collections import defaultdict
from itertools import chain

In [4]:
wn.synsets('dog')[0]

Synset('dog.n.01')

In [252]:
objects =pd.read_csv("../../data/unconstrained_objects/all_objects.csv")
input_object_list = [lemma.replace(" ", "_") for lemma in list(objects['object'])]
superordinates_to_include = ["entity", "tool", "writing_implement", "instrument", "electronic_equipment","conveyance", "shape", "musical_instrument", "furniture", "surface", "clothing", "footwear", "home_appliance", "plaything", "sports_equipment", "body_part", "device", "container", "sport", "game"]
# words to not include that may ruin our chain
not_included = ["structure", "sticker"]
full_object_list = input_object_list + superordinates_to_include
# this will change to include synsets..
all_words = set(full_object_list)
word_maps = {}

In [253]:
import networkx as nx
import matplotlib.pyplot as plt
def word_from_syn(syn):
    return syn.lower().split('.')[0]

def get_best_synset(word):
    """Get the best synset for a word which is the tree that has the most overlap with our word set
    - prefer the most common/general sense"""
    global all_words
    global word_maps
    synsets = wn.synsets(word, pos=wn.NOUN)
    if word =="banana":
        print(synsets)
    filtered_words = [curr_word.lower() for curr_word in all_words if curr_word != word.lower()]
    if not synsets:
        return None
    num_present = {}
    for synset in synsets:
        all_related = get_syn_hypernym_chain(synset) + synset.hyponyms()
        lemma_names = [word_from_syn(lemma.name()) for lemma in all_related]
        num_present[synset] = len([word for word in filtered_words if word in lemma_names])
    best_synset = max(num_present, key=num_present.get)
    if word == "glass":
        best_synset = synsets[6]
    elif word == "hoop":
        best_synset = synsets[3]
    elif word in ["ice", "camera", "notebook", "rock", "hand", "sponge", "button", "game", "sun", "trash", "tiger", "toast", "eye", "envelope", "ghost", "gum", "head", "monkey", "lion"]:
        best_synset = synsets[0]
    elif word in ["banana", "tummy", "frame"]:
        best_synset = synsets[1]
    elif word == "kite":
        best_synset = synsets[2]
    best_word = word_from_syn(best_synset.name())
    all_words.add(best_word)
    word_maps[word] = best_synset.name()
    return best_synset

def get_syn_hypernym_chain(syn):
    chain = [syn]
    while True:
        hypers = syn.hypernyms()
        if not hypers:
            break
        syn = hypers[0]
        chain.append(syn)
    return chain

def get_hypernym_chain(word, words=[]):
    synsets = wn.synsets(word, pos=wn.NOUN)
    if not synsets:
        return []
    if word in words:
        syn = get_best_synset(word)
        if word == "banana":
            print(syn)
    else:
        syn = synsets[0]  # pick the most common sense
    chain = [syn]
    while True:
        hypers = syn.hypernyms()
        if not hypers:
            break
        syn = hypers[0]  # again, pick most common hypernym
        chain.append(syn)
    output_chain = []
    output_chain = [syn.name() for syn in chain]
    return output_chain

# setting popularity_min arbritarily high since we aren't using it, the idea being to include superordinates that are used often
# toggle this value to get a more representative larger tree, setting to 20 is nice
def build_wordnet_tree(object_list, popularity_min=20000):
    global all_words
    global word_maps
    G = nx.DiGraph()
    syn_count = {}
    syn_chains = {}
    for word in object_list:
        syn_chain = get_hypernym_chain(word, object_list)
        syn_count.update({syn: syn_count.get(syn, 0) + 1 for syn in syn_chain})
        syn_chains[word] = syn_chain
    print(syn_chains)
    for word in object_list:
        output_chain = []
        curr_chain = syn_chains[word]
        for syn in curr_chain:
            if (syn in word_maps.values() or syn_count[syn] > popularity_min) and not word_from_syn(syn) in not_included:
                output_chain.append(syn)
        for i in range(len(output_chain) - 1):
            parent = output_chain[i+1]
            child = output_chain[i]
            G.add_edge(parent, child)
    return G

G = build_wordnet_tree(full_object_list)

[Synset('banana.n.01'), Synset('banana.n.02')]
Synset('banana.n.02')
{'adult': ['adult.n.01', 'person.n.01', 'organism.n.01', 'living_thing.n.01', 'whole.n.02', 'object.n.01', 'physical_entity.n.01', 'entity.n.01'], 'adult_foot': [], 'adult_hand': [], 'adult_man': [], 'adult_woman': [], 'air_conditioner': ['air_conditioner.n.01', 'cooling_system.n.02', 'mechanism.n.05', 'device.n.01', 'instrumentality.n.03', 'artifact.n.01', 'whole.n.02', 'object.n.01', 'physical_entity.n.01', 'entity.n.01'], 'air_purifier': [], 'airplane_toy': [], 'alarm_clock': ['alarm_clock.n.01', 'clock.n.01', 'timepiece.n.01', 'measuring_instrument.n.01', 'instrument.n.01', 'device.n.01', 'instrumentality.n.03', 'artifact.n.01', 'whole.n.02', 'object.n.01', 'physical_entity.n.01', 'entity.n.01'], 'alphabet_block': [], 'animal': ['animal.n.01', 'organism.n.01', 'living_thing.n.01', 'whole.n.02', 'object.n.01', 'physical_entity.n.01', 'entity.n.01'], 'animal_picture': [], 'animal_sticker': [], 'apple': ['apple.n.02'

Saving

In [254]:
from networkx.readwrite import json_graph

# Assuming G is your NetworkX graph
json_data = json_graph.node_link_data(G)
# Now you can use the json_data object, for example, write it to a file
import json
with open("../../data/unconstrained_objects/wordnet.json", "w") as f:
    json.dump(json_data, f)
pd.DataFrame(word_maps.items()).to_csv("../../data/unconstrained_objects/wordnet_word_map.csv")

The default value will be `edges="edges" in NetworkX 3.6.


  nx.node_link_data(G, edges="links") to preserve current behavior, or
  nx.node_link_data(G, edges="edges") for forward compatibility.


In [248]:
get_syn_hypernym_chain(wn.synsets("frame")[2])

[Synset('frame.n.02'),
 Synset('photograph.n.01'),
 Synset('representation.n.02'),
 Synset('creation.n.02'),
 Synset('artifact.n.01'),
 Synset('whole.n.02'),
 Synset('object.n.01'),
 Synset('physical_entity.n.01'),
 Synset('entity.n.01')]