In [1]:
import sys
import time

import pandas as pd
from tqdm import tqdm

from wn import WordNet

from pywsd.utils import lemmatize
from pywsd.lesk import synset_signatures

wn = WordNet()

Warming up PyWSD (takes ~10 secs)... took 7.696908950805664 secs.


In [2]:
all_signatures = []

start = time.time()
for ss in tqdm(wn.all_synsets()):
    ss_signature = {}
    offset = ss.offset()
    pos = ss.pos()
    idx = str(offset).zfill(8) + '-' + pos
    
    ss_signature['simple'] = synset_signatures(ss, hyperhypo=True, adapted=False,
                                               remove_stopwords=True, 
                                               to_lemmatize=True, remove_numbers=True,
                                               lowercase=True)
    
    ss_signature['adapted'] = synset_signatures(ss, hyperhypo=True, adapted=True,
                                                   remove_stopwords=True, 
                                                   to_lemmatize=True, remove_numbers=True,
                                                   lowercase=True)
    
    ss_signature['original'] = synset_signatures(ss, original_lesk=True,
                                                 remove_stopwords=True, 
                                                 to_lemmatize=True, remove_numbers=True,
                                                 lowercase=True)
    
    
    all_signatures.append({'name': ss.name(), 'offset-pos': idx, 
                           'original': ss_signature['original'], 
                           'simple': ss_signature['simple'], 
                           'adapted':ss_signature['adapted']})

print('took {}'.format(time.time() - start), file=sys.stderr)

106966it [00:23, 4483.62it/s]
took 23.86339020729065


In [3]:
df = pd.DataFrame(all_signatures)
df.head()

Unnamed: 0,adapted,name,offset-pos,original,simple
0,"{skill, able, mean, usually, authority, grant,...",able.a.01,00001740-a,"{skill, followed, having, to, authority, ', us...","{skill, able, mean, usually, authority, grant,..."
1,"{skill, obtain, without, mean, usually, fund, ...",unable.a.01,00002098-a,"{skill, followed, having, to, usually, ', (, n...","{skill, obtain, without, mean, usually, fund, ..."
2,"{stem, underside, face, dorsal, organ, side, l...",abaxial.a.01,00002312-a,"{from, facing, of, or, organ, an, axis, away, ...","{stem, underside, face, dorsal, organ, side, l..."
3,"{toward, know, upper, face, organ, adaxial, si...",adaxial.a.01,00002527-a,"{toward, to, nearest, facing, of, or, organ, a...","{toward, know, upper, face, organ, adaxial, si..."
4,"{toward, acroscopic, apex, face, side}",acroscopic.a.01,00002730-a,"{toward, apex, facing, or, side, on, the}","{toward, acroscopic, apex, face, side}"


In [4]:
pywsd_signatures = df.set_index('name').T
pywsd_signatures.head()

name,able.a.01,unable.a.01,abaxial.a.01,adaxial.a.01,acroscopic.a.01,basiscopic.a.01,abducent.a.01,adducent.a.01,nascent.a.01,dying.a.01,...,overcast.v.01,overcloud.v.01,clear_up.v.04,blight.v.01,swamp.v.01,run_dry.v.01,fog_up.v.01,char.v.01,haze.v.01,deflagrate.v.01
adapted,"{skill, able, mean, usually, authority, grant,...","{skill, obtain, without, mean, usually, fund, ...","{stem, underside, face, dorsal, organ, side, l...","{toward, know, upper, face, organ, adaxial, si...","{toward, acroscopic, apex, face, side}","{toward, basiscopic, face, side, base}","{muscle, abduct, draw, especially, part, abduc...","{toward, muscle, adduct, especially, draw, add...","{dissilient, emerge, begin, born, parturient, ...","{fire, wish, associate, man, passing, moribund...",...,"{fall, haze, weather, often, cloudy, fog_up, o...","{overcloud, cloud_over, darken, become, cloud,...","{storm, brighten, clear, become, light_up, cle...","{afflict, cause, blight, suffer, smite, plague...","{tsunami, drench, every, submerge, swamp, boat...","{dry_out, run, dry, river, run_dry, empty, bec...","{fog, get, windshield, fog_up, foggy, cloud, o...","{without, fire, drench, char, charcoal, everyt...","{haze, dull, cloudy, become, cloud, overcast, ...","{must, care, cause, rapidly, deflagrate, inten..."
offset-pos,00001740-a,00002098-a,00002312-a,00002527-a,00002730-a,00002843-a,00002956-a,00003131-a,00003356-a,00003939-a,...,02770717-v,02771020-v,02771169-v,02771320-v,02771564-v,02771756-v,02771888-v,02771997-v,02772202-v,02772310-v
original,"{skill, followed, having, to, authority, ', us...","{skill, followed, having, to, usually, ', (, n...","{from, facing, of, or, organ, an, axis, away, ...","{toward, to, nearest, facing, of, or, organ, a...","{toward, apex, facing, or, side, on, the}","{toward, facing, base, or, side, on, the}","{from, especially, muscles, of, or, part, draw...","{toward, especially, muscles, of, or, part, dr...","{being, or, beginning, born}","{with, from, in, to, ceasing, of, passing, or,...",...,"{overcast, cloudy, or, make}","{covered, become, clouds, with}","{become, clear}","{to, cause, a, blight, suffer}","{drench, or, drenched, submerge, submerged, be}","{empty, become, of, water}","{foggy, get}","{charcoal, to, burn}","{,, or, dull, cloudy, become, hazy}","{with, to, cause, rapidly, intensity, and, bur..."
simple,"{skill, able, mean, usually, authority, grant,...","{skill, obtain, without, mean, usually, fund, ...","{stem, underside, face, dorsal, organ, side, l...","{toward, know, upper, face, organ, adaxial, si...","{toward, acroscopic, apex, face, side}","{toward, basiscopic, face, side, base}","{muscle, abduct, draw, especially, part, abduc...","{toward, muscle, adduct, especially, draw, add...","{begin, born, nascent, insurgency, chick}","{fire, wish, associate, man, passing, process,...",...,"{fall, haze, weather, often, cloudy, fog_up, o...","{overcloud, cloud_over, darken, become, cloud,...","{storm, brighten, clear, become, light_up, cle...","{afflict, cause, blight, suffer, smite, plague...","{tsunami, drench, every, submerge, swamp, boat...","{dry_out, run, dry, river, run_dry, empty, bec...","{fog, get, windshield, fog_up, foggy, cloud, o...","{without, fire, drench, char, charcoal, everyt...","{haze, dull, cloudy, become, cloud, overcast, ...","{must, care, cause, rapidly, deflagrate, inten..."


In [5]:
pywsd_signatures.to_pickle('signatures-wordnet-3.0.pkl',protocol=2)