In [1]:
import sys
import time

import pandas as pd

from nltk.corpus import wordnet as wn

from pywsd.utils import lemmatize
from pywsd.lesk import synset_signatures

Warming up PyWSD (takes ~10 secs)... took 6.695136785507202 secs.


In [2]:
all_signatures = []

start = time.time()
for ss in wn.all_synsets():
    ss_signature = {}
    offset = ss.offset()
    pos = ss.pos()
    idx = str(offset).zfill(8) + '-' + pos
    
    ss_signature['simple'] = synset_signatures(ss, hyperhypo=True, adapted=False,
                                               remove_stopwords=True, 
                                               to_lemmatize=True, remove_numbers=True,
                                               lowercase=True)
    
    ss_signature['adapted'] = synset_signatures(ss, hyperhypo=True, adapted=True,
                                                   remove_stopwords=True, 
                                                   to_lemmatize=True, remove_numbers=True,
                                                   lowercase=True)
    
    ss_signature['original'] = synset_signatures(ss, original_lesk=True,
                                                 remove_stopwords=True, 
                                                 to_lemmatize=True, remove_numbers=True,
                                                 lowercase=True)
    
    
    all_signatures.append({'name': ss.name(), 'offset-pos': idx, 
                           'original': ss_signature['original'], 
                           'simple': ss_signature['simple'], 
                           'adapted':ss_signature['adapted']})

print('took {}'.format(time.time() - start), file=sys.stderr)

took 38.222004890441895


In [3]:
df = pd.DataFrame(all_signatures)
df.head()

Unnamed: 0,adapted,name,offset-pos,original,simple
0,"{know-how, project, able, computer, car, somet...",able.a.01,00001740-a,"{having, know-how, by, to, (, something, neces...","{know-how, project, able, computer, car, somet..."
1,"{know-how, without, car, necessary, usually, t...",unable.a.01,00002098-a,"{having, know-how, not, by, to, (, necessary, ...","{know-how, without, car, necessary, usually, t..."
2,"{abaxial, side, leaf, away, axis, face, stem, ...",abaxial.a.01,00002312-a,"{an, facing, from, away, axis, organism, or, t...","{abaxial, side, leaf, away, axis, face, stem, ..."
3,"{upper, know, side, leaf, axis, face, adaxial,...",adaxial.a.01,00002527-a,"{an, facing, axis, to, nearest, organism, towa...","{upper, know, side, leaf, axis, face, adaxial,..."
4,"{side, acroscopic, face, apex, toward}",acroscopic.a.01,00002730-a,"{facing, side, apex, toward, on, or, the}","{side, acroscopic, face, apex, toward}"


In [4]:
pywsd_signatures = df.set_index('name').T
pywsd_signatures.head()

name,able.a.01,unable.a.01,abaxial.a.01,adaxial.a.01,acroscopic.a.01,basiscopic.a.01,abducent.a.01,adducent.a.01,nascent.a.01,emergent.s.02,...,overcast.v.01,overcloud.v.01,clear_up.v.04,blight.v.01,swamp.v.01,run_dry.v.01,fog_up.v.01,char.v.01,haze.v.01,deflagrate.v.01
adapted,"{know-how, project, able, computer, car, somet...","{know-how, without, car, necessary, usually, t...","{abaxial, side, leaf, away, axis, face, stem, ...","{upper, know, side, leaf, axis, face, adaxial,...","{side, acroscopic, face, apex, toward}","{side, face, basiscopic, toward, base}","{away, draw, abduct, midline, muscle, part, bo...","{together, bring, draw, midline, toward, muscl...","{born, emergent, begin, emerge, chick, insurge...","{emergent, emerge, existence, republic, nascen...",...,"{make, darken, often, fog_up, fall, weather, h...","{cloud_up, overcloud, cloud_over, darken, cove...","{storm, brighten, sky, light_up, become, clear...","{rain, may, smite, afflict, cause, garden, bli...","{submerge, swamp, every, tsunami, harbor, boat...","{dry_out, run_dry, summer, river, empty, water...","{overcast, fog_up, foggy, fog, windshield, clo...","{coal, charcoal, forest, without, fire, everyt...","{dull, haze, cloud, become, overcast, hazy, cl...","{exercise, deflagrate, burn, substance, rapidl..."
offset-pos,00001740-a,00002098-a,00002312-a,00002527-a,00002730-a,00002843-a,00002956-a,00003131-a,00003356-a,00003553-s,...,02770717-v,02771020-v,02771169-v,02771320-v,02771564-v,02771756-v,02771888-v,02771997-v,02772202-v,02772310-v
original,"{having, know-how, by, to, (, something, neces...","{having, know-how, not, by, to, (, necessary, ...","{an, facing, from, away, axis, organism, or, t...","{an, facing, axis, to, nearest, organism, towa...","{facing, side, apex, toward, on, or, the}","{facing, side, toward, on, base, or, the}","{an, from, away, drawing, ;, the, midline, or,...","{an, together, drawing, ;, the, midline, bring...","{being, beginning, or, born}","{into, existence, coming}",...,"{overcast, cloudy, make, or}","{covered, become, with, clouds}","{become, clear}","{a, to, cause, blight, suffer}","{submerged, submerge, drenched, drench, or, be}","{empty, water, become, of}","{foggy, get}","{charcoal, burn, to}","{dull, or, become, hazy, cloudy, ,}","{to, and, with, burn, great, rapidly, intensit..."
simple,"{know-how, project, able, computer, car, somet...","{know-how, without, car, necessary, usually, t...","{abaxial, side, leaf, away, axis, face, stem, ...","{upper, know, side, leaf, axis, face, adaxial,...","{side, acroscopic, face, apex, toward}","{side, face, basiscopic, toward, base}","{away, draw, abduct, midline, muscle, part, bo...","{together, bring, draw, midline, toward, muscl...","{begin, born, chick, insurgency, nascent}","{emergent, emerge, existence, republic, come}",...,"{make, darken, often, fog_up, fall, weather, h...","{cloud_up, overcloud, cloud_over, darken, cove...","{storm, brighten, sky, light_up, become, clear...","{rain, may, smite, afflict, cause, garden, bli...","{submerge, swamp, every, tsunami, harbor, boat...","{dry_out, run_dry, summer, river, empty, water...","{overcast, fog_up, foggy, fog, windshield, clo...","{coal, charcoal, forest, without, fire, everyt...","{dull, haze, cloud, become, overcast, hazy, cl...","{exercise, deflagrate, burn, substance, rapidl..."


In [5]:
pywsd_signatures.to_pickle('signatures.pkl',protocol=2)