In [12]:
# convert SemEval data to the standard format

from glob import glob 
import codecs
from xml.etree import ElementTree as et
from traceback import format_exc
from collections import defaultdict
from nltk.corpus import wordnet as wn
from traceback import format_exc


def get_related_by_sensekey(sense_key, verbose=False):
    """ from sense key like 'window%1:06:00::' return list of related words """
    
    related = []
    try:
        
        sense_key = sense_key.split("/")[0]

        synset = wn.lemma_from_key(sense_key).synset()
        lemmas = synset.lemma_names()
        definition = synset.definition()
        examples = synset.examples()

        hypernyms = []
        for hypernym in synset.hypernyms():
            for lemma in hypernym.lemmas():
                hypernyms.append(lemma.name())

        hyponyms = []
        for hyponym in synset.hyponyms():
            for lemma in hyponym.lemmas():
                hyponyms.append(lemma.name())

        related = lemmas + hyponyms + hypernyms
        related = [r.lower().replace("_"," ") for r in related]

        if verbose:
            print "synset:", lemmas
            print "definition:", definition
            print "examples:", examples
            print "hypernyms:", hypernyms
            print "hyponyms:", hyponyms
            print "related:", related
    
    except:
        print "Bad key:", sense_key
        print format_exc()
        
    return set(related)


def semeval_xml2csv(contexts_fpaths, keys_fpath, output_fpath):
    # get keys
    with codecs.open(keys_fpath, "r", "utf-8") as keys:
        context_id2sense_ids = {}
        for line in keys:
            try:
                fields = line.split()
                target = fields[0]
                context_id = fields[1]
                golden_sense_ids = fields[2:]
                context_id2sense_ids[context_id] = golden_sense_ids
            except:
                print "bad line: '%s'" % line.strip()
                print format_exc()

    # parse xml
    # "<instance id="appear.v.1" lemma="appear" partOfSpeech="v" token="appear" tokenEnd="65" tokenStart="59">Tone it down a tad, or at least bring a froth cup when you appear        before cameras.)</instance>"
    with codecs.open(output_fpath, "w", "utf-8") as out:
        print >> out, "context_id\ttarget\ttarget_pos\ttarget_position\tgold_sense_ids\tpredict_sense_ids\tgolden_related\tpredict_related\tcontext"

        for word_fpath in glob(contexts_fpaths):
            #print word_fpath
            tree = et.parse(word_fpath)
            root = tree.getroot()

            for child in root:
                if child.tag == "instance":
                    golden_related = set()
                    for sense_key in context_id2sense_ids[child.attrib["id"]]:
                        golden_related = golden_related.union(get_related_by_sensekey(sense_key))
                    #print child.attrib["lemma"], ">>>", golden_related
                    
                    print >> out, "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (
                                              child.attrib["id"],
                                              child.attrib["lemma"],
                                              child.attrib["partOfSpeech"],
                                              child.attrib["tokenStart"]+  "," + child.attrib["tokenEnd"], 
                                              ",".join(context_id2sense_ids[child.attrib["id"]]),
                                              "",
                                              ",".join(golden_related),
                                              "",
                                              child.text)
    print output_fpath
        
    
contexts_fpaths = "/Users/alex/work/joint/eval/contextualization-eval/semeval_2013_13/contexts/xml-format/*.xml"
keys_fpath = "/Users/alex/work/joint/eval/contextualization-eval/semeval_2013_13/keys/gold/all.key"
output_fpath = "/Users/alex/Desktop/output-semeval-2013.csv"
semeval_xml2csv(contexts_fpaths, keys_fpath, output_fpath)


Bad key: lose%2:30:05::
Traceback (most recent call last):
  File "<ipython-input-12-9102932fc733>", line 20, in get_related_by_sensekey
    synset = wn.lemma_from_key(sense_key).synset()
  File "/usr/local/lib/python2.7/site-packages/nltk/corpus/reader/wordnet.py", line 1202, in lemma_from_key
    raise WordNetError("No synset found for key %r" % key)
WordNetError: No synset found for key u'lose%2:30:05::'

Bad key: number%1:10:07::
Traceback (most recent call last):
  File "<ipython-input-12-9102932fc733>", line 20, in get_related_by_sensekey
    synset = wn.lemma_from_key(sense_key).synset()
  File "/usr/local/lib/python2.7/site-packages/nltk/corpus/reader/wordnet.py", line 1202, in lemma_from_key
    raise WordNetError("No synset found for key %r" % key)
WordNetError: No synset found for key u'number%1:10:07::'

Bad key: part%1:06:01::
Traceback (most recent call last):
  File "<ipython-input-12-9102932fc733>", line 20, in get_related_by_sensekey
    synset = wn.lemma_from_key(sens

In [19]:
import codecs
from pandas import read_csv

dataset_fpath = "/Users/alex/work/joint/eval/contextualization-eval/data/Dataset-SemEval-2013-13-adagram-ukwac-wacky-raw.csv" 
output_fpath = "/Users/alex/Desktop/adagram.key"


/Users/alex/Desktop/adagram.key


In [16]:
df 

Unnamed: 0,context_id,target,target_pos,target_position,gold_sense_ids,predict_sense_ids,golden_related,predict_related,context
0,add.v.1,add,v,5157,add%2:32:01::/4,2,"insert,sneak in,supply,slip in,stick in,toss i...","lend,compare,contribute,relate,imitate,represe...","Lewinsky wrote ""Return to Sender"" on the envel..."
1,add.v.2,add,v,121125,add%2:32:01::/4,2,"insert,sneak in,supply,slip in,stick in,toss i...","lend,compare,contribute,relate,imitate,represe...","For instance, the Post also has the story abou..."
2,add.v.3,add,v,2733,add%2:30:00::/4,1,"concatenate,gild the lily,string,mix in,add on...","install,configure,custom,users,support,render,...",Preventing developers from adding features is ...
3,add.v.4,add,v,710,add%2:32:00::/4,2,"count,tote up,sum,tally,add together,tot,numbe...","lend,compare,contribute,relate,imitate,represe...",if you add the um uh people of various sexual ...
4,add.v.5,add,v,38,add%2:40:00::/4,3,"instill,tinsel,lend,contribute,modify,bestow,b...","preserve,give,combine,adding,obtain,apply,desi...","An added benefit of a warm winter, of course, ..."
5,add.v.6,add,v,3538,add%2:30:00::/4,3,"concatenate,gild the lily,string,mix in,add on...","preserve,give,combine,adding,obtain,apply,desi...","To prepare a 10% solution of acid, add 10 mL o..."
6,add.v.7,add,v,145148,add%2:31:00::/4,3,"cypher,compute,calculate,figure,reckon,foot up...","preserve,give,combine,adding,obtain,apply,desi...",To find the day of the week on which that date...
7,add.v.8,add,v,510,add%2:31:00::/4,2,"cypher,compute,calculate,figure,reckon,foot up...","lend,compare,contribute,relate,imitate,represe...",uh i added up all the taxes that we were going...
8,add.v.9,add,v,8287,add%2:30:00::/4,3,"concatenate,gild the lily,string,mix in,add on...","preserve,give,combine,adding,obtain,apply,desi...",The tripe with onions and garlic is cooked for...
9,add.v.10,add,v,5054,add%2:31:00::/4,3,"cypher,compute,calculate,figure,reckon,foot up...","preserve,give,combine,adding,obtain,apply,desi...",and they give you a half percent of that and t...
