adding support for xml preprocessing format

c-amr · Jan 5, 2017 · a816c9f · a816c9f
1 parent 1a52d91
commit a816c9f
Show file tree

Hide file tree

Showing 10 changed files with 307 additions and 56 deletions.
diff --git a/.gitignore b/.gitignore
@@ -58,6 +58,7 @@ data/
 models/
 stanfordnlp/stanford-corenlp-full-2013-06-20*
 stanfordnlp/stanford-parser-full-2014-01-04*
+stanfordnlp/stanford-corenlp-full-2015-*
 smatch*/
 
 # cache

diff --git a/amr_parsing.py b/amr_parsing.py
@@ -146,6 +146,7 @@ def main():
     arg_parser.add_argument('--feat',help='feature template file')
     arg_parser.add_argument('-iter','--iterations',default=1,type=int,help='training iterations')
     arg_parser.add_argument('amr_file',nargs='?',help='amr annotation file/input sentence file for parsing')
+    arg_parser.add_argument('--prpfmt',choices=['xml','plain'],default='xml',help='preprocessed file format')
     arg_parser.add_argument('--amrfmt',choices=['sent','amr','amreval'],default='sent',help='specifying the input file format')
     arg_parser.add_argument('--smatcheval',action='store_true',help='give evaluation score using smatch')
     arg_parser.add_argument('-e','--eval',nargs=2,help='Error Analysis: give parsed AMR file and gold AMR file')
@@ -165,11 +166,11 @@ def main():
 
     # using corenlp to preprocess the sentences 
     if args.mode == 'preprocess':
-        instances = preprocess(amr_file,START_SNLP=True,INPUT_AMR=args.amrfmt)
+        instances = preprocess(amr_file,START_SNLP=True,INPUT_AMR=args.amrfmt, PRP_FORMAT=args.prpfmt)
         print "Done preprocessing!"
     # preprocess the JAMR aligned amr
     elif args.mode == 'test_gold_graph':     
-        instances = preprocess(amr_file,False)
+        instances = preprocess(amr_file,START_SNLP=False,INPUT_AMR=args.amrfmt, PRP_FORMAT=args.prpfmt)
         #instances = pickle.load(open('data/gold_edge_graph.pkl','rb'))
         gold_amr = []
         for inst in instances:
@@ -243,7 +244,7 @@ def main():
     # test deterministic oracle 
     elif args.mode == 'oracleGuide':
 
-        train_instances = preprocess(amr_file,START_SNLP=False)
+        train_instances = preprocess(amr_file,START_SNLP=False,INPUT_AMR=args.amrfmt, PRP_FORMAT=args.prpfmt)
         try:
             hand_alignments = load_hand_alignments(amr_file+str('.hand_aligned'))
         except IOError:
@@ -315,9 +316,9 @@ def main():
         print "Using verbalization list: %s"%(constants.FLAG_VERB)
         print "Using charniak parser trained on ontonotes: %s"%(constants.FLAG_ONTO)
         print "Dependency parser used: %s"%(constants.FLAG_DEPPARSER)
-        train_instances = preprocess(amr_file,START_SNLP=False)
-        if args.add: train_instances = train_instances + preprocess(args.add,START_SNLP=False)
-        if args.dev: dev_instances = preprocess(args.dev,START_SNLP=False)
+        train_instances = preprocess(amr_file,START_SNLP=False,INPUT_AMR=args.amrfmt,PRP_FORMAT=args.prpfmt)
+        if args.add: train_instances = train_instances + preprocess(args.add,START_SNLP=True,INPUT_AMR=args.amrfmt,PRP_FORMAT=args.prpfmt)
+        if args.dev: dev_instances = preprocess(args.dev,START_SNLP=False,INPUT_AMR=args.amrfmt,PRP_FORMAT=args.prpfmt)
 
 
         if args.section != 'all':
@@ -382,7 +383,7 @@ def main():
         print >> experiment_log ,"DONE TRAINING!"
 
     elif args.mode == 'parse': # actual parsing
-        test_instances = preprocess(amr_file,START_SNLP=False,INPUT_AMR=args.amrfmt)
+        test_instances = preprocess(amr_file,START_SNLP=False,INPUT_AMR=args.amrfmt,PRP_FORMAT=args.prpfmt)
         if args.section != 'all':
             print "Choosing corpus section: %s"%(args.section)
             tcr = constants.get_corpus_range(args.section,'test')

diff --git a/depparser.py b/depparser.py
@@ -37,7 +37,11 @@ def parse(self,sent_filename):
             for l in f:
                 lineno += 1
                 print >> logs, 'lineno %s, %s'% (lineno, l)
-                parsed_trees = rrp.simple_parse(l.strip().split())
+                try:
+                    parsed_trees = rrp.simple_parse(l.strip().split())
+                except IndexError:
+                    parsed_trees = rrp.simple_parse(l.strip().split()[:64])
+
                 parsed_trees += '\n'
                 of.write(parsed_trees)
 

diff --git a/parser.py b/parser.py
@@ -503,7 +503,7 @@ def testOracleGuide(self,instance,start_step=0):
         while not state.is_terminal():
 
             if self.verbose > 0:
-                print >> sys.stderr, state.print_config()
+                #print >> sys.stderr, state.print_config()
                 #print state.A.print_tuples()                                    
                 if DRAW_GRAPH:
                     fname = "graph"+str(state.sentID)+"_s"+str(step)

diff --git a/preprocessing.py b/preprocessing.py
@@ -9,6 +9,7 @@
 from depparser import CharniakParser,StanfordDepParser,ClearDepParser,TurboDepParser, MateDepParser
 from collections import OrderedDict
 import constants
+import xml.etree.ElementTree as ET
 
 log = sys.stdout
 
@@ -81,6 +82,7 @@ def _write_sentences(file_path,sentences):
     """
     write out the sentences to file
     """
+    print >> log, "Writing sentence file to %s" % file_path 
     output = codecs.open(file_path,'w',encoding='utf-8')
     for sent in sentences:
         output.write(sent+'\n')
@@ -92,7 +94,8 @@ def _write_tok_sentences(file_path,instances,comments=None):
         if comments:
             output_tok.write("%s %s\n" % (comments[i]['id'],' '.join(inst.get_tokenized_sent())))
         else:
-            output_tok.write("%s\n" % (' '.join(inst.get_tokenized_sent())))
+            sent = ' '.join(inst.get_tokenized_sent())
+            output_tok.write("%s\n" % sent)
     output_tok.close()
 
 def _write_tok_amr(file_path,amr_file,instances):
@@ -263,7 +266,7 @@ def _add_dependency(instances,result,FORMAT="stanford"):
                     m = re.match(r'(?P<lemma>.+)-(?P<index>[^-]+)', l_lemma)
                     l_lemma, l_index = m.group('lemma'), m.group('index')
                     # some string may start with @; change the segmenter
-                    m = re.match(r'(?P<lemma>[^\^]+|\^(?=-))(\^(?P<trace>[^-]+))?-(?P<index>[^-]+)', r_lemma)
+                    m = re.match(r'(?P<lemma>[^\^]+|\^*(?=-))(\^(?P<trace>[^-]+))?-(?P<index>[^-]+)', r_lemma)
                     try:
                         r_lemma,r_trace, r_index = m.group('lemma'), m.group('trace'), m.group('index')
                     except AttributeError:
@@ -285,7 +288,35 @@ def _add_dependency(instances,result,FORMAT="stanford"):
     else:
         raise ValueError("Unknown dependency format!")
 
-def preprocess(input_file,START_SNLP=True,INPUT_AMR='amr'):
+def load_xml_instances(input_xml):
+    tree = ET.parse(input_xml)
+    root = tree.getroot()
+    instances = []
+    nb_sent = 0
+    nb_tok = 0
+    for doc in root.iter('document'):
+        for sentences in root.iter('sentences'):
+            for sentence in sentences.iter('sentence'):
+                if nb_sent % 1000 == 0:
+                    print >> log, "%d ...." % nb_sent ,
+                    sys.stdout.flush()
+                data = Data()
+                text = ''
+                data.newSen()
+                for tokens in sentence.iter('tokens'):
+                    for tok in tokens.iter('token'):
+                        nb_tok += 1
+                        data.addToken(tok.find('word').text, tok.find('CharacterOffsetBegin').text,
+                                      tok.find('CharacterOffsetEnd').text, tok.find('lemma').text, tok.find('POS').text, tok.find('NER').text)
+                instances.append(data)
+                nb_sent+=1
+
+    print >> log, '\n'
+    print >> log, "Total number of sentences: %d, number of tokens: %s" % (nb_sent, nb_tok)
+
+    return instances
+
+def preprocess(input_file,START_SNLP=True,INPUT_AMR='amr',PRP_FORMAT='plain'):
     '''nasty function'''
     tmp_sent_filename = None
     instances = None
@@ -306,19 +337,32 @@ def preprocess(input_file,START_SNLP=True,INPUT_AMR='amr'):
         if not os.path.exists(tmp_sent_filename): # no cache found
             _write_sentences(tmp_sent_filename,sentences)
 
-        tmp_prp_filename = tmp_sent_filename+'.prp'
-
-        proc1 = StanfordCoreNLP()
+        tmp_prp_filename = None
+        instances = None
+        if PRP_FORMAT == 'plain':
+            tmp_prp_filename = tmp_sent_filename+'.prp'
+
+
+            proc1 = StanfordCoreNLP()
 
-        # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP
+            # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP
 
-        if START_SNLP and not os.path.exists(tmp_prp_filename):
-            print >> log, "Start Stanford CoreNLP..."
-            proc1.setup()
+            if START_SNLP and not os.path.exists(tmp_prp_filename):
+                print >> log, "Start Stanford CoreNLP..."
+                proc1.setup()
 
-        print >> log, 'Read token,lemma,name entity file %s...' % (tmp_prp_filename)            
-        instances = proc1.parse(tmp_sent_filename)
+            print >> log, 'Read token,lemma,name entity file %s...' % (tmp_prp_filename)            
+            instances = proc1.parse(tmp_sent_filename)
 
+        elif PRP_FORMAT == 'xml': # rather than using corenlp plain format; using xml format; also we don't use corenlp wrapper anymore
+            tmp_prp_filename = tmp_sent_filename+'.prp.xml'
+            if not os.path.exists(tmp_prp_filename):
+                raise Exception("No preprocessed xml file found: %s" % tmp_prp_filename)
+            print >> log, 'Read token,lemma,name entity file %s...' % (tmp_prp_filename)
+            instances = load_xml_instances(tmp_prp_filename)
+        else:
+            raise Exception('Unknow preprocessed file format %s' % PRP_FORMAT)
+
         tok_sent_filename = tmp_sent_filename+'.tok' # write tokenized sentence file
         if not os.path.exists(tok_sent_filename):
             _write_tok_sentences(tok_sent_filename,instances)
@@ -376,21 +420,47 @@ def preprocess(input_file,START_SNLP=True,INPUT_AMR='amr'):
             instances[i].addComment(comments[i])
 
     else:        # input file is sentence
-        tmp_sent_filename = input_file 
-        tmp_prp_filename = tmp_sent_filename+'.prp'
-
-        proc1 = StanfordCoreNLP()
+        tmp_sent_filename = input_file
 
-        # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP
-        if START_SNLP and not os.path.exists(tmp_prp_filename):
-            print >> log, "Start Stanford CoreNLP ..."
-            proc1.setup()
+        tmp_prp_filename = None
+        instances = None
+        if PRP_FORMAT == 'plain':
+            tmp_prp_filename = tmp_sent_filename+'.prp'
+
+            proc1 = StanfordCoreNLP()
+
+            # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP
+
+            if START_SNLP and not os.path.exists(tmp_prp_filename):
+                print >> log, "Start Stanford CoreNLP..."
+                proc1.setup()
+
+            print >> log, 'Read token,lemma,name entity file %s...' % (tmp_prp_filename)            
             instances = proc1.parse(tmp_sent_filename)
-        elif os.path.exists(tmp_prp_filename): # found cache file
+
+        elif PRP_FORMAT == 'xml': # rather than using corenlp plain format; using xml format; also we don't use corenlp wrapper anymore
+            tmp_prp_filename = tmp_sent_filename+'.xml'
+            if not os.path.exists(tmp_prp_filename):
+                raise Exception("No preprocessed xml file found: %s" % tmp_prp_filename)
             print >> log, 'Read token,lemma,name entity file %s...' % (tmp_prp_filename)
-            instances = proc1.parse(tmp_sent_filename)
+            instances = load_xml_instances(tmp_prp_filename)
         else:
-            raise Exception('No cache file %s has been found. set START_SNLP=True to start corenlp.' % (tmp_prp_filename))
+            raise Exception('Unknow preprocessed file format %s' % PRP_FORMAT)
+
+
+        # tmp_prp_filename = tmp_sent_filename+'.prp'
+        # proc1 = StanfordCoreNLP()
+
+        # # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP
+        # if START_SNLP and not os.path.exists(tmp_prp_filename):
+        #     print >> log, "Start Stanford CoreNLP ..."
+        #     proc1.setup()
+        #     instances = proc1.parse(tmp_sent_filename)
+        # elif os.path.exists(tmp_prp_filename): # found cache file
+        #     print >> log, 'Read token,lemma,name entity file %s...' % (tmp_prp_filename)
+        #     instances = proc1.parse(tmp_sent_filename)
+        # else:
+        #     raise Exception('No cache file %s has been found. set START_SNLP=True to start corenlp.' % (tmp_prp_filename))
 
 
         tok_sent_filename = tmp_sent_filename+'.tok' # write tokenized sentence file

diff --git a/scripts/create_prp_xml.sh b/scripts/create_prp_xml.sh
@@ -0,0 +1,9 @@
+
+DATA_PATH=$1
+DATADIR="$( cd "$( dirname "${1}" )" && pwd )"
+DIR="$( cd "$( dirname "${BASH_SOURCE[ 0]}" )" && pwd )"
+CORENLP_PATH="/home/j/llc/cwang24/Tools/stanford-corenlp-full-2015-04-20"
+
+java -Xmx25000m -cp "${CORENLP_PATH}/*" edu.stanford.nlp.pipeline.StanfordCoreNLP -props "${CORENLP_PATH}/default.properties" -file $DATA_PATH -outputDirectory $DATADIR
+
+mv $DATA_PATH.xml $DATA_PATH.prp.xml
diff --git a/scripts/preprocess-xml.sh b/scripts/preprocess-xml.sh
@@ -0,0 +1 @@
+python amr_parsing.py -m preprocess --amrfmt amr data/semeval/xml-data/training.txt
diff --git a/scripts/stdconvert.sh b/scripts/stdconvert.sh
@@ -1,4 +1,4 @@
 #CORENLP_PATH='/home/j/llc/cwang24/Tools/CoreNLP-mod-convert.jar'
 #CORENLP_PATH='/home/j/llc/cwang24/Tools/CoreNLP-mod-convert-collapse.jar'
 CORENLP_PATH='./lib'
-java -Xmx1800m -cp $CORENLP_PATH/CoreNLP-mod-convert-collapse.jar edu.stanford.nlp.trees.EnglishGrammaticalStructure -basic -treeFile $1 > $1.dep
+java -Xmx1800m -cp $CORENLP_PATH/CoreNLP-mod-convert-collapse.jar edu.stanford.nlp.trees.EnglishGrammaticalStructure -basic -treeFile $1 > $1.dep