Skip to content

Commit

Permalink
adding support for xml preprocessing format
Browse files Browse the repository at this point in the history
  • Loading branch information
Juicechuan committed Jan 5, 2017
1 parent 1a52d91 commit a816c9f
Show file tree
Hide file tree
Showing 10 changed files with 307 additions and 56 deletions.
1 change: 1 addition & 0 deletions .gitignore
Expand Up @@ -58,6 +58,7 @@ data/
models/
stanfordnlp/stanford-corenlp-full-2013-06-20*
stanfordnlp/stanford-parser-full-2014-01-04*
stanfordnlp/stanford-corenlp-full-2015-*
smatch*/

# cache
Expand Down
15 changes: 8 additions & 7 deletions amr_parsing.py
Expand Up @@ -146,6 +146,7 @@ def main():
arg_parser.add_argument('--feat',help='feature template file')
arg_parser.add_argument('-iter','--iterations',default=1,type=int,help='training iterations')
arg_parser.add_argument('amr_file',nargs='?',help='amr annotation file/input sentence file for parsing')
arg_parser.add_argument('--prpfmt',choices=['xml','plain'],default='xml',help='preprocessed file format')
arg_parser.add_argument('--amrfmt',choices=['sent','amr','amreval'],default='sent',help='specifying the input file format')
arg_parser.add_argument('--smatcheval',action='store_true',help='give evaluation score using smatch')
arg_parser.add_argument('-e','--eval',nargs=2,help='Error Analysis: give parsed AMR file and gold AMR file')
Expand All @@ -165,11 +166,11 @@ def main():

# using corenlp to preprocess the sentences
if args.mode == 'preprocess':
instances = preprocess(amr_file,START_SNLP=True,INPUT_AMR=args.amrfmt)
instances = preprocess(amr_file,START_SNLP=True,INPUT_AMR=args.amrfmt, PRP_FORMAT=args.prpfmt)
print "Done preprocessing!"
# preprocess the JAMR aligned amr
elif args.mode == 'test_gold_graph':
instances = preprocess(amr_file,False)
instances = preprocess(amr_file,START_SNLP=False,INPUT_AMR=args.amrfmt, PRP_FORMAT=args.prpfmt)
#instances = pickle.load(open('data/gold_edge_graph.pkl','rb'))
gold_amr = []
for inst in instances:
Expand Down Expand Up @@ -243,7 +244,7 @@ def main():
# test deterministic oracle
elif args.mode == 'oracleGuide':

train_instances = preprocess(amr_file,START_SNLP=False)
train_instances = preprocess(amr_file,START_SNLP=False,INPUT_AMR=args.amrfmt, PRP_FORMAT=args.prpfmt)
try:
hand_alignments = load_hand_alignments(amr_file+str('.hand_aligned'))
except IOError:
Expand Down Expand Up @@ -315,9 +316,9 @@ def main():
print "Using verbalization list: %s"%(constants.FLAG_VERB)
print "Using charniak parser trained on ontonotes: %s"%(constants.FLAG_ONTO)
print "Dependency parser used: %s"%(constants.FLAG_DEPPARSER)
train_instances = preprocess(amr_file,START_SNLP=False)
if args.add: train_instances = train_instances + preprocess(args.add,START_SNLP=False)
if args.dev: dev_instances = preprocess(args.dev,START_SNLP=False)
train_instances = preprocess(amr_file,START_SNLP=False,INPUT_AMR=args.amrfmt,PRP_FORMAT=args.prpfmt)
if args.add: train_instances = train_instances + preprocess(args.add,START_SNLP=True,INPUT_AMR=args.amrfmt,PRP_FORMAT=args.prpfmt)
if args.dev: dev_instances = preprocess(args.dev,START_SNLP=False,INPUT_AMR=args.amrfmt,PRP_FORMAT=args.prpfmt)


if args.section != 'all':
Expand Down Expand Up @@ -382,7 +383,7 @@ def main():
print >> experiment_log ,"DONE TRAINING!"

elif args.mode == 'parse': # actual parsing
test_instances = preprocess(amr_file,START_SNLP=False,INPUT_AMR=args.amrfmt)
test_instances = preprocess(amr_file,START_SNLP=False,INPUT_AMR=args.amrfmt,PRP_FORMAT=args.prpfmt)
if args.section != 'all':
print "Choosing corpus section: %s"%(args.section)
tcr = constants.get_corpus_range(args.section,'test')
Expand Down
6 changes: 5 additions & 1 deletion depparser.py
Expand Up @@ -37,7 +37,11 @@ def parse(self,sent_filename):
for l in f:
lineno += 1
print >> logs, 'lineno %s, %s'% (lineno, l)
parsed_trees = rrp.simple_parse(l.strip().split())
try:
parsed_trees = rrp.simple_parse(l.strip().split())
except IndexError:
parsed_trees = rrp.simple_parse(l.strip().split()[:64])

parsed_trees += '\n'
of.write(parsed_trees)

Expand Down
2 changes: 1 addition & 1 deletion parser.py
Expand Up @@ -503,7 +503,7 @@ def testOracleGuide(self,instance,start_step=0):
while not state.is_terminal():

if self.verbose > 0:
print >> sys.stderr, state.print_config()
#print >> sys.stderr, state.print_config()
#print state.A.print_tuples()
if DRAW_GRAPH:
fname = "graph"+str(state.sentID)+"_s"+str(step)
Expand Down
116 changes: 93 additions & 23 deletions preprocessing.py
Expand Up @@ -9,6 +9,7 @@
from depparser import CharniakParser,StanfordDepParser,ClearDepParser,TurboDepParser, MateDepParser
from collections import OrderedDict
import constants
import xml.etree.ElementTree as ET

log = sys.stdout

Expand Down Expand Up @@ -81,6 +82,7 @@ def _write_sentences(file_path,sentences):
"""
write out the sentences to file
"""
print >> log, "Writing sentence file to %s" % file_path
output = codecs.open(file_path,'w',encoding='utf-8')
for sent in sentences:
output.write(sent+'\n')
Expand All @@ -92,7 +94,8 @@ def _write_tok_sentences(file_path,instances,comments=None):
if comments:
output_tok.write("%s %s\n" % (comments[i]['id'],' '.join(inst.get_tokenized_sent())))
else:
output_tok.write("%s\n" % (' '.join(inst.get_tokenized_sent())))
sent = ' '.join(inst.get_tokenized_sent())
output_tok.write("%s\n" % sent)
output_tok.close()

def _write_tok_amr(file_path,amr_file,instances):
Expand Down Expand Up @@ -263,7 +266,7 @@ def _add_dependency(instances,result,FORMAT="stanford"):
m = re.match(r'(?P<lemma>.+)-(?P<index>[^-]+)', l_lemma)
l_lemma, l_index = m.group('lemma'), m.group('index')
# some string may start with @; change the segmenter
m = re.match(r'(?P<lemma>[^\^]+|\^(?=-))(\^(?P<trace>[^-]+))?-(?P<index>[^-]+)', r_lemma)
m = re.match(r'(?P<lemma>[^\^]+|\^*(?=-))(\^(?P<trace>[^-]+))?-(?P<index>[^-]+)', r_lemma)
try:
r_lemma,r_trace, r_index = m.group('lemma'), m.group('trace'), m.group('index')
except AttributeError:
Expand All @@ -285,7 +288,35 @@ def _add_dependency(instances,result,FORMAT="stanford"):
else:
raise ValueError("Unknown dependency format!")

def preprocess(input_file,START_SNLP=True,INPUT_AMR='amr'):
def load_xml_instances(input_xml):
tree = ET.parse(input_xml)
root = tree.getroot()
instances = []
nb_sent = 0
nb_tok = 0
for doc in root.iter('document'):
for sentences in root.iter('sentences'):
for sentence in sentences.iter('sentence'):
if nb_sent % 1000 == 0:
print >> log, "%d ...." % nb_sent ,
sys.stdout.flush()
data = Data()
text = ''
data.newSen()
for tokens in sentence.iter('tokens'):
for tok in tokens.iter('token'):
nb_tok += 1
data.addToken(tok.find('word').text, tok.find('CharacterOffsetBegin').text,
tok.find('CharacterOffsetEnd').text, tok.find('lemma').text, tok.find('POS').text, tok.find('NER').text)
instances.append(data)
nb_sent+=1

print >> log, '\n'
print >> log, "Total number of sentences: %d, number of tokens: %s" % (nb_sent, nb_tok)

return instances

def preprocess(input_file,START_SNLP=True,INPUT_AMR='amr',PRP_FORMAT='plain'):
'''nasty function'''
tmp_sent_filename = None
instances = None
Expand All @@ -306,19 +337,32 @@ def preprocess(input_file,START_SNLP=True,INPUT_AMR='amr'):
if not os.path.exists(tmp_sent_filename): # no cache found
_write_sentences(tmp_sent_filename,sentences)

tmp_prp_filename = tmp_sent_filename+'.prp'

proc1 = StanfordCoreNLP()
tmp_prp_filename = None
instances = None
if PRP_FORMAT == 'plain':
tmp_prp_filename = tmp_sent_filename+'.prp'


proc1 = StanfordCoreNLP()

# preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP
# preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP

if START_SNLP and not os.path.exists(tmp_prp_filename):
print >> log, "Start Stanford CoreNLP..."
proc1.setup()
if START_SNLP and not os.path.exists(tmp_prp_filename):
print >> log, "Start Stanford CoreNLP..."
proc1.setup()

print >> log, 'Read token,lemma,name entity file %s...' % (tmp_prp_filename)
instances = proc1.parse(tmp_sent_filename)
print >> log, 'Read token,lemma,name entity file %s...' % (tmp_prp_filename)
instances = proc1.parse(tmp_sent_filename)

elif PRP_FORMAT == 'xml': # rather than using corenlp plain format; using xml format; also we don't use corenlp wrapper anymore
tmp_prp_filename = tmp_sent_filename+'.prp.xml'
if not os.path.exists(tmp_prp_filename):
raise Exception("No preprocessed xml file found: %s" % tmp_prp_filename)
print >> log, 'Read token,lemma,name entity file %s...' % (tmp_prp_filename)
instances = load_xml_instances(tmp_prp_filename)
else:
raise Exception('Unknow preprocessed file format %s' % PRP_FORMAT)

tok_sent_filename = tmp_sent_filename+'.tok' # write tokenized sentence file
if not os.path.exists(tok_sent_filename):
_write_tok_sentences(tok_sent_filename,instances)
Expand Down Expand Up @@ -376,21 +420,47 @@ def preprocess(input_file,START_SNLP=True,INPUT_AMR='amr'):
instances[i].addComment(comments[i])

else: # input file is sentence
tmp_sent_filename = input_file
tmp_prp_filename = tmp_sent_filename+'.prp'

proc1 = StanfordCoreNLP()
tmp_sent_filename = input_file

# preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP
if START_SNLP and not os.path.exists(tmp_prp_filename):
print >> log, "Start Stanford CoreNLP ..."
proc1.setup()
tmp_prp_filename = None
instances = None
if PRP_FORMAT == 'plain':
tmp_prp_filename = tmp_sent_filename+'.prp'

proc1 = StanfordCoreNLP()

# preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP

if START_SNLP and not os.path.exists(tmp_prp_filename):
print >> log, "Start Stanford CoreNLP..."
proc1.setup()

print >> log, 'Read token,lemma,name entity file %s...' % (tmp_prp_filename)
instances = proc1.parse(tmp_sent_filename)
elif os.path.exists(tmp_prp_filename): # found cache file

elif PRP_FORMAT == 'xml': # rather than using corenlp plain format; using xml format; also we don't use corenlp wrapper anymore
tmp_prp_filename = tmp_sent_filename+'.xml'
if not os.path.exists(tmp_prp_filename):
raise Exception("No preprocessed xml file found: %s" % tmp_prp_filename)
print >> log, 'Read token,lemma,name entity file %s...' % (tmp_prp_filename)
instances = proc1.parse(tmp_sent_filename)
instances = load_xml_instances(tmp_prp_filename)
else:
raise Exception('No cache file %s has been found. set START_SNLP=True to start corenlp.' % (tmp_prp_filename))
raise Exception('Unknow preprocessed file format %s' % PRP_FORMAT)


# tmp_prp_filename = tmp_sent_filename+'.prp'
# proc1 = StanfordCoreNLP()

# # preprocess 1: tokenization, POS tagging and name entity using Stanford CoreNLP
# if START_SNLP and not os.path.exists(tmp_prp_filename):
# print >> log, "Start Stanford CoreNLP ..."
# proc1.setup()
# instances = proc1.parse(tmp_sent_filename)
# elif os.path.exists(tmp_prp_filename): # found cache file
# print >> log, 'Read token,lemma,name entity file %s...' % (tmp_prp_filename)
# instances = proc1.parse(tmp_sent_filename)
# else:
# raise Exception('No cache file %s has been found. set START_SNLP=True to start corenlp.' % (tmp_prp_filename))


tok_sent_filename = tmp_sent_filename+'.tok' # write tokenized sentence file
Expand Down
9 changes: 9 additions & 0 deletions scripts/create_prp_xml.sh
@@ -0,0 +1,9 @@

DATA_PATH=$1
DATADIR="$( cd "$( dirname "${1}" )" && pwd )"
DIR="$( cd "$( dirname "${BASH_SOURCE[ 0]}" )" && pwd )"
CORENLP_PATH="/home/j/llc/cwang24/Tools/stanford-corenlp-full-2015-04-20"

java -Xmx25000m -cp "${CORENLP_PATH}/*" edu.stanford.nlp.pipeline.StanfordCoreNLP -props "${CORENLP_PATH}/default.properties" -file $DATA_PATH -outputDirectory $DATADIR

mv $DATA_PATH.xml $DATA_PATH.prp.xml
1 change: 1 addition & 0 deletions scripts/preprocess-xml.sh
@@ -0,0 +1 @@
python amr_parsing.py -m preprocess --amrfmt amr data/semeval/xml-data/training.txt
2 changes: 1 addition & 1 deletion scripts/stdconvert.sh
@@ -1,4 +1,4 @@
#CORENLP_PATH='/home/j/llc/cwang24/Tools/CoreNLP-mod-convert.jar'
#CORENLP_PATH='/home/j/llc/cwang24/Tools/CoreNLP-mod-convert-collapse.jar'
CORENLP_PATH='./lib'
java -Xmx1800m -cp $CORENLP_PATH/CoreNLP-mod-convert-collapse.jar edu.stanford.nlp.trees.EnglishGrammaticalStructure -basic -treeFile $1 > $1.dep
java -Xmx1800m -cp $CORENLP_PATH/CoreNLP-mod-convert-collapse.jar edu.stanford.nlp.trees.EnglishGrammaticalStructure -basic -treeFile $1 > $1.dep

0 comments on commit a816c9f

Please sign in to comment.