Permalink
Browse files

Initial version of web annotator

  • Loading branch information...
andersjo committed May 2, 2014
1 parent b3ee73f commit fb4f2a44fc44f49a35a7554029a0a73fbfc13e2e
View
@@ -0,0 +1,2 @@
+.idea
+*pyc
View
@@ -43,7 +43,7 @@ Install the dependencies via `pip`:
pip install -r requirements.txt
```
-The tool also uses the Framenet data distributed by NLTK. It looks for the data in `$HOME/nltk_data/corpora/framenet_v15`,
+The tool uses the Framenet data distributed by NLTK. It looks for the data in `$HOME/nltk_data/corpora/framenet_v15`,
which is the default install location used by NLTK. If the data is not found, the tool will attempt to download it using NLTK.
Unfortunately, this will fail if your NLTK data is in a non-standard location.
View
@@ -0,0 +1,16 @@
+But CONJ
+in ADP
+any DET
+case NOUN Instance Reasoning Containers Trial
+I PRON
+suppose VERB Opinion
+you PRON
+will VERB Giving Desiring
+not ADV
+let VERB Grant_permission Make_possible_to_do
+it PRON
+away ADV
+for ADP
+some DET
+days NOUN Calendric_unit Timespan Measure_duration
+? .
View
@@ -0,0 +1,15 @@
+JV NOUN
+soccer NOUN
+leads VERB Leadership Cotheme
+NUMBER ADJ
+after ADP
+Benitez NOUN
+scores VERB Damaging Getting
+again ADV
+, .
+less ADJ
+than ADP
+NUMBER NUM
+mins NOUN
+to PRT
+play VERB Cause_to_make_noise Performers_and_roles Competition
View
@@ -0,0 +1,16 @@
+RT X
+@USER X
+: .
+Yay X
+for ADP
+video NOUN
+schedules NOUN
+! .
+New ADJ Age Familiarity
+music NOUN Performing_arts
+video NOUN
+again ADV
+today NOUN Temporal_collocation Calendric_unit
+. .
+URL X
+#MundayMonday X
View
@@ -0,0 +1,2 @@
+flask
+nltk
View
@@ -0,0 +1,124 @@
+import argparse
+import codecs
+from collections import defaultdict
+import json
+from os.path import expanduser
+from flask import Flask, render_template, request
+import sys
+import nltk
+import framenet
+import os
+
+app = Flask(__name__)
+
+def color_map(index):
+ colors = ['red', 'green', 'blue', 'yellow', 'orange']
+ return colors[index % len(colors)]
+
+
+def read_sentence(id):
+ filename = os.path.join(args.in_dir, id)
+
+ sentence = []
+ word_col = None
+ pos_col = None
+
+ for token_i, line in enumerate(codecs.open(filename, encoding='utf-8'), 1):
+ parts = [part.strip() for part in line.split("\t")]
+
+ # Autodetect file format
+ if word_col is None:
+ if len(parts) >= 14:
+ word_col = 1
+ pos_col = 4
+ else:
+ word_col = 0
+ pos_col = 1
+
+ sentence.append({'word': parts[word_col],
+ 'pos': parts[pos_col],
+ 'frames': filter(None, parts[-1].split(" ")),
+ 'token_i': token_i
+ })
+
+ return sentence
+
+@app.route("/annotate/save/<id>", methods=['POST'])
+def save_sentence(id=None):
+ sentence = read_sentence(id)
+
+ with codecs.open(os.path.join(args.out_dir, id), 'w', encoding='utf-8') as out:
+
+ for i, token in enumerate(sentence, 1):
+ frame_name = None
+ arguments = {}
+
+ if len(token['frames']) > 0:
+ selected_frame = request.form['select-{}'.format(i)]
+ if selected_frame:
+ frame_name = selected_frame.split("-")[1]
+ # Grab arguments
+ for key, val in request.form.items():
+ if key.startswith(selected_frame) and val:
+ arguments[key.split("-")[2]] = val
+
+
+ parts = map(unicode, [token['token_i'], token['word'], token['pos'], frame_name or '', json.dumps(arguments)])
+ print >>out, u"\t".join(parts)
+
+ return "OK"
+
+
+@app.route("/annotate/<id>")
+def annotate_sentence(id=None):
+ sentence = read_sentence(id)
+ next_id_index = sentence_ids.index(id) + 1
+ next_id = sentence_ids[next_id_index] if next_id_index < len(sentence_ids) else None
+
+ pre_annotations = []
+ for i, token in enumerate(sentence):
+ if len(token['frames']) > 0:
+ pre_annotation= {'token': token,
+ 'frames': [],
+ 'i': token['token_i'],
+ 'color': color_map(i)}
+ for frame_name in token['frames']:
+ frame = fnet.frame_by_name(frame_name)
+ frame_elems_by_type = defaultdict(list)
+ for fe in frame.FE.values():
+ fe.abbrev = fe.abbrev or fe.name[0:3]
+ frame_elems_by_type[fe.coreType].append(fe)
+ pre_annotation['frames'].append({'name': frame.name,
+ 'definition': frame.definition,
+ 'fe_by_type': frame_elems_by_type})
+ pre_annotations.append(pre_annotation)
+
+
+ return render_template('annotate.html', sentence=sentence, pre_annotations=pre_annotations, id=id, next_id=next_id)
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="""A web interface for annotating Framenet across languages""")
+ parser.add_argument('in_dir', help="Directory with input files, each containing a single sentence in tab separated format."
+ "The last column contains a list of space-separated list of possible frames evoked by that token")
+ parser.add_argument('out_dir', help="Directory for finished annotations")
+
+ args = parser.parse_args()
+
+ print >>sys.stderr, "Reading Framenet"
+ framenet_data = expanduser("~/nltk_data/corpora/framenet_v15/")
+ if not os.path.isdir(framenet_data):
+ nltk.download('framenet_v15')
+
+ fnet = framenet.FramenetCorpusReader(framenet_data, [])
+
+ sentence_ids = [fname for fname in os.listdir(args.in_dir)
+ if os.path.isfile(os.path.join(args.in_dir, fname))]
+
+ print >>sys.stderr, "Found {} sentences for annotation".format(len(sentence_ids))
+ print >>sys.stderr, "Web server started. Navigate to http://127.0.0.1:0000/annotate/{} to get started".format(sentence_ids[0])
+
+
+ # Start the web server
+ # app.run(debug=True)
+ app.run()
Oops, something went wrong.

0 comments on commit fb4f2a4

Please sign in to comment.