Permalink
Browse files

check in stuff

  • Loading branch information...
0 parents commit 395b50ffe620fd0fdba088f775ae19597417868b @brendano committed Mar 4, 2012
Showing with 386 additions and 0 deletions.
  1. +18 −0 README
  2. +4 −0 depper.sh
  3. BIN lib/json-simple-1.1.1.jar
  4. +196 −0 src/Depper.java
  5. +168 −0 src/StanfordParserServer.java
18 README
@@ -0,0 +1,18 @@
+Little programs that run routines from inside the Stanford NLP system:
+
+ * Depper: runs the Stanford Dependency coverter on PTB-style phrase structure
+ trees from standard input. Runs the lemmatizer too. Works with Stanford
+ CoreNLP version 2012-01-08. Outputs either CoNLL format, or a JSON-based
+ format. If using PTB gold-standard parses, make sure to remove traces
+ first: e.g. with wsj-removetraces.pl from srlconll-1.1.
+
+ * StanfordParserServer: a socket (telnet) server to get a phrase-structure
+ parse from a sentence. It requires some old version of the Stanford parser,
+ maybe circa 2008-9 or so; this should be updated.
+
+No guarantee of consistency or usability given, please read the code before
+using.
+
+Stanford NLP: http://nlp.stanford.edu/software/
+srlconll-1.1: http://www.lsi.upc.edu/~srlconll/soft.html#srlconll
+
@@ -0,0 +1,4 @@
+#!/bin/zsh
+h=$(dirname $0)
+java -cp $h/out/production/Stanford-mystuff:$h/../stanford-corenlp-2012-01-08/stanford-corenlp-2012-01-08.jar:$h/lib/json-simple-1.1.1.jar Depper
+
Binary file not shown.
@@ -0,0 +1,196 @@
+import java.util.*;
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.StringReader;
+import java.util.Scanner;
+
+import org.json.simple.*;
+import edu.stanford.nlp.trees.*;
+import edu.stanford.nlp.process.Morphology;
+
+
+public class Depper {
+ public static HeadFinder headfinder;
+ public static TreeFactory tree_factory;
+
+ public static Tree readTreeFromString(String parseStr){
+ //read in the input into a Tree data structure
+ TreeReader treeReader = new PennTreeReader(new StringReader(parseStr), tree_factory);
+ Tree inputTree = null;
+ try{
+ inputTree = treeReader.readTree();
+
+ }catch(IOException e){
+ e.printStackTrace();
+ }
+ return inputTree;
+ }
+ public static String readFile(String filename) throws FileNotFoundException {
+ File file = new File(filename);
+ return new Scanner(file).useDelimiter("\\Z").next();
+ }
+
+
+
+ /**
+ * Needs to match sem/lib/core2jsent.py
+ *
+ * @param deps
+ * @param parse
+ * @param leaves
+ * @return
+ */
+ public static JSONObject makeJSent(String parseStr, List<TypedDependency> deps, Tree parse, ArrayList<Tree> leaves) {
+ HashMap<Integer,List<TypedDependency>> map = new HashMap();
+
+ JSONArray jDeps = new JSONArray();
+
+ for (TypedDependency d : deps) {
+ JSONArray jDep = new JSONArray();
+ int di = d.dep().index()-1;
+ int gi = d.gov().index()-1;
+ jDep.add(d.reln().getShortName());
+ jDep.add(di);
+ jDep.add(gi);
+ jDeps.add(jDep);
+ }
+
+ JSONArray jToks = new JSONArray();
+ for (Tree leaf : leaves) {
+ JSONArray tok = new JSONArray();
+ String surface = leaf.label().value();
+ String posTag = leaf.parent(parse).label().value();
+ String lemma = Morphology.lemmaStatic(surface, posTag, true);
+
+ tok.add(surface); // surface word form
+ tok.add(lemma); // supposed to be lemma
+ tok.add(posTag); // POS tag
+// tok.add(null); // NER tag
+ jToks.add(tok);
+ }
+
+ JSONObject jsent = new JSONObject();
+ jsent.put("deps", jDeps);
+ jsent.put("tokens", jToks);
+ jsent.put("parse", parseStr);
+ return jsent;
+ }
+
+ public static void printCoNLL(List<TypedDependency> deps, Tree parse, List<Tree> leaves) {
+// HashMap<Integer,TypedDependency> map = new HashMap();
+ HashMap<Integer,List<TypedDependency>> map = new HashMap();
+ for (TypedDependency d : deps) {
+ int i = d.dep().index();
+// assert ! map.containsKey(d.dep().index()) : d.dep() + " " + map.get(d.dep().index());
+ if (! map.containsKey(i)) {
+ map.put(i, new ArrayList());
+
+ }
+ map.get(i).add(d);
+ }
+ int i=0;
+ for (Tree L : leaves) {
+ i++;
+ boolean hasBeenCollapsed = !map.containsKey(i);
+ if (map.containsKey(i)) {
+ for (TypedDependency d : map.get(i)) {
+ System.out.printf("%d\t%s\t_\t%s\t_\t_", i, L.yield(), L.parent(parse).label().value());
+ System.out.printf("\t%s\t%s\t_\t_\n", d.gov().index(), d.reln() );
+ }
+// TypedDependency d = map.get(i);
+ } else {
+ System.out.printf("%d\t%s\t_\t%s\t_\t_", i, L.yield(), L.parent(parse).label().value());
+ System.out.printf("\t0\t_\t_\t_\n");
+ }
+ }
+// for (TypedDependency d : deps) {
+// if (d == null) continue;
+// System.out.println("*** " + d);
+//// System.out.println("*** " + d.dep().yield());
+// String s1,s2;
+// s1 = String.format("%d\t%s\t_\t%s\t_\t_", d.dep().index(), d.dep().yield(), "_");
+// s2 = String.format("\t%s\t%s\t_\t_", d.gov().index(), d.reln());
+// System.out.print(s1);
+// System.out.print(s2);
+// System.out.println("");
+// }
+ }
+
+
+
+ public static void main(String args[]) throws Exception {
+
+// headfinder = new CollinsHeadFinder();
+ tree_factory = new LabeledScoredTreeFactory();
+
+ TreebankLanguagePack tlp = new PennTreebankLanguagePack();
+ GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
+
+ for (String arg : args) {
+ }
+ String line;
+ BufferedReader brIn = new BufferedReader(new InputStreamReader(System.in));
+ while ( (line=brIn.readLine()) != null) {
+ if (line.trim().equals("")) continue;
+ String[] parts = line.trim().split("\t");
+ String parseStr = parts[parts.length-1];
+ Tree parse = readTreeFromString(parseStr);
+ GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
+
+ Collection<TypedDependency> deps1;
+ deps1 = gs.typedDependenciesCCprocessed(true);
+ ArrayList<TypedDependency> deps = new ArrayList();
+ deps.addAll(deps1);
+
+// printCoNLL(deps, parse, parse.getLeaves());
+// System.out.println("PARSE\t" + line.trim());
+
+// for (TypedDependency d : deps) System.out.println(d);
+
+ ArrayList<Tree> leaves = new ArrayList();
+ leaves.addAll(parse.getLeaves());
+
+// System.out.println(leaves2);
+// for (int i=0; i < leaves2.size(); i++)
+// System.out.printf("%d %s\n", i, leaves2.get(i).label().value());
+// TypedDependency[] deps = new TypedDependency[deps1.size() + 1000];
+// for (TypedDependency d : deps1)
+// deps[ d.dep().index() - 1 ] = d;
+// printCoNLL(deps);
+
+
+ JSONObject jsent = makeJSent(parseStr, deps, parse, leaves);
+
+ ArrayList<String> tokens = new ArrayList();
+ for (Tree l : leaves) {
+ tokens.add( l.label().value());
+ }
+ String tokensStr = join(tokens, " ");
+
+ ArrayList<String> fields = new ArrayList();
+ for (int i=0; i < parts.length-1; i++) {
+ fields.add(parts[i]);
+ }
+ fields.add(tokensStr);
+ fields.add(jsent.toJSONString());
+ System.out.println(join(fields, "\t"));
+ }
+ }
+
+ public static String join(AbstractCollection<String> s, String delimiter) {
+ if (s == null || s.isEmpty()) return "";
+ Iterator<String> iter = s.iterator();
+ StringBuilder builder = new StringBuilder(iter.next());
+ while( iter.hasNext() )
+ {
+ builder.append(delimiter).append(iter.next());
+ }
+ return builder.toString();
+ }
+
+
+
+}
@@ -0,0 +1,168 @@
+
+
+
+import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
+import edu.stanford.nlp.parser.lexparser.Options;
+import edu.stanford.nlp.trees.*;
+
+import java.io.*;
+
+import java.net.*;
+import java.util.Properties;
+
+/**
+* Wrapper class to run the Stanford Parser as a socket server so the grammar need not
+* be loaded for every new sentence.
+*
+* @author mheilman@cmu.edu
+*
+*/
+public class StanfordParserServer {
+
+ //@SuppressWarnings("unchecked")
+ public static void main(String[] args) {
+
+ //INITIALIZE PARSER
+ String serializedInputFileOrUrl = null;
+ int port = 5556;
+ int maxLength = 40;
+ boolean markHeadNodes = false;
+
+ Properties properties = new Properties();
+ try{
+ properties.load(new FileInputStream("config/arkref.properties"));
+ }catch(Exception e){
+ e.printStackTrace();
+ }
+ maxLength = new Integer(properties.getProperty("parserMaxLength", "60"));
+
+ // variables needed to process the files to be parse
+ String sentenceDelimiter = null;
+ int argIndex = 0;
+ if (args.length < 1) {
+ System.err.println("usage: java edu.stanford.nlp.parser.lexparser." + "LexicalizedParser parserFileOrUrl\nOptions: -port, -maxLength, -markHeadNodes");
+ System.exit(1);
+ }
+
+ Options op = new Options();
+ // while loop through option arguments
+ while (argIndex < args.length && args[argIndex].charAt(0) == '-') {
+ if (args[argIndex].equalsIgnoreCase("-sentences")) {
+ sentenceDelimiter = args[argIndex + 1];
+ if (sentenceDelimiter.equalsIgnoreCase("newline")) {
+ sentenceDelimiter = "\n";
+ }
+ argIndex += 2;
+ } else if (args[argIndex].equalsIgnoreCase("-loadFromSerializedFile")) {
+ // load the parser from a binary serialized file
+ // the next argument must be the path to the parser file
+ serializedInputFileOrUrl = args[argIndex + 1];
+ argIndex += 2;
+ } else if (args[argIndex].equalsIgnoreCase("-maxLength")) {
+ maxLength = new Integer(args[argIndex + 1]);
+ argIndex += 2;
+ } else if (args[argIndex].equalsIgnoreCase("-port")) {
+ port = new Integer(args[argIndex + 1]);
+ argIndex += 2;
+ } else if (args[argIndex].equalsIgnoreCase("-markHeadNodes")) {
+ markHeadNodes = true;
+ argIndex++;
+ } else {
+ argIndex = op.setOptionOrWarn(args, argIndex);
+ }
+ } // end while loop through arguments
+
+ LexicalizedParser lp = null;
+ // so we load a serialized parser
+ if (serializedInputFileOrUrl == null && argIndex < args.length) {
+ // the next argument must be the path to the serialized parser
+ serializedInputFileOrUrl = args[argIndex];
+ argIndex++;
+ }
+ if (serializedInputFileOrUrl == null) {
+ System.err.println("No grammar specified, exiting...");
+ System.exit(0);
+ }
+ try {
+ lp = new LexicalizedParser(serializedInputFileOrUrl, op);
+ } catch (IllegalArgumentException e) {
+ System.err.println("Error loading parser, exiting...");
+ System.exit(0);
+ }
+ lp.setMaxLength(maxLength);
+ lp.setOptionFlags("-outputFormat", "oneline");
+
+ TreePrint tp;
+
+
+
+ // declare a server socket and a client socket for the server
+ // declare an input and an output stream
+ ServerSocket parseServer = null;
+ BufferedReader br;
+ PrintWriter outputWriter;
+ Socket clientSocket = null;
+ try {
+ parseServer = new ServerSocket(port);
+ }
+ catch (IOException e) {
+ System.err.println(e);
+ }
+
+ // Create a socket object from the ServerSocket to listen and accept
+ // connections.
+ // Open input and output streams
+
+ while (true) {
+ System.err.println("Waiting for Connection on Port: "+port);
+ try {
+ clientSocket = parseServer.accept();
+ System.err.println("Connection Accepted From: "+clientSocket.getInetAddress());
+ br = new BufferedReader(new InputStreamReader(new DataInputStream(clientSocket.getInputStream())));
+ outputWriter = new PrintWriter(new PrintStream(clientSocket.getOutputStream()));
+ ByteArrayOutputStream buf = new ByteArrayOutputStream();
+ PrintWriter bufWriter = new PrintWriter(new PrintStream(buf));
+ String doc = "";
+
+ do{
+ doc += br.readLine();
+ }while(br.ready());
+ System.err.println("received: " + doc);
+
+ //PARSE
+ try{
+ lp.parse(doc);
+
+
+
+ //OUTPUT RESULT
+ Tree bestParse = lp.getBestParse();
+ if(markHeadNodes){
+ tp = new TreePrint("penn","markHeadNodes",new PennTreebankLanguagePack());
+ }else{
+ tp = new TreePrint("penn","",new PennTreebankLanguagePack());
+ }
+ tp.printTree(bestParse, bufWriter);
+ outputWriter.println(buf.toString().replaceAll("\\s+", " "));
+ outputWriter.println(lp.getPCFGScore());
+ //String output = bestParse.toString();
+ //outputWriter.println(output);
+ //System.err.println("sent: " + output);
+
+ }catch(Exception e){
+ outputWriter.println("(ROOT (. .))");
+ outputWriter.println("-999999999.0");
+ e.printStackTrace();
+ }
+
+ outputWriter.flush();
+ outputWriter.close();
+
+ }catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+
+}
+

0 comments on commit 395b50f

Please sign in to comment.