diff --git a/java/src/org/antlr/codebuff/CollectFeatures.java b/java/src/org/antlr/codebuff/CollectFeatures.java index eabed86..6f264bb 100644 --- a/java/src/org/antlr/codebuff/CollectFeatures.java +++ b/java/src/org/antlr/codebuff/CollectFeatures.java @@ -17,16 +17,23 @@ import java.io.File; import java.util.ArrayList; -import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; public class CollectFeatures { - public static final double MAX_CONTEXT_DIFF_THRESHOLD = 0.20; + public static final double MAX_CONTEXT_DIFF_THRESHOLD = 0.13; public static final double MAX_CONTEXT_DIFF_THRESHOLD2 = 0.50; + /** When computing child indexes, we use this value for any child list + * element other than the first one. If a parent has just one X child, + * we use the actual child index. If parent has two or more X children, + * and we are not the first X, use CHILD_INDEX_LIST_ELEMENT. If first + * of two or more X children, use actual child index. + */ + public static final int CHILD_INDEX_LIST_ELEMENT = 1_111_111_111; + // Feature values for pair on diff lines feature public static final int NOT_PAIR = -1; public static final int PAIR_ON_SAME_LINE = 0; @@ -68,77 +75,43 @@ public class CollectFeatures { // indexes into feature vector - public static final int INDEX_PREV2_TYPE = 0; - public static final int INDEX_PREV_TYPE = 1; - public static final int INDEX_PREV_RULE = 2; // what rule is prev token in? - public static final int INDEX_PREV_END_COLUMN = 3; - public static final int INDEX_PREV_EARLIEST_ANCESTOR = 4; - public static final int INDEX_TYPE = 5; - public static final int INDEX_MATCHING_TOKEN_DIFF_LINE = 6; - public static final int INDEX_FIRST_ON_LINE = 7; // a \n right before this token? - public static final int INDEX_RULE = 8; // what rule are we in? - public static final int INDEX_EARLIEST_RIGHT_ANCESTOR = 9; - public static final int INDEX_EARLIEST_LEFT_ANCESTOR = 10; - public static final int INDEX_ANCESTORS_PARENT5_RULE = 11; - public static final int INDEX_ANCESTORS_PARENT4_RULE = 12; - public static final int INDEX_ANCESTORS_PARENT3_RULE = 13; - public static final int INDEX_ANCESTORS_PARENT3_WID = 14; - public static final int INDEX_ANCESTORS_PARENT2_RULE = 15; - public static final int INDEX_ANCESTORS_PARENT2_WID = 16; - public static final int INDEX_ANCESTORS_PARENT_RULE = 17; - public static final int INDEX_ANCESTORS_PARENT_WID = 18; - public static final int INDEX_NEXT_TYPE = 19; - public static final int INDEX_INFO_FILE = 20; - public static final int INDEX_INFO_LINE = 21; - public static final int INDEX_INFO_CHARPOS = 22; - - public static final int NUM_FEATURES = 23; + public static final int INDEX_PREV_TYPE = 0; + public static final int INDEX_PREV_EARLIEST_RIGHT_ANCESTOR = 1; + public static final int INDEX_CUR_TYPE = 2; + public static final int INDEX_MATCHING_TOKEN_DIFF_LINE = 3; + public static final int INDEX_FIRST_ON_LINE = 4; // a \n right before this token? + public static final int INDEX_EARLIEST_LEFT_ANCESTOR = 5; + public static final int INDEX_ANCESTORS_CHILD_INDEX = 6; + public static final int INDEX_ANCESTORS_PARENT_RULE = 7; + public static final int INDEX_ANCESTORS_PARENT_CHILD_INDEX = 8; + public static final int INDEX_ANCESTORS_PARENT2_RULE = 9; + public static final int INDEX_ANCESTORS_PARENT2_CHILD_INDEX = 10; + public static final int INDEX_ANCESTORS_PARENT3_RULE = 11; + public static final int INDEX_ANCESTORS_PARENT3_CHILD_INDEX = 12; + public static final int INDEX_ANCESTORS_PARENT4_RULE = 13; + public static final int INDEX_ANCESTORS_PARENT4_CHILD_INDEX = 14; + + public static final int INDEX_INFO_FILE = 15; + public static final int INDEX_INFO_LINE = 16; + public static final int INDEX_INFO_CHARPOS = 17; + + public static final int NUM_FEATURES = 18; public static FeatureMetaData[] FEATURES_INJECT_WS = { // inject ws or nl - new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(-2)"}, 1), - new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(-1)"}, 2), - new FeatureMetaData(FeatureType.RULE, new String[] {"LT(-1)", "rule"}, 2), - new FeatureMetaData(FeatureType.INT, new String[] {"LT(-1)", "end col"}, 0), - new FeatureMetaData(FeatureType.RULE, new String[] {"LT(-1)", "right ancestor"}, 3), - new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(1)"}, 2), - new FeatureMetaData(FeatureType.BOOL, new String[]{"Pair", "dif\\n"}, 3), + new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(-1)"}, 1), + new FeatureMetaData(FeatureType.RULE, new String[] {"LT(-1)", "right ancestor"}, 1), + new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(1)"}, 1), + new FeatureMetaData(FeatureType.BOOL, new String[]{"Pair", "dif\\n"}, 1), + FeatureMetaData.UNUSED, + new FeatureMetaData(FeatureType.RULE, new String[] {"LT(1)", "left ancestor"}, 1), + new FeatureMetaData(FeatureType.INT, new String[] {"ancestor", "child index"}, 1), + // these previous 6 features seem to predict newline really well. whitespace ok too FeatureMetaData.UNUSED, - new FeatureMetaData(FeatureType.RULE, new String[] {"LT(1)", "rule"}, 2), - new FeatureMetaData(FeatureType.RULE, new String[] {"LT(1)", "right ancestor"}, 3), - new FeatureMetaData(FeatureType.RULE, new String[] {"LT(1)", "left ancestor"}, 3), - new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^5"}, 1), - new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^4"}, 1), - new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^3"}, 1), - new FeatureMetaData(FeatureType.INT, new String[] {"ancestor's", "parent^3 wid"}, 1), - new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^2"}, 1), - new FeatureMetaData(FeatureType.INT, new String[] {"ancestor's", "parent^2 wid"}, 1), - new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent"}, 1), - new FeatureMetaData(FeatureType.INT, new String[] {"ancestor's", "parent wid"}, 1), FeatureMetaData.UNUSED, - new FeatureMetaData(FeatureType.INFO_FILE, new String[] {"", "file"}, 0), - new FeatureMetaData(FeatureType.INFO_LINE, new String[] {"", "line"}, 0), - new FeatureMetaData(FeatureType.INFO_CHARPOS, new String[] {"char", "pos"}, 0) - }; - - public static FeatureMetaData[] FEATURES_ALIGN = { FeatureMetaData.UNUSED, - new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(-1)"}, 2), - new FeatureMetaData(FeatureType.RULE, new String[] {"LT(-1)", "rule"}, 2), FeatureMetaData.UNUSED, FeatureMetaData.UNUSED, - new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(1)"}, 2), FeatureMetaData.UNUSED, - new FeatureMetaData(FeatureType.BOOL, new String[]{"Strt", "line"}, 3), - new FeatureMetaData(FeatureType.RULE, new String[] {"LT(1)", "rule"}, 2), - new FeatureMetaData(FeatureType.RULE, new String[] {"LT(1)", "right ancestor"}, 3), - new FeatureMetaData(FeatureType.RULE, new String[] {"LT(1)", "left ancestor"}, 3), - new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^5"}, 2), - new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^4"}, 2), - new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^3"}, 7), - new FeatureMetaData(FeatureType.INT, new String[] {"ancestor's", "par^3 wid"}, 7), - new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^2"}, 7), - new FeatureMetaData(FeatureType.INT, new String[] {"ancestor's", "par^2 wid"}, 7), - new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent"}, 3), FeatureMetaData.UNUSED, FeatureMetaData.UNUSED, new FeatureMetaData(FeatureType.INFO_FILE, new String[] {"", "file"}, 0), @@ -146,35 +119,47 @@ public class CollectFeatures { new FeatureMetaData(FeatureType.INFO_CHARPOS, new String[] {"char", "pos"}, 0) }; - public static FeatureMetaData[] FEATURES_ALL = { - new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(-2)"}, 1), - new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(-1)"}, 2), - new FeatureMetaData(FeatureType.RULE, new String[] {"LT(-1)", "rule"}, 2), - new FeatureMetaData(FeatureType.INT, new String[] {"LT(-1)", "end col"}, 0), - new FeatureMetaData(FeatureType.RULE, new String[] {"LT(-1)", "right ancestor"}, 3), - new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(1)"}, 2), - new FeatureMetaData(FeatureType.BOOL, new String[]{"Pair", "dif\\n"}, 3), - new FeatureMetaData(FeatureType.BOOL, new String[]{"Strt", "line"}, 3), - new FeatureMetaData(FeatureType.RULE, new String[] {"LT(1)", "rule"}, 2), - new FeatureMetaData(FeatureType.RULE, new String[] {"LT(1)", "right ancestor"}, 3), - new FeatureMetaData(FeatureType.RULE, new String[] {"LT(1)", "left ancestor"}, 3), - new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^5"}, 1), - new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^4"}, 1), - new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^3"}, 1), - new FeatureMetaData(FeatureType.INT, new String[] {"ancestor's", "parent^3 wid"}, 1), - new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^2"}, 1), - new FeatureMetaData(FeatureType.INT, new String[] {"ancestor's", "parent^2 wid"}, 1), - new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent"}, 1), - new FeatureMetaData(FeatureType.INT, new String[] {"ancestor's", "parent wid"}, 1), - new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(2)"}, 1), + public static FeatureMetaData[] FEATURES_ALIGN = { + new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(-1)"}, 1), + new FeatureMetaData(FeatureType.RULE, new String[] {"LT(-1)", "right ancestor"}, 1), + new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(1)"}, 1), + new FeatureMetaData(FeatureType.BOOL, new String[] {"Pair", "dif\\n"}, 1), + new FeatureMetaData(FeatureType.BOOL, new String[] {"Strt", "line"}, 1), + new FeatureMetaData(FeatureType.RULE, new String[] {"LT(1)", "left ancestor"}, 1), + new FeatureMetaData(FeatureType.INT, new String[] {"ancestor", "child index"}, 1), + new FeatureMetaData(FeatureType.RULE, new String[] {"", "parent"}, 1), + new FeatureMetaData(FeatureType.INT, new String[] {"parent", "child index"}, 1), + new FeatureMetaData(FeatureType.RULE, new String[] {"", "parent^2"}, 1), + new FeatureMetaData(FeatureType.INT, new String[] {"parent^2", "child index"}, 1), + new FeatureMetaData(FeatureType.RULE, new String[] {"", "parent^3"}, 1), + new FeatureMetaData(FeatureType.INT, new String[] {"parent^3", "child index"}, 1), + new FeatureMetaData(FeatureType.RULE, new String[] {"", "parent^4"}, 1), + new FeatureMetaData(FeatureType.INT, new String[] {"parent^4", "child index"}, 1), new FeatureMetaData(FeatureType.INFO_FILE, new String[] {"", "file"}, 0), new FeatureMetaData(FeatureType.INFO_LINE, new String[] {"", "line"}, 0), new FeatureMetaData(FeatureType.INFO_CHARPOS, new String[] {"char", "pos"}, 0) }; - public static int getCurrentTokenType(int[] features) { return features[INDEX_TYPE]; } - public static int getInfoLine(int[] features) { return features[INDEX_INFO_LINE]; } - public static int getInfoCharPos(int[] features) { return features[INDEX_INFO_CHARPOS]; } + public static FeatureMetaData[] FEATURES_ALL = { + new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(-1)"}, 1), + new FeatureMetaData(FeatureType.RULE, new String[] {"LT(-1)", "right ancestor"}, 1), + new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(1)"}, 1), + new FeatureMetaData(FeatureType.BOOL, new String[] {"Pair", "dif\\n"}, 1), + new FeatureMetaData(FeatureType.BOOL, new String[] {"Strt", "line"}, 1), + new FeatureMetaData(FeatureType.RULE, new String[] {"LT(1)", "left ancestor"}, 1), + new FeatureMetaData(FeatureType.INT, new String[] {"ancestor", "child index"}, 1), + new FeatureMetaData(FeatureType.RULE, new String[] {"", "parent"}, 1), + new FeatureMetaData(FeatureType.INT, new String[] {"parent", "child index"}, 1), + new FeatureMetaData(FeatureType.RULE, new String[] {"", "parent^2"}, 1), + new FeatureMetaData(FeatureType.INT, new String[] {"parent^2", "child index"}, 1), + new FeatureMetaData(FeatureType.RULE, new String[] {"", "parent^3"}, 1), + new FeatureMetaData(FeatureType.INT, new String[] {"parent^3", "child index"}, 1), + new FeatureMetaData(FeatureType.RULE, new String[] {"", "parent^4"}, 1), + new FeatureMetaData(FeatureType.INT, new String[] {"parent^4", "child index"}, 1), + new FeatureMetaData(FeatureType.INFO_FILE, new String[] {"", "file"}, 0), + new FeatureMetaData(FeatureType.INFO_LINE, new String[] {"", "line"}, 0), + new FeatureMetaData(FeatureType.INFO_CHARPOS, new String[] {"char", "pos"}, 0) + }; protected InputDocument doc; protected ParserRuleContext root; @@ -261,12 +246,9 @@ public int getAlignmentCategory(TerminalNode node, Token curToken, int columnDel ParserRuleContext parent = (ParserRuleContext)node.getParent(); // at a newline, are we aligned with a prior sibling (in a list) etc... - ParserRuleContext earliestLeftAncestor = earliestAncestorStartingWithToken(parent, curToken); - if ( earliestLeftAncestor==null ) { - earliestLeftAncestor = parent; - } + ParserRuleContext earliestLeftAncestor = earliestAncestorStartingWithToken(node, curToken); Pair pair = - earliestAncestorWithChildStartingAtCharPos(earliestLeftAncestor.getParent(), curToken); + earliestAncestorWithChildStartingAtCharPos(earliestLeftAncestor, curToken); if ( pair!=null ) { int deltaFromLeftAncestor = getDeltaToAncestor(earliestLeftAncestor, pair.a); aligned = aligncat(deltaFromLeftAncestor, pair.b); @@ -305,26 +287,41 @@ public static int getPrecedingNL(CommonTokenStream tokens, int i) { /** Walk upwards from node while p.start == token; return null if there is * no ancestor starting at token. */ - public static ParserRuleContext earliestAncestorStartingWithToken(ParserRuleContext node, Token token) { - ParserRuleContext p = node; + /** Walk upwards from node while p.start == token; return immediate parent + * if there is no ancestor starting at token. This is the earliest + * left ancestor. E.g, for '{' of a block, return parent up the chain from + * block starting with '{'. For '}' of block, return just block as nothing + * starts with '}'. (block stops with it). + */ + public static ParserRuleContext earliestAncestorStartingWithToken(TerminalNode node, Token token) { + ParserRuleContext p = (ParserRuleContext)node.getParent(); ParserRuleContext prev = null; while (p!=null && p.getStart()==token) { prev = p; p = p.getParent(); } + if ( prev==null ) { + return (ParserRuleContext)node.getParent(); + } return prev; } - /** Walk upwards from node while p.stop == token; return null if there is - * no ancestor stopping at token. + /** Walk upwards from node while p.stop == token; return immediate parent + * if there is no ancestor stopping at token. This is the earliest + * right ancestor. E.g, for '}' of a block, return parent up the chain from + * block stopping with '}'. For '{' of block, return just block as nothing + * stops with '{'. (block starts with it). */ - public static ParserRuleContext earliestAncestorEndingWithToken(ParserRuleContext node, Token token) { - ParserRuleContext p = node; + public static ParserRuleContext earliestAncestorEndingWithToken(TerminalNode node, Token token) { + ParserRuleContext p = (ParserRuleContext)node.getParent(); ParserRuleContext prev = null; while (p!=null && p.getStop()==token) { prev = p; p = p.getParent(); } + if ( prev==null ) { + return (ParserRuleContext)node.getParent(); + } return prev; } @@ -419,88 +416,45 @@ public static int[] getNodeFeatures(Map tokenToNodeMap, tokens.seek(i); // seek so that LT(1) is tokens.get(i); - // Get a 4-gram of tokens with current token in 3rd position - List window = - Arrays.asList(tokens.LT(-2), tokens.LT(-1), tokens.LT(1), tokens.LT(2)); - // Get context information for previous token Token prevToken = tokens.LT(-1); TerminalNode prevTerminalNode = tokenToNodeMap.get(prevToken); ParserRuleContext parent = (ParserRuleContext)prevTerminalNode.getParent(); int prevTokenRuleIndex = parent.getRuleIndex(); - ParserRuleContext prevEarliestRightAncestor = earliestAncestorEndingWithToken(parent, prevToken); - int prevEarliestAncestorRuleIndex = -1; - int prevEarliestAncestorRuleAltNum = 0; - if ( prevEarliestRightAncestor!=null ) { - prevEarliestAncestorRuleIndex = prevEarliestRightAncestor.getRuleIndex(); - prevEarliestAncestorRuleAltNum = prevEarliestRightAncestor.getAltNumber(); - } + ParserRuleContext prevEarliestRightAncestor = earliestAncestorEndingWithToken(node, prevToken); + int prevEarliestAncestorRuleIndex = prevEarliestRightAncestor.getRuleIndex(); + int prevEarliestAncestorRuleAltNum = prevEarliestRightAncestor.getAltNumber(); // Get context information for current token parent = (ParserRuleContext)node.getParent(); - int curTokensParentRuleIndex = parent.getRuleIndex(); - ParserRuleContext earliestLeftAncestor = earliestAncestorStartingWithToken(parent, curToken); - int earliestLeftAncestorRuleIndex = -1; - int earliestLeftAncestorRuleAlt = 0; - if ( earliestLeftAncestor!=null ) { - earliestLeftAncestorRuleIndex = earliestLeftAncestor.getRuleIndex(); - earliestLeftAncestorRuleAlt = earliestLeftAncestor.getAltNumber(); - } + ParserRuleContext earliestLeftAncestor = earliestAncestorStartingWithToken(node, curToken); + ParserRuleContext earliestLeftAncestorParent = earliestLeftAncestor.getParent(); - ParserRuleContext earliestRightAncestor = earliestAncestorEndingWithToken(parent, curToken); - int earliestRightAncestorRuleIndex = -1; - int earliestRightAncestorRuleAlt = 0; - if ( earliestRightAncestor!=null ) { - earliestRightAncestorRuleIndex = earliestRightAncestor.getRuleIndex(); - earliestRightAncestorRuleAlt = earliestRightAncestor.getAltNumber(); - } - int prevTokenEndCharPos = window.get(1).getCharPositionInLine() + window.get(1).getText().length(); + ParserRuleContext earliestLeftAncestorParent2 = earliestLeftAncestorParent!=null ? earliestLeftAncestorParent.getParent() : null; + ParserRuleContext earliestLeftAncestorParent3 = earliestLeftAncestorParent2!=null ? earliestLeftAncestorParent2.getParent() : null; + ParserRuleContext earliestLeftAncestorParent4 = earliestLeftAncestorParent3!=null ? earliestLeftAncestorParent3.getParent() : null; - int matchingSymbolOnDiffLine = getMatchingSymbolOnDiffLine(doc, node, line); + ParserRuleContext earliestRightAncestor = earliestAncestorEndingWithToken(node, curToken); - // Get some context from parse tree - ParserRuleContext ancestorParent = null; - ParserRuleContext ancestorParent2 = null; - if ( earliestLeftAncestor==null ) { // just use regular parent then - ancestorParent = getParent(node); - if ( ancestorParent!=null ) { - ancestorParent2 = ancestorParent.getParent(); // get immediate parent for context - } - } - else { - ancestorParent = getParent(earliestLeftAncestor); // get parent but skip chain rules - if ( ancestorParent!=null ) { - ancestorParent2 = ancestorParent.getParent(); // get immediate parent for context - } - } - ParserRuleContext ancestorParent3 = ancestorParent2!=null ? ancestorParent2.getParent() : null; - ParserRuleContext ancestorParent4 = ancestorParent3!=null ? ancestorParent3.getParent() : null; - ParserRuleContext ancestorParent5 = ancestorParent4!=null ? ancestorParent4.getParent() : null; + int matchingSymbolOnDiffLine = getMatchingSymbolOnDiffLine(doc, node, line); - boolean curTokenStartsNewLine = window.get(2).getLine()>window.get(1).getLine(); + boolean curTokenStartsNewLine = tokens.LT(1).getLine()>tokens.LT(-1).getLine(); int[] features = { - window.get(0).getType(), - window.get(1).getType(), - rulealt(prevTokenRuleIndex,ATN.INVALID_ALT_NUMBER), // only match rule index - prevTokenEndCharPos, + tokens.LT(-1).getType(), rulealt(prevEarliestAncestorRuleIndex,prevEarliestAncestorRuleAltNum), - - window.get(2).getType(), // LT(1) + tokens.LT(1).getType(), matchingSymbolOnDiffLine, curTokenStartsNewLine ? 1 : 0, - rulealt(curTokensParentRuleIndex,ATN.INVALID_ALT_NUMBER), // we care what kind of thing but not more specifically here - rulealt(earliestRightAncestorRuleIndex,earliestRightAncestorRuleAlt), - rulealt(earliestLeftAncestorRuleIndex,earliestLeftAncestorRuleAlt), - ancestorParent5!=null ? rulealt(ancestorParent5.getRuleIndex(),ancestorParent5.getAltNumber()) : -1, - ancestorParent4!=null ? rulealt(ancestorParent4.getRuleIndex(),ancestorParent4.getAltNumber()) : -1, - ancestorParent3!=null ? rulealt(ancestorParent3.getRuleIndex(),ancestorParent3.getAltNumber()) : -1, - ancestorParent3!=null ? ancestorParent3.getChildCount() : 0, - ancestorParent2!=null ? rulealt(ancestorParent2.getRuleIndex(),ancestorParent2.getAltNumber()) : -1, - ancestorParent2!=null ? ancestorParent2.getChildCount() : 0, - rulealt(ancestorParent.getRuleIndex(),ancestorParent.getAltNumber()), // always at least token's parent exists - ancestorParent.getChildCount(), - - window.get(3).getType(), + rulealt(earliestLeftAncestor.getRuleIndex(),earliestLeftAncestor.getAltNumber()), + getChildIndex(node), + earliestLeftAncestorParent!=null ? rulealt(earliestLeftAncestorParent.getRuleIndex(), earliestLeftAncestorParent.getAltNumber()) : -1, + getChildIndex(earliestLeftAncestor), + earliestLeftAncestorParent2!=null ? rulealt(earliestLeftAncestorParent2.getRuleIndex(), earliestLeftAncestorParent2.getAltNumber()) : -1, + getChildIndex(earliestLeftAncestorParent), + earliestLeftAncestorParent3!=null ? rulealt(earliestLeftAncestorParent3.getRuleIndex(), earliestLeftAncestorParent3.getAltNumber()) : -1, + getChildIndex(earliestLeftAncestorParent2), + earliestLeftAncestorParent4!=null ? rulealt(earliestLeftAncestorParent4.getRuleIndex(), earliestLeftAncestorParent4.getAltNumber()) : -1, + getChildIndex(earliestLeftAncestorParent3), // info 0, // dummy; we don't store file index into feature vector @@ -508,7 +462,6 @@ public static int[] getNodeFeatures(Map tokenToNodeMap, curToken.getCharPositionInLine() }; assert features.length == NUM_FEATURES; -// System.out.print(curToken+": "+CodekNNClassifier._toString(features)); return features; } @@ -518,6 +471,7 @@ public static int getMatchingSymbolOnDiffLine(InputDocument doc, { TerminalNode matchingLeftNode = getMatchingLeftSymbol(doc, node); if (matchingLeftNode != null) { +// System.out.println(node.getPayload()+" matches with "+matchingLeftNode.getSymbol()); int matchingLeftTokenLine = matchingLeftNode.getSymbol().getLine(); return matchingLeftTokenLine != line ? PAIR_ON_DIFF_LINE : PAIR_ON_SAME_LINE; } @@ -570,15 +524,6 @@ public static List viableLeftTokenTypes(ParserRuleContext node, return newPairs; } - public static Token findAlignedToken(List tokens, Token leftEdgeToken) { - for (Token t : tokens) { - if ( t.getCharPositionInLine() == leftEdgeToken.getCharPositionInLine() ) { - return t; - } - } - return null; - } - /** Search backwards from tokIndex into 'tokens' stream and get all on-channel * tokens on previous line with respect to token at tokIndex. * return empty list if none found. First token in returned list is @@ -628,7 +573,7 @@ public static String _toString(FeatureMetaData[] FEATURES, InputDocument doc, in for (int i=0; i0 ) buf.append(" "); - if ( i==INDEX_TYPE ) { + if ( i==INDEX_CUR_TYPE ) { buf.append("| "); // separate prev from current tokens } int displayWidth = FEATURES[i].type.displayWidth; @@ -656,7 +601,7 @@ public static String _toString(FeatureMetaData[] FEATURES, InputDocument doc, in case INFO_LINE: case INFO_CHARPOS: if ( features[i]>=0 ) { - buf.append(String.format("%"+displayWidth+"s", String.valueOf(features[i]))); + buf.append(String.format("%"+displayWidth+"s", StringUtils.center(String.valueOf(features[i]),displayWidth))); } else { buf.append(Tool.sequence(displayWidth, " ")); @@ -687,7 +632,7 @@ public static String featureNameHeader(FeatureMetaData[] FEATURES) { for (int i=0; i0 ) buf.append(" "); - if ( i==INDEX_TYPE ) { + if ( i==INDEX_CUR_TYPE ) { buf.append("| "); // separate prev from current tokens } int displayWidth = FEATURES[i].type.displayWidth; @@ -697,7 +642,7 @@ public static String featureNameHeader(FeatureMetaData[] FEATURES) { for (int i=0; i0 ) buf.append(" "); - if ( i==INDEX_TYPE ) { + if ( i==INDEX_CUR_TYPE ) { buf.append("| "); // separate prev from current tokens } int displayWidth = FEATURES[i].type.displayWidth; @@ -707,7 +652,7 @@ public static String featureNameHeader(FeatureMetaData[] FEATURES) { for (int i=0; i0 ) buf.append(" "); - if ( i==INDEX_TYPE ) { + if ( i==INDEX_CUR_TYPE ) { buf.append("| "); // separate prev from current tokens } int displayWidth = FEATURES[i].type.displayWidth; @@ -717,7 +662,7 @@ public static String featureNameHeader(FeatureMetaData[] FEATURES) { for (int i=0; i0 ) buf.append(" "); - if ( i==INDEX_TYPE ) { + if ( i==INDEX_CUR_TYPE ) { buf.append("| "); // separate prev from current tokens } int displayWidth = FEATURES[i].type.displayWidth; @@ -768,28 +713,28 @@ public static List getRealTokens(CommonTokenStream tokens) { return real; } - public static ParserRuleContext getParent(TerminalNode p) { - return parentClosure((ParserRuleContext)p.getParent()); - } - - /** Same as p.getParent() except we scan through chain rule nodes */ - public static ParserRuleContext getParent(ParserRuleContext p) { - if ( p==null ) return null; - ParserRuleContext lastValidParent = p.getParent(); - if ( lastValidParent==null ) return null; // must have hit the root - - return parentClosure(p.getParent()); - } - - // try to walk chain rules starting with the parent of the usual parent - public static ParserRuleContext parentClosure(ParserRuleContext p) { - ParserRuleContext lastValidParent = p; - ParserRuleContext q = lastValidParent.getParent(); - while ( q!=null && q.getChildCount()==1 ) { // while is a chain rule - lastValidParent = q; - q = q.getParent(); + public static int getChildIndex(ParseTree t) { + if ( t==null ) return -1; + ParseTree parent = t.getParent(); + if ( parent==null ) { + return -1; + } + // we know we have a parent now + // check to see if we are 2nd or beyond element in a sibling list + if ( t instanceof ParserRuleContext ) { + List siblings = ((ParserRuleContext)parent).getRuleContexts(((ParserRuleContext)t).getClass()); + if ( siblings.size()>1 && siblings.indexOf(t)>0 ) { + return CHILD_INDEX_LIST_ELEMENT; + } + } + // Either first of sibling list or not in a list. + // Figure out which child index t is of parent + for (int i = 0; i(); for (int i=0; i key = new Pair<>(pr, cr); diff --git a/java/src/org/antlr/codebuff/FeatureType.java b/java/src/org/antlr/codebuff/FeatureType.java index 6988c25..c1a8ee8 100644 --- a/java/src/org/antlr/codebuff/FeatureType.java +++ b/java/src/org/antlr/codebuff/FeatureType.java @@ -1,7 +1,7 @@ package org.antlr.codebuff; public enum FeatureType { - TOKEN(12), RULE(14), INT(7), BOOL(5), COL(7), + TOKEN(12), RULE(14), INT(12), BOOL(5), COL(7), INFO_FILE(15), INFO_LINE(4), INFO_CHARPOS(4), UNUSED(0); public int displayWidth; diff --git a/java/src/org/antlr/codebuff/Formatter.java b/java/src/org/antlr/codebuff/Formatter.java index 51eea68..983c47a 100644 --- a/java/src/org/antlr/codebuff/Formatter.java +++ b/java/src/org/antlr/codebuff/Formatter.java @@ -19,12 +19,14 @@ import static org.antlr.codebuff.CollectFeatures.CAT_INDENT_FROM_ANCESTOR_FIRST_TOKEN; import static org.antlr.codebuff.CollectFeatures.CAT_INJECT_NL; import static org.antlr.codebuff.CollectFeatures.CAT_INJECT_WS; +import static org.antlr.codebuff.CollectFeatures.CAT_NO_ALIGNMENT; import static org.antlr.codebuff.CollectFeatures.FEATURES_ALIGN; import static org.antlr.codebuff.CollectFeatures.FEATURES_INJECT_WS; import static org.antlr.codebuff.CollectFeatures.INDEX_FIRST_ON_LINE; -import static org.antlr.codebuff.CollectFeatures.INDEX_PREV_END_COLUMN; +import static org.antlr.codebuff.CollectFeatures.INDEX_MATCHING_TOKEN_DIFF_LINE; import static org.antlr.codebuff.CollectFeatures.MAX_CONTEXT_DIFF_THRESHOLD; import static org.antlr.codebuff.CollectFeatures.earliestAncestorStartingWithToken; +import static org.antlr.codebuff.CollectFeatures.getMatchingSymbolOnDiffLine; import static org.antlr.codebuff.CollectFeatures.getNodeFeatures; import static org.antlr.codebuff.CollectFeatures.getRealTokens; import static org.antlr.codebuff.CollectFeatures.getTokensOnPreviousLine; @@ -46,7 +48,6 @@ public class Formatter { protected Vector analysis = new Vector<>(); protected CodekNNClassifier nlwsClassifier; - protected CodekNNClassifier wsClassifier; protected CodekNNClassifier alignClassifier; protected int k; @@ -112,13 +113,14 @@ public String format() { public void processToken(int indexIntoRealTokens, int tokenIndexInStream) { CommonToken curToken = (CommonToken)tokens.get(tokenIndexInStream); String tokText = curToken.getText(); + TerminalNode node = tokenToNodeMap.get(curToken); emitCommentsToTheLeft(tokenIndexInStream); int[] features = getNodeFeatures(tokenToNodeMap, doc, tokenIndexInStream, line, tabSize); // must set "prev end column" value as token stream doesn't have it; // we're tracking it as we emit tokens - features[INDEX_PREV_END_COLUMN] = charPosInLine; +// features[INDEX_PREV_END_COLUMN] = charPosInLine; int injectNL_WS = nlwsClassifier.classify(k, features, corpus.injectWhitespace, MAX_CONTEXT_DIFF_THRESHOLD); int newlines = 0; @@ -130,23 +132,15 @@ else if ( (injectNL_WS&0xFF)==CAT_INJECT_WS ) { ws = CollectFeatures.unwscat(injectNL_WS); } - // getNodeFeatures() also doesn't know what line curToken is on. If \n, we need to find exemplars that start a line - features[INDEX_FIRST_ON_LINE] = newlines; // use \n prediction to match exemplars for alignment - - int align = alignClassifier.classify(k, features, corpus.align, MAX_CONTEXT_DIFF_THRESHOLD); - - TokenPositionAnalysis tokenPositionAnalysis = - getTokenAnalysis(features, indexIntoRealTokens, tokenIndexInStream, newlines, align, ws); - analysis.setSize(tokenIndexInStream+1); - analysis.set(tokenIndexInStream, tokenPositionAnalysis); - if ( ws==0 && cannotJoin(realTokens.get(indexIntoRealTokens-1), curToken) ) { // failsafe! ws = 1; } + int align = CAT_NO_ALIGNMENT; + if ( newlines>0 ) { output.append(Tool.newlines(newlines)); - line++; + line+=newlines; charPosInLine = 0; List tokensOnPreviousLine = getTokensOnPreviousLine(tokens, tokenIndexInStream, line); @@ -155,9 +149,15 @@ else if ( (injectNL_WS&0xFF)==CAT_INJECT_WS ) { firstTokenOnPrevLine = tokensOnPreviousLine.get(0); } - TerminalNode node = tokenToNodeMap.get(curToken); ParserRuleContext parent = (ParserRuleContext)node.getParent(); + // getNodeFeatures() doesn't know what line curToken is on. If \n, we need to find exemplars that start a line + features[INDEX_FIRST_ON_LINE] = newlines>0 ? 1 : 0; // use \n prediction to match exemplars for alignment + // if we decide to inject a newline, we better recompute this value before classifying alignment + features[INDEX_MATCHING_TOKEN_DIFF_LINE] = getMatchingSymbolOnDiffLine(doc, node, line); + + align = alignClassifier.classify(k, features, corpus.align, MAX_CONTEXT_DIFF_THRESHOLD); + if ( align==CAT_INDENT ) { if ( firstTokenOnPrevLine!=null ) { // if not on first line, we cannot indent int indentedCol = firstTokenOnPrevLine.getCharPositionInLine()+INDENT_LEVEL; @@ -169,10 +169,7 @@ else if ( (align&0xFF)==CAT_ALIGN_WITH_ANCESTOR_CHILD ) { int[] deltaChild = CollectFeatures.unaligncat(align); int deltaFromAncestor = deltaChild[0]; int childIndex = deltaChild[1]; - ParserRuleContext earliestLeftAncestor = earliestAncestorStartingWithToken(parent, curToken); - if ( earliestLeftAncestor==null ) { - earliestLeftAncestor = parent; - } + ParserRuleContext earliestLeftAncestor = earliestAncestorStartingWithToken(node, curToken); ParserRuleContext ancestor = CollectFeatures.getAncestor(earliestLeftAncestor, deltaFromAncestor); ParseTree child = ancestor.getChild(childIndex); Token start = null; @@ -194,10 +191,7 @@ else if ( child instanceof TerminalNode ){ } else if ( (align&0xFF)==CAT_INDENT_FROM_ANCESTOR_FIRST_TOKEN ) { int deltaFromAncestor = CollectFeatures.unindentcat(align); - ParserRuleContext earliestLeftAncestor = earliestAncestorStartingWithToken(parent, curToken); - if ( earliestLeftAncestor==null ) { - earliestLeftAncestor = parent; - } + ParserRuleContext earliestLeftAncestor = earliestAncestorStartingWithToken(node, curToken); ParserRuleContext ancestor = CollectFeatures.getAncestor(earliestLeftAncestor, deltaFromAncestor); Token start = ancestor.getStart(); int indentCol = start.getCharPositionInLine() + INDENT_LEVEL; @@ -211,6 +205,11 @@ else if ( (align&0xFF)==CAT_INDENT_FROM_ANCESTOR_FIRST_TOKEN ) { charPosInLine += ws; } + TokenPositionAnalysis tokenPositionAnalysis = + getTokenAnalysis(features, indexIntoRealTokens, tokenIndexInStream, newlines, align, ws); + analysis.setSize(tokenIndexInStream+1); + analysis.set(tokenIndexInStream, tokenPositionAnalysis); + // update Token object with position information now that we are about // to emit it. curToken.setLine(line); @@ -227,6 +226,10 @@ else if ( (align&0xFF)==CAT_INDENT_FROM_ANCESTOR_FIRST_TOKEN ) { /** Look into the token stream to get the comments to the left of current * token. Emit all whitespace and comments except for whitespace at the * end as we'll inject that per newline prediction. + * + * This assumes we are grooming not totally reformatting. + * We able to see original input stream for comment purposes. With all + * whitespace removed, we can't emit this stuff properly at moment. */ public void emitCommentsToTheLeft(int tokenIndexInStream) { List hiddenTokensToLeft = tokens.getHiddenTokensToLeft(tokenIndexInStream); @@ -270,7 +273,7 @@ public void emitCommentsToTheLeft(int tokenIndexInStream) { public TokenPositionAnalysis getTokenAnalysis(int[] features, int indexIntoRealTokens, int tokenIndexInStream, int injectNewline, - int alignWithPrevious, + int align, int ws) { CommonToken curToken = (CommonToken)tokens.get(tokenIndexInStream); @@ -286,12 +289,11 @@ public TokenPositionAnalysis getTokenAnalysis(int[] features, int indexIntoRealT boolean prevIsWS = prevToken.getChannel()==Token.HIDDEN_CHANNEL; // assume this means whitespace int actualNL = Tool.count(prevToken.getText(), '\n'); - int actualWS = Tool.count(prevToken.getText(), ' '); - String newlinePredictionString = String.format("### line %d: predicted %d \\n actual %s", + String newlinePredictionString = String.format("### line %d: predicted %d \\n actual ?", originalCurToken.getLine(), injectNewline, prevIsWS ? actualNL : "none"); - String alignPredictionString = String.format("### line %d: predicted %s actual %s", + String alignPredictionString = String.format("### line %d: predicted %d actual %s", originalCurToken.getLine(), - alignWithPrevious==1?"align":"unaligned", + align, "?"); String newlineAnalysis = newlinePredictionString+"\n"+ diff --git a/java/src/org/antlr/codebuff/Neighbor.java b/java/src/org/antlr/codebuff/Neighbor.java index da99415..fb04c9f 100644 --- a/java/src/org/antlr/codebuff/Neighbor.java +++ b/java/src/org/antlr/codebuff/Neighbor.java @@ -17,7 +17,7 @@ public String toString(FeatureMetaData[] FEATURES, List Y) { int[] X = corpus.X.get(corpusVectorIndex); InputDocument doc = corpus.documents.get(corpusVectorIndex); String features = CollectFeatures._toString(FEATURES, doc, X); - int line = CollectFeatures.getInfoLine(X); + int line = X[CollectFeatures.INDEX_INFO_LINE]; String lineText = doc.getLine(line); int col = X[CollectFeatures.INDEX_INFO_CHARPOS]; // insert a dot right before char position diff --git a/java/src/org/antlr/codebuff/kNNClassifier.java b/java/src/org/antlr/codebuff/kNNClassifier.java index 4fc890b..e298795 100644 --- a/java/src/org/antlr/codebuff/kNNClassifier.java +++ b/java/src/org/antlr/codebuff/kNNClassifier.java @@ -125,8 +125,8 @@ public Neighbor[] kNN(int[] unknown, int k, double distanceThreshold) { } public Neighbor[] distances(int[] unknown, double distanceThreshold) { - int curTokenRuleIndex = unknown[CollectFeatures.INDEX_RULE]; - int prevTokenRuleIndex = unknown[CollectFeatures.INDEX_PREV_RULE]; + int curTokenRuleIndex = unknown[CollectFeatures.INDEX_PREV_EARLIEST_RIGHT_ANCESTOR]; + int prevTokenRuleIndex = unknown[CollectFeatures.INDEX_EARLIEST_LEFT_ANCESTOR]; int pr = CollectFeatures.unrulealt(prevTokenRuleIndex)[0]; int cr = CollectFeatures.unrulealt(curTokenRuleIndex)[0]; Pair key = new Pair<>(pr, cr);