diff --git a/java/codebuff.ipr b/java/codebuff.ipr index 362a0ed..973d476 100644 --- a/java/codebuff.ipr +++ b/java/codebuff.ipr @@ -37,6 +37,24 @@ + + + + + + + + + + + + + + + + + + diff --git a/java/src/org/antlr/codebuff/CollectFeatures.java b/java/src/org/antlr/codebuff/CollectFeatures.java index 6b79c08..b8787f0 100644 --- a/java/src/org/antlr/codebuff/CollectFeatures.java +++ b/java/src/org/antlr/codebuff/CollectFeatures.java @@ -9,11 +9,10 @@ import org.antlr.v4.runtime.atn.ATN; import org.antlr.v4.runtime.misc.Pair; import org.antlr.v4.runtime.tree.ErrorNode; +import org.antlr.v4.runtime.tree.ParseTree; import org.antlr.v4.runtime.tree.ParseTreeListener; import org.antlr.v4.runtime.tree.ParseTreeWalker; import org.antlr.v4.runtime.tree.TerminalNode; -import org.antlr.v4.runtime.tree.Tree; -import org.antlr.v4.runtime.tree.Trees; import org.apache.commons.lang3.StringUtils; import java.io.File; @@ -25,7 +24,7 @@ import java.util.Map; public class CollectFeatures { - public static final double MAX_CONTEXT_DIFF_THRESHOLD = 0.6; + public static final double MAX_CONTEXT_DIFF_THRESHOLD = 0.20; // Feature values for pair on diff lines feature public static final int NOT_PAIR = -1; @@ -34,24 +33,32 @@ public class CollectFeatures { // Categories for alignment/indentation public static final int CAT_NO_ALIGNMENT = 0; - public static final int CAT_ALIGN_WITH_ANCESTOR_FIRST_TOKEN = 1; - public static final int CAT_ALIGN_WITH_ANCESTORS_PARENT_FIRST_TOKEN = 2; - public static final int CAT_ALIGN_WITH_LIST_FIRST_ELEMENT = 3; - public static final int CAT_ALIGN_WITH_PAIR = 4; + + /* We want to identify alignment with a child's start token of some parent + but that parent could be a number of levels up the tree. The next category + values indicate alignment from the current token's left ancestor's + parent then it's parent and so on. For category value: + + CAT_ALIGN_WITH_ANCESTOR_CHILD | delta<<8 | childindex<<16 + + current token is aligned with start token of child childindex, + delta levels up from ancestor. + */ + public static final int CAT_ALIGN_WITH_ANCESTOR_CHILD = 10; /* We want to identify indentation from a parent's start token but that parent could be a number of levels up the tree. The next category values indicate indentation from the current token's left ancestor's parent then it's parent and so on. For category value: - CAT_INDENT_FROM_ANCESTOR_FIRST_TOKEN + i + CAT_INDENT_FROM_ANCESTOR_FIRST_TOKEN | delta<<8 current token is indented from start token of node i levels up from ancestor. */ - public static final int CAT_INDENT_FROM_ANCESTOR_FIRST_TOKEN = 100; // left ancestor's first token is really current token + public static final int CAT_INDENT_FROM_ANCESTOR_FIRST_TOKEN = 20; // left ancestor's first token is really current token - public static final int CAT_INDENT = 200; + public static final int CAT_INDENT = 30; // indexes into feature vector @@ -61,22 +68,25 @@ public class CollectFeatures { public static final int INDEX_PREV_END_COLUMN = 3; public static final int INDEX_PREV_EARLIEST_ANCESTOR = 4; public static final int INDEX_TYPE = 5; - public static final int INDEX_FIRST_EL_OF_LIST = 6; // TODO: I don't think we can detect first element of list - public static final int INDEX_MATCHING_TOKEN_DIFF_LINE = 7; - public static final int INDEX_FIRST_ON_LINE = 8; // a \n right before this token? - public static final int INDEX_RULE = 9; // what rule are we in? - public static final int INDEX_EARLIEST_RIGHT_ANCESTOR = 10; - public static final int INDEX_EARLIEST_LEFT_ANCESTOR = 11; + public static final int INDEX_MATCHING_TOKEN_DIFF_LINE = 6; + public static final int INDEX_FIRST_ON_LINE = 7; // a \n right before this token? + public static final int INDEX_RULE = 8; // what rule are we in? + public static final int INDEX_EARLIEST_RIGHT_ANCESTOR = 9; + public static final int INDEX_EARLIEST_LEFT_ANCESTOR = 10; + public static final int INDEX_ANCESTORS_PARENT5_RULE = 11; public static final int INDEX_ANCESTORS_PARENT4_RULE = 12; public static final int INDEX_ANCESTORS_PARENT3_RULE = 13; - public static final int INDEX_ANCESTORS_PARENT2_RULE = 14; - public static final int INDEX_ANCESTORS_PARENT_RULE = 15; - public static final int INDEX_NEXT_TYPE = 16; - public static final int INDEX_INFO_FILE = 17; - public static final int INDEX_INFO_LINE = 18; - public static final int INDEX_INFO_CHARPOS = 19; - - public static final int NUM_FEATURES = 20; + public static final int INDEX_ANCESTORS_PARENT3_WID = 14; + public static final int INDEX_ANCESTORS_PARENT2_RULE = 15; + public static final int INDEX_ANCESTORS_PARENT2_WID = 16; + public static final int INDEX_ANCESTORS_PARENT_RULE = 17; + public static final int INDEX_ANCESTORS_PARENT_WID = 18; + public static final int INDEX_NEXT_TYPE = 19; + public static final int INDEX_INFO_FILE = 20; + public static final int INDEX_INFO_LINE = 21; + public static final int INDEX_INFO_CHARPOS = 22; + + public static final int NUM_FEATURES = 23; public static FeatureMetaData[] FEATURES_INJECT_NL = { new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(-2)"}, 1), @@ -85,16 +95,19 @@ public class CollectFeatures { new FeatureMetaData(FeatureType.INT, new String[] {"LT(-1)", "end col"}, 0), new FeatureMetaData(FeatureType.RULE, new String[] {"LT(-1)", "right ancestor"}, 3), new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(1)"}, 2), - new FeatureMetaData(FeatureType.BOOL, new String[]{"Strt", "list"}, 3), new FeatureMetaData(FeatureType.BOOL, new String[]{"Pair", "dif\\n"}, 3), FeatureMetaData.UNUSED, new FeatureMetaData(FeatureType.RULE, new String[] {"LT(1)", "rule"}, 2), new FeatureMetaData(FeatureType.RULE, new String[] {"LT(1)", "right ancestor"}, 3), new FeatureMetaData(FeatureType.RULE, new String[] {"LT(1)", "left ancestor"}, 3), + new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^5"}, 1), new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^4"}, 1), new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^3"}, 1), + new FeatureMetaData(FeatureType.INT, new String[] {"ancestor's", "parent^3 wid"}, 1), new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^2"}, 1), + new FeatureMetaData(FeatureType.INT, new String[] {"ancestor's", "parent^2 wid"}, 1), new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent"}, 1), + new FeatureMetaData(FeatureType.INT, new String[] {"ancestor's", "parent wid"}, 1), FeatureMetaData.UNUSED, new FeatureMetaData(FeatureType.INFO_FILE, new String[] {"", "file"}, 0), new FeatureMetaData(FeatureType.INFO_LINE, new String[] {"", "line"}, 0), @@ -102,24 +115,26 @@ public class CollectFeatures { }; public static FeatureMetaData[] FEATURES_ALIGN = { - new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(-2)"}, 1), + FeatureMetaData.UNUSED, new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(-1)"}, 2), new FeatureMetaData(FeatureType.RULE, new String[] {"LT(-1)", "rule"}, 2), FeatureMetaData.UNUSED, FeatureMetaData.UNUSED, new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(1)"}, 2), -// new FeatureMetaData(FeatureType.BOOL, new String[]{"Strt", "list"}, 3), - FeatureMetaData.UNUSED, FeatureMetaData.UNUSED, new FeatureMetaData(FeatureType.BOOL, new String[]{"Strt", "line"}, 3), new FeatureMetaData(FeatureType.RULE, new String[] {"LT(1)", "rule"}, 2), new FeatureMetaData(FeatureType.RULE, new String[] {"LT(1)", "right ancestor"}, 3), new FeatureMetaData(FeatureType.RULE, new String[] {"LT(1)", "left ancestor"}, 3), - new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^4"}, 1), - new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^3"}, 1), - new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^2"}, 1), - new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent"}, 1), - new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(2)"}, 1), + new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^5"}, 2), + new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^4"}, 2), + new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^3"}, 7), + new FeatureMetaData(FeatureType.INT, new String[] {"ancestor's", "par^3 wid"}, 7), + new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^2"}, 7), + new FeatureMetaData(FeatureType.INT, new String[] {"ancestor's", "par^2 wid"}, 7), + new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent"}, 3), + FeatureMetaData.UNUSED, + FeatureMetaData.UNUSED, new FeatureMetaData(FeatureType.INFO_FILE, new String[] {"", "file"}, 0), new FeatureMetaData(FeatureType.INFO_LINE, new String[] {"", "line"}, 0), new FeatureMetaData(FeatureType.INFO_CHARPOS, new String[] {"char", "pos"}, 0) @@ -133,15 +148,18 @@ public class CollectFeatures { new FeatureMetaData(FeatureType.RULE, new String[] {"LT(-1)", "right ancestor"}, 3), new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(1)"}, 3), FeatureMetaData.UNUSED, - FeatureMetaData.UNUSED, new FeatureMetaData(FeatureType.BOOL, new String[]{"Strt", "line"}, 3), new FeatureMetaData(FeatureType.RULE, new String[] {"LT(1)", "rule"}, 2), new FeatureMetaData(FeatureType.RULE, new String[] {"LT(1)", "right ancestor"}, 3), new FeatureMetaData(FeatureType.RULE, new String[] {"LT(1)", "left ancestor"}, 3), + new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^5"}, 1), new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^4"}, 1), new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^3"}, 1), + new FeatureMetaData(FeatureType.INT, new String[] {"ancestor's", "parent^3 wid"}, 1), new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^2"}, 1), + new FeatureMetaData(FeatureType.INT, new String[] {"ancestor's", "parent^2 wid"}, 1), new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent"}, 1), + new FeatureMetaData(FeatureType.INT, new String[] {"ancestor's", "parent wid"}, 1), new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(2)"}, 1), new FeatureMetaData(FeatureType.INFO_FILE, new String[] {"", "file"}, 0), new FeatureMetaData(FeatureType.INFO_LINE, new String[] {"", "line"}, 0), @@ -155,16 +173,19 @@ public class CollectFeatures { new FeatureMetaData(FeatureType.INT, new String[] {"LT(-1)", "end col"}, 0), new FeatureMetaData(FeatureType.RULE, new String[] {"LT(-1)", "right ancestor"}, 3), new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(1)"}, 2), - new FeatureMetaData(FeatureType.BOOL, new String[]{"Strt", "list"}, 3), new FeatureMetaData(FeatureType.BOOL, new String[]{"Pair", "dif\\n"}, 3), new FeatureMetaData(FeatureType.BOOL, new String[]{"Strt", "line"}, 3), new FeatureMetaData(FeatureType.RULE, new String[] {"LT(1)", "rule"}, 2), new FeatureMetaData(FeatureType.RULE, new String[] {"LT(1)", "right ancestor"}, 3), new FeatureMetaData(FeatureType.RULE, new String[] {"LT(1)", "left ancestor"}, 3), + new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^5"}, 1), new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^4"}, 1), new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^3"}, 1), + new FeatureMetaData(FeatureType.INT, new String[] {"ancestor's", "parent^3 wid"}, 1), new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^2"}, 1), + new FeatureMetaData(FeatureType.INT, new String[] {"ancestor's", "parent^2 wid"}, 1), new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent"}, 1), + new FeatureMetaData(FeatureType.INT, new String[] {"ancestor's", "parent wid"}, 1), new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(2)"}, 1), new FeatureMetaData(FeatureType.INFO_FILE, new String[] {"", "file"}, 0), new FeatureMetaData(FeatureType.INFO_LINE, new String[] {"", "line"}, 0), @@ -184,8 +205,6 @@ public class CollectFeatures { protected List indent = new ArrayList<>(); protected List align = new ArrayList<>(); - protected Token firstTokenOnLine = null; - protected int currentIndent = 0; protected Map tokenToNodeMap = null; @@ -204,7 +223,6 @@ public CollectFeatures(InputDocument doc, int tabSize, Map realTokens = getRealTokens(tokens); - firstTokenOnLine = realTokens.get(0); // init to first token of file for (int i = 2; i pair = + earliestAncestorWithChildStartingAtCharPos(earliestLeftAncestor.getParent(), curToken); + if ( pair!=null ) { + int deltaFromLeftAncestor = getDeltaToAncestor(earliestLeftAncestor, pair.a); + aligned = aligncat(deltaFromLeftAncestor, pair.b); +// System.out.printf("ALIGN %s %d %x\n", JavaParser.ruleNames[pair.a.getRuleIndex()], pair.b, aligned); } else if ( columnDelta!=0 ) { - ParserRuleContext earliestLeftAncestor = earliestAncestorStartingWithToken(parent, curToken); - ParserRuleContext ancestorParent = getParent(earliestLeftAncestor); int indentedFromPos = curToken.getCharPositionInLine()-Formatter.INDENT_LEVEL; ParserRuleContext indentParent = - earliestAncestorStartingAtCharPos(ancestorParent, indentedFromPos); + earliestAncestorStartingAtCharPos(earliestLeftAncestor.getParent(), indentedFromPos); if ( indentParent!=null ) { int deltaFromLeftAncestor = getDeltaToAncestor(earliestLeftAncestor, indentParent); - aligned = CAT_INDENT_FROM_ANCESTOR_FIRST_TOKEN+deltaFromLeftAncestor; + aligned = indentcat(deltaFromLeftAncestor); +// System.out.printf("INDENT %s %x\n", JavaParser.ruleNames[indentParent.getRuleIndex()], aligned); } else { aligned = CAT_INDENT; // indent standard amount @@ -346,85 +342,6 @@ public static boolean isAlignedWithFirstSiblingOfList(Map t return aligned; } - /** Return list of sibling if curToken's ancestor is in a list. - * Return null if curToken has not ancestor starting with curToken or - * if the ancestor has no siblings (same node type like StatementContext). - */ - public static List getListSiblings(Map tokenToNodeMap, - Token curToken) - { - TerminalNode node = tokenToNodeMap.get(curToken); - ParserRuleContext parent = (ParserRuleContext)node.getParent(); - ParserRuleContext earliestAncestor = earliestAncestorStartingWithToken(parent, curToken); - - if ( earliestAncestor!=null ) { - ParserRuleContext commonAncestor = earliestAncestor.getParent(); - List siblings = commonAncestor.getRuleContexts(earliestAncestor.getClass()); - if ( siblings.size()>1 ) { - return siblings; - } - } - return null; - } - - /** Is curToken the first statement of an slist, first arg of arglist, etc... */ - public static boolean isFirstSiblingOfList(Map tokenToNodeMap, - Token curToken) - { - TerminalNode node = tokenToNodeMap.get(curToken); - ParserRuleContext parent = (ParserRuleContext)node.getParent(); - ParserRuleContext earliestAncestor = earliestAncestorStartingWithToken(parent, curToken); - - if ( earliestAncestor!=null ) { - ParserRuleContext commonAncestor = earliestAncestor.getParent(); - List siblings = commonAncestor.getRuleContexts(earliestAncestor.getClass()); - if ( siblings.size()>1 ) { - ParserRuleContext firstSibling = siblings.get(0); - Token firstSiblingStartToken = firstSibling.getStart(); - return firstSiblingStartToken==curToken; - } - } - return false; - } - - /** Return number of steps to common ancestor whose first token is alignment anchor. - * Return null if no such common ancestor. - */ - public static ParserRuleContext getFirstTokenOfCommonAncestor( - ParserRuleContext root, - CommonTokenStream tokens, - int tokIndex, - int tabSize) - { - List tokensOnPreviousLine = getTokensOnPreviousLine(tokens, tokIndex, tokens.get(tokIndex).getLine()); - // look for alignment - if ( tokensOnPreviousLine.size()>0 ) { - Token curToken = tokens.get(tokIndex); - Token alignedToken = findAlignedToken(tokensOnPreviousLine, curToken); - tokens.seek(tokIndex); // seek so that LT(1) is tokens.get(i); - Token prevToken = tokens.LT(-1); - int prevIndent = tokensOnPreviousLine.get(0).getCharPositionInLine(); - int curIndent = curToken.getCharPositionInLine(); - boolean tabbed = curIndent>prevIndent && curIndent%tabSize==0; - boolean precedingNL = curToken.getLine()>prevToken.getLine(); - if ( precedingNL && - alignedToken!=null && - alignedToken!=tokensOnPreviousLine.get(0) && - !tabbed ) { - // if cur token is on new line and it lines up and it's not left edge, - // it's alignment not 0 indent -// printAlignment(tokens, curToken, tokensOnPreviousLine, alignedToken); - ParserRuleContext commonAncestor = Trees.getRootOfSubtreeEnclosingRegion(root, alignedToken.getTokenIndex(), curToken.getTokenIndex()); -// System.out.println("common ancestor: "+JavaParser.ruleNames[commonAncestor.getRuleIndex()]); - if ( commonAncestor.getStart()==alignedToken ) { - // aligned with first token of common ancestor - return commonAncestor; - } - } - } - return null; - } - /** Walk upwards from node while p.start == token; return null if there is * no ancestor starting at token. */ @@ -465,6 +382,36 @@ public ParserRuleContext earliestAncestorStartingAtCharPos(ParserRuleContext nod return null; } + /** Walk upwards from node until we find a child of p at char position and + * that child (token or start token) is first token on a line; + * return null if there is no such ancestor p. + */ + public Pair earliestAncestorWithChildStartingAtCharPos(ParserRuleContext node, Token t) { + int charpos = t.getCharPositionInLine(); + ParserRuleContext p = node; + while ( p!=null ) { + // check all children of p to see if one of them starts at charpos + for (int i = 0; i(p,i); + } + } + else { // must be token + TerminalNode c = (TerminalNode)child; + // check that we aren't aligned with self or element *after* us + if ( c.getSymbol().getTokenIndex()(p,i); + } + } + } + p = p.getParent(); + } + return null; + } + /** Return the number of hops to get to ancestor from node or -1 if we * don't find ancestor on path to root. */ @@ -498,20 +445,6 @@ public boolean isFirstOnLine(Token t) { return t.getLine()>prevToken.getLine(); } - public static ParserRuleContext deepestCommonAncestor(ParserRuleContext t1, ParserRuleContext t2) { - if ( t1==t2 ) return t1; - List extends Tree> t1_ancestors = Trees.getAncestors(t1); - List extends Tree> t2_ancestors = Trees.getAncestors(t2); - // first ancestor of t2 that matches an ancestor of t1 is the deepest common ancestor - for (Tree t : t1_ancestors) { - int i = t2_ancestors.indexOf(t); - if ( i>=0 ) { - return (ParserRuleContext)t2_ancestors.get(i); - } - } - return null; - } - public static int[] getNodeFeatures(Map tokenToNodeMap, InputDocument doc, int i, @@ -522,6 +455,7 @@ public static int[] getNodeFeatures(Map tokenToNodeMap, TerminalNode node = tokenToNodeMap.get(tokens.get(i)); if ( node==null ) { System.err.println("### No node associated with token "+tokens.get(i)); + return null; } Token curToken = node.getSymbol(); @@ -536,29 +470,23 @@ public static int[] getNodeFeatures(Map tokenToNodeMap, TerminalNode prevTerminalNode = tokenToNodeMap.get(prevToken); ParserRuleContext parent = (ParserRuleContext)prevTerminalNode.getParent(); int prevTokenRuleIndex = parent.getRuleIndex(); - int prevTokenRuleAltNum = parent.getAltNumber(); ParserRuleContext prevEarliestRightAncestor = earliestAncestorEndingWithToken(parent, prevToken); int prevEarliestAncestorRuleIndex = -1; int prevEarliestAncestorRuleAltNum = 0; - int prevEarliestAncestorWidth = -1; if ( prevEarliestRightAncestor!=null ) { prevEarliestAncestorRuleIndex = prevEarliestRightAncestor.getRuleIndex(); prevEarliestAncestorRuleAltNum = prevEarliestRightAncestor.getAltNumber(); - prevEarliestAncestorWidth = prevEarliestRightAncestor.stop.getStopIndex()-prevEarliestRightAncestor.start.getStartIndex()+1; } // Get context information for current token parent = (ParserRuleContext)node.getParent(); int curTokensParentRuleIndex = parent.getRuleIndex(); - int curTokensParentRuleAltNumber = parent.getAltNumber(); ParserRuleContext earliestLeftAncestor = earliestAncestorStartingWithToken(parent, curToken); - int earliestAncestorWidth = -1; int earliestLeftAncestorRuleIndex = -1; int earliestLeftAncestorRuleAlt = 0; if ( earliestLeftAncestor!=null ) { earliestLeftAncestorRuleIndex = earliestLeftAncestor.getRuleIndex(); earliestLeftAncestorRuleAlt = earliestLeftAncestor.getAltNumber(); - earliestAncestorWidth = earliestLeftAncestor.stop.getStopIndex()-earliestLeftAncestor.start.getStartIndex()+1; } ParserRuleContext earliestRightAncestor = earliestAncestorEndingWithToken(parent, curToken); @@ -572,14 +500,6 @@ public static int[] getNodeFeatures(Map tokenToNodeMap, int matchingSymbolOnDiffLine = getMatchingSymbolOnDiffLine(doc, node, line); - int sumEndColAndAncestorWidth = -1; - if ( earliestAncestorWidth>=0 ) { - sumEndColAndAncestorWidth = prevTokenEndCharPos+earliestAncestorWidth; - } - - // TODO: I don't think we can detect first element of list - boolean startOfList = isFirstSiblingOfList(tokenToNodeMap, curToken); - // Get some context from parse tree ParserRuleContext ancestorParent = null; ParserRuleContext ancestorParent2 = null; @@ -593,26 +513,30 @@ public static int[] getNodeFeatures(Map tokenToNodeMap, } ParserRuleContext ancestorParent3 = ancestorParent2!=null ? ancestorParent2.getParent() : null; ParserRuleContext ancestorParent4 = ancestorParent3!=null ? ancestorParent3.getParent() : null; + ParserRuleContext ancestorParent5 = ancestorParent4!=null ? ancestorParent4.getParent() : null; boolean curTokenStartsNewLine = window.get(2).getLine()>window.get(1).getLine(); int[] features = { window.get(0).getType(), window.get(1).getType(), - rulealt(prevTokenRuleIndex,prevTokenRuleAltNum), + rulealt(prevTokenRuleIndex,ATN.INVALID_ALT_NUMBER), // only match rule index prevTokenEndCharPos, rulealt(prevEarliestAncestorRuleIndex,prevEarliestAncestorRuleAltNum), window.get(2).getType(), // LT(1) - startOfList ? 1 : 0, matchingSymbolOnDiffLine, curTokenStartsNewLine ? 1 : 0, - rulealt(curTokensParentRuleIndex,curTokensParentRuleAltNumber), + rulealt(curTokensParentRuleIndex,ATN.INVALID_ALT_NUMBER), // we care what kind of thing but not more specifically here rulealt(earliestRightAncestorRuleIndex,earliestRightAncestorRuleAlt), rulealt(earliestLeftAncestorRuleIndex,earliestLeftAncestorRuleAlt), + ancestorParent5!=null ? rulealt(ancestorParent5.getRuleIndex(),ancestorParent5.getAltNumber()) : -1, ancestorParent4!=null ? rulealt(ancestorParent4.getRuleIndex(),ancestorParent4.getAltNumber()) : -1, ancestorParent3!=null ? rulealt(ancestorParent3.getRuleIndex(),ancestorParent3.getAltNumber()) : -1, + ancestorParent3!=null ? ancestorParent3.getChildCount() : 0, ancestorParent2!=null ? rulealt(ancestorParent2.getRuleIndex(),ancestorParent2.getAltNumber()) : -1, + ancestorParent2!=null ? ancestorParent2.getChildCount() : 0, rulealt(ancestorParent.getRuleIndex(),ancestorParent.getAltNumber()), // always at least token's parent exists + ancestorParent.getChildCount(), window.get(3).getType(), @@ -723,19 +647,6 @@ public static List getTokensOnPreviousLine(CommonTokenStream tokens, int return online; } - public static void printAlignment(CommonTokenStream tokens, Token curToken, List tokensOnPreviousLine, Token alignedToken) { - int alignedCol = alignedToken.getCharPositionInLine(); - int indent = tokensOnPreviousLine.get(0).getCharPositionInLine(); - int first = tokensOnPreviousLine.get(0).getTokenIndex(); - int last = tokensOnPreviousLine.get(tokensOnPreviousLine.size()-1).getTokenIndex(); - System.out.println(Tool.spaces(alignedCol-indent)+"\u2193"); - for (int j=first; j<=last; j++) { - System.out.print(tokens.get(j).getText()); - } - System.out.println(); - System.out.println(Tool.spaces(alignedCol-indent)+curToken.getText()); - } - public List getFeatures() { return features; } @@ -752,10 +663,6 @@ public List getAlign() { return align; } - public List getIndent() { - return indent; - } - public static String _toString(FeatureMetaData[] FEATURES, InputDocument doc, int[] features) { Vocabulary v = doc.parser.getVocabulary(); String[] ruleNames = doc.parser.getRuleNames(); @@ -891,7 +798,7 @@ public void exitEveryRule(ParserRuleContext ctx) { } public static List getRealTokens(CommonTokenStream tokens) { - List real = new ArrayList(); + List real = new ArrayList<>(); for (int i=0; i>16)&0xFFFF,ra&0xFFFF}; } + + public static int indentcat(int deltaFromLeftAncestor) { + return CAT_INDENT_FROM_ANCESTOR_FIRST_TOKEN | (deltaFromLeftAncestor<<8); + } + + public static int unindentcat(int v) { + return v >> 8 & 0xFFFF; + } + + public static int aligncat(int deltaFromLeftAncestor, int child) { + return CAT_ALIGN_WITH_ANCESTOR_CHILD | (deltaFromLeftAncestor<<8) | (child << 16); + } + + public static int[] unaligncat(int v) { + int deltaFromLeftAncestor = (v>>8)&0xFF; + int child = (v>>16)&0xFFFF; + return new int[] { deltaFromLeftAncestor, child }; + } } diff --git a/java/src/org/antlr/codebuff/Corpus.java b/java/src/org/antlr/codebuff/Corpus.java index 3ed73d0..3c54da4 100644 --- a/java/src/org/antlr/codebuff/Corpus.java +++ b/java/src/org/antlr/codebuff/Corpus.java @@ -86,7 +86,9 @@ public void buildTokenContextIndex() { for (int i=0; i key = new Pair<>(prevTokenRuleIndex, curTokenRuleIndex); + int pr = CollectFeatures.unrulealt(prevTokenRuleIndex)[0]; + int cr = CollectFeatures.unrulealt(curTokenRuleIndex)[0]; + Pair key = new Pair<>(pr, cr); List vectorIndexes = curAndPrevTokenRuleIndexToVectorsMap.get(key); if ( vectorIndexes==null ) { vectorIndexes = new ArrayList<>(); diff --git a/java/src/org/antlr/codebuff/Formatter.java b/java/src/org/antlr/codebuff/Formatter.java index 0ecb16a..726ad45 100644 --- a/java/src/org/antlr/codebuff/Formatter.java +++ b/java/src/org/antlr/codebuff/Formatter.java @@ -6,16 +6,14 @@ import org.antlr.v4.runtime.Token; import org.antlr.v4.runtime.WritableToken; import org.antlr.v4.runtime.misc.Interval; +import org.antlr.v4.runtime.tree.ParseTree; import org.antlr.v4.runtime.tree.TerminalNode; import java.util.List; import java.util.Map; import java.util.Vector; -import static org.antlr.codebuff.CollectFeatures.CAT_ALIGN_WITH_ANCESTORS_PARENT_FIRST_TOKEN; -import static org.antlr.codebuff.CollectFeatures.CAT_ALIGN_WITH_ANCESTOR_FIRST_TOKEN; -import static org.antlr.codebuff.CollectFeatures.CAT_ALIGN_WITH_LIST_FIRST_ELEMENT; -import static org.antlr.codebuff.CollectFeatures.CAT_ALIGN_WITH_PAIR; +import static org.antlr.codebuff.CollectFeatures.CAT_ALIGN_WITH_ANCESTOR_CHILD; import static org.antlr.codebuff.CollectFeatures.CAT_INDENT; import static org.antlr.codebuff.CollectFeatures.CAT_INDENT_FROM_ANCESTOR_FIRST_TOKEN; import static org.antlr.codebuff.CollectFeatures.CAT_NO_ALIGNMENT; @@ -25,15 +23,11 @@ import static org.antlr.codebuff.CollectFeatures.INDEX_FIRST_ON_LINE; import static org.antlr.codebuff.CollectFeatures.INDEX_PREV_END_COLUMN; import static org.antlr.codebuff.CollectFeatures.MAX_CONTEXT_DIFF_THRESHOLD; -import static org.antlr.codebuff.CollectFeatures.earliestAncestorEndingWithToken; import static org.antlr.codebuff.CollectFeatures.earliestAncestorStartingWithToken; -import static org.antlr.codebuff.CollectFeatures.getListSiblings; -import static org.antlr.codebuff.CollectFeatures.getMatchingLeftSymbol; import static org.antlr.codebuff.CollectFeatures.getNodeFeatures; import static org.antlr.codebuff.CollectFeatures.getRealTokens; import static org.antlr.codebuff.CollectFeatures.getTokensOnPreviousLine; import static org.antlr.codebuff.CollectFeatures.indexTree; -import static org.antlr.codebuff.CollectFeatures.isAlignedWithFirstSiblingOfList; public class Formatter { public static final int INDENT_LEVEL = 4; @@ -75,6 +69,7 @@ public Formatter(Corpus corpus, InputDocument doc, int tabSize) { wsClassifier = new CodekNNClassifier(corpus, FEATURES_INJECT_WS); alignClassifier = new CodekNNClassifier(corpus, FEATURES_ALIGN); // k = (int)Math.sqrt(corpus.X.size()); +// k = 7; k = 11; this.tabSize = tabSize; } @@ -154,7 +149,6 @@ public void processToken(int indexIntoRealTokens, int tokenIndexInStream) { TerminalNode node = tokenToNodeMap.get(curToken); ParserRuleContext parent = (ParserRuleContext)node.getParent(); - ParserRuleContext earliestRightAncestor = earliestAncestorEndingWithToken(parent, curToken); switch ( align ) { case CAT_INDENT : @@ -164,47 +158,43 @@ public void processToken(int indexIntoRealTokens, int tokenIndexInStream) { output.append(Tool.spaces(indentedCol)); } break; - case CAT_ALIGN_WITH_ANCESTOR_FIRST_TOKEN : - if ( earliestRightAncestor!=null ) { - Token earliestRightAncestorStart = earliestRightAncestor.getStart(); - int linedUpCol = earliestRightAncestorStart.getCharPositionInLine(); - charPosInLine = linedUpCol; - output.append(Tool.spaces(linedUpCol)); - } - break; - case CAT_ALIGN_WITH_ANCESTORS_PARENT_FIRST_TOKEN : - if ( earliestRightAncestor!=null ) { - ParserRuleContext earliestAncestorParent = earliestRightAncestor.getParent(); - if ( earliestAncestorParent!=null ) { - Token earliestAncestorParentStart = earliestAncestorParent.getStart(); - int linedUpCol = earliestAncestorParentStart.getCharPositionInLine(); - charPosInLine = linedUpCol; - output.append(Tool.spaces(linedUpCol)); - } - } - break; - case CAT_ALIGN_WITH_LIST_FIRST_ELEMENT : - List listSiblings = getListSiblings(tokenToNodeMap, curToken); - if ( listSiblings!=null ) { - ParserRuleContext firstSibling = listSiblings.get(0); - int linedUpCol = firstSibling.getStart().getCharPositionInLine(); - charPosInLine = linedUpCol; - output.append(Tool.spaces(linedUpCol)); - } - break; - case CAT_ALIGN_WITH_PAIR : - TerminalNode matchingLeftSymbol = getMatchingLeftSymbol(doc, node); - int linedUpCol = matchingLeftSymbol.getSymbol().getCharPositionInLine(); - charPosInLine = linedUpCol; - output.append(Tool.spaces(linedUpCol)); - break; case CAT_NO_ALIGNMENT : break; default : - if ( align>=CAT_INDENT_FROM_ANCESTOR_FIRST_TOKEN ) { - int deltaFromAncestor = align - CAT_INDENT_FROM_ANCESTOR_FIRST_TOKEN; + if ( (align&0xFF)==CAT_ALIGN_WITH_ANCESTOR_CHILD ) { + int[] deltaChild = CollectFeatures.unaligncat(align); + int deltaFromAncestor = deltaChild[0]; + int childIndex = deltaChild[1]; ParserRuleContext earliestLeftAncestor = earliestAncestorStartingWithToken(parent, curToken); + if ( earliestLeftAncestor==null ) { + earliestLeftAncestor = parent; + } + ParserRuleContext ancestor = CollectFeatures.getAncestor(earliestLeftAncestor, deltaFromAncestor); + ParseTree child = ancestor.getChild(childIndex); + Token start = null; + if ( child instanceof ParserRuleContext ) { + start = ((ParserRuleContext) child).getStart(); + } + else if ( child instanceof TerminalNode ){ + start = ((TerminalNode)child).getSymbol(); + } + else { + // uh oh. + System.err.println("Whoops. Tried access invalid child"); + } + if ( start!=null ) { + int indentCol = start.getCharPositionInLine(); + charPosInLine = indentCol; + output.append(Tool.spaces(indentCol)); + } + } + else if ( (align&0xFF)==CAT_INDENT_FROM_ANCESTOR_FIRST_TOKEN ) { + int deltaFromAncestor = CollectFeatures.unindentcat(align); + ParserRuleContext earliestLeftAncestor = earliestAncestorStartingWithToken(parent, curToken); + if ( earliestLeftAncestor==null ) { + earliestLeftAncestor = parent; + } ParserRuleContext ancestor = CollectFeatures.getAncestor(earliestLeftAncestor, deltaFromAncestor); Token start = ancestor.getStart(); int indentCol = start.getCharPositionInLine() + INDENT_LEVEL; @@ -251,13 +241,12 @@ public TokenPositionAnalysis getTokenAnalysis(int[] features, int indexIntoRealT boolean prevIsWS = prevToken.getType()==JavaLexer.WS; int actualNL = Tool.count(prevToken.getText(), '\n'); int actualWS = Tool.count(prevToken.getText(), ' '); - boolean actualAlign = isAlignedWithFirstSiblingOfList(tokenToNodeMap, tokens, curToken); String newlinePredictionString = String.format("### line %d: predicted %d \\n actual %s", originalCurToken.getLine(), injectNewline, prevIsWS ? actualNL : "none"); String alignPredictionString = String.format("### line %d: predicted %s actual %s", originalCurToken.getLine(), alignWithPrevious==1?"align":"unaligned", - actualAlign?"align":"unaligned"); + "?"); String wsPredictionString = String.format("### line %d: predicted %d ' ' actual %s", originalCurToken.getLine(), ws, prevIsWS ? actualWS : "none"); if ( failsafeTriggered ) { diff --git a/java/src/org/antlr/codebuff/InputDocument.java b/java/src/org/antlr/codebuff/InputDocument.java index 5224052..61952ff 100644 --- a/java/src/org/antlr/codebuff/InputDocument.java +++ b/java/src/org/antlr/codebuff/InputDocument.java @@ -41,7 +41,10 @@ public String getLine(int line) { if ( lines==null ) { lines = Arrays.asList(content.split("\n")); } - return lines.get(line-1); + if ( line>0 ) { + return lines.get(line-1); + } + return null; } public double getIncorrectWSRate() { diff --git a/java/src/org/antlr/codebuff/Neighbor.java b/java/src/org/antlr/codebuff/Neighbor.java index 7ca3358..da99415 100644 --- a/java/src/org/antlr/codebuff/Neighbor.java +++ b/java/src/org/antlr/codebuff/Neighbor.java @@ -21,7 +21,12 @@ public String toString(FeatureMetaData[] FEATURES, List Y) { String lineText = doc.getLine(line); int col = X[CollectFeatures.INDEX_INFO_CHARPOS]; // insert a dot right before char position - lineText = lineText.substring(0,col) + '\u00B7' + lineText.substring(col,lineText.length()); - return String.format("%s (cat=%d,d=%1.3f): %s", features, Y.get(corpusVectorIndex), distance, lineText); + if ( lineText!=null ) { + lineText = lineText.substring(0, col)+'\u00B7'+lineText.substring(col, lineText.length()); + } + int cat = Y.get(corpusVectorIndex); + int[] elements = CollectFeatures.unaligncat(cat); + String display = String.format("%d|%d|%d", cat&0xFF, elements[0], elements[1]); + return String.format("%s (cat=%s,d=%1.3f): %s", features, display, distance, lineText); } } diff --git a/java/src/org/antlr/codebuff/TokenContext.java b/java/src/org/antlr/codebuff/TokenContext.java deleted file mode 100644 index 8d7390a..0000000 --- a/java/src/org/antlr/codebuff/TokenContext.java +++ /dev/null @@ -1,34 +0,0 @@ -package org.antlr.codebuff; - -import java.util.Arrays; - -public class TokenContext { - public final int[] tokens; - - public TokenContext(int[] tokens) { - this.tokens = tokens; - } - - public TokenContext(int t1, int t2, int t3, int t4) { - this(new int[]{t1,t2,t3,t4}); - } - - @Override - public int hashCode() { - int h = tokens[0]; - h = h << 7 + tokens[1]; - h = h << 7 + tokens[2]; - h = h << 7 + tokens[3]; - return h; - } - - @Override - public boolean equals(Object obj) { - if ( obj==this ) return true; - if ( obj.hashCode()!=this.hashCode() ) return false; - if ( obj.getClass()!=TokenContext.class ) return false; - TokenContext other = (TokenContext)obj; - return Arrays.equals(this.tokens, other.tokens); - } - -} diff --git a/java/src/org/antlr/codebuff/TokenPositionAnalysis.java b/java/src/org/antlr/codebuff/TokenPositionAnalysis.java index 6f5d034..4826b20 100644 --- a/java/src/org/antlr/codebuff/TokenPositionAnalysis.java +++ b/java/src/org/antlr/codebuff/TokenPositionAnalysis.java @@ -7,9 +7,6 @@ public class TokenPositionAnalysis { public String ws = "n/a"; public String align = "n/a"; - public TokenPositionAnalysis() { - } - public TokenPositionAnalysis(String newline, String align, String ws) { this.align = align; this.newline = newline; diff --git a/java/src/org/antlr/codebuff/Tool.java b/java/src/org/antlr/codebuff/Tool.java index b49c13b..b25b710 100644 --- a/java/src/org/antlr/codebuff/Tool.java +++ b/java/src/org/antlr/codebuff/Tool.java @@ -14,10 +14,7 @@ import org.antlr.v4.runtime.misc.Pair; import org.antlr.v4.runtime.tree.ParseTreeWalker; -import java.io.BufferedWriter; import java.io.File; -import java.io.FileWriter; -import java.io.IOException; import java.lang.reflect.Constructor; import java.lang.reflect.Method; import java.nio.file.FileSystems; @@ -119,21 +116,6 @@ public static Corpus train(String rootDir, return corpus; } - public void saveCSV(List documents, String dir) throws IOException { - FileWriter fw = new FileWriter(dir+"/style.csv"); - BufferedWriter bw = new BufferedWriter(fw); -// bw.write(Utils.join(CollectFeatures.FEATURE_NAMES, ", ")); - bw.write("\n"); - for (InputDocument doc : documents) { - for (int[] record : doc.featureVectors) { - String r = join(record, ", "); - bw.write(r); - bw.write('\n'); - } - } - bw.close(); - } - public static Corpus processSampleDocs(List docs, Class extends Lexer> lexerClass, Class extends Parser> parserClass, @@ -164,9 +146,7 @@ public static Corpus processSampleDocs(List docs, } /** Parse document, save feature vectors to the doc but return it also */ - public static void process(InputDocument doc, int tabSize, Map>> ruleToPairsBag) - throws Exception - { + public static void process(InputDocument doc, int tabSize, Map>> ruleToPairsBag) { CollectFeatures collector = new CollectFeatures(doc, tabSize, ruleToPairsBag); collector.computeFeatureVectors(); @@ -234,7 +214,7 @@ public static List load(List fileNames, int tabSize) throws Exception { - List input = new ArrayList(fileNames.size()); + List input = new ArrayList<>(fileNames.size()); int i = 0; for (String f : fileNames) { InputDocument doc = load(f, lexerClass, tabSize); @@ -281,17 +261,17 @@ public static InputDocument load(String fileName, } public static List getFilenames(File f, String inputFilePattern) throws Exception { - List files = new ArrayList(); + List files = new ArrayList<>(); getFilenames_(f, inputFilePattern, files); return files; } - public static void getFilenames_(File f, String inputFilePattern, List files) throws Exception { + public static void getFilenames_(File f, String inputFilePattern, List files) { // If this is a directory, walk each file/dir in that directory if (f.isDirectory()) { String flist[] = f.list(); - for (int i=0; i < flist.length; i++) { - getFilenames_(new File(f, flist[i]), inputFilePattern, files); + for (String aFlist : flist) { + getFilenames_(new File(f, aFlist), inputFilePattern, files); } } @@ -338,7 +318,6 @@ public static List copy(CommonTokenStream tokens) { List copy = new ArrayList<>(); tokens.fill(); for (Token t : tokens.getTokens()) { - CommonToken ct = (CommonToken)t; copy.add(new CommonToken(t)); } return copy; @@ -364,6 +343,7 @@ public static double weightedL0_Distance(FeatureMetaData[] featureTypes, int[] A for (int i=0; i getVotesBag(Neighbor[] kNN, int k, int[] unknown, List"+votes); kNN = Arrays.copyOfRange(kNN, 0, Math.min(k, kNN.length)); StringBuilder buf = new StringBuilder(); - for (int i = 0; i0 ) { kNN = Arrays.copyOfRange(kNN, 0, Math.min(k, kNN.length)); - for (int i = 0; i getCategoryDistanceMap(Neighbor[] kNN, int k, int[] unknown, List Y) { - Map catToDist = new HashMap<>(); - for (int i = 0; i key = new Pair<>(prevTokenRuleIndex, curTokenRuleIndex); + int pr = CollectFeatures.unrulealt(prevTokenRuleIndex)[0]; + int cr = CollectFeatures.unrulealt(curTokenRuleIndex)[0]; + Pair key = new Pair<>(pr, cr); List vectorIndexesMatchingTokenContext = corpus.curAndPrevTokenRuleIndexToVectorsMap.get(key); List distances = new ArrayList<>(); if ( vectorIndexesMatchingTokenContext==null ) { @@ -154,8 +135,7 @@ public Neighbor[] distances(int[] unknown, double distanceThreshold) { } else { int n = vectorIndexesMatchingTokenContext.size(); // num training samples - for (int i = 0; i