diff --git a/java/src/org/antlr/codebuff/CollectFeatures.java b/java/src/org/antlr/codebuff/CollectFeatures.java index 3e99abb..eabed86 100644 --- a/java/src/org/antlr/codebuff/CollectFeatures.java +++ b/java/src/org/antlr/codebuff/CollectFeatures.java @@ -32,6 +32,11 @@ public class CollectFeatures { public static final int PAIR_ON_SAME_LINE = 0; public static final int PAIR_ON_DIFF_LINE = 1; + // Categories for newline, whitespace. CAT_INJECT_NL+n<<8 or CAT_INJECT_WS+n<<8 + public static final int CAT_NO_WS = 0; + public static final int CAT_INJECT_NL = 100; + public static final int CAT_INJECT_WS = 200; + // Categories for alignment/indentation public static final int CAT_NO_ALIGNMENT = 0; @@ -89,7 +94,7 @@ public class CollectFeatures { public static final int NUM_FEATURES = 23; - public static FeatureMetaData[] FEATURES_INJECT_NL = { + public static FeatureMetaData[] FEATURES_INJECT_WS = { // inject ws or nl new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(-2)"}, 1), new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(-1)"}, 2), new FeatureMetaData(FeatureType.RULE, new String[] {"LT(-1)", "rule"}, 2), @@ -141,32 +146,6 @@ public class CollectFeatures { new FeatureMetaData(FeatureType.INFO_CHARPOS, new String[] {"char", "pos"}, 0) }; - public static FeatureMetaData[] FEATURES_INJECT_WS = { - new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(-2)"}, 1), - new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(-1)"}, 2), - new FeatureMetaData(FeatureType.RULE, new String[] {"LT(-1)", "rule"}, 2), - FeatureMetaData.UNUSED, - new FeatureMetaData(FeatureType.RULE, new String[] {"LT(-1)", "right ancestor"}, 3), - new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(1)"}, 3), - FeatureMetaData.UNUSED, - new FeatureMetaData(FeatureType.BOOL, new String[]{"Strt", "line"}, 3), - new FeatureMetaData(FeatureType.RULE, new String[] {"LT(1)", "rule"}, 2), - new FeatureMetaData(FeatureType.RULE, new String[] {"LT(1)", "right ancestor"}, 3), - new FeatureMetaData(FeatureType.RULE, new String[] {"LT(1)", "left ancestor"}, 3), - new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^5"}, 1), - new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^4"}, 1), - new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^3"}, 1), - new FeatureMetaData(FeatureType.INT, new String[] {"ancestor's", "parent^3 wid"}, 1), - new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^2"}, 1), - new FeatureMetaData(FeatureType.INT, new String[] {"ancestor's", "parent^2 wid"}, 1), - new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent"}, 1), - new FeatureMetaData(FeatureType.INT, new String[] {"ancestor's", "parent wid"}, 1), - new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(2)"}, 1), - new FeatureMetaData(FeatureType.INFO_FILE, new String[] {"", "file"}, 0), - new FeatureMetaData(FeatureType.INFO_LINE, new String[] {"", "line"}, 0), - new FeatureMetaData(FeatureType.INFO_CHARPOS, new String[] {"char", "pos"}, 0) - }; - public static FeatureMetaData[] FEATURES_ALL = { new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(-2)"}, 1), new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(-1)"}, 2), @@ -201,9 +180,7 @@ public class CollectFeatures { protected ParserRuleContext root; protected CommonTokenStream tokens; // track stream so we can examine previous tokens protected List features = new ArrayList<>(); - protected List injectNewlines = new ArrayList<>(); - protected List injectWS = new ArrayList<>(); - protected List indent = new ArrayList<>(); + protected List injectWhitespace = new ArrayList<>(); protected List align = new ArrayList<>(); protected int currentIndent = 0; @@ -246,7 +223,20 @@ public void computeFeatureVectorForToken(int i) { int precedingNL = getPrecedingNL(tokens, i); // how many lines to inject - this.injectNewlines.add(precedingNL); + int ws = 0; + if ( precedingNL==0 ) { + ws = curToken.getCharPositionInLine() - + (prevToken.getCharPositionInLine()+prevToken.getText().length()); + } + + int injectNL_WS = CAT_NO_WS; + if ( precedingNL>0 ) { + injectNL_WS = nlcat(precedingNL); + } + else if ( ws>0 ) { + injectNL_WS = wscat(ws); + } + this.injectWhitespace.add(injectNL_WS); int columnDelta = 0; if ( precedingNL>0 ) { // && aligned!=1 ) { @@ -259,14 +249,6 @@ public void computeFeatureVectorForToken(int i) { aligned = getAlignmentCategory(node, curToken, columnDelta); } - int ws = 0; - if ( precedingNL==0 ) { - ws = curToken.getCharPositionInLine() - - (prevToken.getCharPositionInLine()+prevToken.getText().length()); - } - - this.injectWS.add(ws); // likely negative if precedingNL - this.align.add(aligned); this.features.add(features); @@ -631,12 +613,8 @@ public List getFeatures() { return features; } - public List getInjectNewlines() { - return injectNewlines; - } - - public List getInjectWS() { - return injectWS; + public List getInjectWhitespace() { + return injectWhitespace; } public List getAlign() { @@ -843,4 +821,20 @@ public static int[] unaligncat(int v) { int child = (v>>16)&0xFFFF; return new int[] { deltaFromLeftAncestor, child }; } + + public static int wscat(int n) { + return CAT_INJECT_WS | (n<<8); + } + + public static int nlcat(int n) { + return CAT_INJECT_NL | (n<<8); + } + + public static int unwscat(int v) { + return v >> 8 & 0xFFFF; + } + + public static int unnlcat(int v) { + return v >> 8 & 0xFFFF; + } } diff --git a/java/src/org/antlr/codebuff/Corpus.java b/java/src/org/antlr/codebuff/Corpus.java index 3c54da4..9953a8b 100644 --- a/java/src/org/antlr/codebuff/Corpus.java +++ b/java/src/org/antlr/codebuff/Corpus.java @@ -18,9 +18,8 @@ public class Corpus { List documents; // an entry for each X List X; - List injectNewlines; + List injectWhitespace; List align; // steps to common ancestor whose first token is alignment anchor - List injectWS; /** an index to narrow down the number of vectors we compute distance() on each classification. * The key is (previous token's rule index, current token's rule index). It yields @@ -30,14 +29,12 @@ public class Corpus { public Corpus(List documents, List X, - List injectNewlines, - List align, - List injectWS) + List injectWhitespace, + List align) { this.documents = documents; this.X = X; - this.injectNewlines = injectNewlines; - this.injectWS = injectWS; + this.injectWhitespace = injectWhitespace; this.align = align; } @@ -65,15 +62,12 @@ public void randomShuffleInPlace() { X.set(i, X.get(j)); X.set(j, tmp); // And now swap all prediction lists - Integer tmpI = injectNewlines.get(i); - injectNewlines.set(i, injectNewlines.get(j)); - injectNewlines.set(j, tmpI); + Integer tmpI = injectWhitespace.get(i); + injectWhitespace.set(i, injectWhitespace.get(j)); + injectWhitespace.set(j, tmpI); tmpI = align.get(i); align.set(i, align.get(j)); align.set(j, tmpI); - tmpI = injectWS.get(i); - injectWS.set(i, injectWS.get(j)); - injectWS.set(j, tmpI); // Finally, swap documents InputDocument tmpD = documents.get(i); documents.set(i, documents.get(j)); diff --git a/java/src/org/antlr/codebuff/Formatter.java b/java/src/org/antlr/codebuff/Formatter.java index aa3ea42..51eea68 100644 --- a/java/src/org/antlr/codebuff/Formatter.java +++ b/java/src/org/antlr/codebuff/Formatter.java @@ -17,8 +17,9 @@ import static org.antlr.codebuff.CollectFeatures.CAT_ALIGN_WITH_ANCESTOR_CHILD; import static org.antlr.codebuff.CollectFeatures.CAT_INDENT; import static org.antlr.codebuff.CollectFeatures.CAT_INDENT_FROM_ANCESTOR_FIRST_TOKEN; +import static org.antlr.codebuff.CollectFeatures.CAT_INJECT_NL; +import static org.antlr.codebuff.CollectFeatures.CAT_INJECT_WS; import static org.antlr.codebuff.CollectFeatures.FEATURES_ALIGN; -import static org.antlr.codebuff.CollectFeatures.FEATURES_INJECT_NL; import static org.antlr.codebuff.CollectFeatures.FEATURES_INJECT_WS; import static org.antlr.codebuff.CollectFeatures.INDEX_FIRST_ON_LINE; import static org.antlr.codebuff.CollectFeatures.INDEX_PREV_END_COLUMN; @@ -44,7 +45,7 @@ public class Formatter { protected Vector analysis = new Vector<>(); - protected CodekNNClassifier newlineClassifier; + protected CodekNNClassifier nlwsClassifier; protected CodekNNClassifier wsClassifier; protected CodekNNClassifier alignClassifier; protected int k; @@ -65,8 +66,7 @@ public Formatter(Corpus corpus, InputDocument doc, int tabSize) { this.tokens = doc.tokens; this.originalTokens = Tool.copy(tokens); Tool.wipeLineAndPositionInfo(tokens); - newlineClassifier = new CodekNNClassifier(corpus, FEATURES_INJECT_NL); - wsClassifier = new CodekNNClassifier(corpus, FEATURES_INJECT_WS); + nlwsClassifier = new CodekNNClassifier(corpus, FEATURES_INJECT_WS); alignClassifier = new CodekNNClassifier(corpus, FEATURES_ALIGN); // k = (int)Math.sqrt(corpus.X.size()); // k = 7; @@ -120,17 +120,23 @@ public void processToken(int indexIntoRealTokens, int tokenIndexInStream) { // we're tracking it as we emit tokens features[INDEX_PREV_END_COLUMN] = charPosInLine; - int injectNewline = newlineClassifier.classify(k, features, corpus.injectNewlines, MAX_CONTEXT_DIFF_THRESHOLD); + int injectNL_WS = nlwsClassifier.classify(k, features, corpus.injectWhitespace, MAX_CONTEXT_DIFF_THRESHOLD); + int newlines = 0; + int ws = 0; + if ( (injectNL_WS&0xFF)==CAT_INJECT_NL ) { + newlines = CollectFeatures.unnlcat(injectNL_WS); + } + else if ( (injectNL_WS&0xFF)==CAT_INJECT_WS ) { + ws = CollectFeatures.unwscat(injectNL_WS); + } // getNodeFeatures() also doesn't know what line curToken is on. If \n, we need to find exemplars that start a line - features[INDEX_FIRST_ON_LINE] = injectNewline; // use \n prediction to match exemplars for alignment + features[INDEX_FIRST_ON_LINE] = newlines; // use \n prediction to match exemplars for alignment int align = alignClassifier.classify(k, features, corpus.align, MAX_CONTEXT_DIFF_THRESHOLD); - int ws = wsClassifier.classify(k, features, corpus.injectWS, MAX_CONTEXT_DIFF_THRESHOLD); - TokenPositionAnalysis tokenPositionAnalysis = - getTokenAnalysis(features, indexIntoRealTokens, tokenIndexInStream, injectNewline, align, ws); + getTokenAnalysis(features, indexIntoRealTokens, tokenIndexInStream, newlines, align, ws); analysis.setSize(tokenIndexInStream+1); analysis.set(tokenIndexInStream, tokenPositionAnalysis); @@ -138,8 +144,8 @@ public void processToken(int indexIntoRealTokens, int tokenIndexInStream) { ws = 1; } - if ( injectNewline>0 ) { - output.append(Tool.newlines(injectNewline)); + if ( newlines>0 ) { + output.append(Tool.newlines(newlines)); line++; charPosInLine = 0; @@ -153,7 +159,7 @@ public void processToken(int indexIntoRealTokens, int tokenIndexInStream) { ParserRuleContext parent = (ParserRuleContext)node.getParent(); if ( align==CAT_INDENT ) { - if ( firstTokenOnPrevLine!=null ) { // if not on first line, we can indent indent + if ( firstTokenOnPrevLine!=null ) { // if not on first line, we cannot indent int indentedCol = firstTokenOnPrevLine.getCharPositionInLine()+INDENT_LEVEL; charPosInLine = indentedCol; output.append(Tool.spaces(indentedCol)); @@ -287,23 +293,14 @@ public TokenPositionAnalysis getTokenAnalysis(int[] features, int indexIntoRealT originalCurToken.getLine(), alignWithPrevious==1?"align":"unaligned", "?"); - String wsPredictionString = String.format("### line %d: predicted %d ' ' actual %s", - originalCurToken.getLine(), ws, prevIsWS ? actualWS : "none"); - if ( failsafeTriggered ) { - wsPredictionString += " (failsafe triggered)"; - } - String newlineAnalysis = newlinePredictionString+"\n"+ - newlineClassifier.getPredictionAnalysis(doc, k, features, corpus.injectNewlines, - MAX_CONTEXT_DIFF_THRESHOLD); + nlwsClassifier.getPredictionAnalysis(doc, k, features, corpus.injectWhitespace, + MAX_CONTEXT_DIFF_THRESHOLD); String alignAnalysis =alignPredictionString+"\n"+ alignClassifier.getPredictionAnalysis(doc, k, features, corpus.align, MAX_CONTEXT_DIFF_THRESHOLD); - String wsAnalysis =wsPredictionString+"\n"+ - wsClassifier.getPredictionAnalysis(doc, k, features, corpus.injectWS, - MAX_CONTEXT_DIFF_THRESHOLD); - return new TokenPositionAnalysis(newlineAnalysis, alignAnalysis, wsAnalysis); + return new TokenPositionAnalysis(newlineAnalysis, alignAnalysis, "n/a"); } /** Do not join two words like "finaldouble" or numbers like "3double", diff --git a/java/src/org/antlr/codebuff/InputDocument.java b/java/src/org/antlr/codebuff/InputDocument.java index 61952ff..9a4c2c1 100644 --- a/java/src/org/antlr/codebuff/InputDocument.java +++ b/java/src/org/antlr/codebuff/InputDocument.java @@ -16,9 +16,8 @@ public class InputDocument { public Parser parser; public CommonTokenStream tokens; public List featureVectors; - public List injectNewlines; - public List injectWS; - public List alignWithPrevious; + public List injectWhitespace; + public List align; public int allWhiteSpaceCount = 0; public int incorrectWhiteSpaceCount = 0; public int misclassifiedNewLineCount = 0; diff --git a/java/src/org/antlr/codebuff/Optimizer.java b/java/src/org/antlr/codebuff/Optimizer.java index b060732..1be5aec 100644 --- a/java/src/org/antlr/codebuff/Optimizer.java +++ b/java/src/org/antlr/codebuff/Optimizer.java @@ -167,7 +167,7 @@ public static void main(String[] args) throws Exception { List allFiles = Tool.getFilenames(new File(testFileDir), ".*\\.java"); ArrayList documents = (ArrayList) Tool.load(allFiles, JavaLexer.class, tabSize); - Tester t = new Tester(CollectFeatures.FEATURES_INJECT_NL, corpus, documents, tabSize); + Tester t = new Tester(CollectFeatures.FEATURES_INJECT_WS, corpus, documents, tabSize); // sorry, had to comment this out // multiRoundMinimize(Tester::test, LEARNING_RATE, h, PRECISION, CollectFeatures.FEATURES_INJECT_NL, 5); } diff --git a/java/src/org/antlr/codebuff/Tool.java b/java/src/org/antlr/codebuff/Tool.java index 046c748..235ca28 100644 --- a/java/src/org/antlr/codebuff/Tool.java +++ b/java/src/org/antlr/codebuff/Tool.java @@ -158,7 +158,6 @@ public static Corpus processSampleDocs(List docs, List documents = new ArrayList<>(); List featureVectors = new ArrayList<>(); List injectNewlines = new ArrayList<>(); - List injectWS = new ArrayList<>(); List alignWithPrevious = new ArrayList<>(); for (InputDocument doc : docs) { if ( showFileNames ) System.out.println(doc); @@ -167,14 +166,13 @@ public static Corpus processSampleDocs(List docs, for (int i=0; i lexerClass) diff --git a/java/src/org/antlr/codebuff/gui/BuffScope.form b/java/src/org/antlr/codebuff/gui/BuffScope.form index de68af2..17a2e37 100644 --- a/java/src/org/antlr/codebuff/gui/BuffScope.form +++ b/java/src/org/antlr/codebuff/gui/BuffScope.form @@ -144,28 +144,6 @@ - - - - - - - - - - - - - - - - - - - - - - diff --git a/java/src/org/antlr/codebuff/gui/BuffScope.java b/java/src/org/antlr/codebuff/gui/BuffScope.java index 3c34f6c..782c2a5 100644 --- a/java/src/org/antlr/codebuff/gui/BuffScope.java +++ b/java/src/org/antlr/codebuff/gui/BuffScope.java @@ -14,8 +14,6 @@ public class BuffScope { public JTabbedPane analysisTabbedPane; public JPanel injectNLTab; public JPanel alignTab; - public JTextArea injectWSConsole; - public JPanel injectWSTab; public JTextArea getInjectNLConsole() { return injectNLConsole; @@ -107,14 +105,6 @@ private void createUIComponents() { alignTab.add(scrollPane4, BorderLayout.CENTER); alignConsole = new JTextArea(); scrollPane4.setViewportView(alignConsole); - injectWSTab = new JPanel(); - injectWSTab.setLayout(new BorderLayout(0, 0)); - analysisTabbedPane.addTab("Inject whitespace", injectWSTab); - final JScrollPane scrollPane5 = new JScrollPane(); - injectWSTab.add(scrollPane5, BorderLayout.CENTER); - injectWSConsole = new JTextArea(); - injectWSConsole.setEditable(false); - scrollPane5.setViewportView(injectWSConsole); } /** diff --git a/java/src/org/antlr/codebuff/gui/GUIController.java b/java/src/org/antlr/codebuff/gui/GUIController.java index e0b9cef..b136990 100644 --- a/java/src/org/antlr/codebuff/gui/GUIController.java +++ b/java/src/org/antlr/codebuff/gui/GUIController.java @@ -66,8 +66,6 @@ public void show() throws Exception { scope.injectNLConsole.setFont(docFont); scope.alignConsole.putClientProperty(JEditorPane.HONOR_DISPLAY_PROPERTIES, Boolean.TRUE); scope.alignConsole.setFont(docFont); - scope.injectWSConsole.putClientProperty(JEditorPane.HONOR_DISPLAY_PROPERTIES, Boolean.TRUE); - scope.injectWSConsole.setFont(docFont); JFrame frame = new JFrame("CodeBuff Scope"); frame.setDefaultCloseOperation(WindowConstants.EXIT_ON_CLOSE); @@ -113,10 +111,8 @@ public void caretUpdate(CaretEvent e) { } scope.injectNLConsole.setText(analysis!=null ? analysis.newline : ""); scope.alignConsole.setText(analysis!=null ? analysis.align : ""); - scope.injectWSConsole.setText(analysis!=null ? analysis.ws : ""); scope.injectNLConsole.setCaretPosition(0); scope.alignConsole.setCaretPosition(0); - scope.injectWSConsole.setCaretPosition(0); } catch (Exception ex) { ex.printStackTrace(System.err); diff --git a/java/src/org/antlr/codebuff/kNNClassifier.java b/java/src/org/antlr/codebuff/kNNClassifier.java index 192205a..4fc890b 100644 --- a/java/src/org/antlr/codebuff/kNNClassifier.java +++ b/java/src/org/antlr/codebuff/kNNClassifier.java @@ -33,12 +33,9 @@ public int[] classify(int k, int[] unknown, double distanceThreshold) { int[] categories = new int[Corpus.NUM_DEPENDENT_VARS]; Neighbor[] kNN = kNN(unknown, k, distanceThreshold); - HashBag votesBag = getVotesBag(kNN, k, unknown, corpus.injectNewlines); + HashBag votesBag = getVotesBag(kNN, k, unknown, corpus.injectWhitespace); categories[Corpus.INDEX_FEATURE_NEWLINES] = getCategoryWithMostVotes(votesBag); - votesBag = getVotesBag(kNN, k, unknown, corpus.injectWS); - categories[Corpus.INDEX_FEATURE_WS] = getCategoryWithMostVotes(votesBag); - votesBag = getVotesBag(kNN, k, unknown, corpus.align); categories[Corpus.INDEX_FEATURE_ALIGN_WITH_PREVIOUS] = getCategoryWithMostVotes(votesBag);