Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 39 additions & 45 deletions java/src/org/antlr/codebuff/CollectFeatures.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@ public class CollectFeatures {
public static final int PAIR_ON_SAME_LINE = 0;
public static final int PAIR_ON_DIFF_LINE = 1;

// Categories for newline, whitespace. CAT_INJECT_NL+n<<8 or CAT_INJECT_WS+n<<8
public static final int CAT_NO_WS = 0;
public static final int CAT_INJECT_NL = 100;
public static final int CAT_INJECT_WS = 200;

// Categories for alignment/indentation
public static final int CAT_NO_ALIGNMENT = 0;

Expand Down Expand Up @@ -89,7 +94,7 @@ public class CollectFeatures {

public static final int NUM_FEATURES = 23;

public static FeatureMetaData[] FEATURES_INJECT_NL = {
public static FeatureMetaData[] FEATURES_INJECT_WS = { // inject ws or nl
new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(-2)"}, 1),
new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(-1)"}, 2),
new FeatureMetaData(FeatureType.RULE, new String[] {"LT(-1)", "rule"}, 2),
Expand Down Expand Up @@ -141,32 +146,6 @@ public class CollectFeatures {
new FeatureMetaData(FeatureType.INFO_CHARPOS, new String[] {"char", "pos"}, 0)
};

public static FeatureMetaData[] FEATURES_INJECT_WS = {
new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(-2)"}, 1),
new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(-1)"}, 2),
new FeatureMetaData(FeatureType.RULE, new String[] {"LT(-1)", "rule"}, 2),
FeatureMetaData.UNUSED,
new FeatureMetaData(FeatureType.RULE, new String[] {"LT(-1)", "right ancestor"}, 3),
new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(1)"}, 3),
FeatureMetaData.UNUSED,
new FeatureMetaData(FeatureType.BOOL, new String[]{"Strt", "line"}, 3),
new FeatureMetaData(FeatureType.RULE, new String[] {"LT(1)", "rule"}, 2),
new FeatureMetaData(FeatureType.RULE, new String[] {"LT(1)", "right ancestor"}, 3),
new FeatureMetaData(FeatureType.RULE, new String[] {"LT(1)", "left ancestor"}, 3),
new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^5"}, 1),
new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^4"}, 1),
new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^3"}, 1),
new FeatureMetaData(FeatureType.INT, new String[] {"ancestor's", "parent^3 wid"}, 1),
new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent^2"}, 1),
new FeatureMetaData(FeatureType.INT, new String[] {"ancestor's", "parent^2 wid"}, 1),
new FeatureMetaData(FeatureType.RULE, new String[] {"ancestor's", "parent"}, 1),
new FeatureMetaData(FeatureType.INT, new String[] {"ancestor's", "parent wid"}, 1),
new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(2)"}, 1),
new FeatureMetaData(FeatureType.INFO_FILE, new String[] {"", "file"}, 0),
new FeatureMetaData(FeatureType.INFO_LINE, new String[] {"", "line"}, 0),
new FeatureMetaData(FeatureType.INFO_CHARPOS, new String[] {"char", "pos"}, 0)
};

public static FeatureMetaData[] FEATURES_ALL = {
new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(-2)"}, 1),
new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(-1)"}, 2),
Expand Down Expand Up @@ -201,9 +180,7 @@ public class CollectFeatures {
protected ParserRuleContext root;
protected CommonTokenStream tokens; // track stream so we can examine previous tokens
protected List<int[]> features = new ArrayList<>();
protected List<Integer> injectNewlines = new ArrayList<>();
protected List<Integer> injectWS = new ArrayList<>();
protected List<Integer> indent = new ArrayList<>();
protected List<Integer> injectWhitespace = new ArrayList<>();
protected List<Integer> align = new ArrayList<>();

protected int currentIndent = 0;
Expand Down Expand Up @@ -246,7 +223,20 @@ public void computeFeatureVectorForToken(int i) {

int precedingNL = getPrecedingNL(tokens, i); // how many lines to inject

this.injectNewlines.add(precedingNL);
int ws = 0;
if ( precedingNL==0 ) {
ws = curToken.getCharPositionInLine() -
(prevToken.getCharPositionInLine()+prevToken.getText().length());
}

int injectNL_WS = CAT_NO_WS;
if ( precedingNL>0 ) {
injectNL_WS = nlcat(precedingNL);
}
else if ( ws>0 ) {
injectNL_WS = wscat(ws);
}
this.injectWhitespace.add(injectNL_WS);

int columnDelta = 0;
if ( precedingNL>0 ) { // && aligned!=1 ) {
Expand All @@ -259,14 +249,6 @@ public void computeFeatureVectorForToken(int i) {
aligned = getAlignmentCategory(node, curToken, columnDelta);
}

int ws = 0;
if ( precedingNL==0 ) {
ws = curToken.getCharPositionInLine() -
(prevToken.getCharPositionInLine()+prevToken.getText().length());
}

this.injectWS.add(ws); // likely negative if precedingNL

this.align.add(aligned);

this.features.add(features);
Expand Down Expand Up @@ -631,12 +613,8 @@ public List<int[]> getFeatures() {
return features;
}

public List<Integer> getInjectNewlines() {
return injectNewlines;
}

public List<Integer> getInjectWS() {
return injectWS;
public List<Integer> getInjectWhitespace() {
return injectWhitespace;
}

public List<Integer> getAlign() {
Expand Down Expand Up @@ -843,4 +821,20 @@ public static int[] unaligncat(int v) {
int child = (v>>16)&0xFFFF;
return new int[] { deltaFromLeftAncestor, child };
}

public static int wscat(int n) {
return CAT_INJECT_WS | (n<<8);
}

public static int nlcat(int n) {
return CAT_INJECT_NL | (n<<8);
}

public static int unwscat(int v) {
return v >> 8 & 0xFFFF;
}

public static int unnlcat(int v) {
return v >> 8 & 0xFFFF;
}
}
20 changes: 7 additions & 13 deletions java/src/org/antlr/codebuff/Corpus.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,8 @@ public class Corpus {

List<InputDocument> documents; // an entry for each X
List<int[]> X;
List<Integer> injectNewlines;
List<Integer> injectWhitespace;
List<Integer> align; // steps to common ancestor whose first token is alignment anchor
List<Integer> injectWS;

/** an index to narrow down the number of vectors we compute distance() on each classification.
* The key is (previous token's rule index, current token's rule index). It yields
Expand All @@ -30,14 +29,12 @@ public class Corpus {

public Corpus(List<InputDocument> documents,
List<int[]> X,
List<Integer> injectNewlines,
List<Integer> align,
List<Integer> injectWS)
List<Integer> injectWhitespace,
List<Integer> align)
{
this.documents = documents;
this.X = X;
this.injectNewlines = injectNewlines;
this.injectWS = injectWS;
this.injectWhitespace = injectWhitespace;
this.align = align;
}

Expand Down Expand Up @@ -65,15 +62,12 @@ public void randomShuffleInPlace() {
X.set(i, X.get(j));
X.set(j, tmp);
// And now swap all prediction lists
Integer tmpI = injectNewlines.get(i);
injectNewlines.set(i, injectNewlines.get(j));
injectNewlines.set(j, tmpI);
Integer tmpI = injectWhitespace.get(i);
injectWhitespace.set(i, injectWhitespace.get(j));
injectWhitespace.set(j, tmpI);
tmpI = align.get(i);
align.set(i, align.get(j));
align.set(j, tmpI);
tmpI = injectWS.get(i);
injectWS.set(i, injectWS.get(j));
injectWS.set(j, tmpI);
// Finally, swap documents
InputDocument tmpD = documents.get(i);
documents.set(i, documents.get(j));
Expand Down
45 changes: 21 additions & 24 deletions java/src/org/antlr/codebuff/Formatter.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,9 @@
import static org.antlr.codebuff.CollectFeatures.CAT_ALIGN_WITH_ANCESTOR_CHILD;
import static org.antlr.codebuff.CollectFeatures.CAT_INDENT;
import static org.antlr.codebuff.CollectFeatures.CAT_INDENT_FROM_ANCESTOR_FIRST_TOKEN;
import static org.antlr.codebuff.CollectFeatures.CAT_INJECT_NL;
import static org.antlr.codebuff.CollectFeatures.CAT_INJECT_WS;
import static org.antlr.codebuff.CollectFeatures.FEATURES_ALIGN;
import static org.antlr.codebuff.CollectFeatures.FEATURES_INJECT_NL;
import static org.antlr.codebuff.CollectFeatures.FEATURES_INJECT_WS;
import static org.antlr.codebuff.CollectFeatures.INDEX_FIRST_ON_LINE;
import static org.antlr.codebuff.CollectFeatures.INDEX_PREV_END_COLUMN;
Expand All @@ -44,7 +45,7 @@ public class Formatter {

protected Vector<TokenPositionAnalysis> analysis = new Vector<>();

protected CodekNNClassifier newlineClassifier;
protected CodekNNClassifier nlwsClassifier;
protected CodekNNClassifier wsClassifier;
protected CodekNNClassifier alignClassifier;
protected int k;
Expand All @@ -65,8 +66,7 @@ public Formatter(Corpus corpus, InputDocument doc, int tabSize) {
this.tokens = doc.tokens;
this.originalTokens = Tool.copy(tokens);
Tool.wipeLineAndPositionInfo(tokens);
newlineClassifier = new CodekNNClassifier(corpus, FEATURES_INJECT_NL);
wsClassifier = new CodekNNClassifier(corpus, FEATURES_INJECT_WS);
nlwsClassifier = new CodekNNClassifier(corpus, FEATURES_INJECT_WS);
alignClassifier = new CodekNNClassifier(corpus, FEATURES_ALIGN);
// k = (int)Math.sqrt(corpus.X.size());
// k = 7;
Expand Down Expand Up @@ -120,26 +120,32 @@ public void processToken(int indexIntoRealTokens, int tokenIndexInStream) {
// we're tracking it as we emit tokens
features[INDEX_PREV_END_COLUMN] = charPosInLine;

int injectNewline = newlineClassifier.classify(k, features, corpus.injectNewlines, MAX_CONTEXT_DIFF_THRESHOLD);
int injectNL_WS = nlwsClassifier.classify(k, features, corpus.injectWhitespace, MAX_CONTEXT_DIFF_THRESHOLD);
int newlines = 0;
int ws = 0;
if ( (injectNL_WS&0xFF)==CAT_INJECT_NL ) {
newlines = CollectFeatures.unnlcat(injectNL_WS);
}
else if ( (injectNL_WS&0xFF)==CAT_INJECT_WS ) {
ws = CollectFeatures.unwscat(injectNL_WS);
}

// getNodeFeatures() also doesn't know what line curToken is on. If \n, we need to find exemplars that start a line
features[INDEX_FIRST_ON_LINE] = injectNewline; // use \n prediction to match exemplars for alignment
features[INDEX_FIRST_ON_LINE] = newlines; // use \n prediction to match exemplars for alignment

int align = alignClassifier.classify(k, features, corpus.align, MAX_CONTEXT_DIFF_THRESHOLD);

int ws = wsClassifier.classify(k, features, corpus.injectWS, MAX_CONTEXT_DIFF_THRESHOLD);

TokenPositionAnalysis tokenPositionAnalysis =
getTokenAnalysis(features, indexIntoRealTokens, tokenIndexInStream, injectNewline, align, ws);
getTokenAnalysis(features, indexIntoRealTokens, tokenIndexInStream, newlines, align, ws);
analysis.setSize(tokenIndexInStream+1);
analysis.set(tokenIndexInStream, tokenPositionAnalysis);

if ( ws==0 && cannotJoin(realTokens.get(indexIntoRealTokens-1), curToken) ) { // failsafe!
ws = 1;
}

if ( injectNewline>0 ) {
output.append(Tool.newlines(injectNewline));
if ( newlines>0 ) {
output.append(Tool.newlines(newlines));
line++;
charPosInLine = 0;

Expand All @@ -153,7 +159,7 @@ public void processToken(int indexIntoRealTokens, int tokenIndexInStream) {
ParserRuleContext parent = (ParserRuleContext)node.getParent();

if ( align==CAT_INDENT ) {
if ( firstTokenOnPrevLine!=null ) { // if not on first line, we can indent indent
if ( firstTokenOnPrevLine!=null ) { // if not on first line, we cannot indent
int indentedCol = firstTokenOnPrevLine.getCharPositionInLine()+INDENT_LEVEL;
charPosInLine = indentedCol;
output.append(Tool.spaces(indentedCol));
Expand Down Expand Up @@ -287,23 +293,14 @@ public TokenPositionAnalysis getTokenAnalysis(int[] features, int indexIntoRealT
originalCurToken.getLine(),
alignWithPrevious==1?"align":"unaligned",
"?");
String wsPredictionString = String.format("### line %d: predicted %d ' ' actual %s",
originalCurToken.getLine(), ws, prevIsWS ? actualWS : "none");
if ( failsafeTriggered ) {
wsPredictionString += " (failsafe triggered)";
}


String newlineAnalysis = newlinePredictionString+"\n"+
newlineClassifier.getPredictionAnalysis(doc, k, features, corpus.injectNewlines,
MAX_CONTEXT_DIFF_THRESHOLD);
nlwsClassifier.getPredictionAnalysis(doc, k, features, corpus.injectWhitespace,
MAX_CONTEXT_DIFF_THRESHOLD);
String alignAnalysis =alignPredictionString+"\n"+
alignClassifier.getPredictionAnalysis(doc, k, features, corpus.align,
MAX_CONTEXT_DIFF_THRESHOLD);
String wsAnalysis =wsPredictionString+"\n"+
wsClassifier.getPredictionAnalysis(doc, k, features, corpus.injectWS,
MAX_CONTEXT_DIFF_THRESHOLD);
return new TokenPositionAnalysis(newlineAnalysis, alignAnalysis, wsAnalysis);
return new TokenPositionAnalysis(newlineAnalysis, alignAnalysis, "n/a");
}

/** Do not join two words like "finaldouble" or numbers like "3double",
Expand Down
5 changes: 2 additions & 3 deletions java/src/org/antlr/codebuff/InputDocument.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,8 @@ public class InputDocument {
public Parser parser;
public CommonTokenStream tokens;
public List<int[]> featureVectors;
public List<Integer> injectNewlines;
public List<Integer> injectWS;
public List<Integer> alignWithPrevious;
public List<Integer> injectWhitespace;
public List<Integer> align;
public int allWhiteSpaceCount = 0;
public int incorrectWhiteSpaceCount = 0;
public int misclassifiedNewLineCount = 0;
Expand Down
2 changes: 1 addition & 1 deletion java/src/org/antlr/codebuff/Optimizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ public static void main(String[] args) throws Exception {
List<String> allFiles = Tool.getFilenames(new File(testFileDir), ".*\\.java");
ArrayList<InputDocument> documents = (ArrayList<InputDocument>) Tool.load(allFiles, JavaLexer.class, tabSize);

Tester t = new Tester(CollectFeatures.FEATURES_INJECT_NL, corpus, documents, tabSize);
Tester t = new Tester(CollectFeatures.FEATURES_INJECT_WS, corpus, documents, tabSize);
// sorry, had to comment this out
// multiRoundMinimize(Tester::test, LEARNING_RATE, h, PRECISION, CollectFeatures.FEATURES_INJECT_NL, 5);
}
Expand Down
13 changes: 5 additions & 8 deletions java/src/org/antlr/codebuff/Tool.java
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,6 @@ public static Corpus processSampleDocs(List<InputDocument> docs,
List<InputDocument> documents = new ArrayList<>();
List<int[]> featureVectors = new ArrayList<>();
List<Integer> injectNewlines = new ArrayList<>();
List<Integer> injectWS = new ArrayList<>();
List<Integer> alignWithPrevious = new ArrayList<>();
for (InputDocument doc : docs) {
if ( showFileNames ) System.out.println(doc);
Expand All @@ -167,14 +166,13 @@ public static Corpus processSampleDocs(List<InputDocument> docs,
for (int i=0; i<doc.featureVectors.size(); i++) {
documents.add(doc);
int[] featureVec = doc.featureVectors.get(i);
injectNewlines.add(doc.injectNewlines.get(i));
injectWS.add(doc.injectWS.get(i));
alignWithPrevious.add(doc.alignWithPrevious.get(i));
injectNewlines.add(doc.injectWhitespace.get(i));
alignWithPrevious.add(doc.align.get(i));
featureVectors.add(featureVec);
}
}
System.out.printf("%d feature vectors\n", featureVectors.size());
return new Corpus(documents, featureVectors, injectNewlines, alignWithPrevious, injectWS);
return new Corpus(documents, featureVectors, injectNewlines, alignWithPrevious);
}

/** Parse document, save feature vectors to the doc but return it also */
Expand All @@ -183,9 +181,8 @@ public static void process(InputDocument doc, int tabSize, Map<String, List<Pair
collector.computeFeatureVectors();

doc.featureVectors = collector.getFeatures();
doc.injectNewlines = collector.getInjectNewlines();
doc.injectWS = collector.getInjectWS();
doc.alignWithPrevious = collector.getAlign();
doc.injectWhitespace = collector.getInjectWhitespace();
doc.align = collector.getAlign();
}

public static CommonTokenStream tokenize(String doc, Class<? extends Lexer> lexerClass)
Expand Down
22 changes: 0 additions & 22 deletions java/src/org/antlr/codebuff/gui/BuffScope.form
Original file line number Diff line number Diff line change
Expand Up @@ -144,28 +144,6 @@
</scrollpane>
</children>
</grid>
<grid id="883ac" binding="injectWSTab" layout-manager="BorderLayout" hgap="0" vgap="0">
<constraints>
<tabbedpane title="Inject whitespace"/>
</constraints>
<properties/>
<border type="none"/>
<children>
<scrollpane id="fb4f9">
<constraints border-constraint="Center"/>
<properties/>
<border type="none"/>
<children>
<component id="93e0e" class="javax.swing.JTextArea" binding="injectWSConsole">
<constraints/>
<properties>
<editable value="false"/>
</properties>
</component>
</children>
</scrollpane>
</children>
</grid>
</children>
</tabbedpane>
</children>
Expand Down
Loading