From ea33545dad9759133c7f4d26a674175cd6bb6d75 Mon Sep 17 00:00:00 2001 From: breandan Date: Mon, 12 Jun 2023 02:30:51 -0400 Subject: [PATCH] compare lexical patches --- .../experiments/repair/PythonSnippetRepair.kt | 25 ++++++++++++++----- .../edu/mcgill/cstk/utils/ParseUtils.kt | 22 ++++++++++++++++ 2 files changed, 41 insertions(+), 6 deletions(-) diff --git a/src/main/kotlin/edu/mcgill/cstk/experiments/repair/PythonSnippetRepair.kt b/src/main/kotlin/edu/mcgill/cstk/experiments/repair/PythonSnippetRepair.kt index 8399ba49..05561174 100644 --- a/src/main/kotlin/edu/mcgill/cstk/experiments/repair/PythonSnippetRepair.kt +++ b/src/main/kotlin/edu/mcgill/cstk/experiments/repair/PythonSnippetRepair.kt @@ -76,27 +76,40 @@ fun stackOverflowEval() { .minimizeFix { tokenizeAsPython(true) } .filter { (broke, fixed, minfix) -> val (brokeTokens, fixedTokens) = - broke.tokenizeAsPython() to fixed.tokenizeAsPython() + broke.lexToIntTypesAsPython() to fixed.lexToIntTypesAsPython() // (brokeTokens.size - fixedTokens.size).absoluteValue < 10 && - broke != fixed && multisetManhattanDistance(brokeTokens, fixedTokens).let { it in 1..5 } + multisetManhattanDistance(brokeTokens, fixedTokens).let { it in 1..5 } } // .forEach { (broke, fixed, minfix) -> // broke.tokenizeAsPython() // } .filter { (broke, fixed, minfix) -> val (brokeVis, fixedVis, minfixVis) = broke.visibleChars() to fixed.visibleChars() to minfix.visibleChars() - brokeVis != fixedVis && brokeVis != minfixVis && fixedVis != minfixVis + brokeVis != fixedVis && brokeVis != minfixVis// && fixedVis != minfixVis } .map { (broke, fixed, minfix) -> prettyDiffs(listOf(broke, fixed), listOf("original snippet", "human patch")).let { origDiff -> prettyDiffs(listOf(broke, minfix), listOf("original snippet", "minimized patch")).let { minDiff -> // Compare ASCII characters for a visible difference, if same do not print two // if (corrected.visibleChars() == minfix.visibleChars()) origDiff to "" else - origDiff to minDiff + origDiff to minDiff to broke to minfix } } - }.filter { (a, b) -> b.isNotEmpty() && 2 < (a.count { it == '\u001B' } - b.count { it == '\u001B' }).absoluteValue } - .forEach { (a, b) -> println("$a\n$b") } + } +// .filter { (a, b) -> b.isNotEmpty() && 2 < (a.count { it == '\u001B' } - b.count { it == '\u001B' }).absoluteValue } + .forEach { (a, b, c, d) -> +// println("$a\n$b") + val coarseBroke = c.lexToStrTypesAsPython().joinToString(" ") + " | " + c.lexToIntTypesAsPython().joinToString(" ") + val coarseFixed = d.lexToStrTypesAsPython().joinToString(" ") + " | " + d.lexToIntTypesAsPython().joinToString(" ") +// val diff1 = prettyDiffNoFrills(coarseBroke, coarseFixed) +// val diff2 = prettyDiffNoFrills(coarseFixed, coarseBroke) +// val diff3 = prettyDiffNoFrills(c.lexToIntTypesAsPython().joinToString("\n"), d.lexToIntTypesAsPython().joinToString("\n")) +// val maxlen = max(diff1.visibleLen(), diff2.visibleLen()) + + println("Broke tokens: ${prettyDiffNoFrills(coarseFixed, coarseBroke)}") + println("Fixed tokens: ${prettyDiffNoFrills(coarseBroke, coarseFixed)}") + println("\n\n") + } } fun seq2parseEval() { diff --git a/src/main/kotlin/edu/mcgill/cstk/utils/ParseUtils.kt b/src/main/kotlin/edu/mcgill/cstk/utils/ParseUtils.kt index 98828e2e..b592a0f5 100644 --- a/src/main/kotlin/edu/mcgill/cstk/utils/ParseUtils.kt +++ b/src/main/kotlin/edu/mcgill/cstk/utils/ParseUtils.kt @@ -32,6 +32,28 @@ fun Σᐩ.tokenizeAsPython(exhaustive: Boolean = false): List<Σᐩ> = else throw Exception("Could not find token $t in ${toSplit.map { it.code }}").also { println("\n\n$this\n\n") } } +fun IntArray.isValidPython(): Boolean { + val tokenSource = ListTokenSource(map { CommonToken(it) }) + val tokens = CommonTokenStream(tokenSource) + return try { + Python3Parser(tokens) + .apply { removeErrorListeners(); addErrorListener(errorListener) } + .file_input() + true + } catch (e: Exception) { + false + } +} + +fun Σᐩ.lexToIntTypesAsPython( + lexer: Lexer = Python3Lexer(CharStreams.fromString(this + "\n")) +) = lexer.allTokens.map { it.type } + +fun Σᐩ.lexToStrTypesAsPython( + lexer: Lexer = Python3Lexer(CharStreams.fromString(this)), + vocabulary: Vocabulary = lexer.vocabulary +) = lexer.allTokens.map { vocabulary.getDisplayName(it.type) } + fun Σᐩ.lexAsPython(): Python3Lexer = Python3Lexer(CharStreams.fromStream(byteInputStream()))