From ea33545dad9759133c7f4d26a674175cd6bb6d75 Mon Sep 17 00:00:00 2001
From: breandan <bre@ndan.co>
Date: Mon, 12 Jun 2023 02:30:51 -0400
Subject: [PATCH] compare lexical patches

---
 .../experiments/repair/PythonSnippetRepair.kt | 25 ++++++++++++++-----
 .../edu/mcgill/cstk/utils/ParseUtils.kt       | 22 ++++++++++++++++
 2 files changed, 41 insertions(+), 6 deletions(-)

diff --git a/src/main/kotlin/edu/mcgill/cstk/experiments/repair/PythonSnippetRepair.kt b/src/main/kotlin/edu/mcgill/cstk/experiments/repair/PythonSnippetRepair.kt
index 8399ba49..05561174 100644
--- a/src/main/kotlin/edu/mcgill/cstk/experiments/repair/PythonSnippetRepair.kt
+++ b/src/main/kotlin/edu/mcgill/cstk/experiments/repair/PythonSnippetRepair.kt
@@ -76,27 +76,40 @@ fun stackOverflowEval() {
     .minimizeFix { tokenizeAsPython(true) }
     .filter { (broke, fixed, minfix) ->
       val (brokeTokens, fixedTokens) =
-        broke.tokenizeAsPython() to fixed.tokenizeAsPython()
+        broke.lexToIntTypesAsPython() to fixed.lexToIntTypesAsPython()
 //      (brokeTokens.size - fixedTokens.size).absoluteValue < 10 &&
-      broke != fixed && multisetManhattanDistance(brokeTokens, fixedTokens).let { it in 1..5 }
+      multisetManhattanDistance(brokeTokens, fixedTokens).let { it in 1..5 }
     }
 //    .forEach { (broke, fixed, minfix) ->
 //      broke.tokenizeAsPython()
 //    }
     .filter { (broke, fixed, minfix) ->
       val (brokeVis, fixedVis, minfixVis) = broke.visibleChars() to fixed.visibleChars() to minfix.visibleChars()
-      brokeVis != fixedVis && brokeVis != minfixVis && fixedVis != minfixVis
+      brokeVis != fixedVis && brokeVis != minfixVis// && fixedVis != minfixVis
     }
     .map { (broke, fixed, minfix) ->
       prettyDiffs(listOf(broke, fixed), listOf("original snippet", "human patch")).let { origDiff ->
         prettyDiffs(listOf(broke, minfix), listOf("original snippet", "minimized patch")).let { minDiff ->
           // Compare ASCII characters for a visible difference, if same do not print two
 //          if (corrected.visibleChars() == minfix.visibleChars()) origDiff to "" else
-          origDiff to minDiff
+          origDiff to minDiff to broke to minfix
         }
       }
-    }.filter { (a, b) -> b.isNotEmpty() && 2 < (a.count { it == '\u001B' } - b.count { it == '\u001B' }).absoluteValue }
-    .forEach { (a, b) -> println("$a\n$b") }
+    }
+//    .filter { (a, b) -> b.isNotEmpty() && 2 < (a.count { it == '\u001B' } - b.count { it == '\u001B' }).absoluteValue }
+    .forEach { (a, b, c, d) ->
+//      println("$a\n$b")
+      val coarseBroke = c.lexToStrTypesAsPython().joinToString(" ") + "  |  " + c.lexToIntTypesAsPython().joinToString(" ")
+      val coarseFixed = d.lexToStrTypesAsPython().joinToString(" ") + "  |  " + d.lexToIntTypesAsPython().joinToString(" ")
+//      val diff1 = prettyDiffNoFrills(coarseBroke, coarseFixed)
+//      val diff2 = prettyDiffNoFrills(coarseFixed, coarseBroke)
+//      val diff3 = prettyDiffNoFrills(c.lexToIntTypesAsPython().joinToString("\n"), d.lexToIntTypesAsPython().joinToString("\n"))
+//      val maxlen = max(diff1.visibleLen(), diff2.visibleLen())
+
+      println("Broke tokens: ${prettyDiffNoFrills(coarseFixed, coarseBroke)}")
+      println("Fixed tokens: ${prettyDiffNoFrills(coarseBroke, coarseFixed)}")
+      println("\n\n")
+    }
 }
 
 fun seq2parseEval() {
diff --git a/src/main/kotlin/edu/mcgill/cstk/utils/ParseUtils.kt b/src/main/kotlin/edu/mcgill/cstk/utils/ParseUtils.kt
index 98828e2e..b592a0f5 100644
--- a/src/main/kotlin/edu/mcgill/cstk/utils/ParseUtils.kt
+++ b/src/main/kotlin/edu/mcgill/cstk/utils/ParseUtils.kt
@@ -32,6 +32,28 @@ fun Σᐩ.tokenizeAsPython(exhaustive: Boolean = false): List<Σᐩ> =
       else throw Exception("Could not find token $t in ${toSplit.map { it.code }}").also { println("\n\n$this\n\n") }
   }
 
+fun IntArray.isValidPython(): Boolean {
+  val tokenSource = ListTokenSource(map { CommonToken(it) })
+  val tokens = CommonTokenStream(tokenSource)
+  return try {
+    Python3Parser(tokens)
+      .apply { removeErrorListeners(); addErrorListener(errorListener) }
+      .file_input()
+    true
+  } catch (e: Exception) {
+    false
+  }
+}
+
+fun Σᐩ.lexToIntTypesAsPython(
+  lexer: Lexer = Python3Lexer(CharStreams.fromString(this + "\n"))
+) = lexer.allTokens.map { it.type }
+
+fun Σᐩ.lexToStrTypesAsPython(
+  lexer: Lexer = Python3Lexer(CharStreams.fromString(this)),
+  vocabulary: Vocabulary = lexer.vocabulary
+) = lexer.allTokens.map { vocabulary.getDisplayName(it.type) }
+
 fun Σᐩ.lexAsPython(): Python3Lexer =
   Python3Lexer(CharStreams.fromStream(byteInputStream()))