diff --git a/src/main/kotlin/edu/mcgill/cstk/experiments/repair/CollectSummaryStatistics.kt b/src/main/kotlin/edu/mcgill/cstk/experiments/repair/CollectSummaryStatistics.kt index 1fde67a..435bb91 100644 --- a/src/main/kotlin/edu/mcgill/cstk/experiments/repair/CollectSummaryStatistics.kt +++ b/src/main/kotlin/edu/mcgill/cstk/experiments/repair/CollectSummaryStatistics.kt @@ -37,7 +37,8 @@ fun main() { // computeLevDistDistribution() // fetchLevenshteinAlignment() // collectPCFGQuintuples() - collectNaturallySmallRepairs() +// collectNaturallySmallRepairs() + collectSyntheticRepairs() // collectPairwisePythonRepairs() // println(naturallySmallRepairs.map { it.second }.joinToString("\n").parseAndCountActiveSymbols().alsoCopy()) // estimateLevenshteinDistanceDistribution() @@ -172,6 +173,27 @@ fun estimateLevenshteinDistanceDistribution() { } } +fun String.addNewLineIfMissing() = if (endsWith("NEWLINE")) this else "$this NEWLINE" + +fun collectSyntheticRepairs() { + preprocessStackOverflowStreaming(MAX_PATCH_SIZE, 3..MAX_TOKENS) + .map { it.first } +// .map { println("CODE: $it"); it } + .map { it to it.mapToUnquotedPythonTokens() } + .filter { it.second.tokenizeByWhitespace().size in 3..MAX_TOKENS } + .flatMap { (goodCode, goodCodeTks) -> +// println("GOOD CODE: $goodCode") + val goodCodeNewline = goodCodeTks.addNewLineIfMissing() + (0..100).map { goodCodeTks.syntheticallyCorrupt() }.filter { !it.isValidPython() } +// .onEach { println("BAD CODE: ${levenshteinAlign(goodCodeTks, it.mapToUnquotedPythonTokens()).paintANSIColors()}") } + .map { it.mapToUnquotedPythonTokens().addNewLineIfMissing() } + .filter { levenshtein(goodCodeTks, it) in 1..MAX_PATCH_SIZE } + .distinct().shuffled().take(10).map { it.addNewLineIfMissing() to goodCodeNewline }.toList() +// .also { println("Size: ${it.size}") } + .stream() + }.forEach { (a, c) -> println("$a\n$c\n") } +} + // Takes ~1.5 hrs to run on M1 serially, ~17 mins w/ parallel streaming fun collectNaturallySmallRepairs() { MAX_PATCH_SIZE = 6 diff --git a/src/main/kotlin/edu/mcgill/cstk/experiments/repair/PythonBarHillelRepair.kt b/src/main/kotlin/edu/mcgill/cstk/experiments/repair/PythonBarHillelRepair.kt index ffd24b5..98fd9fe 100644 --- a/src/main/kotlin/edu/mcgill/cstk/experiments/repair/PythonBarHillelRepair.kt +++ b/src/main/kotlin/edu/mcgill/cstk/experiments/repair/PythonBarHillelRepair.kt @@ -25,11 +25,13 @@ fun main() { MAX_TOKENS = 79 // MAX_RADIUS = 3 CFG_THRESH = 10_000 -// evaluateBarHillelRepairOnStackOverflow() + evaluateBarHillelRepairOnStackOverflow() // evaluateSeq2ParseRepair() - evaluateBIFIRepair() +// evaluateBIFIRepair() } +val LEN_BUCKET_INTERVAL = 5 + fun readPCFG3() = File(File("").absolutePath + "/src/main/resources/models/pcfg3_BIFI.csv").readText() .lines().map { it.split(" ::: ") }.associate { Pair(it[0].split(" ").let { it[0] to it[1] to it[2] }, it[1].toInt()) } @@ -42,8 +44,8 @@ fun readPCFG5(s2pg: CFG) = .let { hash(it[0], it[1], it[2], it[3], it[4]) }, it[1].toInt()) } fun evaluateBarHillelRepairOnStackOverflow() { - val dataset = sizeAndDistBalancedRepairsUnminimized.toList() - // corruptedBIFIGoodCode // balancedSmallRepairsUnminimized.toList() // naturallySmallRepairs //pairwiseUniformAll + val dataset = corruptedBIFIGoodCode//sizeAndDistBalancedRepairsUnminimized.toList() +// corruptedBIFIGoodCode // balancedSmallRepairsUnminimized.toList() // naturallySmallRepairs //pairwiseUniformAll val allRate = LBHMetrics() val levRates = mutableMapOf() val sampleTimeByLevDist = (1..MAX_RADIUS).associateWith { 0.0 }.toMutableMap() @@ -72,6 +74,7 @@ fun evaluateBarHillelRepairOnStackOverflow() { println() val P_1ByLevDist = mutableMapOf, S2PMetrics>() + val P_AllByLevDist = mutableMapOf, S2PMetrics>() dataset.forEach { (invalid, valid) -> val allTime = TimeSource.Monotonic.markNow() @@ -81,8 +84,9 @@ fun evaluateBarHillelRepairOnStackOverflow() { val source = toRepair.joinToString(" ").also { println("Source: $it") } val levAlign = levenshteinAlign(toRepair, humanRepair) val levDist = levAlign.patchSize() - val lenBucket = (toRepair.size / 10) * 10 + val lenBucket = (toRepair.size / LEN_BUCKET_INTERVAL) * LEN_BUCKET_INTERVAL P_1ByLevDist.getOrPut(lenBucket to levDist) { S2PMetrics() }.total++ + P_AllByLevDist.getOrPut(lenBucket to levDist) { S2PMetrics() }.total++ var levBallSize = 1 val humanRepairANSI = levenshteinAlign(toRepair, humanRepair).paintANSIColors() @@ -143,8 +147,10 @@ fun evaluateBarHillelRepairOnStackOverflow() { } val rankedResults = results.mostLikely.entries.map { it.value } - val indexOfTarget = rankedResults.indexOf(target) - .also { if (it == 0) P_1ByLevDist.getOrPut(lenBucket to levDist) { S2PMetrics() }.top1++ } + val indexOfTarget = rankedResults.indexOf(target).also { + if (it == 0) P_1ByLevDist.getOrPut(lenBucket to levDist) { S2PMetrics() }.top1++ + if (matchFound) P_AllByLevDist.getOrPut(lenBucket to levDist) { S2PMetrics() }.top1++ + } println("Top1 scoring repair: ${levenshteinAlign(toRepair, rankedResults.first().tokenizeByWhitespace()).paintANSIColors()}") if (indexOfTarget < 0) { @@ -169,9 +175,10 @@ fun evaluateBarHillelRepairOnStackOverflow() { println("Found length-$levDist repair in $elapsed ms, $allElapsed ms," + " $totalSamples samples, ${intGram.size} prods, $langSize trees, $indexOfTarget rank")//, rank: ${rankedResults.indexOf(target) + 1} / ${rankedResults.size}") allRate.run { println("Lev(*): $allRate") }; println(levRates.summarize()) - sampleTimeByLevDist[levDist] = sampleTimeByLevDist[levDist]!! + elapsed +// sampleTimeByLevDist[levDist] = sampleTimeByLevDist[levDist]!! + elapsed + sampleTimeByLevDist[levDist] = (sampleTimeByLevDist[levDist] ?: 0.0) + elapsed println("Draw timings (ms): ${sampleTimeByLevDist.mapValues { it.value / allRate.recall }}") - allTimeByLevDist[levDist] = allTimeByLevDist[levDist]!! + allElapsed + allTimeByLevDist[levDist] = (allTimeByLevDist[levDist] ?: 0.0) + allElapsed println("Full timings (ms): ${allTimeByLevDist.mapValues { it.value / allRate.recall }}") samplesBeforeMatchByLevDist[levDist] = samplesBeforeMatchByLevDist[levDist]!! + totalSamples.get() println("Avg samples drawn: ${samplesBeforeMatchByLevDist.mapValues { it.value / allRate.recall }}") @@ -179,7 +186,11 @@ fun evaluateBarHillelRepairOnStackOverflow() { "$totalSamples, ${levBallSize}, ${intGram.size}, $langSize, $indexOfTarget, ${levAlign.summarize()}\n") } + println() + println("Precision@1\n===========") println(P_1ByLevDist.summarizeLenAndDist()) + println("Precision@All\n=============") + println(P_AllByLevDist.summarizeLenAndDist()) println() } } @@ -262,9 +273,11 @@ val corruptedBIFIGoodCode by lazy { .filter { it.tokenizeByWhitespace().size in 3..MAX_TOKENS } .flatMap { goodCodeTks -> val goodCode = "$goodCodeTks NEWLINE" - goodCode.corruptPythonSnippet() + goodCode.corruptPythonSnippet().distinct() .filter { - it.tokenizeByWhitespace().all { it in vanillaS2PCFG.terminals } && + val tks = it.tokenizeByWhitespace() + levenshtein(goodCode, it) <= MAX_RADIUS && + tks.all { it in vanillaS2PCFG.terminals } && it !in vanillaS2PCFG.language } .take(10).map { it to goodCode } diff --git a/src/main/kotlin/edu/mcgill/cstk/experiments/repair/PythonSnippetRepair.kt b/src/main/kotlin/edu/mcgill/cstk/experiments/repair/PythonSnippetRepair.kt index 4b52561..166df60 100644 --- a/src/main/kotlin/edu/mcgill/cstk/experiments/repair/PythonSnippetRepair.kt +++ b/src/main/kotlin/edu/mcgill/cstk/experiments/repair/PythonSnippetRepair.kt @@ -471,8 +471,8 @@ fun preprocessStackOverflowStreaming( val mftks = minfix.mapToUnquotedPythonTokens() val bktks = broke.mapToUnquotedPythonTokens() - levenshtein(bktks, mftks) <= maxPatchSize && minfix.isValidPython() && - "$mftks NEWLINE" in seq2parsePythonCFG.language + levenshtein(bktks, mftks) <= maxPatchSize && minfix.isValidPython() +// "$mftks NEWLINE" in seq2parsePythonCFG.language } .filter { (broke, minfix) -> @@ -542,6 +542,16 @@ fun bifiFix( } catch (e: Exception) { "ERROR (${e.message}):\n$brokenCode" } .let { if (k == 1) listOf(it) else it.split("\n") } +fun bifiBreak( + brokenCode: String, + k: Int = 1, + prefix: String = "http://127.0.0.1:5000/api/text?k=$k&bifi_break=" +): List = + try { + URL("$prefix${URLEncoder.encode(brokenCode,"UTF-8")}").readText() + } catch (e: Exception) { "ERROR (${e.message}):\n$brokenCode" } + .let { if (k == 1) listOf(it) else it.split("\n") } + fun bifiTokenize( code: String, prefix: String = "http://127.0.0.1:5000/api/text?bifi_tokenize="