Skip to content

Commit

Permalink
synthetic corruption experiments
Browse files Browse the repository at this point in the history
  • Loading branch information
breandan committed Mar 29, 2024
1 parent a3d9e11 commit fa3a258
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@ fun main() {
// computeLevDistDistribution()
// fetchLevenshteinAlignment()
// collectPCFGQuintuples()
collectNaturallySmallRepairs()
// collectNaturallySmallRepairs()
collectSyntheticRepairs()
// collectPairwisePythonRepairs()
// println(naturallySmallRepairs.map { it.second }.joinToString("\n").parseAndCountActiveSymbols().alsoCopy())
// estimateLevenshteinDistanceDistribution()
Expand Down Expand Up @@ -172,6 +173,27 @@ fun estimateLevenshteinDistanceDistribution() {
}
}

fun String.addNewLineIfMissing() = if (endsWith("NEWLINE")) this else "$this NEWLINE"

fun collectSyntheticRepairs() {
preprocessStackOverflowStreaming(MAX_PATCH_SIZE, 3..MAX_TOKENS)
.map { it.first }
// .map { println("CODE: $it"); it }
.map { it to it.mapToUnquotedPythonTokens() }
.filter { it.second.tokenizeByWhitespace().size in 3..MAX_TOKENS }
.flatMap { (goodCode, goodCodeTks) ->
// println("GOOD CODE: $goodCode")
val goodCodeNewline = goodCodeTks.addNewLineIfMissing()
(0..100).map { goodCodeTks.syntheticallyCorrupt() }.filter { !it.isValidPython() }
// .onEach { println("BAD CODE: ${levenshteinAlign(goodCodeTks, it.mapToUnquotedPythonTokens()).paintANSIColors()}") }
.map { it.mapToUnquotedPythonTokens().addNewLineIfMissing() }
.filter { levenshtein(goodCodeTks, it) in 1..MAX_PATCH_SIZE }
.distinct().shuffled().take(10).map { it.addNewLineIfMissing() to goodCodeNewline }.toList()
// .also { println("Size: ${it.size}") }
.stream()
}.forEach { (a, c) -> println("$a\n$c\n") }
}

// Takes ~1.5 hrs to run on M1 serially, ~17 mins w/ parallel streaming
fun collectNaturallySmallRepairs() {
MAX_PATCH_SIZE = 6
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,13 @@ fun main() {
MAX_TOKENS = 79
// MAX_RADIUS = 3
CFG_THRESH = 10_000
// evaluateBarHillelRepairOnStackOverflow()
evaluateBarHillelRepairOnStackOverflow()
// evaluateSeq2ParseRepair()
evaluateBIFIRepair()
// evaluateBIFIRepair()
}

val LEN_BUCKET_INTERVAL = 5

fun readPCFG3() =
File(File("").absolutePath + "/src/main/resources/models/pcfg3_BIFI.csv").readText()
.lines().map { it.split(" ::: ") }.associate { Pair(it[0].split(" ").let { it[0] to it[1] to it[2] }, it[1].toInt()) }
Expand All @@ -42,8 +44,8 @@ fun readPCFG5(s2pg: CFG) =
.let { hash(it[0], it[1], it[2], it[3], it[4]) }, it[1].toInt()) }

fun evaluateBarHillelRepairOnStackOverflow() {
val dataset = sizeAndDistBalancedRepairsUnminimized.toList()
// corruptedBIFIGoodCode // balancedSmallRepairsUnminimized.toList() // naturallySmallRepairs //pairwiseUniformAll
val dataset = corruptedBIFIGoodCode//sizeAndDistBalancedRepairsUnminimized.toList()
// corruptedBIFIGoodCode // balancedSmallRepairsUnminimized.toList() // naturallySmallRepairs //pairwiseUniformAll
val allRate = LBHMetrics()
val levRates = mutableMapOf<Int, LBHMetrics>()
val sampleTimeByLevDist = (1..MAX_RADIUS).associateWith { 0.0 }.toMutableMap()
Expand Down Expand Up @@ -72,6 +74,7 @@ fun evaluateBarHillelRepairOnStackOverflow() {
println()

val P_1ByLevDist = mutableMapOf<Pair<Int, Int>, S2PMetrics>()
val P_AllByLevDist = mutableMapOf<Pair<Int, Int>, S2PMetrics>()

dataset.forEach { (invalid, valid) ->
val allTime = TimeSource.Monotonic.markNow()
Expand All @@ -81,8 +84,9 @@ fun evaluateBarHillelRepairOnStackOverflow() {
val source = toRepair.joinToString(" ").also { println("Source: $it") }
val levAlign = levenshteinAlign(toRepair, humanRepair)
val levDist = levAlign.patchSize()
val lenBucket = (toRepair.size / 10) * 10
val lenBucket = (toRepair.size / LEN_BUCKET_INTERVAL) * LEN_BUCKET_INTERVAL
P_1ByLevDist.getOrPut(lenBucket to levDist) { S2PMetrics() }.total++
P_AllByLevDist.getOrPut(lenBucket to levDist) { S2PMetrics() }.total++

var levBallSize = 1
val humanRepairANSI = levenshteinAlign(toRepair, humanRepair).paintANSIColors()
Expand Down Expand Up @@ -143,8 +147,10 @@ fun evaluateBarHillelRepairOnStackOverflow() {
}

val rankedResults = results.mostLikely.entries.map { it.value }
val indexOfTarget = rankedResults.indexOf(target)
.also { if (it == 0) P_1ByLevDist.getOrPut(lenBucket to levDist) { S2PMetrics() }.top1++ }
val indexOfTarget = rankedResults.indexOf(target).also {
if (it == 0) P_1ByLevDist.getOrPut(lenBucket to levDist) { S2PMetrics() }.top1++
if (matchFound) P_AllByLevDist.getOrPut(lenBucket to levDist) { S2PMetrics() }.top1++
}
println("Top1 scoring repair: ${levenshteinAlign(toRepair, rankedResults.first().tokenizeByWhitespace()).paintANSIColors()}")

if (indexOfTarget < 0) {
Expand All @@ -169,17 +175,22 @@ fun evaluateBarHillelRepairOnStackOverflow() {
println("Found length-$levDist repair in $elapsed ms, $allElapsed ms," +
" $totalSamples samples, ${intGram.size} prods, $langSize trees, $indexOfTarget rank")//, rank: ${rankedResults.indexOf(target) + 1} / ${rankedResults.size}")
allRate.run { println("Lev(*): $allRate") }; println(levRates.summarize())
sampleTimeByLevDist[levDist] = sampleTimeByLevDist[levDist]!! + elapsed
// sampleTimeByLevDist[levDist] = sampleTimeByLevDist[levDist]!! + elapsed
sampleTimeByLevDist[levDist] = (sampleTimeByLevDist[levDist] ?: 0.0) + elapsed
println("Draw timings (ms): ${sampleTimeByLevDist.mapValues { it.value / allRate.recall }}")
allTimeByLevDist[levDist] = allTimeByLevDist[levDist]!! + allElapsed
allTimeByLevDist[levDist] = (allTimeByLevDist[levDist] ?: 0.0) + allElapsed
println("Full timings (ms): ${allTimeByLevDist.mapValues { it.value / allRate.recall }}")
samplesBeforeMatchByLevDist[levDist] = samplesBeforeMatchByLevDist[levDist]!! + totalSamples.get()
println("Avg samples drawn: ${samplesBeforeMatchByLevDist.mapValues { it.value / allRate.recall }}")
positive.appendText("${toRepair.size}, $levDist, $elapsed, $allElapsed, " +
"$totalSamples, ${levBallSize}, ${intGram.size}, $langSize, $indexOfTarget, ${levAlign.summarize()}\n")
}

println()
println("Precision@1\n===========")
println(P_1ByLevDist.summarizeLenAndDist())
println("Precision@All\n=============")
println(P_AllByLevDist.summarizeLenAndDist())
println()
}
}
Expand Down Expand Up @@ -262,9 +273,11 @@ val corruptedBIFIGoodCode by lazy {
.filter { it.tokenizeByWhitespace().size in 3..MAX_TOKENS }
.flatMap { goodCodeTks ->
val goodCode = "$goodCodeTks NEWLINE"
goodCode.corruptPythonSnippet()
goodCode.corruptPythonSnippet().distinct()
.filter {
it.tokenizeByWhitespace().all { it in vanillaS2PCFG.terminals } &&
val tks = it.tokenizeByWhitespace()
levenshtein(goodCode, it) <= MAX_RADIUS &&
tks.all { it in vanillaS2PCFG.terminals } &&
it !in vanillaS2PCFG.language
}
.take(10).map { it to goodCode }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -471,8 +471,8 @@ fun preprocessStackOverflowStreaming(
val mftks = minfix.mapToUnquotedPythonTokens()
val bktks = broke.mapToUnquotedPythonTokens()

levenshtein(bktks, mftks) <= maxPatchSize && minfix.isValidPython() &&
"$mftks NEWLINE" in seq2parsePythonCFG.language
levenshtein(bktks, mftks) <= maxPatchSize && minfix.isValidPython()
// "$mftks NEWLINE" in seq2parsePythonCFG.language
}
.filter { (broke, minfix) ->

Expand Down Expand Up @@ -542,6 +542,16 @@ fun bifiFix(
} catch (e: Exception) { "ERROR (${e.message}):\n$brokenCode" }
.let { if (k == 1) listOf(it) else it.split("\n") }

fun bifiBreak(
brokenCode: String,
k: Int = 1,
prefix: String = "http://127.0.0.1:5000/api/text?k=$k&bifi_break="
): List<String> =
try {
URL("$prefix${URLEncoder.encode(brokenCode,"UTF-8")}").readText()
} catch (e: Exception) { "ERROR (${e.message}):\n$brokenCode" }
.let { if (k == 1) listOf(it) else it.split("\n") }

fun bifiTokenize(
code: String,
prefix: String = "http://127.0.0.1:5000/api/text?bifi_tokenize="
Expand Down

0 comments on commit fa3a258

Please sign in to comment.