Skip to content

Commit

Permalink
train markov chain on BIFI good code
Browse files Browse the repository at this point in the history
  • Loading branch information
breandan committed Jun 25, 2023
1 parent dfeab03 commit ed7cd1b
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import ai.hypergraph.kaliningraph.hasBalancedBrackets
import com.beust.klaxon.Klaxon
import edu.mcgill.cstk.utils.*
import java.io.File
import java.util.regex.Pattern

/*
./gradlew extractRepairSamples
Expand Down Expand Up @@ -34,4 +35,16 @@ fun main() {
}

private fun selectionCriteria(it: String) =
it.isANontrivialStatementWithBalancedBrackets(1, statementCriteria = { true })
it.isANontrivialStatementWithBalancedBrackets(1, statementCriteria = { true })

fun readBIFIContents(
filename: String = "bifi/data/orig_good_code/orig.good.json",
file: File = File(filename)
): Sequence<String> =
file.readLines().asSequence()
.filter { it.startsWith(" \"code_string\": \"") }
.mapNotNull {
val json = "{$it}"
val parsedObject = Klaxon().parseJsonObject(json.reader())
parsedObject.string("code_string")
}
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ val P_seq2parse: MarkovChain<Σᐩ> by lazy {

val P_stackoverflow: MarkovChain<Σᐩ> by lazy {
measureTimedValue {
readContents("parse_fixes.json").asStream().parallel()
readBIFIContents().take(100_000).asStream().parallel()
.map { "\n$it\n".lexToStrTypesAsPython().asSequence().toMarkovChain(4) }
.reduce { t, u -> t + u }.get()
}.let { println("Trained Markov chain on ${it.value.counter.total.get()} tokens StackOverflow in ${it.duration.inWholeMilliseconds}ms"); it.value }
Expand Down

0 comments on commit ed7cd1b

Please sign in to comment.