Skip to content

Commit

Permalink
add BOS/EOS and laplace smoothing
Browse files Browse the repository at this point in the history
  • Loading branch information
breandan committed Jun 26, 2023
1 parent 0dcf320 commit 58aca74
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 4 deletions.
2 changes: 1 addition & 1 deletion galoisenne
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ val P_seq2parse: MarkovChain<Σᐩ> by lazy {
val P_BIFI: MarkovChain<Σᐩ> by lazy {
measureTimedValue {
readBIFIContents().take(100_000).asStream().parallel()
.map { "\n$it\n".lexToStrTypesAsPython().asSequence().toMarkovChain(4) }
.map { "\n$it\n".lexToStrTypesAsPython().let { listOf("BOS") + it + "EOS" }.asSequence().toMarkovChain(4) }
.reduce { t, u -> t + u }.get()
}.let { println("Trained Markov chain on ${it.value.counter.total.get()} tokens StackOverflow in ${it.duration.inWholeMilliseconds}ms"); it.value }
}
Expand Down Expand Up @@ -225,7 +225,7 @@ class MultiRankStats {

fun evaluateTidyparseOnStackoverflow() {
// val errDeck = pythonErrProbs.expandByFrequency(10)
val topTokens = P_BIFI.topK(200).map { it.first } + "ε" // + errDeck
val topTokens = P_BIFI.topK(200).map { it.first } + "ε" - "BOS" - "EOS"// + errDeck
println("Top tokens: $topTokens")

val multiRankStats = MultiRankStats()
Expand Down Expand Up @@ -260,7 +260,7 @@ fun evaluateTidyparseOnStackoverflow() {
admissibilityFilter = { map { pythonVocabBindex.getUnsafe(it) ?: it.toInt() }.isValidPython() },
// TODO: incorporate parseable segmentations into scoring mechanism to prioritize chokepoint repairs
// TODO: only score the locations that are actually being modified to avoid redundant work
scoreEdit = { P_BIFI.score(it) }
scoreEdit = { P_BIFI.score(listOf("BOS") + it + "EOS") }
).also { repairs ->
repairs.take(20).apply { println("\nTop $size repairs:\n") }.forEach {
println("Δ=${it.scoreStr()} repair (${it.elapsed()}): ${prettyDiffNoFrills(coarseBrokeStr, it.resToStr())}")
Expand Down

0 comments on commit 58aca74

Please sign in to comment.