Skip to content

Commit

Permalink
use string convolution to compute markov chain context
Browse files Browse the repository at this point in the history
  • Loading branch information
breandan committed May 21, 2023
1 parent 1db1d6d commit f80a916
Showing 1 changed file with 5 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,13 @@ fun main() {
}

private fun constructScoringFunction(): (Σᐩ) -> Double {
val P = coarsenedKotlinLines.lines().map { "BOS $it EOS" }
.map { it.tokenizeByWhitespace().asSequence().toMarkovChain(3) }
.fold(MarkovChain<Σᐩ>()) { a, b -> a + b }
val P = fetchKotlinExamples().map { "BOS $it EOS" }
.map { it.tokenizeByWhitespace().asSequence().windowed(3).toMarkovChain(3) }
.fold(MarkovChain<List<Σᐩ>>()) { a, b -> a + b }

println("Top 10 most common tokens: ${P.topK(10)}\n\n")

return { P.score("BOS ${it.coarsenAsKotlin(false)} EOS".tokenizeByWhitespace()) }
return { P.score("BOS ${it.coarsenAsKotlin(false)} EOS".tokenizeByWhitespace().windowed(3)) }
}

// Get top level directory and all Kotlin files in all subdirectories
Expand All @@ -76,7 +76,7 @@ fun fetchKotlinExamples() =
it.coarsenAsKotlin().let { str ->
dropKeywords.none { it in str } && str.split(" ").size in 10..40
}
}.map { it.trim() }.distinct().forEach { println(it) }
}.map { it.trim() }.distinct()

fun Σᐩ.coarsenAsKotlin(lex: Boolean = true): Σᐩ =
(if(lex) lexAsKotlin() else tokenizeByWhitespace()).joinToString(" ") {
Expand Down

0 comments on commit f80a916

Please sign in to comment.