Skip to content

Commit

Permalink
fix deserialization
Browse files Browse the repository at this point in the history
  • Loading branch information
breandan committed Feb 20, 2024
1 parent 1ee4a6e commit 8d8e4ce
Show file tree
Hide file tree
Showing 11 changed files with 71,020 additions and 37 deletions.
2 changes: 1 addition & 1 deletion galoisenne
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import ai.hypergraph.kaliningraph.repair.*
import ai.hypergraph.kaliningraph.repair.Edit
import ai.hypergraph.kaliningraph.tokenizeByWhitespace
import ai.hypergraph.kaliningraph.visualization.alsoCopy
import ai.hypergraph.markovian.mcmc.toMarkovChain
import com.google.common.util.concurrent.AtomicLongMap
import edu.mcgill.cstk.utils.*
import java.io.File
Expand Down Expand Up @@ -38,7 +39,6 @@ fun main() {
// testContextEditIssue()
}


fun paperExample() {
val broken = "f = f.f(1:, 1:)"
val brokeLexed = broken.mapToUnquotedPythonTokens() + " NEWLINE"
Expand Down Expand Up @@ -588,4 +588,16 @@ fun String.reformatCSVIntoPrettyColumns(): String {

// Reassemble the lines and then the entire string
return linesByColumns.joinToString("\n") { it.joinToString(" , ") }
}

fun Sequence<Σᐩ>.train(csv: File) {
measureTimedValue {
println("Training $MARKOV_MEMORY Markov chain")
asStream().parallel().map {
"\n$it\n".mapToUnquotedPythonTokens().let { "BOS $it EOS" }
.tokenizeByWhitespace().asSequence().toMarkovChain(MARKOV_MEMORY)
}.reduce { t, u -> t + u }.get()
.also { csv.also { println("Writing CSV to ${it.absolutePath}") }.writeText(it.toCSV()) }
}.let { println("Trained $MARKOV_MEMORY-gram Markov chain on ${it.value.counter.total.get()} " +
"PY150 tokens in ${it.duration.inWholeSeconds}s"); it.value }
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import kotlin.to
*/
fun main() {
// MAX_TOKENS = 20
// MAX_RADIUS = 2
evaluateBarHillelRepair()
// evaluateSeq2ParseRepair()
// println(balancedSmallRepairs.toList().size)
Expand All @@ -34,13 +35,11 @@ fun evaluateBarHillelRepair() {
val sampleTimeByLevDist = (1..MAX_RADIUS).associateWith { 0.0 }.toMutableMap()
val allTimeByLevDist = (1..MAX_RADIUS).associateWith { 0.0 }.toMutableMap()
val samplesBeforeMatchByLevDist = (1..MAX_RADIUS).associateWith { 0.0 }.toMutableMap()
// val s2pg = vanillaS2PCFG // Original grammar, including all productions
val s2pg = vanillaS2PCFG // Minimized grammar, with rare productions removed
val s2pg = vanillaS2PCFG
val parikhMap = s2pg.parikhMap
// assert(validLexedPythonStatements.lines().all { it in s2pg.language })

val dataset = balancedSmallRepairs.toList() // naturallySmallRepairs //pairwiseUniformAll
.also { println("Evaluating Bar-Hillel repair on ${it.size} repairs...") }
println("Running Bar-Hillel repair on Python snippets with $NUM_CORES cores")
dataset.first().second.let { P_BIFI_PY150.score("BOS NEWLINE $it EOS".tokenizeByWhitespace()) }

Expand Down Expand Up @@ -98,9 +97,8 @@ fun evaluateBarHillelRepair() {
val timeout = (TIMEOUT_MS / 1000).seconds
// val results = mutableListOf<Σᐩ>()
var elapsed = clock.elapsedNow().inWholeMilliseconds
val results = ConcurrentRankedProbabilisticSet<Σᐩ>(100_000)
intGram
.sampleDirectlyWOR(stoppingCriterion = { clock.elapsedNow() < timeout })
val results = ConcurrentRankedProbabilisticSet<Σᐩ>(10_000)
intGram.sampleDirectlyWOR(stoppingCriterion = { clock.elapsedNow() < timeout })
.distinct().forEach {
totalSamples.incrementAndGet()
if (it == target) { matchFound = true; elapsed = clock.elapsedNow().inWholeMilliseconds }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,47 +47,22 @@ val P_seq2parse: MarkovChain<Σᐩ> by lazy {
}.let { println("Trained Markov chain on ${it.value.counter.total.get()} Seq2Parse tokens in ${it.duration.inWholeMilliseconds}ms"); it.value }
}

const val bifi_filename = "src/main/resources/datasets/python/bifi/data/orig_good_code/orig.good.json"
val home_prefix = if (NUM_CORES < 20) "/Users/breandan/IdeaProjects/gym-fs" else "/scratch/b/bengioy/breandan"
val bifi_filenameCC = "$home_prefix/bifi/data/orig_good_code/orig.good.cc.json"
const val MARKOV_MEMORY = 4

// Python3 snippets
// https://github.com/michiyasunaga/BIFI?tab=readme-ov-file#about-the-github-python-dataset
val P_BIFI: MarkovChain<Σᐩ> by lazy {
val csv = File("$home_prefix/ngrams_BIFI_$MARKOV_MEMORY.csv")
if (csv.exists()) MarkovChain.deserialize(csv.readText())
val csv = File(File("").absolutePath + "/src/main/resources/models/ngrams_BIFI_$MARKOV_MEMORY.csv")
MarkovChain.deserialize(csv.readText())
.also { println("Loaded ${it.counter.total} BIFI $MARKOV_MEMORY-grams from ${csv.absolutePath}") }
else measureTimedValue {
println("Training BIFI-$MARKOV_MEMORY Markov chain...")
val numToks = 100_000.let { if (NUM_CORES < 20) it else Int.MAX_VALUE }
// If running on Compute Canada, use the larger dataset
val file: File = File(bifi_filenameCC).let { if (it.exists()) it else File(bifi_filename) }
readBIFIContents(file = file).take(numToks).asStream().parallel().map {
"\n$it\n".mapToUnquotedPythonTokens().let { "BOS $it EOS" }
.tokenizeByWhitespace().asSequence().toMarkovChain(MARKOV_MEMORY)
}.reduce { t, u -> t + u }.get()
.also { if (20 < NUM_CORES) { csv.also { println("Writing CSV to ${it.absolutePath}") }.writeText(it.toCSV()) } }
}.let { println("Trained $MARKOV_MEMORY-gram Markov chain on ${it.value.counter.total.get()} " +
"BIFI tokens in ${it.duration.inWholeSeconds}s"); it.value }
}

// Python2 snippets, about ~20x longer on average than BIFI
// https://www.sri.inf.ethz.ch/py150
val P_PY150: MarkovChain<Σᐩ> by lazy {
val csv = File("$home_prefix/ngrams_PY150_$MARKOV_MEMORY.csv")
if (csv.exists()) MarkovChain.deserialize(csv.readText())
val csv = File(File("").absolutePath + "/src/main/resources/models/ngrams_PY150_$MARKOV_MEMORY.csv")
MarkovChain.deserialize(csv.readText())
.also { println("Loaded ${it.counter.total} PY150 $MARKOV_MEMORY-grams from ${csv.absolutePath}") }
else measureTimedValue {
println("Training PY150-$MARKOV_MEMORY Markov chain...")
val numToks = 5_000.let { if (NUM_CORES < 20) it else Int.MAX_VALUE }
readPY150Contents().take(numToks).asStream().parallel().map {
"\n$it\n".mapToUnquotedPythonTokens().let { "BOS $it EOS" }
.tokenizeByWhitespace().asSequence().toMarkovChain(MARKOV_MEMORY)
}.reduce { t, u -> t + u }.get()
.also { if (20 < NUM_CORES) { csv.also { println("Writing CSV to ${it.absolutePath}") }.writeText(it.toCSV()) } }
}.let { println("Trained $MARKOV_MEMORY-gram Markov chain on ${it.value.counter.total.get()} " +
"PY150 tokens in ${it.duration.inWholeSeconds}s"); it.value }
}

val P_BIFI_PY150: MarkovChain<Σᐩ> by lazy { P_BIFI + P_PY150 }
Expand Down
Binary file removed src/main/resources/model/dqn-trained-0000.params
Binary file not shown.
Loading

0 comments on commit 8d8e4ce

Please sign in to comment.