In [1]:
%use dataframe
%use kandy

In [24]:
val data = DataFrame.readCSV("C:\\Users\\Winggar\\Downloads\\PhonotacticonSequences.csv")
data.columnNames()

enum class SequenceCategory { ONSET, NUCLEUS, CODA }
class Sequence(val lect: String, val category: SequenceCategory, val sequence: String, val order: Int, val segment: String)
class Lect(val name: String, val sequences: List<Sequence>)


In [25]:
val rows = data.map { Sequence(it.Lect, SequenceCategory.valueOf(it.Category.uppercase()), it.Sequence, it.Order, it.Segment)}

val lects = rows.groupBy { row: Sequence -> row.lect }.map { Lect(it.key, it.value)}.associateBy { it.name }

In [30]:
val hClassification = lects.values.map { lect ->
    val hasOnset = lect.sequences.any { it.category == SequenceCategory.ONSET && it.sequence == "h" }
    val hasCoda = lect.sequences.any { it.category == SequenceCategory.CODA && it.sequence == "h" }
    when {
        hasOnset && hasCoda -> "H_H"
        hasOnset -> "H_"
        hasCoda -> "_H"
        else -> "N/A"
    }
}.groupingBy { it }.eachCount()

plot {
    pie {
        slice(hClassification.values)
        fillColor(hClassification.keys)
    }
}

In [42]:
val onsetCodaSingleConsonantDiff = lects.values.map { lect ->
    val onsets = lect.sequences.filter { it.category == SequenceCategory.ONSET && it.sequence.length == 1 }.size
    val codas = lect.sequences.filter { it.category == SequenceCategory.CODA && it.sequence.length == 1 }.size
    onsets - codas
}.groupingBy { it }.eachCount()

plot {
    layout.title = "Difference in number of single-consonant onsets vs. codas"
    bars {
        x(onsetCodaSingleConsonantDiff.keys, name = "Difference: Onsets - Codas")
        y(onsetCodaSingleConsonantDiff.values, name = "Count")
    }
}

In [94]:
val allSinglePhonemes = rows
    .filter { it.sequence.length == 1 }
    .map { it.sequence }
    .distinct()
    .minus(listOf("C", "R", "N", "V"))
val lectsToPhonemeOnsetVsCoda = lects
    .filterValues { lect -> lect.sequences.count { it.category == SequenceCategory.CODA } > 3 }
    .mapValues { (_, lect) ->
        val onsets = lect.sequences.filter { it.category == SequenceCategory.ONSET }.map { it.sequence }
        val codas = lect.sequences.filter { it.category == SequenceCategory.CODA }.map { it.sequence }
        allSinglePhonemes.associateWith { phoneme ->
            val hasOnset = onsets.any { phoneme in it }
            val hasCoda = codas.any { phoneme in it }
            when {
                hasOnset && hasCoda -> "both"
                hasOnset -> "onset-only"
                hasCoda -> "coda-only"
                else -> "N/A"
            }
        }
    }

val phonemesToOnsetCodaLectResults = allSinglePhonemes.associateWith { phoneme ->
    lectsToPhonemeOnsetVsCoda.map { (_, lectResults) -> lectResults[phoneme]!! }.groupingBy { it }.eachCount()
}

val dataToDisplay = phonemesToOnsetCodaLectResults.entries
    .filter { (_, results) -> results.filter { (key, _) -> key != "N/A" }.values.sum() != 0 }
    .sortedByDescending { (_, results) -> results.filter { (key, _) -> key != "N/A" }.values.sum() }
    .let { data ->
        dataFrameOf(
            "phoneme" to data.map { it.key },
            "both" to data.map { (_, results) -> results["both"] ?: 0 },
            "onset-only" to data.map { (_, results) -> results["onset-only"] ?: 0 },
            "coda-only" to data.map { (_, results) -> results["coda-only"] ?: 0 },
        ).gather("both", "onset-only", "coda-only").into("categories", "lect count")
    }

dataToDisplay.plot {
    layout.title = "Phoneme Presence in Lect Onsets and Codas"
    layout.size = 1500 to 900

    bars {
        x("phoneme")
        y("lect count")
        fillColor("categories") {
            scale = categorical(
                "both" to Color.hex("#0072B2"),
                "onset-only" to Color.hex("#D55E00"),
                "coda-only" to Color.hex("#009E73"),
            )
        }
        position = Position.stack()
    }
}

In [93]:
val dataToDisplay2 = phonemesToOnsetCodaLectResults
    .mapValues { (_, results) -> results.filter { (key, _) -> key != "N/A" } }
    .entries
    .filter { (_, results) -> results.values.sum() != 0 }
    .sortedByDescending { (_, results) -> results.values.sum() }
    .let { data ->
        fun (Map<String, Int>).proportion(key: String) = (this[key]?.toDouble() ?: 0.0) / this.values.sum()
        dataFrameOf(
            "phoneme" to data.map { it.key },
            "both" to data.map { (_, results) -> results.proportion("both") },
            "onset-only" to data.map { (_, results) -> results.proportion("onset-only") },
            "coda-only" to data.map { (_, results) -> results.proportion("coda-only") },
        ).gather("both", "onset-only", "coda-only").into("categories", "height")
    }

dataToDisplay2.plot {
    layout.title = "Phoneme Presence in Lect Onsets and Codas Normalized"
    layout.size = 1500 to 900

    bars {
        x("phoneme")
        y("height")
        fillColor("categories") {
            scale = categorical(
                "both" to Color.hex("#0072B2"),
                "onset-only" to Color.hex("#D55E00"),
                "coda-only" to Color.hex("#009E73"),
            )
        }
        position = Position.stack()
    }
}