In [3]:
%use dataframe

In [4]:
val data = DataFrame.readCSV("C:\\Users\\Winggar\\Downloads\\phoible.csv")
data.columnNames()


[InventoryID, Glottocode, ISO6393, LanguageName, SpecificDialect, GlyphID, Phoneme, Allophones, Marginal, SegmentClass, Source, tone, stress, syllabic, short, long, consonantal, sonorant, continuant, delayedRelease, approximant, tap, trill, nasal, lateral, labial, round, labiodental, coronal, anterior, distributed, strident, dorsal, high, low, front, back, tense, retractedTongueRoot, advancedTongueRoot, periodicGlottalSource, epilaryngealSource, spreadGlottis, constrictedGlottis, fortis, raisedLarynxEjective, loweredLarynxImplosive, click]

In [71]:
enum class Signal { PLUS, MINUS, NONE }

fun (String).toSignal() = when (this) {
    "+" -> Signal.PLUS
    "-" -> Signal.MINUS
    else -> Signal.NONE
}

fun (Char).toSignal() = when (this) {
    '+' -> Signal.PLUS
    '-' -> Signal.MINUS
    else -> Signal.NONE
}

data class Row(
    val glottocode: String?,
    val ISO6393: String?,
    val name: String?,
    val specificDialect: String?,

    val glyphId: String,

    val phoneme: String,
    val allophones: String?,
    val marginal: Boolean?,
    val segmentClass: String?,
    val source: String?,

    val tone: Signal,
    val stress: Signal,
    val syllabic: Signal,
    val short: Signal,
    val long: Signal,
    val consonantal: Signal,
    val sonorant: Signal,
    val continuant: Signal,
    val delayedRelease: Signal,
    val approximant: Signal,
    val tap: Signal,
    val trill: Signal,
    val nasal: Signal,
    val lateral: Signal,
    val labial: Signal,
    val round: Signal,
    val labiodental: Signal,
    val coronal: Signal,
    val anterior: Signal,
    val distributed: Signal,
    val strident: Signal,
    val dorsal: Signal,
    val high: Signal,
    val low: Signal,
    val front: Signal,
    val back: Signal,
    val tense: Signal,
    val retractedTongueRoot: Signal,
    val advancedTongueRoot: Signal,
    val periodicGlottalSource: Signal,
    val epilaryngealSource: Signal,
    val spreadGlottis: Signal,
    val constrictedGlottis: Signal,
    val fortis: Signal,
    val raisedLarynxEjective: Signal,
    val loweredLarynxImplosive: Signal,
    val click: Signal,
)

val rows = data.map {
    Row(
        it.Glottocode,
        it.ISO6393,
        it.LanguageName,
        it.SpecificDialect,

        it.GlyphID.toString(),

        it.Phoneme,
        it.Allophones,
        it.Marginal,
        it.SegmentClass,
        it.Source,

        it.tone.toSignal(),
        it.stress.toSignal(),
        it.syllabic.toSignal(),
        it.short.toSignal(),
        it.long.toSignal(),
        it.consonantal.toSignal(),
        it.sonorant.toSignal(),
        it.continuant.toSignal(),
        it.delayedRelease.toSignal(),
        it.approximant.toSignal(),
        it.tap.toSignal(),
        it.trill.toSignal(),
        it.nasal.toSignal(),
        it.lateral.toSignal(),
        it.labial.toSignal(),
        it.round.toSignal(),
        it.labiodental.toSignal(),
        it.coronal.toSignal(),
        it.anterior.toSignal(),
        it.distributed.toSignal(),
        it.strident.toSignal(),
        it.dorsal.toSignal(),
        it.high.toSignal(),
        it.low.toSignal(),
        it.front.toSignal(),
        it.back.toSignal(),
        it.tense.toSignal(),
        it.retractedTongueRoot.toSignal(),
        it.advancedTongueRoot.toSignal(),
        it.periodicGlottalSource.toSignal(),
        it.epilaryngealSource.toSignal(),
        it.spreadGlottis.toSignal(),
        it.constrictedGlottis.toSignal(),
        it.fortis.toSignal(),
        it.raisedLarynxEjective.toSignal(),
        it.loweredLarynxImplosive.toSignal(),
        it.click.toSignal(),
    )
}
val languages = rows
    .groupBy { it.ISO6393 }
    .mapValues { (_, rows) ->
        rows
//        rows.filter { it.source == rows.first().source }
    }


In [42]:
println(
    "%affricates: ${
        languages.filter { (_, rows) ->
            rows
                .filter { it.strident == Signal.PLUS && it.continuant == Signal.PLUS }
                .isNotEmpty()
        }.size / languages.size.toDouble()
    }"
)

println(
    "%plosives: ${
        languages.filter { (_, rows) ->
            rows.filter {
                it.strident == Signal.MINUS &&
                        it.continuant == Signal.MINUS &&
                        it.nasal == Signal.MINUS &&
                        it.click == Signal.MINUS &&
                        it.marginal == false &&
                        !it.phoneme.contains("|")
            }.isNotEmpty()
        }.size / languages.size.toDouble()
    }"
)

println(
    "%fricatives: ${
        languages.filter { (_, rows) ->
            rows.filter {

                it.continuant == Signal.PLUS &&
                        it.nasal == Signal.MINUS &&
                        it.click == Signal.MINUS &&
                        it.sonorant == Signal.MINUS &&
                        it.marginal == false &&
                        !it.phoneme.contains("|")
            }.isNotEmpty()
        }.size / languages.size.toDouble()
    }"
)

println(
    "%nasal: ${
        languages.filter { (_, rows) ->
            rows.filter {
                it.continuant == Signal.MINUS &&
                        it.nasal == Signal.PLUS &&
                        it.click == Signal.MINUS &&
                        it.sonorant == Signal.PLUS &&
                        it.marginal == false &&
                        !it.phoneme.contains("|")
            }.isNotEmpty()
        }.size / languages.size.toDouble()
    }"
)

println(
    "%semivowels: ${
        languages.filter { (_, rows) ->
            rows.filter {
                it.continuant == Signal.PLUS &&
                        it.nasal == Signal.MINUS &&
                        it.click == Signal.MINUS &&
                        it.sonorant == Signal.PLUS &&
                        it.approximant == Signal.PLUS &&
                        it.consonantal == Signal.MINUS &&
                        it.marginal == false &&
                        !it.phoneme.contains("|")
            }.isNotEmpty()
        }.size / languages.size.toDouble()
    }"
)

println(
    "%glides: ${
        languages.filter { (_, rows) ->
            rows.filter {
                it.continuant == Signal.PLUS &&
                        it.nasal == Signal.MINUS &&
                        it.click == Signal.MINUS &&
                        it.sonorant == Signal.PLUS &&
                        it.approximant == Signal.PLUS &&
                        it.consonantal == Signal.PLUS &&
                        it.marginal == false &&
                        !it.phoneme.contains("|")
            }.isNotEmpty()
        }.size / languages.size.toDouble()
    }"
)

%affricates: 0.8128571428571428
%plosives: 0.7219047619047619
%fricatives: 0.6290476190476191
%nasal: 0.7495238095238095
%semivowels: 0.7214285714285714
%glides: 0.6961904761904761


In [74]:
val phonemeOccurrences = rows.map { it.phoneme }
    .toList()
    .flatMap { it.toList() }
    .distinct()
    .associateWith { symbol ->
        languages.filter { (_, rows) ->
            rows.filter { it.phoneme.contains(symbol) }.isNotEmpty()
        }.map { it.key}
    }

In [75]:
phonemeOccurrences
    .entries
    .map { it.key to it.value }
    .map { it.first to it.second.size }
    .sortedByDescending { it.second }
    .map {
        "${it.first} - ${it.second} - ${
            ({
                val ratio = it.second.toDouble() / languages.size.toDouble()
                "%.2f".format(ratio)
            })()
        }"
    }
    .forEach { println(it) }

t - 2093 - 1.00
k - 2083 - 0.99
i - 2055 - 0.98
m - 2046 - 0.97
n - 2045 - 0.97
a - 1992 - 0.95
p - 1990 - 0.95
u - 1971 - 0.94
j - 1925 - 0.92
w - 1816 - 0.86
l - 1734 - 0.83
s - 1676 - 0.80
e - 1647 - 0.78
o - 1617 - 0.77
b - 1466 - 0.70
d - 1463 - 0.70
ŋ - 1443 - 0.69
ɡ - 1346 - 0.64
h - 1254 - 0.60
r - 1175 - 0.56
ʃ - 1137 - 0.54
̠ - 1071 - 0.51
ɲ - 993 - 0.47
f - 968 - 0.46
ː - 905 - 0.43
ɛ - 896 - 0.43
ɔ - 869 - 0.41
z - 844 - 0.40
ʔ - 839 - 0.40
̪ - 791 - 0.38
ʒ - 786 - 0.37
ɾ - 635 - 0.30
v - 625 - 0.30
ə - 576 - 0.27
̃ - 517 - 0.25
˦ - 514 - 0.24
˨ - 505 - 0.24
c - 454 - 0.22
ʰ - 444 - 0.21
ʈ - 444 - 0.21
x - 433 - 0.21
ɨ - 425 - 0.20
ɟ - 393 - 0.19
ɪ - 382 - 0.18
ʷ - 380 - 0.18
̞ - 370 - 0.18
ʊ - 359 - 0.17
ɣ - 313 - 0.15
ɳ - 297 - 0.14
˧ - 286 - 0.14
ȵ - 276 - 0.13
ȶ - 276 - 0.13
β - 273 - 0.13
ɭ - 257 - 0.12
ɓ - 249 - 0.12
ɻ - 230 - 0.11
ɑ - 229 - 0.11
ʲ - 228 - 0.11
ɗ - 222 - 0.11
ɖ - 211 - 0.10
æ - 209 - 0.10
| 

In [62]:
languages.filter { (key, _) -> key == "haw" }.values.first().first().name

Hawaiian

In [76]:
languages.entries.filter { (key, _) -> !phonemeOccurrences['t']!!.contains(key) }.map { (_, rows) -> rows.first().name }

[Hawaiian, CHEROKEE, JOMANG, BERTA, Abau, Talasa, Karajá]