In [2]:
%use dataframe

In [3]:
val data = DataFrame.readCSV("C:\\Users\\Winggar\\Downloads\\phoible.csv")
data.columnNames()


[InventoryID, Glottocode, ISO6393, LanguageName, SpecificDialect, GlyphID, Phoneme, Allophones, Marginal, SegmentClass, Source, tone, stress, syllabic, short, long, consonantal, sonorant, continuant, delayedRelease, approximant, tap, trill, nasal, lateral, labial, round, labiodental, coronal, anterior, distributed, strident, dorsal, high, low, front, back, tense, retractedTongueRoot, advancedTongueRoot, periodicGlottalSource, epilaryngealSource, spreadGlottis, constrictedGlottis, fortis, raisedLarynxEjective, loweredLarynxImplosive, click]

In [4]:
enum class Signal { PLUS, MINUS, NONE }

fun (String).toSignal() = when (this) {
    "+" -> Signal.PLUS
    "-" -> Signal.MINUS
    else -> Signal.NONE
}

fun (Char).toSignal() = when (this) {
    '+' -> Signal.PLUS
    '-' -> Signal.MINUS
    else -> Signal.NONE
}

data class Row(
    val glottocode: String?,
    val ISO6393: String?,
    val name: String?,
    val specificDialect: String?,

    val glyphId: String,

    val phoneme: String,
    val allophones: String?,
    val marginal: Boolean?,
    val segmentClass: String?,
    val source: String?,

    val tone: Signal,
    val stress: Signal,
    val syllabic: Signal,
    val short: Signal,
    val long: Signal,
    val consonantal: Signal,
    val sonorant: Signal,
    val continuant: Signal,
    val delayedRelease: Signal,
    val approximant: Signal,
    val tap: Signal,
    val trill: Signal,
    val nasal: Signal,
    val lateral: Signal,
    val labial: Signal,
    val round: Signal,
    val labiodental: Signal,
    val coronal: Signal,
    val anterior: Signal,
    val distributed: Signal,
    val strident: Signal,
    val dorsal: Signal,
    val high: Signal,
    val low: Signal,
    val front: Signal,
    val back: Signal,
    val tense: Signal,
    val retractedTongueRoot: Signal,
    val advancedTongueRoot: Signal,
    val periodicGlottalSource: Signal,
    val epilaryngealSource: Signal,
    val spreadGlottis: Signal,
    val constrictedGlottis: Signal,
    val fortis: Signal,
    val raisedLarynxEjective: Signal,
    val loweredLarynxImplosive: Signal,
    val click: Signal,
)

val rows = data.map {
    Row(
        it.Glottocode,
        it.ISO6393,
        it.LanguageName,
        it.SpecificDialect,

        it.GlyphID.toString(),

        it.Phoneme,
        it.Allophones,
        it.Marginal,
        it.SegmentClass,
        it.Source,

        it.tone.toSignal(),
        it.stress.toSignal(),
        it.syllabic.toSignal(),
        it.short.toSignal(),
        it.long.toSignal(),
        it.consonantal.toSignal(),
        it.sonorant.toSignal(),
        it.continuant.toSignal(),
        it.delayedRelease.toSignal(),
        it.approximant.toSignal(),
        it.tap.toSignal(),
        it.trill.toSignal(),
        it.nasal.toSignal(),
        it.lateral.toSignal(),
        it.labial.toSignal(),
        it.round.toSignal(),
        it.labiodental.toSignal(),
        it.coronal.toSignal(),
        it.anterior.toSignal(),
        it.distributed.toSignal(),
        it.strident.toSignal(),
        it.dorsal.toSignal(),
        it.high.toSignal(),
        it.low.toSignal(),
        it.front.toSignal(),
        it.back.toSignal(),
        it.tense.toSignal(),
        it.retractedTongueRoot.toSignal(),
        it.advancedTongueRoot.toSignal(),
        it.periodicGlottalSource.toSignal(),
        it.epilaryngealSource.toSignal(),
        it.spreadGlottis.toSignal(),
        it.constrictedGlottis.toSignal(),
        it.fortis.toSignal(),
        it.raisedLarynxEjective.toSignal(),
        it.loweredLarynxImplosive.toSignal(),
        it.click.toSignal(),
    )
}
val languages = rows
    .groupBy { it.ISO6393 }
    .mapValues { (_, rows) ->
        rows
//        rows.filter { it.source == rows.first().source }
    }


In [5]:
println(
    "%affricates: ${
        languages.filter { (_, rows) ->
            rows
                .filter { it.strident == Signal.PLUS && it.continuant == Signal.PLUS }
                .isNotEmpty()
        }.size / languages.size.toDouble()
    }"
)

println(
    "%plosives: ${
        languages.filter { (_, rows) ->
            rows.filter {
                it.strident == Signal.MINUS &&
                        it.continuant == Signal.MINUS &&
                        it.nasal == Signal.MINUS &&
                        it.click == Signal.MINUS &&
                        it.marginal == false &&
                        !it.phoneme.contains("|")
            }.isNotEmpty()
        }.size / languages.size.toDouble()
    }"
)

println(
    "%fricatives: ${
        languages.filter { (_, rows) ->
            rows.filter {

                it.continuant == Signal.PLUS &&
                        it.nasal == Signal.MINUS &&
                        it.click == Signal.MINUS &&
                        it.sonorant == Signal.MINUS &&
                        it.marginal == false &&
                        !it.phoneme.contains("|")
            }.isNotEmpty()
        }.size / languages.size.toDouble()
    }"
)

println(
    "%nasal: ${
        languages.filter { (_, rows) ->
            rows.filter {
                it.continuant == Signal.MINUS &&
                        it.nasal == Signal.PLUS &&
                        it.click == Signal.MINUS &&
                        it.sonorant == Signal.PLUS &&
                        it.marginal == false &&
                        !it.phoneme.contains("|")
            }.isNotEmpty()
        }.size / languages.size.toDouble()
    }"
)

println(
    "%semivowels: ${
        languages.filter { (_, rows) ->
            rows.filter {
                it.continuant == Signal.PLUS &&
                        it.nasal == Signal.MINUS &&
                        it.click == Signal.MINUS &&
                        it.sonorant == Signal.PLUS &&
                        it.approximant == Signal.PLUS &&
                        it.consonantal == Signal.MINUS &&
                        it.marginal == false &&
                        !it.phoneme.contains("|")
            }.isNotEmpty()
        }.size / languages.size.toDouble()
    }"
)

println(
    "%glides: ${
        languages.filter { (_, rows) ->
            rows.filter {
                it.continuant == Signal.PLUS &&
                        it.nasal == Signal.MINUS &&
                        it.click == Signal.MINUS &&
                        it.sonorant == Signal.PLUS &&
                        it.approximant == Signal.PLUS &&
                        it.consonantal == Signal.PLUS &&
                        it.marginal == false &&
                        !it.phoneme.contains("|")
            }.isNotEmpty()
        }.size / languages.size.toDouble()
    }"
)

%affricates: 0.8157142857142857
%plosives: 0.8104761904761905
%fricatives: 0.7219047619047619
%nasal: 0.840952380952381
%semivowels: 0.8128571428571428
%glides: 0.780952380952381


In [6]:
val phonemeOccurrences = rows.map { it.phoneme }
    .toList()
    .flatMap { it.toList() }
    .distinct()
    .associateWith { symbol ->
        languages.filter { (_, rows) ->
            rows.filter { it.phoneme.contains(symbol) }.isNotEmpty()
        }.map { it.key }
    }

In [7]:
phonemeOccurrences
    .entries
    .map { it.key to it.value }
    .map { it.first to it.second.size }
    .sortedByDescending { it.second }
    .map {
        "${it.first} - ${it.second} - ${
            ({
                val ratio = it.second.toDouble() / languages.size.toDouble()
                "%.2f".format(ratio)
            })()
        }"
    }
    .forEach { println(it) }

t - 2093 - 1.00
k - 2083 - 0.99
i - 2055 - 0.98
m - 2046 - 0.97
n - 2045 - 0.97
a - 1992 - 0.95
p - 1990 - 0.95
u - 1971 - 0.94
j - 1925 - 0.92
w - 1816 - 0.86
l - 1734 - 0.83
s - 1676 - 0.80
e - 1647 - 0.78
o - 1617 - 0.77
b - 1466 - 0.70
d - 1463 - 0.70
ŋ - 1443 - 0.69
ɡ - 1346 - 0.64
h - 1254 - 0.60
r - 1175 - 0.56
ʃ - 1137 - 0.54
̠ - 1071 - 0.51
ɲ - 993 - 0.47
f - 968 - 0.46
ː - 905 - 0.43
ɛ - 896 - 0.43
ɔ - 869 - 0.41
z - 844 - 0.40
ʔ - 839 - 0.40
̪ - 791 - 0.38
ʒ - 786 - 0.37
ɾ - 635 - 0.30
v - 625 - 0.30
ə - 576 - 0.27
̃ - 517 - 0.25
˦ - 514 - 0.24
˨ - 505 - 0.24
c - 454 - 0.22
ʰ - 444 - 0.21
ʈ - 444 - 0.21
x - 433 - 0.21
ɨ - 425 - 0.20
ɟ - 393 - 0.19
ɪ - 382 - 0.18
ʷ - 380 - 0.18
̞ - 370 - 0.18
ʊ - 359 - 0.17
ɣ - 313 - 0.15
ɳ - 297 - 0.14
˧ - 286 - 0.14
ȵ - 276 - 0.13
ȶ - 276 - 0.13
β - 273 - 0.13
ɭ - 257 - 0.12
ɓ - 249 - 0.12
ɻ - 230 - 0.11
ɑ - 229 - 0.11
ʲ - 228 - 0.11
ɗ - 222 - 0.11
ɖ - 211 - 0.10
æ - 209 - 0.10
| 

In [8]:
languages.filter { (key, _) -> key == "haw" }.values.first().first().name

Hawaiian

In [9]:
languages.entries.filter { (key, _) -> !phonemeOccurrences['t']!!.contains(key) }.map { (_, rows) -> rows.first().name }

[Hawaiian, CHEROKEE, JOMANG, BERTA, Abau, Talasa, Karajá]

In [10]:
fun chart(a: Char, b: Char) {
    println("$a: " + phonemeOccurrences[a]!!.size)
    println("$b: " + phonemeOccurrences[b]!!.size)
    println("$a ∧ $b: " + phonemeOccurrences[a]!!.intersect(phonemeOccurrences[b]!!).size)
    println("$a - $b: " + (phonemeOccurrences[a]!! - phonemeOccurrences[b]!!).size)
    println("$b - $a: " + (phonemeOccurrences[b]!! - phonemeOccurrences[a]!!).size)
    println("$a ⊕ $b: " + (phonemeOccurrences[a]!! - phonemeOccurrences[b]!!).union(phonemeOccurrences[b]!! - phonemeOccurrences[a]!!).size)
    println("all: " + languages.size)
}

chart('m', 'n')
chart('p', 'b')

val analyze =
    "m,n,ŋ,ɲ,p,t,k,b,d,ɡ,f,θ,ʃ,ʒ,v,z,h,j,w,ɹ,r,ɾ,l".split(',').map { it[0] }.filter { it in phonemeOccurrences }

val cooccurrenceData = analyze.map { symbolA ->
    analyze.map { symbolB ->
        val languagesWithA = phonemeOccurrences[symbolA]!!.toSet()
        val languagesWithB = phonemeOccurrences[symbolB]!!.toSet()

        val n = languages.size.toDouble()
        val n11 = languagesWithA.intersect(languagesWithB).size.toDouble()
        val n10 = (languagesWithA - languagesWithB).size.toDouble()
        val n01 = (languagesWithB - languagesWithA).size.toDouble()
        val n00 = n - n11 - n10 - n01

        val numerator = (n11 * n00) - (n10 * n01)
        val denominator = Math.sqrt((n11 + n10) * (n01 + n00) * (n11 + n01) * (n10 + n00))

        val result = if (denominator > 0) {
            numerator / denominator
        } else {
            0.0
        }

        result
    }
}

val cooccurrenceDF = dataFrameOf(
    "symbol" to analyze.map { it.toString() },
    *analyze.zip(cooccurrenceData).map { (symbol, values) ->
        symbol.toString() to values
    }.toTypedArray()
)

println("High correlation pairs (correlation > 0.3):")
val highCorrelationPairs = mutableListOf<Triple<Char, Char, Double>>()
for (i in analyze.indices) {
    for (j in (i + 1) until analyze.size) {
        val correlation = cooccurrenceData[i][j]
        if (correlation.absoluteValue > 0.3) {
            highCorrelationPairs.add(Triple(analyze[i], analyze[j], correlation))
        }
    }
}
highCorrelationPairs.sortedByDescending { it.third }.forEach { (a, b, correlation) ->
    println("$a - $b: ${"%.3f".format(correlation)}")
}

cooccurrenceDF


m: 2046
n: 2045
m ∧ n: 2034
m - n: 12
n - m: 11
m ⊕ n: 23
all: 2100
p: 1990
b: 1466
p ∧ b: 1370
p - b: 620
b - p: 96
p ⊕ b: 716
all: 2100
High correlation pairs (correlation > 0.3):
b - d: 0.886
d - ɡ: 0.815
b - ɡ: 0.801
m - n: 0.783
ʃ - ʒ: 0.615
f - v: 0.587
ɡ - z: 0.516
v - z: 0.514
d - z: 0.478
ɡ - ʒ: 0.468
d - ʒ: 0.463
b - f: 0.454
b - z: 0.452
d - f: 0.452
ɡ - f: 0.451
f - z: 0.450
ʒ - z: 0.418
b - ʒ: 0.416
j - w: 0.415
f - ʒ: 0.386
r - l: 0.361
ɡ - v: 0.355
d - v: 0.334
b - v: 0.321
ʒ - v: 0.321
ʃ - h: 0.314
ɲ - d: 0.314
ɲ - ɡ: 0.313
ɲ - b: 0.305
ŋ - l: 0.305
ɾ - l: -0.326
r - ɾ: -0.339


symbol,m,n,ŋ,ɲ,p,t,k,b,d,ɡ,f,θ,ʃ,ʒ,v,z,h,j,w,ɹ,r,ɾ,l
m,1.0,0.783404,0.208322,0.123737,0.096839,0.042801,0.018897,-0.054411,-0.054843,-0.027522,0.07177,-0.011068,0.043696,0.051048,0.059691,0.04113,-0.059832,0.048983,-0.029056,0.002087,0.134621,-0.069901,0.242585
n,0.783404,1.0,0.210889,0.131434,0.055127,0.042248,0.01846,-0.081868,-0.075782,-0.035725,0.04398,0.004535,0.034579,0.046739,0.048058,0.043209,-0.05567,0.080014,-0.012539,0.01551,0.124772,-0.054333,0.223338
ŋ,0.208322,0.210889,1.0,0.225596,0.136384,0.032251,0.030733,0.010399,0.012757,0.079439,0.069734,-0.017779,-0.229368,0.004044,0.113518,-0.004082,-0.179405,0.052952,0.117572,0.111448,0.272266,-0.240017,0.304557
ɲ,0.123737,0.131434,0.225596,1.0,0.017185,0.021677,0.000411,0.304972,0.313732,0.313212,0.277971,0.01613,0.216992,0.243078,0.186631,0.161289,0.073968,0.033647,0.031492,-0.085953,-0.0223,-0.025469,0.113306
p,0.096839,0.055127,0.136384,0.017185,1.0,0.097649,0.074169,-0.089434,-0.094694,-0.069038,-0.05272,-0.0575,-0.010479,-0.038992,0.101621,-0.020884,-0.049305,0.037377,-0.011726,0.049541,0.058327,-0.017396,0.089181
t,0.042801,0.042248,0.032251,0.021677,0.097649,1.0,0.271366,0.015956,-0.002217,0.008381,0.036905,-0.070369,0.013098,0.027656,0.037645,0.030556,-0.013812,0.042347,-0.02287,0.014425,0.031896,0.002099,-0.004791
k,0.018897,0.01846,0.030733,0.000411,0.074169,0.271366,1.0,-0.024683,-0.036491,-0.012227,0.051558,0.018893,0.002179,0.003985,0.035561,0.019861,0.055814,0.04967,0.026432,0.022534,0.026888,0.013196,0.028537
b,-0.054411,-0.081868,0.010399,0.304972,-0.089434,0.015956,-0.024683,1.0,0.886058,0.800807,0.454139,0.049529,0.233701,0.416452,0.321449,0.45234,0.267712,-0.018139,-0.050777,-0.176097,-0.033977,-0.025498,0.072466
d,-0.054843,-0.075782,0.012757,0.313732,-0.094694,-0.002217,-0.036491,0.886058,1.0,0.814687,0.45225,0.034603,0.238845,0.46325,0.334371,0.477524,0.277474,-0.019052,-0.036793,-0.179498,-0.028343,-0.030185,0.084595
ɡ,-0.027522,-0.035725,0.079439,0.313212,-0.069038,0.008381,-0.012227,0.800807,0.814687,1.0,0.451159,0.032678,0.237536,0.468101,0.354764,0.516353,0.192763,0.00419,-0.049257,-0.168401,0.049749,-0.058364,0.140223


In [11]:
val letter = 26
val offset = 15.0 / 26.0
val power = letter / 5.0
val eval = { x: Int -> (x + offset * letter).toDouble().pow(power) }
val sum = (1..letter).sumOf(eval)
(1..letter).forEach { println("%.2f".format(eval(it) / sum.toDouble() * 100)) }


0.11
0.14
0.19
0.26
0.34
0.43
0.55
0.70
0.87
1.07
1.32
1.60
1.94
2.32
2.77
3.29
3.88
4.55
5.31
6.18
7.15
8.25
9.48
10.85
12.37
14.07
