# Glyph Frequencies and Word Counts

## Letter Frequencies and Word Count in Takahashi’s transliteration (Table 1)

In [17]:
import pandas as pd

with open('takahashi_clean.txt') as voy:
    data = voy.read()

print(pd.Series(list(data)).value_counts(),
    len(data))

print('total words: ' + str(len(data.split())))

      37919
o     25468
e     20070
h     17856
y     17655
a     14281
c     13314
d     12973
i     11660
l     10518
k      9996
r      7456
n      6141
t      5968
q      5423
\n     5212
S      4501
s      2886
p      1406
m      1116
T       976
K       938
f       425
*       280
P       224
g        96
F        80
I        72
x        35
v         9
z         2
dtype: int64 234956
total words: 37919


# Glyph frequencies in Takahashi’s transliteration for several c combinations (Table 2)

In [18]:
import re

pattern_ch  = '(ch)'
pattern_sh  = '(Sh)'
pattern_cth = '(cTh)'
pattern_ckh = '(cKh)'
pattern_cph = '(cPh)'
pattern_cfh = '(cFh)'
pattern_c   = '(c)+[^hTKPFoy]'
pattern_co  = '(co)'
pattern_cy  = '(cy)'
pattern_cxo = '(c)+(T|K|P|F)+(o)'
pattern_cxy = '(c)+(T|K|P|F)+(y)'

matches_ch  = re.findall(pattern_ch, data)
matches_sh  = re.findall(pattern_sh, data)
matches_cth = re.findall(pattern_cth, data)
matches_ckh = re.findall(pattern_ckh, data)
matches_cph = re.findall(pattern_cph, data)
matches_cfh = re.findall(pattern_cfh, data)
matches_c   = re.findall(pattern_c, data)
matches_co  = re.findall(pattern_co, data)
matches_cy  = re.findall(pattern_cy, data)
matches_cxo = re.findall(pattern_cxo, data)
matches_cxy = re.findall(pattern_cxy, data)

print('ch '  + str(len(matches_ch)))
print('sh '  + str(len(matches_sh)))
print('cth ' + str(len(matches_cth)))
print('ckh ' + str(len(matches_ckh)))
print('cph ' + str(len(matches_cph)))
print('cfh ' + str(len(matches_cfh)))
print('c '   + str(len(matches_c)))
print('co '  + str(len(matches_co)))
print('cy '  + str(len(matches_cy)))
print('cxo ' + str(len(matches_cxo)))
print('cxy ' + str(len(matches_cxy)))

len(matches_ch) + len(matches_sh) + len(matches_cth) + len(matches_ckh) + len(matches_cph) + len(matches_cfh) + len(matches_c) + len(matches_co) + len(matches_cy) + len(matches_cxo) + len(matches_cxy)

ch 11008
sh 4501
cth 950
ckh 906
cph 216
cfh 74
c 143
co 9
cy 7
cxo 0
cxy 0


17814

## Glyph frequencies in Takahashi’s transliteration for several i combinations (Table 3)

In [19]:
pattern_i     = '([^i]+(i))+[^in]'
pattern_ii    = '([^i]+(ii))+[^in]'
pattern_iii   = '([^i]+(iii))+[^in]'
pattern_iiii  = '([^i]+(iiii))+[^in]'
pattern_n     = '[^i]+(n)'
pattern_in    = '[^i]+(in)'
pattern_iin   = '[^i]+(iin)'
pattern_iiin  = '[^i]+(iiin)'
pattern_iiiin = '[^i]+(iiiin)'

matches_i     = re.findall(pattern_i, data)
matches_ii    = re.findall(pattern_ii, data)
matches_iii   = re.findall(pattern_iii, data)
matches_iiii  = re.findall(pattern_iiii, data)
matches_n     = re.findall(pattern_n, data)
matches_in    = re.findall(pattern_in, data)
matches_iin   = re.findall(pattern_iin, data)
matches_iiin  = re.findall(pattern_iiin, data)
matches_iiiin = re.findall(pattern_iiiin, data)

print('i '     + str(len(matches_i)))
print('ii '    + str(len(matches_ii)))
print('iii '   + str(len(matches_iii)))
print('iiii '  + str(len(matches_iiii)))
print('n '     + str(len(matches_n)))
print('in '    + str(len(matches_in)))
print('iin '   + str(len(matches_iin)))
print('iiin '  + str(len(matches_iiin)))
print('iiiin ' + str(len(matches_iiiin)))

len(matches_i) + len(matches_ii) + len(matches_iii) + len(matches_iiii) + len(matches_n) + len(matches_in) + len(matches_iin) + len(matches_iiin) + len(matches_iiiin)

i 590
ii 195
iii 10
iiii 0
n 148
in 1752
iin 4076
iiin 154
iiiin 2


6927

## Glyph frequencies in Takahashi’s transliteration (Table 4)

In [20]:
pattern_t = '(t)|(T)'
pattern_k = '(k)|(K)'
pattern_p = '(p)|(P)'
pattern_f = '(f)|(F)'

matches_t = re.findall(pattern_t, data)
matches_k = re.findall(pattern_k, data)
matches_p = re.findall(pattern_p, data)
matches_f = re.findall(pattern_f, data)

print('sh '  + str(len(matches_sh)))
print('ch '  + str(len(matches_ch) + len(matches_cth) + len(matches_ckh) + len(matches_cph) + len(matches_cfh)))
print('t '   + str(len(matches_t)))
print('k '   + str(len(matches_k)))
print('p '   + str(len(matches_p)))
print('f '   + str(len(matches_f)))
print('i '   + str(len(matches_i) + 2 * len(matches_ii) + 3 * len(matches_iii) + 4 * len(matches_iiii) + len(matches_iiin) + 2 * len(matches_iiiin)))
print('in '  + str(len(matches_in)))
print('iin ' + str(len(matches_iin) + len(matches_iiin) + len(matches_iiiin)))

sh 4501
ch 13154
t 6944
k 10934
p 1630
f 505
i 1168
in 1752
iin 4232


# Entropy (Table 5)

## Entropy for Takahashi’s transliteration

In [21]:
import collections
import math

def estimate_shannon_entropy(data):
    m = len(data)
    bases = collections.Counter([tmp_base for tmp_base in data])

    shannon_entropy_value = 0
    for base in bases:
        n_i = bases[base]
        p_i = n_i / float(m)
        entropy_i = p_i * (math.log(p_i, 2))
        shannon_entropy_value += entropy_i

    return shannon_entropy_value * -1

print(estimate_shannon_entropy(data))

4.021349339992516


## Entropy for the converted transliteration

In [22]:
data_new = data.replace("Sh", "1").replace("ch", "2").replace("cTh", "2t").replace("cKh", "2k").replace("cPh", "2p").replace("cFh", "2f").replace("z", "k").replace("iin", "3").replace("in", "4").replace("I", "i")

print(pd.Series(list(data_new)).value_counts(),
    len(data_new))

def estimate_shannon_entropy(data_new):
    m = len(data_new)
    bases = collections.Counter([tmp_base for tmp_base in data_new])

    shannon_entropy_value = 0
    for base in bases:
        n_i = bases[base]
        p_i = n_i / float(m)
        entropy_i = p_i * (math.log(p_i, 2))
        shannon_entropy_value += entropy_i

    return shannon_entropy_value * -1

print(estimate_shannon_entropy(data_new))

      37919
o     25468
e     20070
y     17655
a     14281
2     13154
d     12973
k     10904
l     10518
r      7456
t      6918
q      5423
\n     5212
1      4501
3      4232
s      2886
4      1752
p      1622
i      1516
m      1116
f       499
*       280
h       201
c       160
n       157
g        96
x        35
K        32
T        26
v         9
P         8
F         6
dtype: int64 207085
3.8664432973586886


## Entropy for a random string

In [23]:
import random
letters = 'abcdefghijklmnopqrstuv'

def random_characters(length):
    return ''.join([random.choice(letters) for x in range(0,length)])

data_rand = random_characters(234956)

print(pd.Series(list(data_rand)).value_counts(),
    len(data_rand))

def estimate_shannon_entropy(data_rand):
    m = len(data_rand)
    bases = collections.Counter([tmp_base for tmp_base in data_rand])

    shannon_entropy_value = 0
    for base in bases:
        n_i = bases[base]
        p_i = n_i / float(m)
        entropy_i = p_i * (math.log(p_i, 2))
        shannon_entropy_value += entropy_i

    return shannon_entropy_value * -1

print(estimate_shannon_entropy(data_rand))

o    10901
m    10842
i    10778
n    10778
r    10763
e    10748
h    10742
s    10739
d    10710
u    10690
p    10688
f    10677
k    10666
c    10655
g    10645
l    10638
a    10609
v    10597
t    10567
q    10552
b    10518
j    10453
dtype: int64 234956
4.459363503358211
