In [2]:
from glob import glob
import numpy as np
import collections

In [3]:
test = glob('language-test/*')
language_A = glob('language-A/*')
language_B = glob('language-B/*')
language_C = glob('language-C/*')

language_A_texts = []
language_B_texts = []
language_C_texts = []
language_test = []

cMap = {
    "A": 0,
    "e": 1,
    "g": 2,
    "k": 3,
    "o": 4,
    "p": 5,
    "t": 6
}


In [4]:
for path in language_A:
    # open it as a read file in binary mode
    with open(path, 'r') as f:
        text = f.read()
        language_A_texts.append(text)
        
for path in language_B:
    # open it as a read file in binary mode
    with open(path, 'r') as f:
        text = f.read()
        language_B_texts.append(text)
        
for path in language_C:
    # open it as a read file in binary mode
    with open(path, 'r') as f:
        text = f.read()
        language_C_texts.append(text)
        
for path in test:
    # open it as a read file in binary mode
    with open(path, 'r') as f:
        text = f.read()
        language_test.append((text, path))

In [5]:
def findInitialDistribution(textSet):
    distributionMap = {}
    for text in textSet:
        distributionMap[text[0]] = distributionMap.get(text[0],0) + 1
    
    od = collections.OrderedDict(sorted(distributionMap.items()))
    return np.array(list(od.values()))

In [6]:
def transition_matrix(transitions, cMap):
    n = 7
    
    M = np.zeros((n,n))
    
    for t in transitions:
        t = list(t)
        for (i,j) in zip(t,t[1:]):
            M[cMap[j],cMap[i]] += 1

    #now convert to probabilities:
    M_prob = M / M.sum(axis=0)
    
    return M_prob

In [7]:
def find_probability(text, transition_matrix, initial_distribution, cMap):
    initial = initial_distribution[cMap[text[0]]]
    probability = initial
    prev = cMap[text[0]]
    for c in text[1:]:
        new_prob = transition_matrix[cMap[c],prev]
        prev = cMap[c]
        probability *= new_prob
    
    return probability

In [8]:
initDistA = findInitialDistribution(language_A_texts)
initDistB = findInitialDistribution(language_B_texts)
initDistC = findInitialDistribution(language_C_texts)

In [9]:
initDistA_norm = initDistA/sum(initDistA)
initDistB_norm = initDistB/sum(initDistB)
initDistC_norm = initDistC/sum(initDistC)
print("A: ",np.around(initDistA_norm,2))
print("B: ",np.around(initDistB_norm,2))
print("C: ",np.around(initDistC_norm,2))

A:  [0.07 0.03 0.17 0.2  0.17 0.13 0.23]
B:  [0.17 0.13 0.2  0.1  0.2  0.1  0.1 ]
C:  [0.23 0.1  0.17 0.07 0.17 0.1  0.17]


In [10]:
transA = transition_matrix(language_A_texts, cMap)
transB = transition_matrix(language_B_texts, cMap)
transC = transition_matrix(language_C_texts, cMap)

In [11]:
print("A: ")
print(np.around(transA,3))
print("B: ")
print(np.around(transB,3))
print("C: ")
print(np.around(transC,3))

A: 
[[0.018 0.027 0.264 0.319 0.017 0.272 0.02 ]
 [0.03  0.017 0.278 0.285 0.026 0.306 0.029]
 [0.232 0.242 0.029 0.037 0.243 0.036 0.227]
 [0.212 0.208 0.025 0.042 0.216 0.018 0.236]
 [0.02  0.029 0.33  0.246 0.017 0.295 0.024]
 [0.222 0.258 0.05  0.029 0.233 0.027 0.23 ]
 [0.265 0.22  0.025 0.042 0.248 0.045 0.234]]
B: 
[[0.024 0.02  0.453 0.023 0.416 0.038 0.402]
 [0.041 0.03  0.408 0.023 0.433 0.058 0.472]
 [0.283 0.253 0.038 0.279 0.042 0.308 0.033]
 [0.035 0.023 0.    0.023 0.    0.019 0.   ]
 [0.289 0.314 0.051 0.256 0.057 0.308 0.057]
 [0.029 0.037 0.    0.093 0.    0.019 0.   ]
 [0.301 0.325 0.051 0.302 0.051 0.25  0.036]]
C: 
[[0.621 0.058 0.081 0.052 0.067 0.042 0.068]
 [0.054 0.607 0.07  0.057 0.053 0.089 0.071]
 [0.071 0.058 0.601 0.062 0.051 0.087 0.08 ]
 [0.054 0.072 0.065 0.59  0.06  0.04  0.059]
 [0.068 0.056 0.058 0.079 0.616 0.071 0.059]
 [0.085 0.075 0.051 0.076 0.083 0.598 0.064]
 [0.047 0.072 0.074 0.084 0.069 0.071 0.598]]


In [12]:
for c, test in enumerate(language_test):
    returnMap = {
        0: "A",
        1: "B",
        2: "C"
    }
    # probA = P(language|String)
    # find_probability(test, transA, initDistA_norm, cMap) = P(String|Language)
    # 1/3  = P(String) (uniform prior)
    probA = find_probability(test[0], transA, initDistA_norm, cMap) * 1/3
    probB = find_probability(test[0], transB, initDistB_norm, cMap) * 1/3
    probC = find_probability(test[0], transC, initDistC_norm, cMap) * 1/3
    res = np.array([probA/(probA+probB+probC), probB/(probA+probB+probC), probC/(probA+probB+probC)])
    testName = test[1].split("/")[1]
    print(f"{testName}:", returnMap[np.argmax(res)], "|" , res)

language-test-3: B | [1.34443778e-46 1.00000000e+00 8.64666350e-59]
language-test-4: A | [1.00000000e+00 0.00000000e+00 9.91152197e-47]
language-test-5: A | [1.0000000e+00 0.0000000e+00 1.1527412e-39]
language-test-2: A | [1.00000000e+00 0.00000000e+00 3.03765345e-34]
language-test-7: A | [1.00000000e+00 0.00000000e+00 5.34293923e-42]
language-test-0: C | [7.0087509e-61 0.0000000e+00 1.0000000e+00]
language-test-9: A | [1.00000000e+00 0.00000000e+00 3.58533799e-46]
language-test-8: C | [2.21983772e-81 0.00000000e+00 1.00000000e+00]
language-test-1: C | [4.62774258e-68 0.00000000e+00 1.00000000e+00]
language-test-6: B | [5.29263175e-63 1.00000000e+00 2.69984205e-62]
