In [1]:
import xml.etree.cElementTree as cET
from collections import Counter
import pandas as pd
import requests
import os
import re

In [4]:
df = pd.read_html(
    'https://en.wikipedia.org/wiki/Letter_frequency',
    index_col=0
)[-1].iloc[:26].applymap(lambda cell: float(cell.strip('~%*')) / 100)
langs = [col.split('[')[0] for col in df.columns]
df.columns = langs
df

Unnamed: 0_level_0,English,French,German,Spanish,Portuguese,Esperanto,Italian,Turkish,Swedish,Polish,Dutch,Danish,Icelandic,Finnish,Czech
Letter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
a,0.08167,0.07636,0.06516,0.11525,0.14634,0.12117,0.11745,0.1192,0.09383,0.0891,0.07486,0.06025,0.1011,0.12217,0.08421
b,0.01492,0.00901,0.01886,0.02215,0.01043,0.0098,0.00927,0.02844,0.01535,0.0147,0.01584,0.02,0.01043,0.00281,0.00822
c,0.02782,0.0326,0.02732,0.04019,0.03882,0.00776,0.04501,0.00963,0.01486,0.0396,0.01242,0.00565,0.0,0.00281,0.0074
d,0.04253,0.03669,0.05076,0.0501,0.04992,0.03044,0.03736,0.04706,0.04702,0.0325,0.05933,0.05858,0.01575,0.01043,0.03475
e,0.12702,0.14715,0.16396,0.12181,0.1257,0.08995,0.11792,0.08912,0.10149,0.0766,0.1891,0.15453,0.06418,0.07968,0.07562
f,0.02228,0.01066,0.01656,0.00692,0.01023,0.01037,0.01153,0.00461,0.02027,0.003,0.00805,0.02406,0.03013,0.00194,0.00084
g,0.02015,0.00866,0.03009,0.01768,0.01303,0.01171,0.01644,0.01253,0.02862,0.0142,0.03403,0.04077,0.04241,0.00392,0.00092
h,0.06094,0.00737,0.04577,0.00703,0.00781,0.00384,0.00636,0.01212,0.0209,0.0108,0.0238,0.01621,0.01871,0.01851,0.01356
i,0.06966,0.07529,0.0655,0.06247,0.06186,0.10012,0.10143,0.086,0.05817,0.0821,0.06499,0.06,0.07578,0.10817,0.06073
j,0.00153,0.00613,0.00268,0.00493,0.00397,0.03501,0.00011,0.00034,0.00614,0.0228,0.0146,0.0073,0.01144,0.02042,0.01433


In [64]:
os.makedirs('xml', exist_ok=True)
for lang in langs:
    print(lang)
    url = f"https://raw.githubusercontent.com/christos-c/bible-corpus/master/bibles/{lang}.xml"
    with open(f"xml/{lang}.xml", "wb") as file:
        response = requests.get(url)
        file.write(response.content)

English
French
German
Spanish
Portuguese
Esperanto
Italian
Turkish
Swedish
Polish
Dutch
Danish
Icelandic
Finnish
Czech


In [5]:
os.makedirs('txt', exist_ok=True)
for lang in langs:
    print(lang)
    root = cET.parse(f'xml/{lang}.xml')
    with open(f'txt/{lang}.txt', 'w', encoding='utf-8') as out:
        for seg in root.findall(f'.//seg'):
            line = "".join(seg.itertext()).lower()
            line = line.replace('&quot;', '')
            line = re.sub('\W', '', line)
            out.write(line + '\n')

English
French
German
Spanish
Portuguese
Esperanto
Italian
Turkish
Swedish
Polish
Dutch
Danish
Icelandic
Finnish
Czech


In [33]:
for lang in langs:
    with open(f'txt/{lang}.txt', 'r') as f:
        lang_counter = Counter(char for line in f for char in line[:-1])
    count = sum(lang_counter.values())
    lang_counter = {k: v/count for k, v in sorted(lang_counter.items())}
    prediction = min(
        df.columns,
        key=lambda col: sum(
            (lang_counter.get(letter, 0) - freq)**2
            for letter, freq in df[col].items()
        )
    )
    print(f'True: {lang}, Predicted: {prediction}')

True: English, Predicted: English
True: French, Predicted: French
True: German, Predicted: German
True: Spanish, Predicted: Spanish
True: Portuguese, Predicted: Portuguese
True: Esperanto, Predicted: Esperanto
True: Italian, Predicted: Italian
True: Turkish, Predicted: Turkish
True: Swedish, Predicted: Swedish
True: Polish, Predicted: Polish
True: Dutch, Predicted: Dutch
True: Danish, Predicted: Danish
True: Icelandic, Predicted: Icelandic
True: Finnish, Predicted: Finnish
True: Czech, Predicted: Czech
