In [33]:
import csv
from collections import Counter
import json
import plotly.express as px
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import math
import numpy as np
import bisect
import pandas as pd
import unicodedata
from scipy.cluster import hierarchy

In [5]:
def memoize(f):
    """
    memoization decorator for a function taking ONLY a single argument
    src: http://code.activestate.com/recipes/578231-probably-the-fastest-memoization-decorator-in-the-/
    """

    class MemoDict(dict):
        def __missing__(self, key):
            ret = self[key] = f(key)
            return ret

    return MemoDict().__getitem__

@memoize
def is_text_char(char: str) -> bool:
    return unicodedata.category(char) in {'Lu', 'Ll', 'Lt', 'Lm', 'Lo',  # letters
                                          'Nd', 'Nl', 'No',  # numbers
                                          'Mn', 'Mc', 'Me',  # diacritics, etc
                                          'Co',  # private use char class
                                          }



In [20]:
with open('MASTER.LINGUISTIC.1GRAM.big.char-count-uniq.json', 'rt', encoding='utf8') as f:
    char_count_per_lang = json.load(f)

In [3]:
with open('MASTER.LINGUISTIC.1GRAM.big.len-count-n.json', 'rt', encoding='utf8') as f:
    len_uniq_per_lang = json.load(f)

In [5]:
with open('MASTER.LINGUISTIC.1GRAM.big.len-count-sum.json', 'rt', encoding='utf8') as f:
    len_count_per_lang = json.load(f)

In [10]:
for lang_name, uniq_len_counts in list(len_uniq_per_lang.items())[:10]:
    xs, ys = zip(*sorted((int(x), int(y)) for x, y in uniq_len_counts.items()))
    fig = px.scatter(x=xs, y=ys, log_x=True, log_y=True, title=lang_name)
    fig.show()

In [11]:
for lang_name, len_counts in list(len_count_per_lang.items())[:10]:
    xs, ys = zip(*sorted((int(x), int(y)) for x, y in len_counts.items()))
    fig = px.scatter(x=xs, y=ys, log_x=True, log_y=True, title=lang_name)
    fig.show()

In [25]:
rows = []
for lang_name, char_counts in char_count_per_lang.items():
    row = {}
    row['language'] = lang_name
    for char, count in char_counts.items():
        if is_text_char(char):
            row[char] = count
    rows.append(row)
df = pd.DataFrame(rows)
df.set_index('language', inplace=True)
df = df.transpose()
df

language,Korean,ENGLISH,POLISH,Chinese,SPANISH,SWEDISH,INDONESIAN,Japanese,ChineseT,ARABIC,...,SESELWA,SAMOAN,LINGALA,XHOSA,TIBETAN,OCCITAN,NEPALI,GUARANI,HMONG,TATAR
야,50275.0,306.0,56.0,65.0,4.0,71.0,,2.0,4.0,14.0,...,,,,,,,,,,
는,230558.0,350.0,160.0,232.0,52.0,10.0,,9.0,15.0,193.0,...,,,,,,,,,,
여,84973.0,269.0,219.0,296.0,8.0,28.0,,2.0,12.0,20.0,...,,,,,,,,,,
하,174944.0,665.0,239.0,236.0,28.0,11.0,,6.0,17.0,101.0,...,,,,,,,,,,
놔,510.0,1.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ཿ,,,,,,,,,,,...,,,,,1.0,,,,,
བྷ,,,,,,,,,,,...,,,,,8.0,,,,,
ཎ,,,,,,,,,,,...,,,,,5.0,,,,,
ཾ,,,,,,,,,,,...,,,,,1.0,,,,,


In [26]:
df.to_csv('MASTER.LINGUISTIC.1GRAM.big.char-count.csv')

In [27]:
df.to_excel('MASTER.LINGUISTIC.1GRAM.big.char-count.xlsx')

In [8]:
def dot_product(xs, ys):
    return sum(x * y for x, y in zip(xs, ys))

In [28]:
for lang_name in df.columns:
    df[lang_name] /= dot_product(df[df[lang_name].notna()][lang_name], df[df[lang_name].notna()][lang_name])**0.5
df

language,Korean,ENGLISH,POLISH,Chinese,SPANISH,SWEDISH,INDONESIAN,Japanese,ChineseT,ARABIC,...,SESELWA,SAMOAN,LINGALA,XHOSA,TIBETAN,OCCITAN,NEPALI,GUARANI,HMONG,TATAR
야,0.026642,1.615248e-05,0.000020,0.000005,8.166126e-07,0.000023,,5.932717e-07,9.676738e-07,0.000005,...,,,,,,,,,,
는,0.122180,1.847506e-05,0.000056,0.000017,1.061596e-05,0.000003,,2.669723e-06,3.628777e-06,0.000072,...,,,,,,,,,,
여,0.045030,1.419940e-05,0.000077,0.000022,1.633225e-06,0.000009,,5.932717e-07,2.903021e-06,0.000007,...,,,,,,,,,,
하,0.092708,3.510261e-05,0.000084,0.000017,5.716288e-06,0.000004,,1.779815e-06,4.112614e-06,0.000037,...,,,,,,,,,,
놔,0.000270,5.278588e-08,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ཿ,,,,,,,,,,,...,,,,,0.000920,,,,,
བྷ,,,,,,,,,,,...,,,,,0.007359,,,,,
ཎ,,,,,,,,,,,...,,,,,0.004599,,,,,
ཾ,,,,,,,,,,,...,,,,,0.000920,,,,,


In [29]:
minimum = 1
for lang_name in df.columns:
    print(lang_name, min(df[df[lang_name].notna()][lang_name]))
    minimum = min(minimum, min(df[df[lang_name].notna()][lang_name]))
print(minimum)

Korean 5.29929846851086e-07
ENGLISH 5.2785875965882116e-08
POLISH 3.494192887982482e-07
Chinese 7.299444185446964e-08
SPANISH 2.0415314475117103e-07
SWEDISH 3.2743418237228516e-07
INDONESIAN 2.0672459754299818e-06
Japanese 2.9663584228822074e-07
ChineseT 2.4191845358569346e-07
ARABIC 3.705104863821065e-07
MALAGASY 1.5613056699281075e-05
FRENCH 2.20495991464508e-07
RUSSIAN 8.699298132128091e-08
BELARUSIAN 5.498843110088079e-06
TAMIL 2.1504920443610558e-06
TURKISH 2.3186425421398674e-07
DANISH 5.313500439341142e-07
CROATIAN 9.682587508355126e-07
ITALIAN 2.3915964962239e-07
MACEDONIAN 2.646778909526762e-06
DUTCH 4.595999300875374e-07
CZECH 5.278058423436257e-07
GREEK 5.837580869446021e-07
GERMAN 1.1304224305696652e-07
TELUGU 4.331962069636273e-06
SLOVENIAN 1.2570453804072842e-06
FINNISH 4.457366867324399e-07
HUNGARIAN 5.446719778597291e-07
BIHARI 8.312561247876778e-06
PORTUGUESE 3.258859473272091e-07
VIETNAMESE 3.8428824916644095e-06
MALAYALAM 1.0292496515049712e-06
ALBANIAN 1.98892453407

In [12]:
df.to_excel('MASTER.LINGUISTIC.1GRAM.big.char-freq.xlsx')

In [38]:
df2 = df.fillna(1e-15)
df2

language,Korean,ENGLISH,POLISH,Chinese,SPANISH,SWEDISH,INDONESIAN,Japanese,ChineseT,ARABIC,...,SESELWA,SAMOAN,LINGALA,XHOSA,TIBETAN,OCCITAN,NEPALI,GUARANI,HMONG,TATAR
야,2.664222e-02,1.615248e-05,1.956748e-05,4.744639e-06,8.166126e-07,2.324783e-05,1.000000e-15,5.932717e-07,9.676738e-07,5.187147e-06,...,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15
는,1.221796e-01,1.847506e-05,5.590709e-05,1.693471e-05,1.061596e-05,3.274342e-06,1.000000e-15,2.669723e-06,3.628777e-06,7.150852e-05,...,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15
여,4.502973e-02,1.419940e-05,7.652282e-05,2.160635e-05,1.633225e-06,9.168157e-06,1.000000e-15,5.932717e-07,2.903021e-06,7.410210e-06,...,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15
하,9.270805e-02,3.510261e-05,8.351121e-05,1.722669e-05,5.716288e-06,3.601776e-06,1.000000e-15,1.779815e-06,4.112614e-06,3.742156e-05,...,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15
놔,2.702642e-04,5.278588e-08,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,...,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ཿ,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,...,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,9.198425e-04,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15
བྷ,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,...,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,7.358740e-03,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15
ཎ,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,...,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,4.599213e-03,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15
ཾ,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,...,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,9.198425e-04,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15,1.000000e-15


In [14]:
df2 = df.applymap(lambda x: x if x > 1e-6 else 0.0)
df2

language,Korean,ENGLISH,POLISH,Chinese,SPANISH,SWEDISH,INDONESIAN,Japanese,ChineseT,ARABIC,...,SESELWA,SAMOAN,LINGALA,XHOSA,TIBETAN,OCCITAN,NEPALI,GUARANI,HMONG,TATAR
쟁,0.005987,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
여,0.044000,0.0,0.0,0.000002,0.0,0.0,0.0,0.0,0.000002,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
놔,0.000113,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
야,0.023785,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
하,0.080850,0.0,0.0,0.000003,0.0,0.0,0.0,0.0,0.000002,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ཿ,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.000100,0.0,0.0,0.0,0.0,0.0
བྷ,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.000314,0.0,0.0,0.0,0.0,0.0
ཎ,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.001050,0.0,0.0,0.0,0.0,0.0
ཾ,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.000135,0.0,0.0,0.0,0.0,0.0


In [31]:
rows = []
for lang_1 in df2.columns:
    print(lang_1)
    row = {'language': lang_1}
    for lang_2 in df2.columns:
        row[lang_2] = dot_product(df2[lang_1], df2[lang_2])
    rows.append(row)
df3 = pd.DataFrame(rows)
df3.set_index('language', inplace=True)
df3        

Korean
ENGLISH
POLISH
Chinese
SPANISH
SWEDISH
INDONESIAN
Japanese
ChineseT
ARABIC
MALAGASY
FRENCH
RUSSIAN
BELARUSIAN
TAMIL
TURKISH
DANISH
CROATIAN
ITALIAN
MACEDONIAN
DUTCH
CZECH
GREEK
GERMAN
TELUGU
SLOVENIAN
FINNISH
HUNGARIAN
BIHARI
PORTUGUESE
VIETNAMESE
MALAYALAM
ALBANIAN
MALTESE
Unknown
GALICIAN
ESTONIAN
ARMENIAN
NORWEGIAN
SERBIAN
HEBREW
SLOVAK
THAI
UKRAINIAN
ROMANIAN
GEORGIAN
LITHUANIAN
HINDI
BENGALI
ICELANDIC
LATVIAN
PERSIAN
SWAHILI
TAGALOG
UZBEK
CATALAN
BOSNIAN
FRISIAN
ORIYA
SINHALESE
MARATHI
TURKMEN
AZERBAIJANI
BASQUE
NORWEGIAN_N
BURMESE
BULGARIAN
TIGRINYA
SOMALI
LUXEMBOURGISH
MALAY
URDU
GUJARATI
KAZAKH
TAJIK
KANNADA
HAUSA
DHIVEHI
PASHTO
MONGOLIAN
KURDISH
PUNJABI
KINYARWANDA
WELSH
RHAETO_ROMANCE
AYMARA
WARAY_PHILIPPINES
OROMO
KYRGYZ
IRISH
SCOTS_GAELIC
GANDA
AFRIKAANS
FAROESE
YORUBA
NYANJA
AMHARIC
LATIN
CORSICAN
IGBO
SESELWA
SAMOAN
LINGALA
XHOSA
TIBETAN
OCCITAN
NEPALI
GUARANI
HMONG
TATAR


Unnamed: 0_level_0,Korean,ENGLISH,POLISH,Chinese,SPANISH,SWEDISH,INDONESIAN,Japanese,ChineseT,ARABIC,...,SESELWA,SAMOAN,LINGALA,XHOSA,TIBETAN,OCCITAN,NEPALI,GUARANI,HMONG,TATAR
language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Korean,1.000000,0.216318,0.182441,0.147054,0.200121,0.165137,0.174530,0.085225,0.162381,0.067133,...,0.172251,0.280851,0.147573,0.242620,0.042780,0.016871,0.049233,0.151245,0.273001,0.134709
ENGLISH,0.216318,1.000000,0.944212,0.261769,0.980731,0.941520,0.948352,0.214695,0.333525,0.184937,...,0.968692,0.854101,0.929923,0.911321,0.233207,0.047327,0.301242,0.930023,0.849700,0.149078
POLISH,0.182441,0.944212,1.000000,0.203160,0.937582,0.898497,0.935064,0.184812,0.271614,0.158906,...,0.924498,0.745985,0.918228,0.849509,0.226790,0.055938,0.293411,0.917665,0.746501,0.081695
Chinese,0.147054,0.261769,0.203160,1.000000,0.236156,0.169820,0.188863,0.325618,0.772165,0.100806,...,0.183163,0.410442,0.141277,0.325720,0.047085,0.025083,0.048724,0.150806,0.393083,0.237708
SPANISH,0.200121,0.980731,0.937582,0.236156,1.000000,0.933553,0.947659,0.200675,0.307153,0.169813,...,0.970896,0.823827,0.953706,0.913954,0.233086,0.062113,0.297655,0.973688,0.812573,0.114654
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
OCCITAN,0.016871,0.047327,0.055938,0.025083,0.062113,0.066267,0.041741,0.015293,0.029388,0.012890,...,0.052077,0.050755,0.069368,0.050502,0.010324,1.000000,0.012793,0.061471,0.052410,0.021332
NEPALI,0.049233,0.301242,0.293411,0.048724,0.297655,0.300902,0.314052,0.051169,0.070257,0.043853,...,0.298956,0.221883,0.301591,0.263187,0.074079,0.012793,1.000000,0.299968,0.225324,0.008353
GUARANI,0.151245,0.930023,0.917665,0.150806,0.973688,0.930520,0.946213,0.159961,0.217620,0.132738,...,0.967751,0.704404,0.976912,0.849404,0.232170,0.061471,0.299968,1.000000,0.695087,0.019561
HMONG,0.273001,0.849700,0.746501,0.393083,0.812573,0.736619,0.767015,0.255385,0.464062,0.216057,...,0.752204,0.955357,0.693933,0.880423,0.182500,0.052410,0.225324,0.695087,1.000000,0.364925


In [32]:
df3.to_excel('MASTER.LINGUISTIC.1GRAM.big.char-freq-similarity.xlsx')

In [36]:
labels = list(df2.columns)
labels

['Korean',
 'ENGLISH',
 'POLISH',
 'Chinese',
 'SPANISH',
 'SWEDISH',
 'INDONESIAN',
 'Japanese',
 'ChineseT',
 'ARABIC',
 'MALAGASY',
 'FRENCH',
 'RUSSIAN',
 'BELARUSIAN',
 'TAMIL',
 'TURKISH',
 'DANISH',
 'CROATIAN',
 'ITALIAN',
 'MACEDONIAN',
 'DUTCH',
 'CZECH',
 'GREEK',
 'GERMAN',
 'TELUGU',
 'SLOVENIAN',
 'FINNISH',
 'HUNGARIAN',
 'BIHARI',
 'PORTUGUESE',
 'VIETNAMESE',
 'MALAYALAM',
 'ALBANIAN',
 'MALTESE',
 'Unknown',
 'GALICIAN',
 'ESTONIAN',
 'ARMENIAN',
 'NORWEGIAN',
 'SERBIAN',
 'HEBREW',
 'SLOVAK',
 'THAI',
 'UKRAINIAN',
 'ROMANIAN',
 'GEORGIAN',
 'LITHUANIAN',
 'HINDI',
 'BENGALI',
 'ICELANDIC',
 'LATVIAN',
 'PERSIAN',
 'SWAHILI',
 'TAGALOG',
 'UZBEK',
 'CATALAN',
 'BOSNIAN',
 'FRISIAN',
 'ORIYA',
 'SINHALESE',
 'MARATHI',
 'TURKMEN',
 'AZERBAIJANI',
 'BASQUE',
 'NORWEGIAN_N',
 'BURMESE',
 'BULGARIAN',
 'TIGRINYA',
 'SOMALI',
 'LUXEMBOURGISH',
 'MALAY',
 'URDU',
 'GUJARATI',
 'KAZAKH',
 'TAJIK',
 'KANNADA',
 'HAUSA',
 'DHIVEHI',
 'PASHTO',
 'MONGOLIAN',
 'KURDISH',
 'PUNJ

In [44]:
vecs = [list(df2[label]) for label in labels]
len(vecs)

110

In [45]:
linkages = hierarchy.linkage(vecs, method='single', metric='euclidean', optimal_ordering=False)
len(linkages)

109

In [46]:
linkages

array([[3.50000000e+01, 1.07000000e+02, 7.96099175e-02, 2.00000000e+00],
       [3.80000000e+01, 6.40000000e+01, 8.93920436e-02, 2.00000000e+00],
       [4.00000000e+00, 2.90000000e+01, 1.08156595e-01, 2.00000000e+00],
       [5.50000000e+01, 1.10000000e+02, 1.10591166e-01, 3.00000000e+00],
       [8.50000000e+01, 1.00000000e+02, 1.19524848e-01, 2.00000000e+00],
       [2.10000000e+01, 4.10000000e+01, 1.31108468e-01, 2.00000000e+00],
       [1.80000000e+01, 1.12000000e+02, 1.34726023e-01, 3.00000000e+00],
       [9.20000000e+01, 1.14000000e+02, 1.36448563e-01, 3.00000000e+00],
       [6.90000000e+01, 8.40000000e+01, 1.40377907e-01, 2.00000000e+00],
       [4.90000000e+01, 9.30000000e+01, 1.40616460e-01, 2.00000000e+00],
       [1.10000000e+01, 1.16000000e+02, 1.46876851e-01, 4.00000000e+00],
       [1.00000000e+00, 1.20000000e+02, 1.48603755e-01, 5.00000000e+00],
       [1.17000000e+02, 1.18000000e+02, 1.48737480e-01, 5.00000000e+00],
       [9.80000000e+01, 1.02000000e+02, 1.58945677e