In [1]:
import pandas as pd

In [20]:
from collections import defaultdict
import random
import time
from multiprocessing.pool import ThreadPool

import fasttext
import numpy as np
import requests
from bs4 import BeautifulSoup
from nltk import sent_tokenize
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from tqdm.auto import tqdm, trange

https://meta.wikimedia.org/wiki/List_of_Wikipedias

# Download samples of Wikipedia in multiple languages

In [2]:
tables = pd.read_html('https://meta.wikimedia.org/wiki/List_of_Wikipedias')

In [3]:
len(tables)

2

In [4]:
all_langs = tables[0]
all_langs = all_langs[all_langs.Articles > 0].copy()

In [5]:
all_langs

Unnamed: 0,№,Language,Language (local),Wiki,Articles,All pages,Edits,Admins,Users,Active users,Files,Depth
0,1,English,English,en,6947529,62423458,1268332783,849,48638764,125817,932156,1295
1,2,Cebuano,Cebuano,ceb,6116829,11229541,35055778,5,123890,146,1,2
2,3,German,Deutsch,de,2983285,8206709,251127588,174,4513992,19214,129877,93
3,4,French,français,fr,2661853,13367484,222099832,147,5103325,18663,72889,268
4,5,Swedish,svenska,sv,2602633,6288042,56694452,66,937231,2204,0,18
...,...,...,...,...,...,...,...,...,...,...,...,...
335,336,Kalaallisut,kalaallisut,kl,242,2259,75528,3,14589,22,0,2322
336,337,West Coast Bajau,Bajau Sama,bdr,235,1345,5322,0,610,8,0,88
337,338,South Ndebele,isiNdebele seSewula,nr,129,385,2165,0,362,8,0,22
338,339,Tigre,ትግሬ,tig,37,464,6108,1,297,16,0,1753


In [7]:
all_langs[all_langs['Wiki'] == 'mdf']

Unnamed: 0,№,Language,Language (local),Wiki,Articles,All pages,Edits,Admins,Users,Active users,Files,Depth
194,195,Moksha,мокшень,mdf,7129,21915,94797,3,10527,22,0,18


## Scrape the texts

Sample the languages with temperature sampling. 

In [21]:
# create weights for temperature sampling

w = all_langs.Articles.values ** (1/5)
w /= sum(w)
sum(w)

0.9999999999999998

In [18]:
pages = []

In [14]:
def get_page(lang, timeout=0.1):
    res = requests.get(f'https://{lang}.wikipedia.org/wiki/Special:Random')
    if '<title>Wikimedia Error</title>' in res.text:
        time.sleep(timeout)
        return get_page(lang, timeout=timeout*2)
    return [res.url, res.text]

In [None]:
# example with async mining


# pool = ThreadPool(processes=100)

# for i in trange(2):
#     if random.random() < 0.2:
#         # uniform
#         langs = all_langs.Wiki.tolist()
#     else:
#         # heated distribution
#         langs = all_langs.Wiki.loc[random.choices(range(len(all_langs)), weights=w, k=len(all_langs))].tolist()
#     async_result = pool.map(get_page, langs)
#     pages.extend(async_result)

In [None]:
# example with separate mining

for i in trange(100_000_000):
    if random.random() < 0.2:
        # uniform
        idx = random.choice(range(len(all_langs)))
    else:
        # heated uniform
        idx = random.choices(range(len(all_langs)), weights=w)[0]
    lang = all_langs.Wiki.loc[idx]
    res = requests.get(f'https://{lang}.wikipedia.org/wiki/Special:Random')
    pages.append([res.url, res.text])

In [53]:
len(pages)

339

## put texts by language together

In [55]:
def get_paragraphs(html):
    soup = BeautifulSoup(html)
    body = soup.find('div', {'id': 'bodyContent'})
    if not body:
        return []
    # todo: remove <sup> elements
    # 
    result = []
    for p in body.findAll('p'):
        for bad_tag in ['sup', 'style', 'script']:
            for unwanted in p.find_all(bad_tag):
                unwanted.extract()
        text = p.text.replace('\xa0', ' ')
        if text.strip():
            result.append(text)
    return result

In [56]:
url, html = random.choice(pages)
print(url)

print(get_paragraphs(html))

https://blk.wikipedia.org/wiki/%E1%80%9D%E1%80%B1%E1%80%84%E1%80%BA%EA%A9%BB%E1%80%94%E1%80%9A%E1%80%BA%E1%82%8F%E1%80%9C%E1%80%94%E1%80%BA%E1%80%B8%E1%80%99%E1%80%92%E1%80%B1%E1%80%AB%E1%80%BA
['ဝေင်ꩻနယ်ႏလန်းမဒေါ် (မန်း: လမ်းမတော်မြို့နယ်) (အိန်းကလေတ်: Lanmadaw Township) နဝ်ꩻ အဝ်ႏဒျာႏ မျန်မာခမ်းထီ အခဝ်ထာႏဝ ရန်ႏတကုင်ႏတွိုင်ꩻဒေႏသတန် စျောက်တဒါးခရဲင်ႏ ကို ကပါဒါႏ ဝေင်ꩻနယ်ႏတဖြုံႏဒျာႏသွူ။ အဝ်ႏပယ်ဒျာႏ ဝေင်ꩻတန်ရန်ႏတကုင်ႏ ဝေင်ꩻကဲဉ်မော့တွို့ꩻကို ထွာဒျာႏ ဝေင်ꩻတန်ရန်ႏတကုင်ႏ တခူးတခဝ်ဟံႏနယ်ႏဒျာႏသွူ။\n']


In [None]:
def url2lang(url):
    return url.split('//')[1].split('.')[0]

url2lang('https://en.wikipedia.org/wiki/Dunbar_Duncan')

In [59]:
lang2texts = defaultdict(list)

In [60]:
for url, html in tqdm(pages):
    lang2texts[url2lang(url)].extend(get_paragraphs(html))

  0%|          | 0/339 [00:00<?, ?it/s]

# get stats for pages

In [65]:
stats = []

for k, v in lang2texts.items():
    all_texts = ' '.join(v)
    stats.append({
        'lang': k,
        'pars': len(v),
        'chars': len(all_texts),
        'unique_chars': len(set(all_texts))
    })
stats = pd.DataFrame(stats)

In [66]:
stats.sort_values('chars', ascending=False)

Unnamed: 0,lang,pars,chars,unique_chars
194,mt,38,16849,86
111,sco,41,14840,78
133,sd,16,11852,111
47,el,29,11399,123
103,tl,13,11337,64
...,...,...,...,...
148,fo,0,0,0
250,pi,0,0,0
253,krc,0,0,0
85,jv,0,0,0


In [67]:
stats.sort_values('unique_chars', ascending=False)

Unnamed: 0,lang,pars,chars,unique_chars
11,zh,37,3953,696
12,ja,13,2175,384
106,wuu,9,960,377
47,el,29,11399,123
133,sd,16,11852,111
...,...,...,...,...
250,pi,0,0,0
253,krc,0,0,0
85,jv,0,0,0
117,avk,0,0,0


In [68]:
stats[stats.lang.apply(lambda x: x in ['ru', 'mdf', 'myv'])]

Unnamed: 0,lang,pars,chars,unique_chars
6,ru,5,2598,88
187,myv,2,120,33
196,mdf,4,312,67


# select train, test sets

In [69]:
train_texts = []
train_labels = []
test_texts = []
test_labels = []

for lang, texts in lang2texts.items():
    unique_texts = sorted(set(texts))
    if len(unique_texts) < 2:
        continue

    train_samples, test_samples = train_test_split(unique_texts, test_size=0.2, random_state=1)
    train_texts.extend(train_samples)
    test_texts.extend(test_samples)

    train_labels.extend([lang] * len(train_samples))
    test_labels.extend([lang] * len(test_samples))
print(len(train_texts), len(test_texts))

856 327


In [70]:
print(pd.Series(train_labels).value_counts()['mdf'])
pd.Series(train_labels).value_counts()

1


skr        35
sco        32
bg         30
mt         30
zh         29
           ..
bug         1
cdo         1
sah         1
bat-smg     1
cr          1
Name: count, Length: 211, dtype: int64

too little mdf pairs, so add bible for mdf and myv 

In [755]:
mdf_bible = pd.read_csv('../parsing_aligned/results/mdf_bible.tsv', sep='\t')
print(mdf_bible.shape)

mdf_bible = mdf_bible.dropna()
print(mdf_bible.shape)

mdf_bible = mdf_bible[~mdf_bible['mdf']str.startswith('Глава ')]
print(mdf_bible.shape)
print(mdf_bible.columns)

(12517, 4)
(12500, 4)
(12344, 4)
Index(['Unnamed: 0', 'mdf', 'ru', 'source'], dtype='object')


In [474]:
myv_bible = pd.read_csv('../parsing_aligned/results/myv_bible.tsv', sep='\t')
print(myv_bible.shape)

bmyv_bibleible = myv_bible.dropna()
print(myv_bible.shape)

myv_bible = myv_bible[~myv_bible['myv'].str.startswith('Глава ')]
print(myv_bible.shape)


(12926, 4)
(12899, 4)
(12483, 4)
Index(['Unnamed: 0', 'myv', 'ru', 'source'], dtype='object')


# augmenting data

In [80]:
def split_by_newline(text):
    return text.split('\n')


def split_overlapping_chunks(text, step=100, chunk_size=300):
    results = []
    for i in range(0, len(text), step):
        results.append(text[i:i+chunk_size])
    return results


def try_split(text, max_length=500):
    results = [text]
    parts = [text]
    
    for splitter in [split_by_newline, sent_tokenize, split_overlapping_chunks]:
        new_parts = []
        for part in parts:
            small_parts = [p.strip() for p in splitter(part)]
            for small_part in small_parts:
                if len(small_part) < 3: # the text is too short 
                    continue
                if len(small_part) <= min(len(part) * 0.5, max_length):
                    results.append(small_part)
                new_parts.append(small_part)
        parts = new_parts
    results.extend(parts)
    return sorted(set(results), key=lambda x: len(x))

In [72]:
train_texts_aug = []
train_labels_aug = []
for t, l in zip(tqdm(train_texts), train_labels):
    a = try_split(t)
    train_texts_aug.extend(a)
    train_labels_aug.extend([l]*len(a))

  0%|          | 0/856 [00:00<?, ?it/s]

In [73]:
print(len(train_texts), len(train_texts_aug))

856 4407


In [83]:
print(pd.Series(train_labels_aug).value_counts()['mdfs'])
pd.Series(train_labels_aug).value_counts()

3


mt     212
sco    186
el     186
sd     183
bg     124
      ... 
ia       1
glk      1
su       1
cu       1
ik       1
Name: count, Length: 211, dtype: int64

In [579]:
train_texts_aug.extend(mdf_bible.mdf.tolist())
train_texts_aug.extend(myv_bible.mdf.tolist())
train_texts_aug.extend(mdf_bible.ru.tolist())

train_labels_aug.extend(['mdf']*len(mdf_bible))
train_labels_aug.extend(['myv']*len(myv_bible))
train_labels_aug.extend(['ru']*len(mdf_bible))

In [757]:
print(pd.Series(train_labels_aug).value_counts()['mdf'])
print(pd.Series(train_labels_aug).value_counts()['myv'])
pd.Series(train_labels_aug).value_counts()

15610
13603


de     38694
ru     31283
es     25238
en     24849
fr     21234
       ...  
cho       27
ii        18
mus       16
kj         9
ho         4
Length: 323, dtype: int64

# Sklearn pipeline

In [611]:
pipe = make_pipeline(
    HashingVectorizer(analyzer='char_wb', ngram_range=(1,4), n_features=100_000), 
    LogisticRegression(C=1e-4, max_iter=1_000, solver='saga')
)

In [613]:
%%time
pipe.fit(train_texts_aug, train_labels_augg)

Wall time: 6h 16min 31s


Pipeline(steps=[('hashingvectorizer',
                 HashingVectorizer(analyzer='char_wb', n_features=100000,
                                   ngram_range=(1, 4))),
                ('logisticregression',
                 LogisticRegression(C=0.0001, max_iter=1000, solver='saga'))])

In [622]:
%%time

preds = pipe.predict_proba(test_texts)

Wall time: 50 s


In [624]:
preds.argmax(1)

array([63, 63, 63, ..., 63, 63, 63], dtype=int64)

In [629]:
(pipe.classes_[preds.argmax(1)] == test_labels).mean()

0.028913795159174912

In [631]:
pd.Series(pipe.classes_[preds.argmax(1)]).value_counts()

de    56051
ru    11045
dtype: int64

# FastText

In [84]:
with open('ft_train.txt', 'w') as f:
    for label, text in zip(train_labels_aug, train_texts_aug):
        f.write(f'__label__{label} ')
        f.write(text.replace('\n', ' ') + '\n')

In [765]:
model = fasttext.train_supervised(
    input="ft_train.txt", 
    lr=0.05, # 0.1 gives 38.8% / 0.5 gives 34.8% (overfitting?)
    epoch=100, # 5 gives 38.8%  / 10 gives 49.5% / 
    wordNgrams=0, 
    bucket=200_000,  # default is 200K; 100K gives 38% acc, and 200K as well.
    dim=64, # FB uses 16, but 32 is much better with me, and 64 seems +- the same
    loss='softmax',
    minn=1,
    maxn=4, # if I decrease this to 3, the quality is no worse.
    minCount=100, # a larger number is required  # 5 gets 39% accuracy with 650K words; 50 gets 38% acc with 50K words; 
    # 300 gets acc 38% with 6K words
)

In [766]:
model.predict('пек вадря', k=5)

(('__label__myv', '__label__ru', '__label__lez', '__label__tg', '__label__be'),
 array([0.74181491, 0.20231327, 0.0135121 , 0.00760135, 0.00718429]))

In [767]:
model.predict(['пек вадря', 'привет'], k=1)

([['__label__myv'], ['__label__ru']],
 [array([0.7418149], dtype=float32), array([0.9943341], dtype=float32)])

In [768]:
%%time
ft_preds = model.predict([t.replace('\n', ' ') for t in test_texts])

Wall time: 9.3 s


In [769]:
ft_labels = [l[0][9:] for l in ft_preds[0]]

77% accuracy, which is not bad! / 0.69618 in a smaller version / 0.76831 a middle one / 87% for longer training

In [770]:
(np.array(ft_labels) == test_labels).mean()

0.8764009777035889

In [None]:
print(classification_report(test_labels, ft_labels))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          ab       0.93      0.95      0.94        44
         ace       0.98      0.91      0.95        68
         ady       0.77      0.68      0.72        79
          af       0.95      0.90      0.93       304
          ak       0.70      0.30      0.42       102
         als       0.73      0.89      0.80       417
         alt       0.93      0.95      0.94       319
          am       0.98      0.98      0.98       151
         ami       0.86      0.90      0.88       159
          an       0.79      0.95      0.87       186
         ang       0.97      0.94      0.95       108
          ar       0.93      0.99      0.96       408
         arc       1.00      1.00      1.00        40
         ary       0.93      0.90      0.91       117
         arz       0.97      0.91      0.94       300
          as       0.98      0.96      0.97       302
         ast       0.87      0.96      0.91       628
         atj       0.95    

  _warn_prf(average, modifier, msg_start, len(result))


## save and quantize

In [772]:
model.save_model('../langid/lid.323.bin')

In [787]:
model.quantize(retrain=True, input="ft_train.txt", qnorm=True, cutoff=50_000)

In [788]:
len(model.words)

2282

In [789]:
model.save_model('../langid/lid.323.ftz')

In [790]:
%%time
ft_preds = model.predict([t.replace('\n', ' ') for t in test_texts])

Wall time: 25.1 s


After compression, the model retains 89% accuracy - even more than the large one. Probably, due to longer training.

In [792]:
ft_labels = [l[0][9:] for l in ft_preds[0]]
(np.array(ft_labels) == test_labels).mean()

0.8900828663407655

In [15]:
print(classification_report(test_labels, ft_labels))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          ab       0.93      0.95      0.94        44
         ace       0.98      0.91      0.95        68
         ady       0.77      0.68      0.72        79
          af       0.95      0.90      0.93       304
          ak       0.70      0.30      0.42       102
         als       0.73      0.89      0.80       417
         alt       0.93      0.95      0.94       319
          am       0.98      0.98      0.98       151
         ami       0.86      0.90      0.88       159
          an       0.79      0.95      0.87       186
         ang       0.97      0.94      0.95       108
          ar       0.93      0.99      0.96       408
         arc       1.00      1.00      1.00        40
         ary       0.93      0.90      0.91       117
         arz       0.97      0.91      0.94       300
          as       0.98      0.96      0.97       302
         ast       0.87      0.96      0.91       628
         atj       0.95    

  _warn_prf(average, modifier, msg_start, len(result))
