In [4]:
from collections import Counter
import random
import re

import pandas as pd
import razdel
import requests
from bs4 import BeautifulSoup
from tqdm.auto import tqdm

In [2]:
from LID import LanguageDetector
    
LD = LanguageDetector()

## get and parse pages

In [3]:
def get_content(soup):
    body = soup.find('div', {'id': 'bodyContent'})
    return [paragraph.text for paragraph in body.findAll('p')]

In [5]:
good_counter = 0

def parse_by_id(idx, lang='mdf'):
    url = f'https://{lang}.wikipedia.org/?curid={idx}'
    soup = BeautifulSoup(requests.get(url).text)

    raw_content = '\n\n'.join(get_content(soup))
    if LD.predict_lang(raw_content).most_common(1)[0][0] != lang:
        return None

    results = {
        'url': url,
        'content': raw_content.split('\n\n')
    }
    ru_button = soup.find('li', {'class': 'interlanguage-link interwiki-ru mw-list-item'})
    if ru_button:
        ru_url = ru_button.find('a')['href']
        
        ru_soup = BeautifulSoup(requests.get(ru_url).text)
        results['ru_url'] = ru_url
        results['ru_content'] = get_content(ru_soup)
    return results

In [16]:
all_results = []

In [17]:
for i in range(10):
    page = parse_by_id(i)
    if page is None:
        continue
    all_results.append(page)

In [18]:
len(all_results)

5

## Sort pages by useful content

In [20]:
good_results = {}
for item in tqdm(all_results):
    url = item['url']
    text = '\n\n'.join(item['content']).strip()
    text = re.sub('\[\d+\]', '', text)

    pars = []
    mdf_scores = []
    ru_scores = []
    top_langs = []
    lens = []
    for p in text.split('\n\n'):
        p = p.strip()
        if len(p.strip()) < 3:
            continue
        if (p.count('•') + p.count('·') + p.count('|')) / len(p) > 0.05:
            continue
        if not re.match('.*[а-яё].*', p.lower(), re.DOTALL):
            continue
        pars.append(p)
        langs = LD.predict_lang(p)
        mdf_scores.append(langs['mdf'])
        ru_scores.append(langs['ru'])
        top_langs.append(langs.most_common(1)[0][0])
        lens.append(len(p))
    
    good_pars = '\n\n'.join([p for i, p in enumerate(pars) if top_langs[i] == 'mdf'])
    
    if good_pars.startswith('Тя категориес') or good_pars.startswith('Няфтьфт'):
        continue
    
    if good_pars:
        good_results[url] = good_pars
        
print(len(good_results))

  0%|          | 0/5 [00:00<?, ?it/s]

5


In [22]:
url = random.choice(list(good_results.keys()))
print(url)
text = good_results[url]
print(text)


https://mdf.wikipedia.org/?curid=0
Вешф-анаф лопать пряфкслемоц шава или эсонза ули аньцек лемонди ётка васта.

Пря лопа лопанди мрдамс.


In [24]:
Counter(good_results.values()).most_common(5)

[('Вешф-анаф лопать пряфкслемоц шава или эсонза ули аньцек лемонди ётка васта.\n\nПря лопа лопанди мрдамс.',
  3),
 ('Арьсезь корхтама лопат — васта, коса ломаттне арьсихть, кода цебярьгофтомс уликсть Википедиесь. Ушедода од арьсезь корхнема, штоба кирдемс соткс и и арьсемс-ладямс мезть-бди мархта Az1568. Тяса сёрматфть тонь кядялот ули кода моафтомс и лиятненди.',
  1),
 ('тяштьфкс: Улема, сёрматфксть лисемда меле полафтоматнень няеманкса тееть сави аруяфтомс-шамдомс эсь браузерцень кэш-паргонянц.',
  1)]

In [25]:
len(good_results)

5

## get candidates with `ru_content`

In [26]:
url2item = {item['url']: item for item in all_results}

In [27]:
candidates = list({k for k, v in good_results.items() if 'ru_content' in url2item[k]})
len(candidates)

0

## get df with data

In [119]:
def get_good_text(paragraphs, target_language='ru'):
    text = '\n\n'.join(paragraphs).strip()
    text = re.sub('\[\d+\]', '', text)

    pars = []
    top_langs = []
    for p in text.split('\n\n'):
        p = p.strip().replace('\xa0', ' ')
        if len(p.strip()) < 3:
            return ''
        if (p.count('•') + p.count('·') + p.count('|')) / len(p) > 0.05:
            return ''
        if not re.match('.*[а-яё].*', p.lower(), re.DOTALL):
            return ''
        pars.append(p)
        langs = LD.predict_lang(p)
        top_langs.append(langs.most_common(1)[0][0])
    
    good_pars = '\n\n'.join([p for i, p in enumerate(pars) if top_langs[i] == target_language])
    
    if good_pars.startswith('Тя категориес') or good_pars.startswith('Няфтьфт'):
        return ''
    
    return good_pars

In [140]:
all_ru_sentences = []
all_mdf_sentences = []

for url in tqdm(candidates):
    ru_text = get_good_text(url2item[url]['ru_content'])
    if not ru_text: 
        continue
    sents_mdf = [s.text for p in good_results[url].split('\n') for s in razdel.sentenize(p) if s.text]
    sents_ru = [s.text for p in ru_text.split('\n') for s in razdel.sentenize(p)  if s.text]
    
    all_ru_sentences.append(sents_ru)
    all_mdf_sentences.append(sents_mdf)

  0%|          | 0/5975 [00:00<?, ?it/s]

In [153]:
df = pd.DataFrame({'ru_text': ' '.join(all_ru_sentences), 'mdf_text': ' '.join(all_mdf_sentences)})

In [6]:
df = df[df['mdf_text'].apply(len) > 20]

In [7]:
df.shape

(3164, 2)

In [13]:
df.reset_index(drop=True).to_pickle('results/wikipedia_dump.pkl')