```
🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦
🟦🟦😁🟦🟦🟦😁🟦🟦🟦😁🟦🟦🟦😁😁😁🟦😁😁😁🟦😁😁😁😁🟦😁🟦🟦🟦😁🟦🟦
🟦🟦😁😁🟦😁😁🟦🟦😁🟦😁🟦🟦🟦🟦😁🟦🟦😁🟦🟦😁🟦🟦🟦🟦🟦😁🟦😁🟦🟦🟦
🟦🟦😁🟦😁🟦😁🟦😁🟦🙄🟦😁🟦🟦🟦😁🟦🟦😁🟦🟦😁😁😁🟦🟦🟦🟦😁🟦🟦🟦🟦
🟦🟦😁🟦🟦🟦😁🟦🟦😁🟦😁🟦🟦😁🟦😁🟦🟦😁🟦🟦😁🟦🟦🟦🟦🟦🟦😁🟦🟦🟦🟦
🟦🟦😁🟦🟦🟦😁🟦🟦🟦😁🟦🟦🟦🟦😁🟦🟦😁😁😁🟦😁🟦🟦🟦🟦🟦🟦😁🟦🟦🟦🟦
🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦
```

# Mojification

**Mojification** is a Jupyter Notebook-based helper for alignment of two texts in different languages. Based on [Lingtrain Aligner](https://github.com/averkij/lingtrain-aligner) (version 0.8.7), it does not change texts, but “mojify” them.

>🤓 **to mojify** *verb* /ˈmɒdʒɪfaɪ/ to wrap corresponding sentences of two texts in different languages in emoji

>🤓 **to de-mojify** *verb* /ˌdiːˈmɒdʒɪfaɪ/ to delete emojis from two aligned texts

[Github](https://github.com/bilinguator/mojification/) | Created by 📚 [Bilinguator.com](https://bilinguator.com/ru/)

## 1. Load packages

In [None]:
# !pip install -U lingtrain-aligner==0.8.7
# !pip install razdel dateparser sentence_transformers
# !pip install translate
import os
import re
from tqdm import tqdm
from translate import Translator
from lingtrain_aligner import preprocessor, splitter, aligner, resolver, reader, helper, vis_helper

## 2. Load two texts

Name files as follows: `<FOLDER>/<BOOKID>_<LANG>.txt`

In [None]:
# Enter the information about two TXT files to be aligned
folder = '../b-editor/books/'
book_id = 'le_petit_prince'
lang1 = 'ru'
lang2 = 'eo'
lang2_real = lang2
lang3 = None
lang2_split_method = 'splitter'

text1_path  = f'{folder.rstrip("/")}/{book_id}_{lang1}.txt'
text2_path  = f'{folder.rstrip("/")}/{book_id}_{lang2}.txt'

if lang1 not in splitter.get_supported_languages():
    print(f'"{lang1}" is not supported! Renamed to "xx".')
    lang1 = 'xx'

lang2 = 'bu' if lang2 == 'be' else lang2
lang2 = 'cz' if lang2 == 'cs' else lang2
lang2 = 'sw' if lang2 == 'sv' else lang2

if lang2 not in splitter.get_supported_languages():
    print(f'"{lang2}" is not supported! Renamed to "xx".')
    lang2 = 'xx'

with open(text1_path, "r", encoding="utf8") as handle1:
    text1 = handle1.read()
with open(text2_path, "r", encoding="utf8") as handle2:
    text2 = handle2.read()

## Intermediate translation options

Intermediate translation is the option of translation of the text in the badly aligned language 2 (`lang2`) to the well aligned language (`lang3`).The latter is used in the alignment process. When aligned, it is substituted to the original language 2. If no intermediate translation needed, define `lang3` as `False`.

Badly aligned languages may also be badly splitted by the `splitter.split_by_sentences_wrapper` function. Use the `standard_split` method to use built-in Python function `split('\n')`.

In [None]:
lang3 = 'ru'
lang2_split_methods = ['standard_split', 'splitter']
lang2_split_method = lang2_split_methods[0]

## 3. Align

In [None]:
# Prepare texts and database
if not os.path.isdir('db'):
    os.mkdir('db')
    
db_path = f'db/{book_id}_{lang1}_{lang2}.db'

# Choose model:
# _sentence_transformer_multilingual_ is faster, supports 50+ languages;
# _sentence_transformer_multilingual_labse_ supports 100+ languages.
models = ['sentence_transformer_multilingual', 'sentence_transformer_multilingual_labse']
model_name = models[0]

# Split by sentences
splitted = {}

if lang2_split_method == 'splitter':
    splitted['from'] = splitter.split_by_sentences_wrapper(text1.split('\n'), lang1)
    splitted['to'] = splitter.split_by_sentences_wrapper(text2.split('\n'), lang2)
elif lang2_split_method == 'standard_split':
    splitted['from'] = text1.split('\n')
    splitted['to'] = text2.split('\n')

# Create and fill database
if os.path.isfile(db_path):
    os.unlink(db_path)

route = 'to'
if bool(lang3):
    print(f'Translation form {lang2_real} to {lang3}.')
    route = 'translation'
    translator = Translator(from_lang=lang2_real, to_lang=lang3)
    splitted[route] = [translator.translate(line) for line in tqdm(splitted['to'])]
    
aligner.fill_db(db_path, lang1, lang2, splitted['from'], splitted[route])

# Align

# Adjust batch_size or batch_count if needed
batch_size = 100
batch_count = len(text1.split('\n')) // batch_size + 4
batch_ids = list(range(1, batch_count+1))

print(f'Batch count: {batch_count}.')

aligner.align_db(db_path, \
                model_name, \
                batch_size=batch_size, \
                window=40, \
                batch_ids=batch_ids, \
                save_pic=False,
                embed_batch_size=10, \
                normalize_embeddings=True, \
                show_progress_bar=True
                )

# Visualize alignments
if not os.path.isdir('img'):
    os.mkdir('img')
output_path = f'img/alignment_{book_id}_{lang1}_{lang2}.png'
vis_helper.visualize_alignment_by_db(db_path, output_path=output_path,
                                     lang_name_from=lang1,
                                     lang_name_to=lang2,
                                     batch_size=400,
                                     size=(800,800),
                                     plt_show=True)

# Resolve conflicts

# Determine all conflicts and print statistics
conflicts_to_solve, rest = resolver.get_all_conflicts(db_path, min_chain_length=2,
                                                      max_conflicts_len=6, batch_id=-1)

resolver.get_statistics(conflicts_to_solve)
resolver.get_statistics(rest)

# Resolve conflicts
steps = 3
batch_id = -1 # align all available batches

for i in range(steps):
    conflicts, rest = resolver.get_all_conflicts(db_path, min_chain_length=2+i,
                                                 max_conflicts_len=6*(i+1), batch_id=batch_id,
                                                 handle_start=True, handle_finish=True)
    resolver.resolve_all_conflicts(db_path, conflicts, model_name, show_logs=False)
    output_path = f'img/conflicts_{book_id}_{lang1}_{lang2}.png'
    vis_helper.visualize_alignment_by_db(db_path, output_path=output_path,
                                         lang_name_from=lang1, lang_name_to=lang2,
                                         batch_size=400, size=(600,600), plt_show=True)

    if len(rest) == 0: break

## 4. Mojify texts

In [None]:
# Get sentences
paragraphs, delimeters, metas, sent_counter = reader.get_paragraphs(
    db_path, direction='to'
)

for direction in ('from', 'to'):
    print(f'{len(paragraphs[direction])} "{direction}" paragraphs totally.')
print()

# Fill the lists of aligned sentences
sentences = {'from': [], 'to': [], 'translation': []}

for direction in ('from', 'to'):
    for paragraph in paragraphs[direction]:
        for sentence in paragraph:
            if direction == 'to' and route == 'translation':
                direction = 'translation'
            sentences[direction].append(sentence)
    print(f'{len(sentences[direction])} "{direction}" sentences totally.')

# Fill the "to" sentences from the "translation" sentences
if route == 'translation':
    sentences['to'] = [''] * len(sentences['translation'])
    for i in range(len(sentences['translation'])):
        if 'QUERY LENGTH LIMIT EXCEEDED' in sentences['translation'][i]:
            break
        if 'MYMEMORY WARNING' in sentences['translation'][i]:
            continue
        try:
            index = splitted['translation'].index(sentences['translation'][i])
            sentences['to'][i] = splitted['to'][index]
        except ValueError:
            continue
    print(f'{len(sentences["to"])} "to" sentences totally.')
    

# Get emojis
with open('emojis/emojis.txt', 'r', encoding='utf-8') as handle:
    emojis = handle.read()

emojis = ''.join(set(emojis) - set(text1) - set(text2))

emojis_path = f'emojis/emojis_{book_id}_{lang1}_{lang2_real}.txt'
with open(emojis_path, 'w', encoding='utf-8') as handle:
    handle.write(emojis)

emoji_index = 0
emoji_counter = 0

last_emoji = None

# Mojify texts
for i in range(len(sentences['from'])):
    emoji = emojis[emoji_index]
    
    sentence1 = sentences['from'][i]
    sentence2 = sentences['to'][i]
    
    if sentence1 == '' or sentence2 == '':
        continue
    
    if text1.count(sentence1) == text2.count(sentence2) == 1:
        text1 = text1.replace(sentence1, f'{emoji}{sentence1}{emoji}')
        text2 = text2.replace(sentence2, f'{emoji}{sentence2}{emoji}')
    
        emoji_index = (emoji_index + 1) % len(emojis)
        emoji_counter += 1
        last_emoji = emoji
    
progress1 = round((text1.rfind(last_emoji) + 1) / len(text1) * 100, 1)
progress2 = round((text2.rfind(last_emoji) + 1) / len(text2) * 100, 1)
print(f'{progress1}% of the text 1 mojified.')
print(f'{progress2}% of the text 2 mojified.')
print(f'{emoji_counter} emojis used.')

## 5. Save mojified texts

In [None]:
with open(text1_path, 'w', encoding='utf-8') as handle1:
    handle1.write(text1)
    
with open(text2_path, 'w', encoding='utf-8') as handle1:
    handle1.write(text2)

## 6. De-mojify texts

In [None]:
emojis_path = f'emojis/emojis_{book_id}_{lang1}_{lang2}.txt'

if not os.path.isfile(emojis_path):
    emojis_path = 'emojis/emojis.txt'
    
emojis_path = 'emojis/emojis.txt'
    
with open(emojis_path, 'r', encoding='utf-8') as handle:
    emojis = handle.read()

for emoji in emojis:
    text1 = text1.replace(emoji, '')
    text2 = text2.replace(emoji, '')

emojis_path = f'emojis/emojis_{book_id}_{lang1}_{lang2}.txt'
if os.path.isfile(emojis_path):
    os.remove(emojis_path)

# Save de-mojificated texts
with open(text1_path, 'w', encoding='utf-8') as handle1:
    handle1.write(text1)
    
with open(text2_path, 'w', encoding='utf-8') as handle1:
    handle1.write(text2)