# Предобработка
Код для первого подхода к улучшению классификации - предобработка данных для автоматического исправления маскировок  

**Задачи:**
* имплементировать правила замены букв и буквосочетаний
* скомпилировать словари матерных, грубых и ругательных слов
* имплементировать замену по расстоянию Левенштейна и словарю
* собрать оба подхода в двухступенчатый алгоритм

Ссылки на использованные словари:  
https://gist.github.com/nestyme/8531fe4ec34cd2c8e9b306513cb8b59a (Zueva et al.) 89 слов  
https://github.com/bohdan1/AbusiveLanguageDataset/blob/master/bad_words.txt (Andrusyak et al.) 623 слова  
Из первого были удалены слова, не относящиеся к ругательным, из второго - повторы, имеющиеся в первом  
Удалены искажённые формы, чтобы не сбивать Левенштейна  
Добавлены слова: *хули, лахтодырка, пиздуй, школота, рашка, хуйло*

#### Правила замены:
йо → ё  
^мл → бл  
ип → еб  
п → б  
к → х  
т → д  
а → о  
с → з  
и → е  

к → г  
ш → ж  
ф → в  
3.14 → пи  
3,14 → пи

In [52]:
import re
import json
import itertools
import pymorphy2
import pandas as pd
from string import punctuation
from Levenshtein import distance
morph = pymorphy2.MorphAnalyzer()

In [18]:
replace_dict = {'йо': 'е', 
                'ип': 'еб',
                'мл': 'бл',
                'ау': 'ов',
                'и': 'е',
                'п': 'б',
                'т': 'д',
                'к': 'х',
                'а': 'о',
                'с': 'з',
                'ш': 'ж',
                'ф': 'в',
                'у': 'в',
                'ц': 'с',
                'цц': 'тс',
                '3.14': 'пи',
                '3,14': 'пи'}

with open('replacement.json', 'w', encoding='UTF-8') as f:
    json.dump(replace_dict, f)

In [10]:
with open('replacement.json', 'r', encoding='UTF-8') as f:
    replace_dict = json.load(f)

In [11]:
with open('bad_wordlist.txt', encoding='UTF-8') as f:
    bad_wordlist = [line.rstrip('\n') for line in f.readlines()]

In [12]:
len(bad_wordlist)

598

#### Собирание замены букв по правилам

In [13]:
def replace_latin(text):
    """
    replaces latin letters similar to cyrillic ones
    """
    table = text.maketrans('wertyuiopahkxcbnm', 'шертуииоранкхсвпм')
    return text.translate(table)

In [14]:
nonletter_pat = re.compile('[^а-яё]')
pi_pat = re.compile('^3[.,]14.+')

def contains_nonletters(word):
    """
    returns True, if given word contains any character that is not a cyrillic letter or
    or a translatable latin letter or a "3.14" / "3,14" sequence
    """
    if bool(re.search(nonletter_pat, word)) and not bool(re.search(pi_pat, word)):
        return True
    return False

In [15]:
link_pat = re.compile('^\[.*?|.*?\], ')

def remove_link(text):
    """
    removes reply link
    """
    return re.sub(link_pat, '', text)

In [16]:
def check(text):
    """
    checks if comments is a reply
    """
    if text.startswith('['):
        return True
    return False

In [17]:
def word_exists(word):
    """
    checks whether given word is in OpenCorpora dictionary using PyMorphy2
    """
    if morph.word_is_known(word.strip(punctuation)):
        return True
    return False

In [18]:
def closest_word_levestein(word):
    """
    returns closest word from dictionary
    """
    dists = [distance(word, candidate) for candidate in bad_wordlist]
    closest_val = dists.index(min(dists))
    return bad_wordlist[closest_val]

In [19]:
def correct_by_letters(word, non_found_return):
    """
    takes a word, replaces letters one pair at a time unless the result or its lemma is
    found in the bad dictionary, otherwise returns the intial word
    """
    for old, new in replace_dict.items():
        if old in word:
            new_word = word.replace(old, new)
            if morph.parse(new_word)[0].normal_form in bad_wordlist or new_word in bad_wordlist:
                return new_word
    # if the word is not found, go through all possible combinations of rules
    for l in range(1, len(replace_dict)):
        for tple in itertools.combinations(replace_dict.keys(), l+1):
            new_word = word
            for key in tple:
                new_word = new_word.replace(key, replace_dict[key])
        if morph.parse(new_word)[0].normal_form in bad_wordlist or new_word in bad_wordlist:
            return new_word
    # if still not found, return the initial input or closest my levenstein
    if non_found_return == 'initial':
        return word
    elif non_found_return == 'levenstein':
        return closest_word_levestein(word)

#### Собирание Левенштейна

In [3]:
nonletter_pat = re.compile('[^а-яёА-ЯЁ]')

def count_nonletters(word):
    """
    counts symbols that are not letters
    (latin letters are replaced by that time)
    """
    return len(re.findall(nonletter_pat, word))

In [21]:
def count_distances(word):
    """
    counts all distances to dictionary words, returns a list of them
    """
    return [distance(word, candidate) for candidate in bad_wordlist]

In [22]:
def correct_by_levenstein(word):
    """
    three sequential approaches to find the masked word using edit distance
    """
    n = count_nonletters(word)
    dists = count_distances(word)
    
    if n in dists: # find the word with n == distance
        idx = dists.index(n)
        return bad_wordlist[idx]
    else: # pick first word with closest distance from dist list
        closest_val = min(dists, key=lambda x:abs(x-n))
        idx = dists.index(closest_val)
        return bad_wordlist[idx]
    # might need to add threshold above that replacement does no happen

In [25]:
correct_by_levenstein('еб***того')

'ебать'

# Финальная функция:

In [51]:
def preprocess_distortion(text, debug=False):
    """
    performs all steps of checks and corrections for each token
    """
    text = replace_latin(text) # this step is done for the whole text because it's fast,
                               # does not require iterations and cannot produce non-existent words
    if text.startswith('['):
        tokens = remove_link(text).split()
    else:
        tokens = text.split()
    
    for token in tokens:
        token_low = token.lower()
        if word_exists(token_low) or token_low.isdigit() or all([char in punctuation for char in token_low]):
            continue # skip the token if it is an existing word or a number or a piece of punctuation

        if not contains_nonletters(token_low): # pass token further if it has non-cyrillic characters
            if debug:
                print('using replacement rules')
                print(correct_by_letters(token_low, 'levenstein'))
            text = text.replace(token, correct_by_letters(token_low, 'levenstein'))

        else: # it has non-cyrillic characters and therefore is passed to Levenstein
            if debug:
                print('using Levenstein')
                print(correct_by_levenstein(token_low))
            text = text.replace(token, correct_by_levenstein(token_low))

    return text

In [53]:
test_string = 'Уeище тупое'
print(test_string)
preprocess_distortion(test_string, debug=True)

Уeище тупое
using replacement rules
уебище


'уебище тупое'

## Тестирование

In [55]:
uncorrected = pd.read_csv('uncorrected_data_NEW.tsv', encoding='UTF-8', sep='\t')
corrected = pd.read_csv('corrected_data_NEW.tsv', encoding='UTF-8', sep='\t')
test_x = uncorrected['comments']
test_y = corrected['corrected']

In [56]:
def evaluate_preprocessing(test_x, test_y):
    """
    prints data for visual assessment
    """
    for x, y in zip(test_x, test_y):
        edited = preprocess_distortion(x)
        if x == y:
            print('--------\nсошлось')
            print(f'\nx: {x}')
            print(f'\nedited: {edited}')
            print(f'\ny: {y}')
        else:
            print('--------\nне сошлось')
            print(f'\nx: {x}')
            print(f'\nedited: {edited}')
            print(f'\ny: {y}')

In [None]:
evaluate_preprocessing(test_x[:5], test_y[:5])

In [61]:
uncorrected['preprocessed'] = uncorrected['comments'].apply(preprocess_distortion)
uncorrected.head(5)

Unnamed: 0.1,Unnamed: 0,comments,toxicity,preprocessed
0,25,"[id695904995|Ярослав], как же обо%рался ОКР ко...",1,"[иd695904995|Ярослав], как же обосранец очко к..."
1,31,"[id483002399|Максим], дол..еб, ты под каждым п...",1,"[иd483002399|Максим], долбоеб ты под каждым по..."
2,33,"[id483002399|Максим], лахтодырка-это твоя мама...",1,"[иd483002399|Максим], лахтодырка твоя мама) А ..."
3,34,"[id238092031|?лег], от школоты иного ответа, к...",1,"[иd238092031|?лег], от школота иного ответа, к..."
4,36,"[id238092031|?лег], нет, лахтодырка ??, грамот...",1,"[иd238092031|?лег], нет, лахтодырка ??, грамот..."


In [63]:
uncorrected.to_csv('preprocessed_data_NEW.tsv', sep='\t')

In [140]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)