In [1]:
import pandas as pd
from alphabet_detector import AlphabetDetector
import string
import re
import nltk
import numpy as np
import operator
from nltk.stem.snowball import SnowballStemmer
import xml.etree.ElementTree

# Data cleaning

In [171]:
path_to_data = '../data/comments1.csv'

In [172]:
df = pd.read_csv(path_to_data)
df = df.dropna()

In [173]:
emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)

In [174]:
def clean(x):
    ad = AlphabetDetector()
    res = x
    for ch in string.punctuation:                                                                                                     
        res = res.replace(ch, ' ')
    res = ''.join([i for i in res if not i.isdigit()])
    res = res.lower()
    res = emoji_pattern.sub(r' ', res)
    res = res.replace('\n', ' ')
    res = res.replace('\t', ' ')
    res = res.replace('\ufeff', ' ')
    res = res.replace('\r\n', '  ')
    res = res.replace('\xa0', ' ')
    res = res.replace('«', ' ')
    res = res.replace('»', ' ')
    res = res.replace('—', ' ')
    res = res.replace('ё', 'е')
    res = re.sub(' +',' ', res)
    if  not ad.only_alphabet_chars(res, "CYRILLIC"): 
        res = ''
    return res

In [175]:
df.text = df.text.apply(clean)
df = df[df.text != '']

In [124]:
#df.to_csv('../data/comments_clean1.csv', index=False)

In [176]:
#remove comments that were manualy labeld
manual = pd.read_csv('../data/manual.csv')
df = df[~df.id.isin(manual.id)]

# Stemming bad words

In [177]:
def stem(word):
    stemmer = SnowballStemmer("russian", ignore_stopwords=True) 
    stemmed_word = stemmer.stem(word)
    if len(stemmed_word) <= 2:
        return word
    return stemmed_word

In [178]:
path_to_bad_words = '../data/bad_words.txt'

In [179]:
bad_words = open(path_to_bad_words).read().split('\n')[:-1]

In [180]:
report_dictionary = [] # array that will contain all bad words
stemmed_dictionary = [] # array that will contain actual stems used for finding bad words

In [181]:
for word in bad_words:
    report_dictionary.append(stem(word))
    stemmed_dictionary.append(stem(word))

In [182]:
stemmed_dictionary = [i for i in stemmed_dictionary if len(i) > 2] 

In [183]:
stemmed_dictionary.append('хуй') # nltk stemmer can not corectly stem word хуй
stemmed_dictionary.append('хуе')

In [184]:
stemmed_dictionary = list(set(stemmed_dictionary))

In [185]:
stemmed_dictionary

['пиздец',
 'заеба',
 'ипа',
 'хуякнут',
 'пердун',
 'припизден',
 'отъеб',
 'пи3д',
 'проеба',
 'говнян',
 'бздех',
 'проебанк',
 'е6ут',
 'хуищ',
 'залуп',
 'придурок',
 'xуе',
 'чмыр',
 'въебыва',
 'похуист',
 'распроет',
 'елдач',
 'ебаньк',
 'сран',
 'писюн',
 'съеба',
 'шлюшк',
 'мандавошк',
 'нехир',
 'долбаеб',
 'падонк',
 'архипиздр',
 'ебанут',
 'хуйл',
 'педерас',
 'секел',
 'шалав',
 'приеба',
 'нахр',
 'заебист',
 'выеба',
 'ебн',
 'припиздюлин',
 'говняк',
 'минет',
 'сцук',
 'припизднут',
 'еблищ',
 'мудаг',
 'хуесоск',
 'охует',
 'стерв',
 'злоеб',
 'нех',
 'наговня',
 'поскуд',
 'курвятник',
 'ушлепок',
 'подонок',
 'нихер',
 'ибанамат',
 'хyё',
 'сговня',
 'ебут',
 'наебыва',
 'пезд',
 'спиздет',
 'ебальник',
 'напизд',
 'бляб',
 'охуел',
 'пердунец',
 'захуяч',
 'нехр',
 'уебк',
 'хуеньк',
 'пиздищ',
 'вафел',
 'проеб',
 'ахует',
 'хуюл',
 'усра',
 'заеб',
 'хуек',
 'пиздун',
 'хуи',
 'хуевин',
 'пердильник',
 'мандищ',
 'хуяс',
 'хитрожоп',
 'отпиздяч',
 'бздюх',
 '

# label coments with bad words

In [186]:
def label(x):
    global stemmed_dictionary
    tokens = nltk.word_tokenize(x)
    for bad_word in stemmed_dictionary:
        for token in tokens:
            if bad_word in token:
                return True
    return False

In [187]:
df['label'] = df.text.apply(label)

# Iterative process of finding new bad words

## Word stemming and counting

In [166]:
all_stemmed = dict()
for sentence in df.text:
    for token in nltk.word_tokenize(sentence):
        if token not in all_stemmed:
            all_stemmed[token] = stem(token)

In [56]:
all_counts = dict()
for sentence in df.text:
    tokens = nltk.word_tokenize(sentence)
    for token in tokens:
        word = stem(token)
        if word in all_counts:
            all_counts[word] += 1
        else:
            all_counts[word] = 1

In [57]:
word_to_id = dict()
for index, row in df.iterrows():
    tokens = nltk.word_tokenize(row.text)
    for token in tokens:
        word = stem(token)
        if word in word_to_id:
            word_to_id[word].append(row.video_id)
        else:
            word_to_id[word] = [row.video_id]


In [58]:
for key in word_to_id:
    word_to_id[key] = len(list(set(word_to_id[key])))

# Likelihood of words being in comments labeld as bad or labeld as not bad

In [27]:
def likelihood(label):
    labeld_part = dict()
    part = df[df.label == label]
    for index, row in part.iterrows():
        sentence = row.text
        tokens = nltk.word_tokenize(sentence)
        for token in tokens:
            stemed = stem(token)
            if stemed != '':
                if all_counts[stemed] >= 20 and word_to_id[stemed] > 30:
                    if stemed in labeld_part:
                        labeld_part[stemed] += 1.0/len(part)
                    else:
                        labeld_part[stemed] = 1.0/len(part)
    return labeld_part

## Criteria functions for decision if word is bad or not bad

In [28]:
def relative_distance(p_good, p_bad):
    difference = dict()
    for key in p_good:
        if key in p_bad:
            difference[key] =(p_bad[key] - p_good[key])/ np.maximum( p_bad[key], p_good[key]) 
    return difference

In [29]:
def logg_odds(p_good, p_bad):
    ratio = dict()
    for key in p_good:
        if key in p_bad:
            odds_good =  p_good[key]/(1 -  p_good[key])
            odds_bad = p_bad[key]/(1- p_bad[key])
            ratio[key] = np.log(odds_bad/odds_good)
    return ratio

## Pointwise mutual information

In [30]:
def make_pairs(df_bad):
    pairs = dict()
    for sentence in df_bad.text:
        selection = []
        for token in nltk.word_tokenize(sentence):
            word = stem(token)
            if word in stemmed_dictionary and all_counts[word] >= 20:
                selection.append(word)
        for bad_word in selection:
            for token in nltk.word_tokenize(sentence):
                word = stem(token)
                if bad_word != word and word != '' and all_counts[word] >= 30:
                    key = bad_word + '-' + word
                    if key in pairs:
                        pairs[key] += 1
                    else:
                        pairs[key] = 1
    return pairs

In [31]:
def pmi():
    bad_likelihood = likelihood(True)
    pairs = make_pairs(df[df.label == True])
    result = dict()
    for key in pairs:
        words = key.split('-')
        pxy = float(pairs[key]/len(df[df.label == True]))
        px = bad_likelihood[words[0]]
        py = bad_likelihood[words[1]]
        result[key] = np.log(pxy/(px*py))
    return result
        
        
        

## Refining with corpus

In [29]:
corpus = xml.etree.ElementTree.parse('../corpus/opcorpora.xml').getroot()

KeyboardInterrupt: 

In [30]:
def in_rus_corpus(corpus, word):
    for lemma in corpus.iter('lemma'):
        found = False
        for forms in lemma.getchildren():
            if forms.attrib['t'] == word:
                return True
    return False

## Detecting new bad words

In [32]:
def unstem(stem):
    res = []
    for key in all_stemmed:
        if all_stemmed[key] == stem:
            res.append(key)
    return res    

In [33]:
def sort(x, rev = True):
    return sorted(x.items(), key=operator.itemgetter(1), reverse=rev)

In [34]:
def update_dictionary(word):
    global stemmed_dictionary
    if len(word) <= 2:
        return False
    #for key in stemmed_dictionary:
        #if word in key:
            #return False
    stemmed_dictionary.append(word)
    return True

In [192]:
def iterate(df, report_dictionary, stemmed_dictionary):
    iter = True
    while(iter):
        print('-----------------------------------')
        print('labeld as abusive: ', len(df[df.label == True]))
        #print('List of bad_words')
        #print(report_dictionary)
        p_bad = likelihood(True)
        p_good = likelihood(False)
        prob = logg_odds(p_good, p_bad)
        print(sort(prob)[:10])
        print(prob['майдаун'])
        print('New bad words')
        new_words = []
        for key in prob:
            if prob[key] > 1.8:
                if update_dictionary(key):
                    new_words += unstem(key)
                    print(key)
        report_dictionary += new_words
        if len(new_words) == 0:
            iter = False
            break
        #print('\nNew dictionary')
        #print(report_dictionary)
        df['label'] =  df.text.apply(label)
        print('relabeld as abusive: ', len(df[df.label == True]))

In [193]:
iterate(df, report_dictionary, stemmed_dictionary)

-----------------------------------
labeld as abusive:  13244


  import sys


[('пошел', 1.8002145672591727), ('член', 1.7052524290952167), ('рот', 1.6989349048094933), ('жрат', 1.6911634139418525), ('что', 1.6063504920715515), ('принцип', 1.5957057586437078), ('безмозгл', 1.544462932425848), ('гнил', 1.531003667241796), ('кита', 1.5298315805916034), ('горет', 1.5173615301984738)]
0.5310309305592511
New bad words
пошел
relabeld as abusive:  13278
-----------------------------------
labeld as abusive:  13278
[('член', 1.701576453176969), ('рот', 1.6952443248243876), ('жрат', 1.6874910001853927), ('принцип', 1.6292852399810964), ('что', 1.6033833571059022), ('безмозгл', 1.5407904119005673), ('гнил', 1.5273262350983232), ('горет', 1.5136904781436038), ('тупорыл', 1.4678495851522604), ('квартир', 1.4609205015603532)]
0.5273457831878136
New bad words


In [167]:
word_to_id['майдаун']

61

# Evaluation

In [194]:
manual['evaluation'] = manual.text.apply(label)

In [195]:
tp = 0
tn = 0
fp = 0
fn = 0
for index, row in manual.iterrows():
    if row.label == True and row.evaluation == True:
        tp += 1
    if row.label == False and row.evaluation == False:
        tn += 1
    if row.label == False and row.evaluation == True:
        fp += 1
    if row.label == True and row.evaluation == False:
        #print(row.text)
        fn += 1
accuracy = (tp + fp)/(tp + fp + fn + fp)
precision = tp/(tp + fp)
recall = tp/(tp + fn)
f1  = 2 * (precision * recall)/(precision + recall)
print('accuracy: ', accuracy)
print('precision: ', precision)
print('recall: ', recall)
print('f1: ', f1)

accuracy:  0.5963488843813387
precision:  0.717687074829932
recall:  0.6452599388379205
f1:  0.679549114331723
