In [1]:
import pandas as pd
from alphabet_detector import AlphabetDetector
import string
import re
import nltk
import numpy as np
import operator
from nltk.stem.snowball import SnowballStemmer
import xml.etree.ElementTree

# Data cleaning

In [2]:
path_to_data = '../data/comments1.csv'

In [3]:
df = pd.read_csv(path_to_data)
df = df.dropna()

In [4]:
emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)

In [43]:
def clean(x):
    ad = AlphabetDetector()
    res = x
    for ch in string.punctuation:                                                                                                     
        res = res.replace(ch, ' ')
    res = ''.join([i for i in res if not i.isdigit()])
    res = res.lower()
    res = emoji_pattern.sub(r' ', res)
    res = res.replace('\n', ' ')
    res = res.replace('\t', ' ')
    res = res.replace('\ufeff', ' ')
    res = res.replace('\r\n', '  ')
    res = res.replace('\xa0', ' ')
    res = res.replace('«', ' ')
    res = res.replace('»', ' ')
    res = res.replace('—', ' ')
    res = res.replace('“', ' ')
    res = res.replace('ё', 'е')
    res = re.sub(' +',' ', res)
    if  not ad.only_alphabet_chars(res, "CYRILLIC"): 
        res = ''
    return res

In [6]:
df.text = df.text.apply(clean)
df = df[df.text != '']

In [7]:
#df.to_csv('../data/comments_clean1.csv', index=False)

In [8]:
#remove comments that were manualy labeld
manual = pd.read_csv('../data/manual.csv')
df = df[~df.id.isin(manual.id)]

In [9]:
def stem(word):
    stemmer = SnowballStemmer("russian", ignore_stopwords=True) 
    stemmed_word = stemmer.stem(word)
    if len(stemmed_word) <= 2:
        return word
    return stemmed_word

In [10]:
word_treshold = [10,15,20,25,30]
treshold = 0.95



In [11]:
def get_bad_words():
    path_to_bad_words = '../data/bad_words.txt'
    bad_words = open(path_to_bad_words).read().split('\n')[:-1]
    stemmed_dictionary = []
    for word in bad_words:
        stemmed_dictionary.append(stem(word))
    stemmed_dictionary = [i for i in stemmed_dictionary if len(i) > 2] 
    stemmed_dictionary.append('хуй') # nltk stemmer can not corectly stem word хуй
    stemmed_dictionary.append('хуе')
    stemmed_dictionary = list(set(stemmed_dictionary))
    return stemmed_dictionary

In [12]:
def label(x):
    global stemmed_dictionary
    tokens = nltk.word_tokenize(x)
    for bad_word in stemmed_dictionary:
        for token in tokens:
            if bad_word in token:
                return True
    return False

In [13]:
stemmed_dictionary = get_bad_words()

In [16]:
all_stemmed = dict()
for sentence in df.text:
    for token in nltk.word_tokenize(sentence):
        if token not in all_stemmed:
            all_stemmed[token] = stem(token)

In [17]:
word_to_id = dict()
all_counts = dict()
for index, row in df.iterrows():
    tokens = nltk.word_tokenize(row.text)
    for token in tokens:
        word = stem(token)
        if word in word_to_id:
            word_to_id[word].append(row.video_id)
        else:
            word_to_id[word] = [row.video_id]
        if word in all_counts:
            all_counts[word] += 1
        else:
            all_counts[word] = 1
for key in word_to_id:
    word_to_id[key] = len(list(set(word_to_id[key])))

In [18]:
def likelihood(label, treshold):
    labeld_part = dict()
    part = df[df.label == label]
    for index, row in part.iterrows():
        sentence = row.text
        tokens = nltk.word_tokenize(sentence)
        for token in tokens:
            stemed = stem(token)
            if stemed != '':
                if word_to_id[stemed] > treshold:
                    if stemed in labeld_part:
                        labeld_part[stemed] += 1.0/len(part)
                    else:
                        labeld_part[stemed] = 1.0/len(part)
    return labeld_part

In [19]:
def relative_distance(p_good, p_bad):
    difference = dict()
    for key in p_good:
        if key in p_bad:
            difference[key] =(p_bad[key] - p_good[key])/ np.maximum( p_bad[key], p_good[key]) 
    return difference

In [52]:
def log_odds(p_good, p_bad):
    ratio = dict()
    for key in p_good:
        if key in p_bad:
            odds_good =  p_good[key]/(1 -  p_good[key])
            odds_bad = p_bad[key]/(1- p_bad[key])
            ratio[key] = np.log(odds_bad/odds_good)
    log_max = max(ratio.items(), key=operator.itemgetter(1))[1]
    #for key in ratio:
       # ratio[key] = np.true_divide(ratio[key],log_max)
    return ratio

In [50]:
prob = log_odds(p_good, p_bad)

  import sys


In [51]:
sort(prob)

[('аэропорт', 3.5977743840432588),
 ('ремонт', 2.9045952663356176),
 ('звон', 2.722987618645223),
 ('вставля', 2.681218497000372),
 ('отбит', 2.681218497000372),
 ('менеджер', 2.681218497000372),
 ('хохлятск', 2.624416871916947),
 ('наркоман', 2.613374266382351),
 ('драл', 2.4994550984825135),
 ('тип', 2.487619461885677),
 ('чувствова', 2.4480840284775796),
 ('западэнц', 2.4300276031888846),
 ('москальск', 2.355841891695131),
 ('лишен', 2.336702860277377),
 ('алкогол', 2.3174586291009818),
 ('“', 2.2339214144664488),
 ('людишк', 2.1886323464797663),
 ('спят', 2.1886323464797663),
 ('засун', 2.113527368755234),
 ('нын', 2.113527368755234),
 ('пастух', 2.093926108938273),
 ('быстреньк', 2.093244445461397),
 ('мифическ', 2.093244445461397),
 ('рівен', 2.068127880055561),
 ('запуга', 2.0528249894643227),
 ('десятилет', 2.0528249894643227),
 ('медлен', 2.0352092360511804),
 ('куск', 2.025459830445752),
 ('погон', 2.022083572253978),
 ('истреб', 1.9882087168980298),
 ('земн', 1.9880074390849

In [21]:
def sort(x, rev = True):
    return sorted(x.items(), key=operator.itemgetter(1), reverse=rev)

In [22]:
def update_dictionary(word):
    global stemmed_dictionary
    if len(word) <= 2:
        return False
    #for key in stemmed_dictionary:
        #if word in key:
            #return False
    stemmed_dictionary.append(word)
    return True

In [23]:
def evaluate():
    manual['evaluation'] = manual.text.apply(label)
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    for index, row in manual.iterrows():
        if row.label == True and row.evaluation == True:
            tp += 1
        if row.label == False and row.evaluation == False:
            tn += 1
        if row.label == False and row.evaluation == True:
            fp += 1
        if row.label == True and row.evaluation == False:
            #print(row.text)
            fn += 1
    accuracy = (tp + fp)/(tp + fp + fn + fp)
    precision = tp/(tp + fp)
    recall = tp/(tp + fn)
    f1  = 2 * (precision * recall)/(precision + recall)
    print('accuracy: ', accuracy)
    print('precision: ', precision)
    print('recall: ', recall)
    print('f1: ', f1)
    return f1
    

In [27]:
p_bad = likelihood(True, amount_treshold)
p_good = likelihood(False, amount_treshold)


NameError: name 'log_odds' is not defined

  import sys


TypeError: ufunc 'true_divide' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [55]:
amount_tresholds = [30]
result = []
for amount_treshold in amount_tresholds:
    stemmed_dictionary = get_bad_words()
    df['label'] = df.text.apply(label)
    score0 = evaluate()
    treshold = 2
    while(True):
        print('----------------------------------')
        old_len = len(stemmed_dictionary)
        print('treshold', treshold)
        print('length of dict before ', old_len)
        p_bad = likelihood(True, amount_treshold)
        p_good = likelihood(False, amount_treshold)
        prob = log_odds(p_good, p_bad)
        sort(prob)[:15]
        new_words = []
        for key in prob:
            if prob[key] > treshold:
                if update_dictionary(key):
                    new_words.append(key)

        print('length of dict after ',len(stemmed_dictionary))
        print('new words ', new_words)
        score1 = evaluate()
        if score1 < score0:
            print('f1 ', score0, 'treshold ', treshold, 'amount treshold ', amount_treshold)
            result.append([score0, treshold, amount_treshold])
            break
        else:
            score0 = score1

        if len(stemmed_dictionary) == old_len:
            treshold -= 0.1
        df.label = df.text.apply(label)
    
    
    
    
    


accuracy:  0.5885416666666666
precision:  0.7292035398230089
recall:  0.6299694189602446
f1:  0.6759639048400328
----------------------------------
treshold 2
length of dict before  476


  import sys


length of dict after  479
new words  ['тип', 'западенск', 'треб']
accuracy:  0.5936863543788188
precision:  0.7186963979416809
recall:  0.6406727828746177
f1:  0.677445432497979
----------------------------------
treshold 2
length of dict before  479
length of dict after  479
new words  []
accuracy:  0.5936863543788188
precision:  0.7186963979416809
recall:  0.6406727828746177
f1:  0.677445432497979
----------------------------------
treshold 1.9
length of dict before  479
length of dict after  479
new words  []
accuracy:  0.5936863543788188
precision:  0.7186963979416809
recall:  0.6406727828746177
f1:  0.677445432497979
----------------------------------
treshold 1.7999999999999998
length of dict before  479
length of dict after  481
new words  ['вонюч', 'пошел']
accuracy:  0.5945121951219512
precision:  0.717948717948718
recall:  0.6422018348623854
f1:  0.6779661016949153
----------------------------------
treshold 1.7999999999999998
length of dict before  481
length of dict after  

In [43]:
evaluate()

accuracy:  0.5943983402489627
precision:  0.7294938917975567
recall:  0.6391437308868502
f1:  0.6813365933170334


0.6813365933170334