In [3]:
import pandas as pd
from alphabet_detector import AlphabetDetector
import string
import re
import nltk
import numpy as np
import operator
from nltk.stem.snowball import SnowballStemmer
import xml.etree.ElementTree

In [4]:
path_to_data = '../data/comments1.csv'

In [5]:
df = pd.read_csv(path_to_data)
df = df.dropna()

In [6]:
emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)

In [7]:
def clean(x):
    ad = AlphabetDetector()
    res = x
    for ch in string.punctuation:                                                                                                     
        res = res.replace(ch, ' ')
    res = ''.join([i for i in res if not i.isdigit()])
    res = res.lower()
    res = emoji_pattern.sub(r' ', res)
    res = res.replace('\n', ' ')
    res = res.replace('\t', ' ')
    res = res.replace('\ufeff', ' ')
    res = res.replace('\r\n', '  ')
    res = res.replace('\xa0', ' ')
    res = res.replace('«', ' ')
    res = res.replace('»', ' ')
    res = res.replace('—', ' ')
    res = res.replace('ё', 'е')
    res = re.sub(' +',' ', res)
    if  not ad.only_alphabet_chars(res, "CYRILLIC"): 
        res = ''
    return res

In [8]:
df.text = df.text.apply(clean)
df = df[df.text != '']

In [9]:
manual = pd.read_csv('../data/manual.csv')
df = df[~df.id.isin(manual.id)]

In [13]:
def stem(word):
    stemmer = SnowballStemmer("russian", ignore_stopwords=True) 
    stemmed_word = stemmer.stem(word)
    if len(stemmed_word) <= 2:
        return word
    return stemmed_word

In [28]:
def get_bad_words(path):
    path_to_bad_words = path 
    bad_words = open(path_to_bad_words).read().split('\n')[:-1]
    stemmed_dictionary = []
    for word in bad_words:
        stemmed_dictionary.append(stem(word))
    stemmed_dictionary = [i for i in stemmed_dictionary if len(i) > 2] 
    stemmed_dictionary.append('хуй') # nltk stemmer can not corectly stem word хуй
    stemmed_dictionary.append('хуе')
    stemmed_dictionary = list(set(stemmed_dictionary))
    return stemmed_dictionary

In [58]:
def get_unstem_bad_words(path):
    path_to_bad_words = path 
    bad_words = open(path_to_bad_words).read().split('\n')[:-1]
    stemmed_dictionary = []
    for word in bad_words:
        stemmed_dictionary.append(word)
    stemmed_dictionary.append('хуй')
    stemmed_dictionary = list(set(stemmed_dictionary))
    return stemmed_dictionary

In [15]:
def label(x):
    global stemmed_dictionary
    tokens = nltk.word_tokenize(x)
    for bad_word in stemmed_dictionary:
        for token in tokens:
            if bad_word in token:
                return True
    return False

In [39]:
def unstem_label(x):
    global stemmed_dictionary
    tokens = nltk.word_tokenize(x)
    for bad_word in stemmed_dictionary:
        for token in tokens:
            if bad_word == token:
                return True
    return False

In [40]:
all_words = dict()
for sentence in df.text:
    for token in nltk.word_tokenize(sentence):
        if token not in all_stemmed:
            all_words[token] = token

In [16]:
all_stemmed = dict()
for sentence in df.text:
    for token in nltk.word_tokenize(sentence):
        if token not in all_stemmed:
            all_stemmed[token] = stem(token)

In [17]:
word_to_id = dict()
all_counts = dict()
for index, row in df.iterrows():
    tokens = nltk.word_tokenize(row.text)
    for token in tokens:
        word = stem(token)
        if word in word_to_id:
            word_to_id[word].append(row.video_id)
        else:
            word_to_id[word] = [row.video_id]
        if word in all_counts:
            all_counts[word] += 1
        else:
            all_counts[word] = 1
for key in word_to_id:
    word_to_id[key] = len(list(set(word_to_id[key])))

In [43]:
unstem_word_to_id = dict()
unstem_all_counts = dict()
for index, row in df.iterrows():
    tokens = nltk.word_tokenize(row.text)
    for token in tokens:
        word = token
        if word in unstem_word_to_id:
            unstem_word_to_id[word].append(row.video_id)
        else:
            unstem_word_to_id[word] = [row.video_id]
        if word in unstem_all_counts:
            unstem_all_counts[word] += 1
        else:
            unstem_all_counts[word] = 1
for key in unstem_word_to_id:
    unstem_word_to_id[key] = len(list(set(unstem_word_to_id[key])))

In [18]:
def likelihood(label, treshold):
    labeld_part = dict()
    part = df[df.label == label]
    for index, row in part.iterrows():
        sentence = row.text
        tokens = nltk.word_tokenize(sentence)
        for token in tokens:
            stemed = stem(token)
            if stemed != '':
                if word_to_id[stemed] > treshold:
                    if stemed in labeld_part:
                        labeld_part[stemed] += 1.0/len(part)
                    else:
                        labeld_part[stemed] = 1.0/len(part)
    return labeld_part

In [51]:
def unstem_likelihood(label, treshold):
    labeld_part = dict()
    part = df[df.label == label]
    for index, row in part.iterrows():
        sentence = row.text
        tokens = nltk.word_tokenize(sentence)
        for token in tokens:
            stemed = token
            if stemed != '':
                if unstem_word_to_id[stemed] > treshold:
                    if stemed  in labeld_part:
                        labeld_part[stemed] += 1.0/len(part)
                    else:
                        labeld_part[stemed] = 1.0/len(part)
    return labeld_part

In [19]:
def relative_distance(p_good, p_bad):
    difference = dict()
    for key in p_good:
        if key in p_bad:
            difference[key] =(p_bad[key] - p_good[key])/ np.maximum( p_bad[key], p_good[key]) 
    return difference

In [33]:
def log_odds(p_good, p_bad):
    ratio = dict()
    for key in p_good:
        if key in p_bad:
            odds_good =  p_good[key]/(1 -  p_good[key])
            odds_bad = p_bad[key]/(1- p_bad[key])
            ratio[key] = np.log(odds_bad/odds_good)
    return ratio

In [21]:
def sort(x, rev = True):
    return sorted(x.items(), key=operator.itemgetter(1), reverse=rev)

In [22]:
def update_dictionary(word):
    global stemmed_dictionary
    if len(word) <= 2:
        return False
    #for key in stemmed_dictionary:
        #if word in key:
            #return False
    stemmed_dictionary.append(word)
    return True

In [23]:
def evaluate():
    manual['evaluation'] = manual.text.apply(label)
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    for index, row in manual.iterrows():
        if row.label == True and row.evaluation == True:
            tp += 1
        if row.label == False and row.evaluation == False:
            tn += 1
        if row.label == False and row.evaluation == True:
            fp += 1
        if row.label == True and row.evaluation == False:
            #print(row.text)
            fn += 1
    accuracy = (tp + fp)/(tp + fp + fn + fp)
    precision = tp/(tp + fp)
    recall = tp/(tp + fn)
    f1  = 2 * (precision * recall)/(precision + recall)
    print('accuracy: ', accuracy)
    print('precision: ', precision)
    print('recall: ', recall)
    print('f1: ', f1)
    return f1

# Resultd for relative distance with unstemed words

In [54]:
stemmed_dictionary = get_unstem_bad_words('../data/bad_words.txt')
df['label'] = df.text.apply(unstem_label)
score0 = evaluate()
treshold = 0.83
amount_treshold = 30
while(True):
    print('----------------------------------')
    old_len = len(stemmed_dictionary)
    print('treshold', treshold)
    print('length of dict before ', old_len)
    p_bad = unstem_likelihood(True, amount_treshold)
    p_good = unstem_likelihood(False, amount_treshold)
    prob = relative_distance(p_good, p_bad)
    new_words = []
    for key in prob:
        if prob[key] > treshold:
            stemmed_dictionary.append(key)
            new_words.append(key)

    print('length of dict after ',len(stemmed_dictionary))
    print('new words ', new_words)
    score1 = evaluate()
    if score1 < score0:
        print('f1 ', score0, 'treshold ', treshold, 'amount treshold ', amount_treshold)
        #result.append([score0, treshold, amount_treshold])
        break
    else:
        score0 = score1

    if len(stemmed_dictionary) == old_len:
        treshold -= 0.01
    df.label = df.text.apply(unstem_label)

accuracy:  0.5117493472584856
precision:  0.8571428571428571
recall:  0.5137614678899083
f1:  0.6424474187380497
----------------------------------
treshold 0.83
length of dict before  622
length of dict after  630
new words  ['очко', 'тупое', 'иди', 'пошел', 'сосать', 'хули', 'рот', 'ебаные']
accuracy:  0.5594989561586639
precision:  0.7164179104477612
recall:  0.5871559633027523
f1:  0.6453781512605042
----------------------------------
treshold 0.83
length of dict before  630
length of dict after  630
new words  []
accuracy:  0.5594989561586639
precision:  0.7164179104477612
recall:  0.5871559633027523
f1:  0.6453781512605042
----------------------------------
treshold 0.82
length of dict before  630
length of dict after  633
new words  ['твое', 'долбоебы', 'вали']
accuracy:  0.5662878787878788
precision:  0.6638795986622074
recall:  0.6070336391437309
f1:  0.6341853035143771
f1  0.6453781512605042 treshold  0.82 amount treshold  30


# Results for relative distance unstem small dictionary

In [59]:
stemmed_dictionary = get_unstem_bad_words('../data/bad_words_seed.txt')
df['label'] = df.text.apply(unstem_label)
score0 = evaluate()
treshold = 0.83
amount_treshold = 30
while(True):
    print('----------------------------------')
    old_len = len(stemmed_dictionary)
    print('treshold', treshold)
    print('length of dict before ', old_len)
    p_bad = unstem_likelihood(True, amount_treshold)
    p_good = unstem_likelihood(False, amount_treshold)
    prob = relative_distance(p_good, p_bad)
    new_words = []
    for key in prob:
        if prob[key] > treshold:
            stemmed_dictionary.append(key)
            new_words.append(key)

    print('length of dict after ',len(stemmed_dictionary))
    print('new words ', new_words)
    score1 = evaluate()
    if score1 < score0:
        print('f1 ', score0, 'treshold ', treshold, 'amount treshold ', amount_treshold)
        #result.append([score0, treshold, amount_treshold])
        break
    else:
        score0 = score1

    if len(stemmed_dictionary) == old_len:
        treshold -= 0.01
    df.label = df.text.apply(unstem_label)

accuracy:  0.16715116279069767
precision:  0.8521739130434782
recall:  0.14984709480122324
f1:  0.25487646293888166
----------------------------------
treshold 0.83
length of dict before  5
length of dict after  29
new words  ['соси', 'гореть', 'бараны', 'пиздец', 'хуя', 'тупая', 'ебаные', 'пидор', 'долбоебы', 'нихуя', 'хули', 'пизды', 'долбоеб', 'тварь', 'пидарасы', 'бандеры', 'блядь', 'рот', 'похуй', 'иди', 'блять', 'хуйня', 'нахуй', 'сосать']
accuracy:  0.3780487804878049
precision:  0.6363636363636364
recall:  0.3318042813455658
f1:  0.43618090452261304
----------------------------------
treshold 0.83
length of dict before  29
length of dict after  31
new words  ['аду', 'пошел']
accuracy:  0.39552238805970147
precision:  0.6172506738544474
recall:  0.35015290519877673
f1:  0.4468292682926829
----------------------------------
treshold 0.83
length of dict before  31
length of dict after  31
new words  []
accuracy:  0.39552238805970147
precision:  0.6172506738544474
recall:  0.350152

# Results for log odds unstem 

In [56]:
stemmed_dictionary = get_unstem_bad_words('../data/bad_words.txt')
df['label'] = df.text.apply(unstem_label)
score0 = evaluate()
treshold = 2
amount_treshold = 30
while(True):
    print('----------------------------------')
    old_len = len(stemmed_dictionary)
    print('treshold', treshold)
    print('length of dict before ', old_len)
    p_bad = unstem_likelihood(True, amount_treshold)
    p_good = unstem_likelihood(False, amount_treshold)
    prob = relative_distance(p_good, p_bad)
    new_words = []
    for key in prob:
        if prob[key] > treshold:
            stemmed_dictionary.append(key)
            new_words.append(key)

    print('length of dict after ',len(stemmed_dictionary))
    print('new words ', new_words)
    score1 = evaluate()
    if score1 < score0:
        print('f1 ', score0, 'treshold ', treshold, 'amount treshold ', amount_treshold)
        #result.append([score0, treshold, amount_treshold])
        break
    else:
        score0 = score1

    if len(stemmed_dictionary) == old_len:
        treshold -= 0.1
    df.label = df.text.apply(unstem_label)

accuracy:  0.5117493472584856
precision:  0.8571428571428571
recall:  0.5137614678899083
f1:  0.6424474187380497
----------------------------------
treshold 2
length of dict before  622
length of dict after  622
new words  []
accuracy:  0.5117493472584856
precision:  0.8571428571428571
recall:  0.5137614678899083
f1:  0.6424474187380497
----------------------------------
treshold 1.9
length of dict before  622
length of dict after  622
new words  []
accuracy:  0.5117493472584856
precision:  0.8571428571428571
recall:  0.5137614678899083
f1:  0.6424474187380497
----------------------------------
treshold 1.7999999999999998
length of dict before  622
length of dict after  622
new words  []
accuracy:  0.5117493472584856
precision:  0.8571428571428571
recall:  0.5137614678899083
f1:  0.6424474187380497
----------------------------------
treshold 1.6999999999999997
length of dict before  622
length of dict after  622
new words  []
accuracy:  0.5117493472584856
precision:  0.8571428571428571

# Results for log odds unstem small dictionary

In [60]:
stemmed_dictionary = get_unstem_bad_words('../data/bad_words_seed.txt')
df['label'] = df.text.apply(unstem_label)
score0 = evaluate()
treshold = 2
amount_treshold = 30
while(True):
    print('----------------------------------')
    old_len = len(stemmed_dictionary)
    print('treshold', treshold)
    print('length of dict before ', old_len)
    p_bad = unstem_likelihood(True, amount_treshold)
    p_good = unstem_likelihood(False, amount_treshold)
    prob = relative_distance(p_good, p_bad)
    new_words = []
    for key in prob:
        if prob[key] > treshold:
            stemmed_dictionary.append(key)
            new_words.append(key)

    print('length of dict after ',len(stemmed_dictionary))
    print('new words ', new_words)
    score1 = evaluate()
    if score1 < score0:
        print('f1 ', score0, 'treshold ', treshold, 'amount treshold ', amount_treshold)
        #result.append([score0, treshold, amount_treshold])
        break
    else:
        score0 = score1

    if len(stemmed_dictionary) == old_len:
        treshold -= 0.1
    df.label = df.text.apply(unstem_label)

accuracy:  0.16715116279069767
precision:  0.8521739130434782
recall:  0.14984709480122324
f1:  0.25487646293888166
----------------------------------
treshold 2
length of dict before  5
length of dict after  5
new words  []
accuracy:  0.16715116279069767
precision:  0.8521739130434782
recall:  0.14984709480122324
f1:  0.25487646293888166
----------------------------------
treshold 1.9
length of dict before  5
length of dict after  5
new words  []
accuracy:  0.16715116279069767
precision:  0.8521739130434782
recall:  0.14984709480122324
f1:  0.25487646293888166
----------------------------------
treshold 1.7999999999999998
length of dict before  5
length of dict after  5
new words  []
accuracy:  0.16715116279069767
precision:  0.8521739130434782
recall:  0.14984709480122324
f1:  0.25487646293888166
----------------------------------
treshold 1.6999999999999997
length of dict before  5
length of dict after  5
new words  []
accuracy:  0.16715116279069767
precision:  0.8521739130434782
re

accuracy:  0.5986682808716707
precision:  0.3301314459049545
recall:  0.9984709480122325
f1:  0.49620060790273557
f1  0.5345794392523364 treshold  0.6999999999999993 amount treshold  30


# Results for relative distance

In [27]:
stemmed_dictionary = get_bad_words('../data/bad_words.txt')
df['label'] = df.text.apply(label)
score0 = evaluate()
treshold = 0.83
amount_treshold = 30
while(True):
    print('----------------------------------')
    old_len = len(stemmed_dictionary)
    print('treshold', treshold)
    print('length of dict before ', old_len)
    p_bad = likelihood(True, amount_treshold)
    p_good = likelihood(False, amount_treshold)
    prob = relative_distance(p_good, p_bad)
    new_words = []
    for key in prob:
        if prob[key] > treshold:
            if update_dictionary(key):
                new_words.append(key)

    print('length of dict after ',len(stemmed_dictionary))
    print('new words ', new_words)
    score1 = evaluate()
    if score1 < score0:
        print('f1 ', score0, 'treshold ', treshold, 'amount treshold ', amount_treshold)
        #result.append([score0, treshold, amount_treshold])
        break
    else:
        score0 = score1

    if len(stemmed_dictionary) == old_len:
        treshold -= 0.01
    df.label = df.text.apply(label)

accuracy:  0.5885416666666666
precision:  0.7292035398230089
recall:  0.6299694189602446
f1:  0.6759639048400328
----------------------------------
treshold 0.83
length of dict before  476
length of dict after  482
new words  ['кацапск', 'западенск', 'бендеровск', 'треб', 'тип', 'вонюч']
accuracy:  0.5965447154471545
precision:  0.7189097103918228
recall:  0.6452599388379205
f1:  0.6800966962127317
----------------------------------
treshold 0.83
length of dict before  482
length of dict after  483
new words  ['пошел']
accuracy:  0.5963488843813387
precision:  0.717687074829932
recall:  0.6452599388379205
f1:  0.679549114331723
f1  0.6800966962127317 treshold  0.83 amount treshold  30


# Results for relative distance with small seed

In [30]:
stemmed_dictionary = get_bad_words('../data/bad_words_seed.txt')
df['label'] = df.text.apply(label)
score0 = evaluate()
treshold = 0.90
amount_treshold = 30
while(True):
    print('----------------------------------')
    old_len = len(stemmed_dictionary)
    print('treshold', treshold)
    print('length of dict before ', old_len)
    p_bad = likelihood(True, amount_treshold)
    p_good = likelihood(False, amount_treshold)
    prob = relative_distance(p_good, p_bad)
    new_words = []
    for key in prob:
        if prob[key] > treshold:
            if update_dictionary(key):
                new_words.append(key)

    print('length of dict after ',len(stemmed_dictionary))
    print('new words ', new_words)
    score1 = evaluate()
    if score1 < score0:
        print('f1 ', score0, 'treshold ', treshold, 'amount treshold ', amount_treshold)
        #result.append([score0, treshold, amount_treshold])
        break
    else:
        score0 = score1

    if len(stemmed_dictionary) == old_len:
        treshold -= 0.01
    df.label = df.text.apply(label)

accuracy:  0.2741477272727273
precision:  0.8704663212435233
recall:  0.25688073394495414
f1:  0.39669421487603307
----------------------------------
treshold 0.9
length of dict before  6
length of dict after  12
new words  ['ниху', 'сос', 'нах', 'треб', 'ебуч', 'пох']
accuracy:  0.3737980769230769
precision:  0.7138263665594855
recall:  0.3394495412844037
f1:  0.4601036269430052
----------------------------------
treshold 0.9
length of dict before  12
length of dict after  12
new words  []
accuracy:  0.3737980769230769
precision:  0.7138263665594855
recall:  0.3394495412844037
f1:  0.4601036269430052
----------------------------------
treshold 0.89
length of dict before  12
length of dict after  13
new words  ['хуя']
accuracy:  0.3786057692307692
precision:  0.7174603174603175
recall:  0.345565749235474
f1:  0.46646026831785353
----------------------------------
treshold 0.89
length of dict before  13
length of dict after  13
new words  []
accuracy:  0.3786057692307692
precision:  0.7

# Results for log odds

In [34]:
stemmed_dictionary = get_bad_words('../data/bad_words.txt')
df['label'] = df.text.apply(label)
score0 = evaluate()
treshold = 2
amount_treshold = 30
while(True):
    print('----------------------------------')
    old_len = len(stemmed_dictionary)
    print('treshold', treshold)
    print('length of dict before ', old_len)
    p_bad = likelihood(True, amount_treshold)
    p_good = likelihood(False, amount_treshold)
    prob = log_odds(p_good, p_bad)
    new_words = []
    for key in prob:
        if prob[key] > treshold:
            if update_dictionary(key):
                new_words.append(key)

    print('length of dict after ',len(stemmed_dictionary))
    print('new words ', new_words)
    score1 = evaluate()
    if score1 < score0:
        print('f1 ', score0, 'treshold ', treshold, 'amount treshold ', amount_treshold)
        break
    else:
        score0 = score1

    if len(stemmed_dictionary) == old_len:
        treshold -= 0.1
    df.label = df.text.apply(label)
    

accuracy:  0.5885416666666666
precision:  0.7292035398230089
recall:  0.6299694189602446
f1:  0.6759639048400328
----------------------------------
treshold 2
length of dict before  476


  import sys


length of dict after  479
new words  ['западенск', 'треб', 'тип']
accuracy:  0.5936863543788188
precision:  0.7186963979416809
recall:  0.6406727828746177
f1:  0.677445432497979
----------------------------------
treshold 2
length of dict before  479
length of dict after  479
new words  []
accuracy:  0.5936863543788188
precision:  0.7186963979416809
recall:  0.6406727828746177
f1:  0.677445432497979
----------------------------------
treshold 1.9
length of dict before  479
length of dict after  479
new words  []
accuracy:  0.5936863543788188
precision:  0.7186963979416809
recall:  0.6406727828746177
f1:  0.677445432497979
----------------------------------
treshold 1.7999999999999998
length of dict before  479
length of dict after  481
new words  ['пошел', 'вонюч']
accuracy:  0.5945121951219512
precision:  0.717948717948718
recall:  0.6422018348623854
f1:  0.6779661016949153
----------------------------------
treshold 1.7999999999999998
length of dict before  481
length of dict after  

NameError: name 'result' is not defined

# Resulst for log odds small dicitonary

In [37]:
stemmed_dictionary = get_bad_words('../data/bad_words_seed.txt')
df['label'] = df.text.apply(label)
score0 = evaluate()
treshold = 2.3
amount_treshold = 30
while(True):
    print('----------------------------------')
    old_len = len(stemmed_dictionary)
    print('treshold', treshold)
    print('length of dict before ', old_len)
    p_bad = likelihood(True, amount_treshold)
    p_good = likelihood(False, amount_treshold)
    prob = log_odds(p_good, p_bad)
    print(sort(prob)[:15])
    new_words = []
    for key in prob:
        if prob[key] > treshold:
            if update_dictionary(key):
                new_words.append(key)

    print('length of dict after ',len(stemmed_dictionary))
    print('new words ', new_words)
    score1 = evaluate()
    if score1 < score0:
        print('f1 ', score0, 'treshold ', treshold, 'amount treshold ', amount_treshold)
        break
    else:
        score0 = score1

    if len(stemmed_dictionary) == old_len:
        treshold -= 0.1
    df.label = df.text.apply(label)

accuracy:  0.2741477272727273
precision:  0.8704663212435233
recall:  0.25688073394495414
f1:  0.39669421487603307
----------------------------------
treshold 2.3
length of dict before  6


  import sys


[('не', 4.106855420729722), ('нах', 3.70784457636469), ('треб', 3.130972245837058), ('пох', 2.9925091867538334), ('ебуч', 2.5867022325109823), ('сос', 2.4087789763755096), ('ниху', 2.391941342456374), ('хуя', 2.2910793289705316), ('уебк', 2.204398317969881), ('хул', 2.159901165863682), ('пидорас', 2.153054683788713), ('тим', 2.0576438167942306), ('в', 1.9640348340170493), ('пидарас', 1.9164883759960185), ('хлеб', 1.8709229875901296)]
length of dict after  12
new words  ['ниху', 'сос', 'нах', 'треб', 'ебуч', 'пох']
accuracy:  0.3737980769230769
precision:  0.7138263665594855
recall:  0.3394495412844037
f1:  0.4601036269430052
----------------------------------
treshold 2.3
length of dict before  12
[('хуя', 2.2191863212399485), ('на', 2.054430564474757), ('хлеб', 1.9964683011553512), ('промышлен', 1.9390555920404462), ('уебк', 1.8737686232393402), ('что', 1.8618225508866049), ('хул', 1.8528522566965775), ('срок', 1.7144092241872495), ('млн', 1.698130165528625), ('пидарас', 1.67121757329

----------------------------------
treshold 1.7999999999999994
length of dict before  151
[('не', 5.51299824669465), ('поэт', 4.257229802911231), ('активн', 4.0229942915180334), ('минимум', 4.0229942915180334), ('будто', 4.003121454294141), ('покупа', 3.8974099629202312), ('хаос', 3.87486692670461), ('в', 3.859921281232788), ('результат', 3.782022251679524), ('относ', 3.7757174614607534), ('попытк', 3.7538881109584255), ('итог', 3.7036386653964564), ('беспорядк', 3.6453939525429853), ('пострада', 3.6163362845244627), ('отсутств', 3.5711106617471833)]
length of dict after  742
new words  ['нескольк', 'ник', 'солдат', 'видел', 'хочет', 'гряз', 'дан', 'братск', 'нужн', 'ваш', 'евромайдан', 'выход', 'начнет', 'было', 'дешев', 'аргумент', 'итог', 'налог', 'зарплат', 'ибо', 'представител', 'сто', 'однак', 'друз', 'недел', 'хрен', 'таможен', 'когда', 'забира', 'определен', 'бывш', 'окол', 'умеют', 'снача', 'бесплатн', 'сраз', 'держа', 'войн', 'нельзя', 'если', 'друг', 'ихн', 'отсутств', 'дейс

# Result for mix of relative distance and log odds

In [63]:
stemmed_dictionary = get_bad_words('../data/bad_words.txt')
df['label'] = df.text.apply(label)
score0 = evaluate()
treshold = 0.9
log_treshold = 2
amount_treshold = 30
while(True):
    print('----------------------------------')
    old_len = len(stemmed_dictionary)
    print('treshold', treshold)
    print('length of dict before ', old_len)
    p_bad = likelihood(True, amount_treshold)
    p_good = likelihood(False, amount_treshold)
    prob = relative_distance(p_good, p_bad)
    log_prob =log_odds(p_good, p_bad)
    new_words = []
    for key in prob:
        if prob[key] > treshold:
            if update_dictionary(key):
                new_words.append(key)
    for key in log_prob:
        if log_prob[key] > log_treshold:
            if update_dictionary(key):
                new_words.append(key)


    print('length of dict after ',len(stemmed_dictionary))
    print('new words ', new_words)
    score1 = evaluate()
    if score1 < score0:
        print('f1 ', score0, 'treshold ', treshold, 'amount treshold ', amount_treshold)
        break
    else:
        score0 = score1

    if len(stemmed_dictionary) == old_len:
        treshold -= 0.01
        log_treshold -= 0.1
    df.label = df.text.apply(label)

accuracy:  0.5885416666666666
precision:  0.7292035398230089
recall:  0.6299694189602446
f1:  0.6759639048400328
----------------------------------
treshold 0.9
length of dict before  476


  import sys


length of dict after  480
new words  ['тип', 'западенск', 'треб', 'тип']
accuracy:  0.5936863543788188
precision:  0.7186963979416809
recall:  0.6406727828746177
f1:  0.677445432497979
----------------------------------
treshold 0.9
length of dict before  480
length of dict after  480
new words  []
accuracy:  0.5936863543788188
precision:  0.7186963979416809
recall:  0.6406727828746177
f1:  0.677445432497979
----------------------------------
treshold 0.89
length of dict before  480
length of dict after  480
new words  []
accuracy:  0.5936863543788188
precision:  0.7186963979416809
recall:  0.6406727828746177
f1:  0.677445432497979
----------------------------------
treshold 0.88
length of dict before  480
length of dict after  482
new words  ['пошел', 'вонюч']
accuracy:  0.5945121951219512
precision:  0.717948717948718
recall:  0.6422018348623854
f1:  0.6779661016949153
----------------------------------
treshold 0.88
length of dict before  482
length of dict after  482
new words  []
