In [61]:
import pandas as pd
from alphabet_detector import AlphabetDetector
import string
import re
import nltk
import numpy as np
import operator
from nltk.stem.snowball import SnowballStemmer
import xml.etree.ElementTree

In [62]:
path_to_data = '../data/comments1.csv'

In [63]:
df = pd.read_csv(path_to_data)
df = df.dropna()

In [64]:
emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)

In [65]:
def clean(x):
    ad = AlphabetDetector()
    res = x
    for ch in string.punctuation:                                                                                                     
        res = res.replace(ch, ' ')
    res = ''.join([i for i in res if not i.isdigit()])
    res = res.lower()
    res = emoji_pattern.sub(r' ', res)
    res = res.replace('\n', ' ')
    res = res.replace('\t', ' ')
    res = res.replace('\ufeff', ' ')
    res = res.replace('\r\n', '  ')
    res = res.replace('\xa0', ' ')
    res = res.replace('«', ' ')
    res = res.replace('»', ' ')
    res = res.replace('—', ' ')
    res = res.replace('ё', 'е')
    res = re.sub(' +',' ', res)
    if  not ad.only_alphabet_chars(res, "CYRILLIC"): 
        res = ''
    return res

In [66]:
df.text = df.text.apply(clean)
df = df[df.text != '']

In [67]:
manual_total = pd.read_csv('../data/manual.csv')

# Validation set
manual = manual_total.sample(1500)
testing_set = manual_total[~manual_total.id.isin(manual.id)]

df = df[~df.id.isin(manual_total.id)]

In [68]:
def stem(word):
    stemmer = SnowballStemmer("russian", ignore_stopwords=True) 
    stemmed_word = stemmer.stem(word)
    if len(stemmed_word) <= 2:
        return word
    return stemmed_word

In [69]:
def get_bad_words(path):
    path_to_bad_words = path 
    bad_words = open(path_to_bad_words).read().split('\n')[:-1]
    stemmed_dictionary = []
    for word in bad_words:
        stemmed_dictionary.append(stem(word))
    stemmed_dictionary = [i for i in stemmed_dictionary if len(i) > 2] 
    stemmed_dictionary.append('хуй') # nltk stemmer can not corectly stem word хуй
    stemmed_dictionary.append('хуе')
    stemmed_dictionary = list(set(stemmed_dictionary))
    return stemmed_dictionary

In [70]:
def get_unstem_bad_words(path):
    path_to_bad_words = path 
    bad_words = open(path_to_bad_words).read().split('\n')[:-1]
    stemmed_dictionary = []
    for word in bad_words:
        stemmed_dictionary.append(word)
    stemmed_dictionary.append('хуй')
    stemmed_dictionary = list(set(stemmed_dictionary))
    return stemmed_dictionary

In [71]:
def label(x):
    global stemmed_dictionary
    tokens = nltk.word_tokenize(x)
    for bad_word in stemmed_dictionary:
        for token in tokens:
            if bad_word in token:
                return True
    return False

In [72]:
def unstem_label(x):
    global stemmed_dictionary
    tokens = nltk.word_tokenize(x)
    for bad_word in stemmed_dictionary:
        for token in tokens:
            if bad_word == token:
                return True
    return False

In [73]:
all_words = dict()
for sentence in df.text:
    for token in nltk.word_tokenize(sentence):
        if token not in all_words:
            all_words[token] = token

In [74]:
all_stemmed = dict()
for sentence in df.text:
    for token in nltk.word_tokenize(sentence):
        if token not in all_stemmed:
            all_stemmed[token] = stem(token)

In [75]:
word_to_id = dict()
all_counts = dict()
for index, row in df.iterrows():
    tokens = nltk.word_tokenize(row.text)
    for token in tokens:
        word = stem(token)
        if word in word_to_id:
            word_to_id[word].append(row.video_id)
        else:
            word_to_id[word] = [row.video_id]
        if word in all_counts:
            all_counts[word] += 1
        else:
            all_counts[word] = 1
for key in word_to_id:
    word_to_id[key] = len(list(set(word_to_id[key])))

In [76]:
unstem_word_to_id = dict()
unstem_all_counts = dict()
for index, row in df.iterrows():
    tokens = nltk.word_tokenize(row.text)
    for token in tokens:
        word = token
        if word in unstem_word_to_id:
            unstem_word_to_id[word].append(row.video_id)
        else:
            unstem_word_to_id[word] = [row.video_id]
        if word in unstem_all_counts:
            unstem_all_counts[word] += 1
        else:
            unstem_all_counts[word] = 1
for key in unstem_word_to_id:
    unstem_word_to_id[key] = len(list(set(unstem_word_to_id[key])))

In [77]:
def likelihood(label, treshold):
    labeld_part = dict()
    part = df[df.label == label]
    for index, row in part.iterrows():
        sentence = row.text
        tokens = nltk.word_tokenize(sentence)
        for token in tokens:
            stemed = stem(token)
            if stemed != '':
                if word_to_id[stemed] > treshold:
                    if stemed in labeld_part:
                        labeld_part[stemed] += 1.0/len(part)
                    else:
                        labeld_part[stemed] = 1.0/len(part)
    return labeld_part

In [78]:
def unstem_likelihood(label, treshold):
    labeld_part = dict()
    part = df[df.label == label]
    for index, row in part.iterrows():
        sentence = row.text
        tokens = nltk.word_tokenize(sentence)
        for token in tokens:
            stemed = token
            if stemed != '':
                if unstem_word_to_id[stemed] > treshold:
                    if stemed  in labeld_part:
                        labeld_part[stemed] += 1.0/len(part)
                    else:
                        labeld_part[stemed] = 1.0/len(part)
    return labeld_part

In [79]:
def relative_distance(p_good, p_bad):
    difference = dict()
    for key in p_good:
        if key in p_bad:
            difference[key] =(p_bad[key] - p_good[key])/ np.maximum( p_bad[key], p_good[key]) 
    return difference

In [80]:
def log_odds(p_good, p_bad):
    ratio = dict()
    for key in p_good:
        if key in p_bad:
            odds_good =  p_good[key]/(1 -  p_good[key])
            odds_bad = p_bad[key]/(1- p_bad[key])
            ratio[key] = np.log(odds_bad/odds_good)
    return ratio

In [81]:
def sort(x, rev = True):
    return sorted(x.items(), key=operator.itemgetter(1), reverse=rev)

In [82]:
def update_dictionary(word):
    global stemmed_dictionary
    if len(word) <= 2:
        return False
    #for key in stemmed_dictionary:
        #if word in key:
            #return False
    stemmed_dictionary.append(word)
    return True

In [83]:
def evaluate(manual):
    manual['evaluation'] = manual.text.apply(label)
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    for index, row in manual.iterrows():
        if row.label == True and row.evaluation == True:
            tp += 1
        if row.label == False and row.evaluation == False:
            tn += 1
        if row.label == False and row.evaluation == True:
            fp += 1
        if row.label == True and row.evaluation == False:
            #print(row.text)
            fn += 1
    accuracy = (tp + fp)/(tp + fp + fn + fp)
    precision = tp/(tp + fp)
    recall = tp/(tp + fn)
    f1  = 2 * (precision * recall)/(precision + recall)
    print('accuracy: ', accuracy)
    print('precision: ', precision)
    print('recall: ', recall)
    print('f1: ', f1)
    return f1

In [84]:
eps = 0.001

# Resultd for relative distance with unstemed words

In [85]:
stemmed_dictionary = get_unstem_bad_words('../data/bad_words.txt')
df['label'] = df.text.apply(unstem_label)
print("-------------- TESTING SET RESULT 0 -----------")
evaluate(testing_set)
print('------------------------------------------------')
score0 = evaluate(manual)
treshold = 0.83
amount_treshold = 30
while(True):
    print('----------------------------------')
    old_stemmed_dictionary = stemmed_dictionary
    old_len = len(stemmed_dictionary)
    print('treshold', treshold)
    print('length of dict before ', old_len)
    p_bad = unstem_likelihood(True, amount_treshold)
    p_good = unstem_likelihood(False, amount_treshold)
    prob = relative_distance(p_good, p_bad)
    new_words = []
    for key in prob:
        if prob[key] > treshold:
            stemmed_dictionary.append(key)
            new_words.append(key)

    print('length of dict after ',len(stemmed_dictionary))
    print('new words ', new_words)
    score1 = evaluate(manual)
    if score0 - score1 > eps:
        print('f1 ', score0, 'treshold ', treshold, 'amount treshold ', amount_treshold)
        stemmed_dictionary = old_stemmed_dictionary
        #result.append([score0, treshold, amount_treshold])
        break
    else:
        score0 = score1

    if len(stemmed_dictionary) == old_len:
        treshold -= 0.01
    df.label = df.text.apply(unstem_label)
print("-------------- TESTING SET RESULTS -----------")
evaluate(testing_set)

-------------- TESTING SET RESULT 0 -----------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


accuracy:  0.5086705202312138
precision:  0.875
recall:  0.5099337748344371
f1:  0.6443514644351465
------------------------------------------------
accuracy:  0.5126475548060708
precision:  0.8519736842105263
recall:  0.5149105367793241
f1:  0.6418835192069393
----------------------------------
treshold 0.83
length of dict before  622
length of dict after  630
new words  ['очко', 'рот', 'тупое', 'пошел', 'хули', 'сосать', 'ебаные', 'иди']
accuracy:  0.550744248985115
precision:  0.7100737100737101
recall:  0.5745526838966203
f1:  0.6351648351648352
f1  0.6418835192069393 treshold  0.83 amount treshold  30
-------------- TESTING SET RESULTS -----------
accuracy:  0.589041095890411
precision:  0.7364341085271318
recall:  0.6291390728476821
f1:  0.6785714285714286


0.6785714285714286

# Results for relative distance unstem small dictionary

In [86]:
stemmed_dictionary = get_unstem_bad_words('../data/bad_words_seed.txt')
df['label'] = df.text.apply(unstem_label)
print("-------------- TESTING SET RESULT 0 -----------")
evaluate(testing_set)
print('------------------------------------------------')
score0 = evaluate(manual)
treshold = 0.83
amount_treshold = 30
while(True):
    print('----------------------------------')
    old_len = len(stemmed_dictionary)
    print('treshold', treshold)
    print('length of dict before ', old_len)
    p_bad = unstem_likelihood(True, amount_treshold)
    p_good = unstem_likelihood(False, amount_treshold)
    prob = relative_distance(p_good, p_bad)
    new_words = []
    for key in prob:
        if prob[key] > treshold:
            stemmed_dictionary.append(key)
            new_words.append(key)

    print('length of dict after ',len(stemmed_dictionary))
    print('new words ', new_words)
    score1 = evaluate(manual)
    if score0 - score1 > eps:
        print('f1 ', score0, 'treshold ', treshold, 'amount treshold ', amount_treshold)
        #result.append([score0, treshold, amount_treshold])
        break
    else:
        score0 = score1

    if len(stemmed_dictionary) == old_len:
        treshold -= 0.01
    df.label = df.text.apply(unstem_label)
print("-------------- TESTING SET RESULTS -----------")
evaluate(testing_set)

-------------- TESTING SET RESULT 0 -----------
accuracy:  0.1761006289308176
precision:  0.8571428571428571
recall:  0.15894039735099338
f1:  0.2681564245810056
------------------------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


accuracy:  0.16446124763705103
precision:  0.8505747126436781
recall:  0.147117296222664
f1:  0.25084745762711863
----------------------------------
treshold 0.83
length of dict before  5
length of dict after  29
new words  ['тупая', 'похуй', 'пидарасы', 'гореть', 'хуйня', 'соси', 'тварь', 'рот', 'нахуй', 'бараны', 'долбоебы', 'пизды', 'пиздец', 'пидор', 'хуя', 'ебаные', 'иди', 'бандеры', 'хули', 'блять', 'блядь', 'сосать', 'долбоеб', 'нихуя']
accuracy:  0.3708513708513709
precision:  0.6303501945525292
recall:  0.3220675944333996
f1:  0.4263157894736842
----------------------------------
treshold 0.83
length of dict before  29
length of dict after  31
new words  ['пошел', 'аду']
accuracy:  0.38589211618257263
precision:  0.6057347670250897
recall:  0.3359840954274354
f1:  0.432225063938619
----------------------------------
treshold 0.83
length of dict before  31
length of dict after  31
new words  []
accuracy:  0.38589211618257263
precision:  0.6057347670250897
recall:  0.33598409542

0.5185185185185185

# Results for log odds unstem 

In [87]:
stemmed_dictionary = get_unstem_bad_words('../data/bad_words.txt')
df['label'] = df.text.apply(unstem_label)
print("-------------- TESTING SET RESULT 0 -----------")
evaluate(testing_set)
print('------------------------------------------------')
score0 = evaluate(manual)
treshold = 2
amount_treshold = 30
while(True):
    print('----------------------------------')
    old_stemmed_dictionary = stemmed_dictionary
    old_len = len(stemmed_dictionary)
    print('treshold', treshold)
    print('length of dict before ', old_len)
    p_bad = unstem_likelihood(True, amount_treshold)
    p_good = unstem_likelihood(False, amount_treshold)
    prob = relative_distance(p_good, p_bad)
    new_words = []
    for key in prob:
        if prob[key] > treshold:
            stemmed_dictionary.append(key)
            new_words.append(key)

    print('length of dict after ',len(stemmed_dictionary))
    print('new words ', new_words)
    score1 = evaluate(manual)
    if score0 - score1 > eps:
        print('f1 ', score0, 'treshold ', treshold, 'amount treshold ', amount_treshold)
        stemmed_dictionary = old_stemmed_dictionary
        #result.append([score0, treshold, amount_treshold])
        break
    else:
        score0 = score1

    if len(stemmed_dictionary) == old_len:
        treshold -= 0.1
    df.label = df.text.apply(unstem_label)
print("-------------- TESTING SET RESULTS -----------")
evaluate(testing_set)

-------------- TESTING SET RESULT 0 -----------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


accuracy:  0.5086705202312138
precision:  0.875
recall:  0.5099337748344371
f1:  0.6443514644351465
------------------------------------------------
accuracy:  0.5126475548060708
precision:  0.8519736842105263
recall:  0.5149105367793241
f1:  0.6418835192069393
----------------------------------
treshold 2
length of dict before  622
length of dict after  622
new words  []
accuracy:  0.5126475548060708
precision:  0.8519736842105263
recall:  0.5149105367793241
f1:  0.6418835192069393
----------------------------------
treshold 1.9
length of dict before  622
length of dict after  622
new words  []
accuracy:  0.5126475548060708
precision:  0.8519736842105263
recall:  0.5149105367793241
f1:  0.6418835192069393
----------------------------------
treshold 1.7999999999999998
length of dict before  622
length of dict after  622
new words  []
accuracy:  0.5126475548060708
precision:  0.8519736842105263
recall:  0.5149105367793241
f1:  0.6418835192069393
----------------------------------
tresho

0.6529209621993127

# Results for log odds unstem small dictionary

In [88]:
stemmed_dictionary = get_unstem_bad_words('../data/bad_words_seed.txt')
df['label'] = df.text.apply(unstem_label)
print("-------------- TESTING SET RESULT 0 -----------")
evaluate(testing_set)
print('------------------------------------------------')
score0 = evaluate(manual)
treshold = 2
amount_treshold = 30
while(True):
    print('----------------------------------')
    old_stemmed_dictionary = stemmed_dictionary
    old_len = len(stemmed_dictionary)
    print('treshold', treshold)
    print('length of dict before ', old_len)
    p_bad = unstem_likelihood(True, amount_treshold)
    p_good = unstem_likelihood(False, amount_treshold)
    prob = relative_distance(p_good, p_bad)
    new_words = []
    for key in prob:
        if prob[key] > treshold:
            stemmed_dictionary.append(key)
            new_words.append(key)

    print('length of dict after ',len(stemmed_dictionary))
    print('new words ', new_words)
    score1 = evaluate(manual)
    if score0 - score1 > eps:
        print('f1 ', score0, 'treshold ', treshold, 'amount treshold ', amount_treshold)
        stemmed_dictionary = old_stemmed_dictionary
        #result.append([score0, treshold, amount_treshold])
        break
    else:
        score0 = score1

    if len(stemmed_dictionary) == old_len:
        treshold -= 0.1
    df.label = df.text.apply(unstem_label)
print("-------------- TESTING SET RESULTS -----------")
evaluate(testing_set)

-------------- TESTING SET RESULT 0 -----------
accuracy:  0.1761006289308176
precision:  0.8571428571428571
recall:  0.15894039735099338
f1:  0.2681564245810056


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


------------------------------------------------
accuracy:  0.16446124763705103
precision:  0.8505747126436781
recall:  0.147117296222664
f1:  0.25084745762711863
----------------------------------
treshold 2
length of dict before  5
length of dict after  5
new words  []
accuracy:  0.16446124763705103
precision:  0.8505747126436781
recall:  0.147117296222664
f1:  0.25084745762711863
----------------------------------
treshold 1.9
length of dict before  5
length of dict after  5
new words  []
accuracy:  0.16446124763705103
precision:  0.8505747126436781
recall:  0.147117296222664
f1:  0.25084745762711863
----------------------------------
treshold 1.7999999999999998
length of dict before  5
length of dict after  5
new words  []
accuracy:  0.16446124763705103
precision:  0.8505747126436781
recall:  0.147117296222664
f1:  0.25084745762711863
----------------------------------
treshold 1.6999999999999997
length of dict before  5
length of dict after  5
new words  []
accuracy:  0.1644612476

accuracy:  0.601870679137861
precision:  0.33918918918918917
recall:  0.9980119284294234
f1:  0.5063035804336864
f1  0.5385878489326765 treshold  0.6999999999999993 amount treshold  30
-------------- TESTING SET RESULTS -----------
accuracy:  0.5893491124260355
precision:  0.3032128514056225
recall:  1.0
f1:  0.46533127889060094


0.46533127889060094

# Results for relative distance

In [89]:
stemmed_dictionary = get_bad_words('../data/bad_words.txt')
df['label'] = df.text.apply(label)
print("-------------- TESTING SET RESULT 0 -----------")
evaluate(testing_set)
print('------------------------------------------------')
score0 = evaluate(manual)
treshold = 0.83
amount_treshold = 30
while(True):
    print('----------------------------------')
    old_stemmed_dictionary = stemmed_dictionary
    old_len = len(stemmed_dictionary)
    print('treshold', treshold)
    print('length of dict before ', old_len)
    p_bad = likelihood(True, amount_treshold)
    p_good = likelihood(False, amount_treshold)
    prob = relative_distance(p_good, p_bad)
    new_words = []
    for key in prob:
        if prob[key] > treshold:
            if update_dictionary(key):
                new_words.append(key)

    print('length of dict after ',len(stemmed_dictionary))
    print('new words ', new_words)
    score1 = evaluate(manual)
    if score0 - score1 > eps:
        print('f1 ', score0, 'treshold ', treshold, 'amount treshold ', amount_treshold)
        stemmed_dictionary = old_stemmed_dictionary
        #result.append([score0, treshold, amount_treshold])
        break
    else:
        score0 = score1

    if len(stemmed_dictionary) == old_len:
        treshold -= 0.01
    df.label = df.text.apply(label)
print("-------------- TESTING SET RESULTS -----------")
evaluate(testing_set)

-------------- TESTING SET RESULT 0 -----------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


accuracy:  0.5898617511520737
precision:  0.7421875
recall:  0.6291390728476821
f1:  0.6810035842293908
------------------------------------------------
accuracy:  0.5881561238223418
precision:  0.7254004576659039
recall:  0.6302186878727635
f1:  0.6744680851063831
----------------------------------
treshold 0.83
length of dict before  476
length of dict after  482
new words  ['западенск', 'треб', 'кацапск', 'вонюч', 'бендеровск', 'тип']
accuracy:  0.5939553219448095
precision:  0.7146017699115044
recall:  0.6421471172962226
f1:  0.6764397905759162
----------------------------------
treshold 0.83
length of dict before  482
length of dict after  483
new words  ['пошел']
accuracy:  0.5937090432503277
precision:  0.7130242825607064
recall:  0.6421471172962226
f1:  0.6757322175732218
----------------------------------
treshold 0.83
length of dict before  483
length of dict after  483
new words  []
accuracy:  0.5937090432503277
precision:  0.7130242825607064
recall:  0.6421471172962226
f1: 

0.6710526315789473

# Results for relative distance with small seed

In [90]:
stemmed_dictionary = get_bad_words('../data/bad_words_seed.txt')
df['label'] = df.text.apply(label)
print("-------------- TESTING SET RESULT 0 -----------")
evaluate(testing_set)
print('------------------------------------------------')
score0 = evaluate(manual)
treshold = 0.90
amount_treshold = 30
while(True):
    print('----------------------------------')
    old_stemmed_dictionary = stemmed_dictionary
    old_len = len(stemmed_dictionary)
    print('treshold', treshold)
    print('length of dict before ', old_len)
    p_bad = likelihood(True, amount_treshold)
    p_good = likelihood(False, amount_treshold)
    prob = relative_distance(p_good, p_bad)
    new_words = []
    for key in prob:
        if prob[key] > treshold:
            if update_dictionary(key):
                new_words.append(key)

    print('length of dict after ',len(stemmed_dictionary))
    print('new words ', new_words)
    score1 = evaluate(manual)
    if score0 - score1 > eps:
        print('f1 ', score0, 'treshold ', treshold, 'amount treshold ', amount_treshold)
        stemmed_dictionary = old_stemmed_dictionary
        #result.append([score0, treshold, amount_treshold])
        break
    else:
        score0 = score1

    if len(stemmed_dictionary) == old_len:
        treshold -= 0.01
    df.label = df.text.apply(label)
print("-------------- TESTING SET RESULTS -----------")
evaluate(testing_set)

-------------- TESTING SET RESULT 0 -----------
accuracy:  0.24528301886792453
precision:  0.8974358974358975
recall:  0.23178807947019867
f1:  0.368421052631579
------------------------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


accuracy:  0.28256880733944956
precision:  0.8636363636363636
recall:  0.2644135188866799
f1:  0.4048706240487062
----------------------------------
treshold 0.9
length of dict before  6
length of dict after  12
new words  ['нах', 'пох', 'ебуч', 'треб', 'сос', 'ниху']
accuracy:  0.37676609105180536
precision:  0.7208333333333333
recall:  0.34393638170974156
f1:  0.4656796769851952
----------------------------------
treshold 0.9
length of dict before  12
length of dict after  12
new words  []
accuracy:  0.37676609105180536
precision:  0.7208333333333333
recall:  0.34393638170974156
f1:  0.4656796769851952
----------------------------------
treshold 0.89
length of dict before  12
length of dict after  13
new words  ['хуя']
accuracy:  0.37990580847723704
precision:  0.7231404958677686
recall:  0.34791252485089463
f1:  0.46979865771812085
----------------------------------
treshold 0.89
length of dict before  13
length of dict after  13
new words  []
accuracy:  0.37990580847723704
precisio

0.4581497797356829

# Results for log odds

In [91]:
stemmed_dictionary = get_bad_words('../data/bad_words.txt')
df['label'] = df.text.apply(label)
print("-------------- TESTING SET RESULT 0 -----------")
evaluate(testing_set)
print('------------------------------------------------')
score0 = evaluate(manual)
treshold = 2
amount_treshold = 30
while(True):
    print('----------------------------------')
    old_stemmed_dictionary = stemmed_dictionary
    old_len = len(stemmed_dictionary)
    print('treshold', treshold)
    print('length of dict before ', old_len)
    p_bad = likelihood(True, amount_treshold)
    p_good = likelihood(False, amount_treshold)
    prob = log_odds(p_good, p_bad)
    new_words = []
    for key in prob:
        if prob[key] > treshold:
            if update_dictionary(key):
                new_words.append(key)

    print('length of dict after ',len(stemmed_dictionary))
    print('new words ', new_words)
    score1 = evaluate(manual)
    if score0 - score1 > eps:
        print('f1 ', score0, 'treshold ', treshold, 'amount treshold ', amount_treshold)
        stemmed_dictionary = old_stemmed_dictionary
        break
    else:
        score0 = score1

    if len(stemmed_dictionary) == old_len:
        treshold -= 0.1
    df.label = df.text.apply(label)
print("-------------- TESTING SET RESULTS -----------")
evaluate(testing_set)

-------------- TESTING SET RESULT 0 -----------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


accuracy:  0.5898617511520737
precision:  0.7421875
recall:  0.6291390728476821
f1:  0.6810035842293908
------------------------------------------------
accuracy:  0.5881561238223418
precision:  0.7254004576659039
recall:  0.6302186878727635
f1:  0.6744680851063831
----------------------------------
treshold 2
length of dict before  476


  import sys


length of dict after  479
new words  ['западенск', 'треб', 'тип']
accuracy:  0.5913272010512484
precision:  0.7133333333333334
recall:  0.6381709741550696
f1:  0.6736621196222456
----------------------------------
treshold 2
length of dict before  479
length of dict after  479
new words  []
accuracy:  0.5913272010512484
precision:  0.7133333333333334
recall:  0.6381709741550696
f1:  0.6736621196222456
----------------------------------
treshold 1.9
length of dict before  479
length of dict after  479
new words  []
accuracy:  0.5913272010512484
precision:  0.7133333333333334
recall:  0.6381709741550696
f1:  0.6736621196222456
----------------------------------
treshold 1.7999999999999998
length of dict before  479
length of dict after  481
new words  ['пошел', 'вонюч']
accuracy:  0.5923984272608126
precision:  0.7123893805309734
recall:  0.6401590457256461
f1:  0.6743455497382199
----------------------------------
treshold 1.7999999999999998
length of dict before  481
length of dict aft

0.5788113695090439

# Resulst for log odds small dicitonary

In [92]:
stemmed_dictionary = get_bad_words('../data/bad_words_seed.txt')
df['label'] = df.text.apply(label)
print("-------------- TESTING SET RESULT 0 -----------")
evaluate(testing_set)
print('------------------------------------------------')
score0 = evaluate(manual)
treshold = 2.3
amount_treshold = 30
while(True):
    print('----------------------------------')
    old_stemmed_dictionary = stemmed_dictionary
    old_len = len(stemmed_dictionary)
    print('treshold', treshold)
    print('length of dict before ', old_len)
    p_bad = likelihood(True, amount_treshold)
    p_good = likelihood(False, amount_treshold)
    prob = log_odds(p_good, p_bad)
    print(sort(prob)[:15])
    new_words = []
    for key in prob:
        if prob[key] > treshold:
            if update_dictionary(key):
                new_words.append(key)

    print('length of dict after ',len(stemmed_dictionary))
    print('new words ', new_words)
    score1 = evaluate(manual)
    if score0 - score1 > eps:
        print('f1 ', score0, 'treshold ', treshold, 'amount treshold ', amount_treshold)
        stemmed_dictionary = old_stemmed_dictionary
        break
    else:
        score0 = score1

    if len(stemmed_dictionary) == old_len:
        treshold -= 0.1
    df.label = df.text.apply(label)
print("-------------- TESTING SET RESULTS -----------")
evaluate(testing_set)

-------------- TESTING SET RESULT 0 -----------
accuracy:  0.24528301886792453
precision:  0.8974358974358975
recall:  0.23178807947019867
f1:  0.368421052631579
------------------------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


accuracy:  0.28256880733944956
precision:  0.8636363636363636
recall:  0.2644135188866799
f1:  0.4048706240487062
----------------------------------
treshold 2.3
length of dict before  6


  import sys


[('не', 4.106855420729722), ('нах', 3.70784457636469), ('треб', 3.130972245837058), ('пох', 2.9925091867538334), ('ебуч', 2.5867022325109823), ('сос', 2.4087789763755096), ('ниху', 2.391941342456374), ('хуя', 2.2910793289705316), ('уебк', 2.204398317969881), ('хул', 2.159901165863682), ('пидорас', 2.153054683788713), ('тим', 2.0576438167942306), ('в', 1.9640348340170493), ('пидарас', 1.9164883759960185), ('хлеб', 1.8709229875901296)]
length of dict after  12
new words  ['нах', 'пох', 'ебуч', 'треб', 'сос', 'ниху']
accuracy:  0.37676609105180536
precision:  0.7208333333333333
recall:  0.34393638170974156
f1:  0.4656796769851952
----------------------------------
treshold 2.3
length of dict before  12
[('хуя', 2.2191863212399485), ('на', 2.054430564474757), ('хлеб', 1.9964683011553512), ('промышлен', 1.9390555920404462), ('уебк', 1.8737686232393402), ('что', 1.8618225508866049), ('хул', 1.8528522566965775), ('пидорас', 1.8181452789851265), ('срок', 1.7144092241872495), ('млн', 1.69813016

----------------------------------
treshold 1.7999999999999994
length of dict before  151
[('не', 5.51299824669465), ('поэт', 4.257229802911231), ('активн', 4.0229942915180334), ('минимум', 4.0229942915180334), ('будто', 4.003121454294141), ('хаос', 3.87486692670461), ('в', 3.859921281232788), ('результат', 3.782022251679524), ('относ', 3.7757174614607534), ('попытк', 3.7538881109584255), ('итог', 3.7036386653964564), ('беспорядк', 3.6453939525429853), ('пострада', 3.6163362845244627), ('отсутств', 3.5711106617471833), ('причин', 3.5493991919631727)]
length of dict after  742
new words  ['счет', 'видет', 'пришел', 'сми', 'план', 'братск', 'получ', 'момент', 'сегодн', 'событ', 'держа', 'след', 'зна', 'якоб', 'денег', 'почти', 'хер', 'бабушк', 'налог', 'быть', 'пол', 'пойм', 'остальн', 'долг', 'газ', 'куда', 'виж', 'улиц', 'абсолютн', 'поступа', 'групп', 'жизн', 'цел', 'главн', 'его', 'услыша', 'борьб', 'ихн', 'она', 'постро', 'появ', 'дадут', 'почита', 'остав', 'маск', 'особен', 'меша',

0.46256239600665555

# Result for mix of relative distance and log odds

In [93]:
stemmed_dictionary = get_bad_words('../data/bad_words.txt')
df['label'] = df.text.apply(label)
print("-------------- TESTING SET RESULT 0 -----------")
evaluate(testing_set)
print('------------------------------------------------')
score0 = evaluate(manual)
treshold = 0.9
log_treshold = 2
amount_treshold = 30
while(True):
    print('----------------------------------')
    old_stemmed_dictionary = stemmed_dictionary 
    old_len = len(stemmed_dictionary)
    print('treshold', treshold)
    print('length of dict before ', old_len)
    p_bad = likelihood(True, amount_treshold)
    p_good = likelihood(False, amount_treshold)
    prob = relative_distance(p_good, p_bad)
    log_prob =log_odds(p_good, p_bad)
    new_words = []
    for key in prob:
        if prob[key] > treshold:
            if update_dictionary(key):
                new_words.append(key)
    for key in log_prob:
        if log_prob[key] > log_treshold:
            if update_dictionary(key):
                new_words.append(key)


    print('length of dict after ',len(stemmed_dictionary))
    print('new words ', new_words)
    score1 = evaluate(manual)
    if score0 - score1 > eps:
        print('f1 ', score0, 'treshold ', treshold, 'amount treshold ', amount_treshold)
        stemmed_dictionary = old_stemmed_dictionary
        break
    else:
        score0 = score1

    if len(stemmed_dictionary) == old_len:
        treshold -= 0.01
        log_treshold -= 0.1
    df.label = df.text.apply(label)
print("-------------- TESTING SET RESULTS -----------")
evaluate(testing_set)

-------------- TESTING SET RESULT 0 -----------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


accuracy:  0.5898617511520737
precision:  0.7421875
recall:  0.6291390728476821
f1:  0.6810035842293908
------------------------------------------------
accuracy:  0.5881561238223418
precision:  0.7254004576659039
recall:  0.6302186878727635
f1:  0.6744680851063831
----------------------------------
treshold 0.9
length of dict before  476


  import sys


length of dict after  480
new words  ['тип', 'западенск', 'треб', 'тип']
accuracy:  0.5913272010512484
precision:  0.7133333333333334
recall:  0.6381709741550696
f1:  0.6736621196222456
----------------------------------
treshold 0.9
length of dict before  480
length of dict after  480
new words  []
accuracy:  0.5913272010512484
precision:  0.7133333333333334
recall:  0.6381709741550696
f1:  0.6736621196222456
----------------------------------
treshold 0.89
length of dict before  480
length of dict after  480
new words  []
accuracy:  0.5913272010512484
precision:  0.7133333333333334
recall:  0.6381709741550696
f1:  0.6736621196222456
----------------------------------
treshold 0.88
length of dict before  480
length of dict after  482
new words  ['пошел', 'вонюч']
accuracy:  0.5923984272608126
precision:  0.7123893805309734
recall:  0.6401590457256461
f1:  0.6743455497382199
----------------------------------
treshold 0.88
length of dict before  482
length of dict after  482
new words 

0.5788113695090439