In [17]:
import pandas as pd
from alphabet_detector import AlphabetDetector
import string
import re
import nltk
import numpy as np
import operator
from nltk.stem.snowball import SnowballStemmer
import xml.etree.ElementTree

In [18]:
path_to_data = '../data/comments1.csv'

In [19]:
df = pd.read_csv(path_to_data)
df = df.dropna()

In [20]:
emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)

In [21]:
def clean(x):
    ad = AlphabetDetector()
    res = x
    for ch in string.punctuation:                                                                                                     
        res = res.replace(ch, ' ')
    res = ''.join([i for i in res if not i.isdigit()])
    res = res.lower()
    res = emoji_pattern.sub(r' ', res)
    res = res.replace('\n', ' ')
    res = res.replace('\t', ' ')
    res = res.replace('\ufeff', ' ')
    res = res.replace('\r\n', '  ')
    res = res.replace('\xa0', ' ')
    res = res.replace('«', ' ')
    res = res.replace('»', ' ')
    res = res.replace('—', ' ')
    res = res.replace('ё', 'е')
    res = re.sub(' +',' ', res)
    if  not ad.only_alphabet_chars(res, "CYRILLIC"): 
        res = ''
    return res

In [22]:
df.text = df.text.apply(clean)
df = df[df.text != '']

In [23]:
manual_total = pd.read_csv('../data/manual.csv')

# Validation set
manual = manual_total.sample(1500)
testing_set = manual_total[~manual_total.id.isin(manual.id)]

df = df[~df.id.isin(manual_total.id)]

In [24]:
def stem(word):
    stemmer = SnowballStemmer("russian", ignore_stopwords=True) 
    stemmed_word = stemmer.stem(word)
    if len(stemmed_word) <= 2:
        return word
    return stemmed_word

In [25]:
def get_bad_words(path):
    path_to_bad_words = path 
    bad_words = open(path_to_bad_words).read().split('\n')[:-1]
    stemmed_dictionary = []
    for word in bad_words:
        stemmed_dictionary.append(stem(word))
    stemmed_dictionary = [i for i in stemmed_dictionary if len(i) > 2] 
    stemmed_dictionary.append('хуй') # nltk stemmer can not corectly stem word хуй
    stemmed_dictionary.append('хуе')
    stemmed_dictionary = list(set(stemmed_dictionary))
    return stemmed_dictionary

In [26]:
def get_unstem_bad_words(path):
    path_to_bad_words = path 
    bad_words = open(path_to_bad_words).read().split('\n')[:-1]
    stemmed_dictionary = []
    for word in bad_words:
        stemmed_dictionary.append(word)
    stemmed_dictionary.append('хуй')
    stemmed_dictionary = list(set(stemmed_dictionary))
    return stemmed_dictionary

In [27]:
def label(x):
    global stemmed_dictionary
    tokens = nltk.word_tokenize(x)
    for bad_word in stemmed_dictionary:
        for token in tokens:
            if bad_word in token:
                return True
    return False

In [30]:
def unstem_label(x):
    global stemmed_dictionary
    tokens = nltk.word_tokenize(x)
    for bad_word in stemmed_dictionary:
        for token in tokens:
            if bad_word == token:
                return True
    return False

In [33]:
all_words = dict()
for sentence in df.text:
    for token in nltk.word_tokenize(sentence):
        if token not in all_words:
            all_words[token] = token

In [34]:
all_stemmed = dict()
for sentence in df.text:
    for token in nltk.word_tokenize(sentence):
        if token not in all_stemmed:
            all_stemmed[token] = stem(token)

In [35]:
word_to_id = dict()
all_counts = dict()
for index, row in df.iterrows():
    tokens = nltk.word_tokenize(row.text)
    for token in tokens:
        word = stem(token)
        if word in word_to_id:
            word_to_id[word].append(row.video_id)
        else:
            word_to_id[word] = [row.video_id]
        if word in all_counts:
            all_counts[word] += 1
        else:
            all_counts[word] = 1
for key in word_to_id:
    word_to_id[key] = len(list(set(word_to_id[key])))

In [36]:
unstem_word_to_id = dict()
unstem_all_counts = dict()
for index, row in df.iterrows():
    tokens = nltk.word_tokenize(row.text)
    for token in tokens:
        word = token
        if word in unstem_word_to_id:
            unstem_word_to_id[word].append(row.video_id)
        else:
            unstem_word_to_id[word] = [row.video_id]
        if word in unstem_all_counts:
            unstem_all_counts[word] += 1
        else:
            unstem_all_counts[word] = 1
for key in unstem_word_to_id:
    unstem_word_to_id[key] = len(list(set(unstem_word_to_id[key])))

In [37]:
def likelihood(label, treshold):
    labeld_part = dict()
    part = df[df.label == label]
    for index, row in part.iterrows():
        sentence = row.text
        tokens = nltk.word_tokenize(sentence)
        for token in tokens:
            stemed = stem(token)
            if stemed != '':
                if word_to_id[stemed] > treshold:
                    if stemed in labeld_part:
                        labeld_part[stemed] += 1.0/len(part)
                    else:
                        labeld_part[stemed] = 1.0/len(part)
    return labeld_part

In [38]:
def unstem_likelihood(label, treshold):
    labeld_part = dict()
    part = df[df.label == label]
    for index, row in part.iterrows():
        sentence = row.text
        tokens = nltk.word_tokenize(sentence)
        for token in tokens:
            stemed = token
            if stemed != '':
                if unstem_word_to_id[stemed] > treshold:
                    if stemed  in labeld_part:
                        labeld_part[stemed] += 1.0/len(part)
                    else:
                        labeld_part[stemed] = 1.0/len(part)
    return labeld_part

In [39]:
def relative_distance(p_good, p_bad):
    difference = dict()
    for key in p_good:
        if key in p_bad:
            difference[key] =(p_bad[key] - p_good[key])/ np.maximum( p_bad[key], p_good[key]) 
    return difference

In [40]:
def log_odds(p_good, p_bad):
    ratio = dict()
    for key in p_good:
        if key in p_bad:
            odds_good =  p_good[key]/(1 -  p_good[key])
            odds_bad = p_bad[key]/(1- p_bad[key])
            ratio[key] = np.log(odds_bad/odds_good)
    return ratio

In [41]:
def sort(x, rev = True):
    return sorted(x.items(), key=operator.itemgetter(1), reverse=rev)

In [42]:
def update_dictionary(word):
    global stemmed_dictionary
    if len(word) <= 2:
        return False
    #for key in stemmed_dictionary:
        #if word in key:
            #return False
    stemmed_dictionary.append(word)
    return True

In [43]:
def evaluate(manual):
    manual['evaluation'] = manual.text.apply(label)
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    for index, row in manual.iterrows():
        if row.label == True and row.evaluation == True:
            tp += 1
        if row.label == False and row.evaluation == False:
            tn += 1
        if row.label == False and row.evaluation == True:
            fp += 1
        if row.label == True and row.evaluation == False:
            #print(row.text)
            fn += 1
    accuracy = (tp + fp)/(tp + fp + fn + fp)
    precision = tp/(tp + fp)
    recall = tp/(tp + fn)
    f1  = 2 * (precision * recall)/(precision + recall)
    print('accuracy: ', accuracy)
    print('precision: ', precision)
    print('recall: ', recall)
    print('f1: ', f1)
    return f1

# Resultd for relative distance with unstemed words

In [46]:
stemmed_dictionary = get_unstem_bad_words('../data/bad_words.txt')
df['label'] = df.text.apply(unstem_label)
score0 = evaluate(manual)
treshold = 0.83
amount_treshold = 30
while(True):
    print('----------------------------------')
    old_len = len(stemmed_dictionary)
    print('treshold', treshold)
    print('length of dict before ', old_len)
    p_bad = unstem_likelihood(True, amount_treshold)
    p_good = unstem_likelihood(False, amount_treshold)
    prob = relative_distance(p_good, p_bad)
    new_words = []
    for key in prob:
        if prob[key] > treshold:
            stemmed_dictionary.append(key)
            new_words.append(key)

    print('length of dict after ',len(stemmed_dictionary))
    print('new words ', new_words)
    score1 = evaluate(manual)
    if score1 < score0:
        print('f1 ', score0, 'treshold ', treshold, 'amount treshold ', amount_treshold)
        #result.append([score0, treshold, amount_treshold])
        break
    else:
        score0 = score1

    if len(stemmed_dictionary) == old_len:
        treshold -= 0.01
    df.label = df.text.apply(unstem_label)
print("-------------- TESTING SET RESULTS -----------")
evaluate(testing_set)

accuracy:  0.5121527777777778
precision:  0.8508474576271187
recall:  0.514344262295082
f1:  0.6411238825031927
----------------------------------
treshold 0.83
length of dict before  622
length of dict after  630
new words  ['сосать', 'очко', 'рот', 'ебаные', 'пошел', 'тупое', 'хули', 'иди']
accuracy:  0.555858310626703
precision:  0.6985294117647058
recall:  0.5840163934426229
f1:  0.6361607142857143
f1  0.6411238825031927 treshold  0.83 amount treshold  30
-------------- TESTING SET RESULTS -----------
accuracy:  0.5714285714285714
precision:  0.7734375
recall:  0.5963855421686747
f1:  0.6734693877551021


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


0.6734693877551021

# Results for relative distance unstem small dictionary

In [47]:
stemmed_dictionary = get_unstem_bad_words('../data/bad_words_seed.txt')
df['label'] = df.text.apply(unstem_label)
score0 = evaluate(manual)
treshold = 0.83
amount_treshold = 30
while(True):
    print('----------------------------------')
    old_len = len(stemmed_dictionary)
    print('treshold', treshold)
    print('length of dict before ', old_len)
    p_bad = unstem_likelihood(True, amount_treshold)
    p_good = unstem_likelihood(False, amount_treshold)
    prob = relative_distance(p_good, p_bad)
    new_words = []
    for key in prob:
        if prob[key] > treshold:
            stemmed_dictionary.append(key)
            new_words.append(key)

    print('length of dict after ',len(stemmed_dictionary))
    print('new words ', new_words)
    score1 = evaluate(manual)
    if score1 < score0:
        print('f1 ', score0, 'treshold ', treshold, 'amount treshold ', amount_treshold)
        #result.append([score0, treshold, amount_treshold])
        break
    else:
        score0 = score1

    if len(stemmed_dictionary) == old_len:
        treshold -= 0.01
    df.label = df.text.apply(unstem_label)
print("-------------- TESTING SET RESULTS -----------")
evaluate(testing_set)

accuracy:  0.16085271317829458
precision:  0.8313253012048193
recall:  0.1413934426229508
f1:  0.24168126094570927
----------------------------------
treshold 0.83
length of dict before  5
length of dict after  29
new words  ['гореть', 'сосать', 'рот', 'тупая', 'бараны', 'похуй', 'нихуя', 'блядь', 'долбоеб', 'иди', 'пиздец', 'пидарасы', 'ебаные', 'блять', 'долбоебы', 'хуйня', 'бандеры', 'тварь', 'хули', 'соси', 'пизды', 'пидор', 'хуя', 'нахуй']
accuracy:  0.3815789473684211
precision:  0.6245210727969349
recall:  0.33401639344262296
f1:  0.4352469959946596
----------------------------------
treshold 0.83
length of dict before  29
length of dict after  31
new words  ['аду', 'пошел']
accuracy:  0.4005602240896359
precision:  0.6048951048951049
recall:  0.35450819672131145
f1:  0.4470284237726098
----------------------------------
treshold 0.83
length of dict before  31
length of dict after  31
new words  []
accuracy:  0.4005602240896359
precision:  0.6048951048951049
recall:  0.354508196

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


0.45238095238095244

# Results for log odds unstem 

In [48]:
stemmed_dictionary = get_unstem_bad_words('../data/bad_words.txt')
df['label'] = df.text.apply(unstem_label)
score0 = evaluate(manual)
treshold = 2
amount_treshold = 30
while(True):
    print('----------------------------------')
    old_len = len(stemmed_dictionary)
    print('treshold', treshold)
    print('length of dict before ', old_len)
    p_bad = unstem_likelihood(True, amount_treshold)
    p_good = unstem_likelihood(False, amount_treshold)
    prob = relative_distance(p_good, p_bad)
    new_words = []
    for key in prob:
        if prob[key] > treshold:
            stemmed_dictionary.append(key)
            new_words.append(key)

    print('length of dict after ',len(stemmed_dictionary))
    print('new words ', new_words)
    score1 = evaluate(manual)
    if score1 < score0:
        print('f1 ', score0, 'treshold ', treshold, 'amount treshold ', amount_treshold)
        #result.append([score0, treshold, amount_treshold])
        break
    else:
        score0 = score1

    if len(stemmed_dictionary) == old_len:
        treshold -= 0.1
    df.label = df.text.apply(unstem_label)
print("-------------- TESTING SET RESULTS -----------")
evaluate(testing_set)

accuracy:  0.5121527777777778
precision:  0.8508474576271187
recall:  0.514344262295082
f1:  0.6411238825031927
----------------------------------
treshold 2
length of dict before  622
length of dict after  622
new words  []
accuracy:  0.5121527777777778
precision:  0.8508474576271187
recall:  0.514344262295082
f1:  0.6411238825031927
----------------------------------
treshold 1.9
length of dict before  622
length of dict after  622
new words  []
accuracy:  0.5121527777777778
precision:  0.8508474576271187
recall:  0.514344262295082
f1:  0.6411238825031927
----------------------------------
treshold 1.7999999999999998
length of dict before  622
length of dict after  622
new words  []
accuracy:  0.5121527777777778
precision:  0.8508474576271187
recall:  0.514344262295082
f1:  0.6411238825031927
----------------------------------
treshold 1.6999999999999997
length of dict before  622
length of dict after  622
new words  []
accuracy:  0.5121527777777778
precision:  0.8508474576271187
rec

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


0.6463878326996197

# Results for log odds unstem small dictionary

In [49]:
stemmed_dictionary = get_unstem_bad_words('../data/bad_words_seed.txt')
df['label'] = df.text.apply(unstem_label)
score0 = evaluate(manual)
treshold = 2
amount_treshold = 30
while(True):
    print('----------------------------------')
    old_len = len(stemmed_dictionary)
    print('treshold', treshold)
    print('length of dict before ', old_len)
    p_bad = unstem_likelihood(True, amount_treshold)
    p_good = unstem_likelihood(False, amount_treshold)
    prob = relative_distance(p_good, p_bad)
    new_words = []
    for key in prob:
        if prob[key] > treshold:
            stemmed_dictionary.append(key)
            new_words.append(key)

    print('length of dict after ',len(stemmed_dictionary))
    print('new words ', new_words)
    score1 = evaluate(manual)
    if score1 < score0:
        print('f1 ', score0, 'treshold ', treshold, 'amount treshold ', amount_treshold)
        #result.append([score0, treshold, amount_treshold])
        break
    else:
        score0 = score1

    if len(stemmed_dictionary) == old_len:
        treshold -= 0.1
    df.label = df.text.apply(unstem_label)
print("-------------- TESTING SET RESULTS -----------")
evaluate(testing_set)

accuracy:  0.16085271317829458
precision:  0.8313253012048193
recall:  0.1413934426229508
f1:  0.24168126094570927
----------------------------------
treshold 2
length of dict before  5
length of dict after  5
new words  []
accuracy:  0.16085271317829458
precision:  0.8313253012048193
recall:  0.1413934426229508
f1:  0.24168126094570927
----------------------------------
treshold 1.9
length of dict before  5
length of dict after  5
new words  []
accuracy:  0.16085271317829458
precision:  0.8313253012048193
recall:  0.1413934426229508
f1:  0.24168126094570927
----------------------------------
treshold 1.7999999999999998
length of dict before  5
length of dict after  5
new words  []
accuracy:  0.16085271317829458
precision:  0.8313253012048193
recall:  0.1413934426229508
f1:  0.24168126094570927
----------------------------------
treshold 1.6999999999999997
length of dict before  5
length of dict after  5
new words  []
accuracy:  0.16085271317829458
precision:  0.8313253012048193
recall

accuracy:  0.597983870967742
precision:  0.32838840188806473
recall:  0.9979508196721312
f1:  0.49416539827498734
f1  0.5322175732217573 treshold  0.6999999999999993 amount treshold  30
-------------- TESTING SET RESULTS -----------
accuracy:  0.6007281553398058
precision:  0.33535353535353535
recall:  1.0
f1:  0.5022692889561271


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


0.5022692889561271

# Results for relative distance

In [50]:
stemmed_dictionary = get_bad_words('../data/bad_words.txt')
df['label'] = df.text.apply(label)
score0 = evaluate(manual)
treshold = 0.83
amount_treshold = 30
while(True):
    print('----------------------------------')
    old_len = len(stemmed_dictionary)
    print('treshold', treshold)
    print('length of dict before ', old_len)
    p_bad = likelihood(True, amount_treshold)
    p_good = likelihood(False, amount_treshold)
    prob = relative_distance(p_good, p_bad)
    new_words = []
    for key in prob:
        if prob[key] > treshold:
            if update_dictionary(key):
                new_words.append(key)

    print('length of dict after ',len(stemmed_dictionary))
    print('new words ', new_words)
    score1 = evaluate(manual)
    if score1 < score0:
        print('f1 ', score0, 'treshold ', treshold, 'amount treshold ', amount_treshold)
        #result.append([score0, treshold, amount_treshold])
        break
    else:
        score0 = score1

    if len(stemmed_dictionary) == old_len:
        treshold -= 0.01
    df.label = df.text.apply(label)
print("-------------- TESTING SET RESULTS -----------")
evaluate(testing_set)

accuracy:  0.5877437325905293
precision:  0.7274881516587678
recall:  0.6290983606557377
f1:  0.6747252747252747
----------------------------------
treshold 0.83
length of dict before  476
length of dict after  482
new words  ['треб', 'бендеровск', 'вонюч', 'кацапск', 'тип', 'западенск']
accuracy:  0.5967302452316077
precision:  0.7191780821917808
recall:  0.6454918032786885
f1:  0.6803455723542117
----------------------------------
treshold 0.83
length of dict before  482
length of dict after  483
new words  ['пошел']
accuracy:  0.5967302452316077
precision:  0.7191780821917808
recall:  0.6454918032786885
f1:  0.6803455723542117
----------------------------------
treshold 0.83
length of dict before  483
length of dict after  483
new words  []
accuracy:  0.5967302452316077
precision:  0.7191780821917808
recall:  0.6454918032786885
f1:  0.6803455723542117
----------------------------------
treshold 0.82
length of dict before  483
length of dict after  483
new words  []
accuracy:  0.5967

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


0.6809815950920245

# Results for relative distance with small seed

In [51]:
stemmed_dictionary = get_bad_words('../data/bad_words_seed.txt')
df['label'] = df.text.apply(label)
score0 = evaluate(manual)
treshold = 0.90
amount_treshold = 30
while(True):
    print('----------------------------------')
    old_len = len(stemmed_dictionary)
    print('treshold', treshold)
    print('length of dict before ', old_len)
    p_bad = likelihood(True, amount_treshold)
    p_good = likelihood(False, amount_treshold)
    prob = relative_distance(p_good, p_bad)
    new_words = []
    for key in prob:
        if prob[key] > treshold:
            if update_dictionary(key):
                new_words.append(key)

    print('length of dict after ',len(stemmed_dictionary))
    print('new words ', new_words)
    score1 = evaluate(manual)
    if score1 < score0:
        print('f1 ', score0, 'treshold ', treshold, 'amount treshold ', amount_treshold)
        #result.append([score0, treshold, amount_treshold])
        break
    else:
        score0 = score1

    if len(stemmed_dictionary) == old_len:
        treshold -= 0.01
    df.label = df.text.apply(label)
print("-------------- TESTING SET RESULTS -----------")
evaluate(testing_set)

accuracy:  0.2737642585551331
precision:  0.8680555555555556
recall:  0.25614754098360654
f1:  0.39556962025316456
----------------------------------
treshold 0.9
length of dict before  6
length of dict after  12
new words  ['треб', 'пох', 'сос', 'ебуч', 'нах', 'ниху']
accuracy:  0.3741935483870968
precision:  0.7155172413793104
recall:  0.3401639344262295
f1:  0.46111111111111114
----------------------------------
treshold 0.9
length of dict before  12
length of dict after  12
new words  []
accuracy:  0.3741935483870968
precision:  0.7155172413793104
recall:  0.3401639344262295
f1:  0.46111111111111114
----------------------------------
treshold 0.89
length of dict before  12
length of dict after  13
new words  ['хуя']
accuracy:  0.3790322580645161
precision:  0.7191489361702128
recall:  0.3463114754098361
f1:  0.46749654218533887
----------------------------------
treshold 0.89
length of dict before  13
length of dict after  13
new words  []
accuracy:  0.3790322580645161
precision:  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


0.4803149606299213

# Results for log odds

In [52]:
stemmed_dictionary = get_bad_words('../data/bad_words.txt')
df['label'] = df.text.apply(label)
score0 = evaluate(manual)
treshold = 2
amount_treshold = 30
while(True):
    print('----------------------------------')
    old_len = len(stemmed_dictionary)
    print('treshold', treshold)
    print('length of dict before ', old_len)
    p_bad = likelihood(True, amount_treshold)
    p_good = likelihood(False, amount_treshold)
    prob = log_odds(p_good, p_bad)
    new_words = []
    for key in prob:
        if prob[key] > treshold:
            if update_dictionary(key):
                new_words.append(key)

    print('length of dict after ',len(stemmed_dictionary))
    print('new words ', new_words)
    score1 = evaluate(manual)
    if score1 < score0:
        print('f1 ', score0, 'treshold ', treshold, 'amount treshold ', amount_treshold)
        break
    else:
        score0 = score1

    if len(stemmed_dictionary) == old_len:
        treshold -= 0.1
    df.label = df.text.apply(label)
print("-------------- TESTING SET RESULTS -----------")
evaluate(testing_set)

accuracy:  0.5877437325905293
precision:  0.7274881516587678
recall:  0.6290983606557377
f1:  0.6747252747252747
----------------------------------
treshold 2
length of dict before  476


  import sys


length of dict after  479
new words  ['треб', 'тип', 'западенск']
accuracy:  0.5940054495912807
precision:  0.7178899082568807
recall:  0.6413934426229508
f1:  0.6774891774891775
----------------------------------
treshold 2
length of dict before  479
length of dict after  479
new words  []
accuracy:  0.5940054495912807
precision:  0.7178899082568807
recall:  0.6413934426229508
f1:  0.6774891774891775
----------------------------------
treshold 1.9
length of dict before  479
length of dict after  479
new words  []
accuracy:  0.5940054495912807
precision:  0.7178899082568807
recall:  0.6413934426229508
f1:  0.6774891774891775
----------------------------------
treshold 1.7999999999999998
length of dict before  479
length of dict after  481
new words  ['пошел', 'вонюч']
accuracy:  0.5953678474114441
precision:  0.7185354691075515
recall:  0.6434426229508197
f1:  0.6789189189189189
----------------------------------
treshold 1.7999999999999998
length of dict before  481
length of dict aft

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


0.6112469437652812

# Resulst for log odds small dicitonary

In [53]:
stemmed_dictionary = get_bad_words('../data/bad_words_seed.txt')
df['label'] = df.text.apply(label)
score0 = evaluate(manual)
treshold = 2.3
amount_treshold = 30
while(True):
    print('----------------------------------')
    old_len = len(stemmed_dictionary)
    print('treshold', treshold)
    print('length of dict before ', old_len)
    p_bad = likelihood(True, amount_treshold)
    p_good = likelihood(False, amount_treshold)
    prob = log_odds(p_good, p_bad)
    print(sort(prob)[:15])
    new_words = []
    for key in prob:
        if prob[key] > treshold:
            if update_dictionary(key):
                new_words.append(key)

    print('length of dict after ',len(stemmed_dictionary))
    print('new words ', new_words)
    score1 = evaluate(manual)
    if score1 < score0:
        print('f1 ', score0, 'treshold ', treshold, 'amount treshold ', amount_treshold)
        break
    else:
        score0 = score1

    if len(stemmed_dictionary) == old_len:
        treshold -= 0.1
    df.label = df.text.apply(label)
print("-------------- TESTING SET RESULTS -----------")
evaluate(testing_set)

accuracy:  0.2737642585551331
precision:  0.8680555555555556
recall:  0.25614754098360654
f1:  0.39556962025316456
----------------------------------
treshold 2.3
length of dict before  6


  import sys


[('не', 4.106855420729722), ('нах', 3.70784457636469), ('треб', 3.130972245837058), ('пох', 2.9925091867538334), ('сос', 2.4087789763755096), ('хул', 2.159901165863682), ('в', 1.9640348340170493), ('пидарас', 1.9164883759960185), ('хлеб', 1.8709229875901296), ('долбоеб', 1.802569045002336), ('тупорыл', 1.7142458478129305), ('соса', 1.6861210937834334), ('рот', 1.6814384484165137), ('гроші', 1.6407596685814936), ('кацапск', 1.6182518828381507)]
length of dict after  12
new words  ['треб', 'пох', 'сос', 'ебуч', 'нах', 'ниху']
accuracy:  0.3741935483870968
precision:  0.7155172413793104
recall:  0.3401639344262295
f1:  0.46111111111111114
----------------------------------
treshold 2.3
length of dict before  12
[('на', 2.054430564474757), ('хул', 1.8528522566965775), ('срок', 1.7144092241872495), ('пидарас', 1.671217573299505), ('квартир', 1.55185159957179), ('бизнес', 1.5194129050159764), ('самі', 1.4971307408278682), ('здан', 1.47539930871282), ('долбоеб', 1.452820729001537), ('связ', 1

----------------------------------
treshold 1.7999999999999994
length of dict before  151
[('будто', 4.003121454294141), ('покупа', 3.8974099629202312), ('хаос', 3.87486692670461), ('в', 3.859921281232788), ('результат', 3.782022251679524), ('попытк', 3.7538881109584255), ('беспорядк', 3.6453939525429853), ('вест', 3.5095336165353794), ('поступа', 3.507457023238202), ('привел', 3.3964038570755077), ('систем', 3.3182825143350727), ('вывод', 3.1959900024359653), ('отношен', 3.1658476042156067), ('обрат', 3.112202850036321), ('таможен', 3.110827570985875)]
length of dict after  742
new words  ['миллион', 'сказк', 'другой', 'погибл', 'привел', 'известн', 'регион', 'видет', 'очевидн', 'поня', 'сценар', 'реш', 'срок', 'сем', 'приня', 'быстр', 'продолжа', 'живут', 'мол', 'национальн', 'тяжел', 'перв', 'довольн', 'при', 'если', 'российск', 'дел', 'собствен', 'долг', 'моей', 'насра', 'слыша', 'кстат', 'мне', 'мечта', 'ора', 'правительств', 'событ', 'город', 'предприят', 'демократическ', 'виж', 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


0.49423393739703453

# Result for mix of relative distance and log odds

In [54]:
stemmed_dictionary = get_bad_words('../data/bad_words.txt')
df['label'] = df.text.apply(label)
score0 = evaluate(manual)
treshold = 0.9
log_treshold = 2
amount_treshold = 30
while(True):
    print('----------------------------------')
    old_len = len(stemmed_dictionary)
    print('treshold', treshold)
    print('length of dict before ', old_len)
    p_bad = likelihood(True, amount_treshold)
    p_good = likelihood(False, amount_treshold)
    prob = relative_distance(p_good, p_bad)
    log_prob =log_odds(p_good, p_bad)
    new_words = []
    for key in prob:
        if prob[key] > treshold:
            if update_dictionary(key):
                new_words.append(key)
    for key in log_prob:
        if log_prob[key] > log_treshold:
            if update_dictionary(key):
                new_words.append(key)


    print('length of dict after ',len(stemmed_dictionary))
    print('new words ', new_words)
    score1 = evaluate(manual)
    if score1 < score0:
        print('f1 ', score0, 'treshold ', treshold, 'amount treshold ', amount_treshold)
        break
    else:
        score0 = score1

    if len(stemmed_dictionary) == old_len:
        treshold -= 0.01
        log_treshold -= 0.1
    df.label = df.text.apply(label)
print("-------------- TESTING SET RESULTS -----------")
evaluate(testing_set)

accuracy:  0.5877437325905293
precision:  0.7274881516587678
recall:  0.6290983606557377
f1:  0.6747252747252747
----------------------------------
treshold 0.9
length of dict before  476


  import sys


length of dict after  480
new words  ['тип', 'треб', 'тип', 'западенск']
accuracy:  0.5940054495912807
precision:  0.7178899082568807
recall:  0.6413934426229508
f1:  0.6774891774891775
----------------------------------
treshold 0.9
length of dict before  480
length of dict after  480
new words  []
accuracy:  0.5940054495912807
precision:  0.7178899082568807
recall:  0.6413934426229508
f1:  0.6774891774891775
----------------------------------
treshold 0.89
length of dict before  480
length of dict after  480
new words  []
accuracy:  0.5940054495912807
precision:  0.7178899082568807
recall:  0.6413934426229508
f1:  0.6774891774891775
----------------------------------
treshold 0.88
length of dict before  480
length of dict after  482
new words  ['пошел', 'вонюч']
accuracy:  0.5953678474114441
precision:  0.7185354691075515
recall:  0.6434426229508197
f1:  0.6789189189189189
----------------------------------
treshold 0.88
length of dict before  482
length of dict after  482
new words 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


0.6112469437652812