In [None]:
import re
import string
import pymorphy2
import pandas as pd

In [None]:
aggregated = pd.read_csv('../data/marks_csv/aggregated.csv', index_col=None)

In [None]:
%%time
regex_ru = re.compile('[^а-яА-Я]')
morph = pymorphy2.MorphAnalyzer()


def get_ru_words_normal_form(line: str):
    words = [morph.parse(word)[0].normal_form for word in regex_ru.sub(' ', line).lower().split() if word]
    return words
aggregated['ru_words_normal_form'] = aggregated.input.apply(get_ru_words_normal_form)
# aggregated.head(2)

In [None]:
%%time
from gensim.models.phrases import Phrases

DELIMITER = '_'
B_DELIMITER = DELIMITER.encode()

def train_phrases(sentences, n, min_count, threshold):
    if n < 2:
        raise ValueError("n < 2")
    
    grams = []
    transformed_corpus = []
    
    _words = sentences
    for ind in range(n - 1):
        gram = Phrases(_words, min_count=min_count, delimiter=B_DELIMITER, threshold=threshold)
        grams.append(gram)
        _words = gram[_words]
        transformed_corpus.append(_words)
            
    return grams, transformed_corpus

grams, transformed_corpus = train_phrases(aggregated.ru_words_normal_form, 5, min_count=10, threshold=40)

In [None]:
%%time
phrases_by_groups = {name: list() for name in aggregated.output.unique()}
all_words = []
for index, (words, output) in enumerate(zip(transformed_corpus[-1], aggregated.output)):
    phrases_by_groups[output].extend(words)
    all_words.extend(words)

In [None]:
from collections import Counter

counts = {name: Counter(values) for name, values in phrases_by_groups.items()}
counter_sum = Counter(all_words)

In [None]:
output_counts = aggregated.output.value_counts().to_dict()
_sum = sum(output_counts.values())
output_coefficients = {n: v/_sum for n, v in output_counts.items()}
output_coefficients

normalized_counts:

(Количество данного слова в классе / Количество данного слова во всех классах) * (размер класса / сумма размеров все классов)


In [None]:
normalized_counts = {out_name: {word: number/counter_sum[word]/output_coefficients[out_name] for (word, number) in count.most_common()} for out_name, count in counts.items()}
normalized_counts['all_count'] = dict(counter_sum)
normalized_counts.keys()

In [None]:
import math 

df_res = pd.DataFrame(normalized_counts)
df_res.sort_values('all_count', ascending=False, inplace=True)
df_res.index.set_names(['word'], inplace=True)
df_res.reset_index(inplace=True)
def get_sum_distance(row):
    return round(math.fabs(row.NEUTRAL-row.GOOD)+math.fabs(row.NEUTRAL-row.BAD)+math.fabs(row.GOOD-row.BAD), 2)
df_res['sum_distance'] = df_res.apply(get_sum_distance, 1)
def word_counts(line: str) -> bool:
    return len(line.replace(DELIMITER, ' ').split())
df_res['word_counts'] = df_res.word.map(word_counts)
def round_not_none(value):
    if pd.isnull(value):
        return value
    return round(value, 2)
df_res.NEUTRAL = df_res.NEUTRAL.apply(round_not_none)
df_res.GOOD = df_res.GOOD.apply(round_not_none)
df_res.BAD = df_res.BAD.apply(round_not_none)
df_res.head()

In [None]:
def get_result(df, min_all_count, word_counts):
    return df[(df.all_count >= min_all_count) & (df.word_counts == word_counts)].sort_values('sum_distance', ascending=False)

In [None]:
get_result(df_res, 500, word_counts=1).head(20)

In [None]:
get_result(df_res, 20, word_counts=2).head(10)

In [None]:
get_result(df_res, 40, word_counts=3).head(10)

In [None]:
df_res.to_csv('../data/phrases/aggregated_word_table.csv', index=False)