In [1]:
import re
import string
import pymorphy2
import pandas as pd

from typing import List, Tuple
from collections import Counter

In [2]:
aggregated = pd.read_csv('../data/marks_csv/aggregated.csv', index_col=None)
# aggregated.drop_duplicates(inplace=True)
aggregated.shape

(7760, 3)

In [3]:
output_counts = aggregated.output.value_counts().to_dict()
_sum = sum(output_counts.values())
output_coefficients = {n: v/_sum for n, v in output_counts.items()}
output_coefficients

{'NEUTRAL': 0.5019329896907216, 'GOOD': 0.2875, 'BAD': 0.21056701030927835}

In [4]:
morph = pymorphy2.MorphAnalyzer()


def normalize(line: str) -> List[str]:
    return [morph.parse(word)[0].normal_form for word in (v.lower() for v in re.findall(r"('?[а-яА-ЯёЁ][а-яА-ЯёЁ]*(?:-[а-яА-ЯёЁ]+)*'?)", line))]


def get_n_gramm(words: list, n: int = 2): 
    return ['_'.join(words[i-n+1:i+1]) for i in range(n - 1, len(words))]


aggregated['ru_words_normal_form'] = aggregated.input.apply(normalize)
aggregated['ru_words_normal_form'] = aggregated.ru_words_normal_form.apply(lambda x: x + get_n_gramm(x, 2) + get_n_gramm(x, 3))

In [5]:
aggregated.head()

Unnamed: 0,input,output,confidence,ru_words_normal_form
0,Условия термодинамической согласованности имею...,NEUTRAL,0.92,"[условие, термодинамический, согласованность, ..."
1,"мишени, подтвержденная там для 2D  модели, до...",NEUTRAL,0.9,"[мишень, подтвердить, там, для, модель, дополн..."
2,При разработке алгоритмов решения задач ГД и М...,NEUTRAL,0.97,"[при, разработка, алгоритм, решение, задача, г..."
3,На основе теории стохастических динамических п...,GOOD,1.0,"[на, основа, теория, стохастический, динамичес..."
4,Основным достоинством лазерного поджига являет...,NEUTRAL,0.61,"[основный, достоинство, лазерный, поджиг, явля..."


In [6]:
words_by_groups = {name: [] for name in aggregated.output.unique()}
all_words = []

for index, row in aggregated.iterrows():
    words_by_groups[row['output']].extend(row['ru_words_normal_form'])
    all_words.extend(row['ru_words_normal_form'])

counts = {name: Counter(values) for name, values in words_by_groups.items()}
counter_sum = Counter(all_words)

normalized_counts = {out_name: {word: number/counter_sum[word]/output_coefficients[out_name] for (word, number) in count.most_common()} for out_name, count in counts.items()}
normalized_counts['all_count'] = dict(counter_sum)
normalized_counts.keys()

dict_keys(['NEUTRAL', 'GOOD', 'BAD', 'all_count'])

In [7]:
df_res = pd.DataFrame(normalized_counts)
df_res.sort_values('all_count', ascending=False, inplace=True)
df_res.index.set_names(['word'], inplace=True)
df_res.reset_index(inplace=True)

def round_not_none(value):
    if pd.isnull(value):
        return value
    return round(value, 2)
df_res.NEUTRAL = df_res.NEUTRAL.apply(round_not_none)
df_res.GOOD = df_res.GOOD.apply(round_not_none)
df_res.BAD = df_res.BAD.apply(round_not_none)

In [10]:
df_res.to_csv('../data/frequencies/bag_of_words_3_gramm.csv', index=False)

In [9]:
aggregated['input_normal_form'] = aggregated['ru_words_normal_form'].apply(lambda v: ' '.join(v))
aggregated[['input_normal_form', 'output', 'confidence']].to_csv('../data/marks_csv/aggregated_normalize_3_gramm.csv', 
                                                                 index=None)