In [1]:
import re
import math
import string
import pymorphy2
import pandas as pd

from typing import List, Tuple
from collections import Counter

In [2]:
aggregated = pd.read_csv('../data/marks_csv/aggregated.csv', index_col=None)

In [3]:
output_counts = aggregated.output.value_counts().to_dict()
_sum = sum(output_counts.values())
output_coefficients = {n: v/_sum for n, v in output_counts.items()}
output_coefficients

{'NEUTRAL': 0.5019329896907216, 'GOOD': 0.2875, 'BAD': 0.21056701030927835}

In [4]:
%%time
morph = pymorphy2.MorphAnalyzer()


def normalize(line: str) -> List[str]:
    return [morph.parse(word)[0].normal_form for word in (v.lower() for v in re.findall(r"('?[а-яА-ЯёЁ][а-яА-ЯёЁ]*(?:-[а-яА-ЯёЁ]+)*'?)", line))]


aggregated['ru_words_normal_form'] = aggregated.input.apply(normalize)

Wall time: 1min 58s


In [11]:
# aggregated

In [5]:
%%time
from gensim.models.phrases import Phrases

DELIMITER = '_'
B_DELIMITER = DELIMITER.encode()

def train_phrases(sentences, n, min_count, threshold):
    if n < 2:
        raise ValueError("n < 2")
    
    grams = []
    transformed_corpus = []
    
    _words = sentences
    for ind in range(n - 1):
        gram = Phrases(_words, min_count=min_count, delimiter=B_DELIMITER, threshold=threshold)
        grams.append(gram)
        _words = gram[_words]
        transformed_corpus.append(_words)
            
    return grams, transformed_corpus

grams, transformed_corpus = train_phrases(aggregated.ru_words_normal_form, 5, min_count=10, threshold=40)

Wall time: 16.2 s


normalized_counts:

(Количество данного слова в классе / Количество данного слова во всех классах) * (размер класса / сумма размеров все классов)

In [6]:
%%time
phrases_by_groups = {name: list() for name in aggregated.output.unique()}
all_words = []
for index, (words, output) in enumerate(zip(transformed_corpus[-1], aggregated.output)):
    phrases_by_groups[output].extend(words)
    all_words.extend(words)

counts = {name: Counter(values) for name, values in phrases_by_groups.items()}
counter_sum = Counter(all_words)

normalized_counts = {out_name: {word: number/counter_sum[word]/output_coefficients[out_name] for (word, number) in count.most_common()} for out_name, count in counts.items()}
normalized_counts['all_count'] = dict(counter_sum)
normalized_counts.keys()

Wall time: 7.91 s


dict_keys(['NEUTRAL', 'GOOD', 'BAD', 'all_count'])

In [7]:
df_res = pd.DataFrame(normalized_counts)
df_res.sort_values('all_count', ascending=False, inplace=True)
df_res.index.set_names(['word'], inplace=True)
df_res.reset_index(inplace=True)

def word_counts(line: str) -> bool:
    return len(line.replace(DELIMITER, ' ').split())
df_res['word_counts'] = df_res.word.map(word_counts)

def round_not_none(value):
    if pd.isnull(value):
        return value
    return round(value, 2)
df_res.NEUTRAL = df_res.NEUTRAL.apply(round_not_none)
df_res.GOOD = df_res.GOOD.apply(round_not_none)
df_res.BAD = df_res.BAD.apply(round_not_none)

df_res.head()

Unnamed: 0,word,NEUTRAL,GOOD,BAD,all_count,word_counts
0,в,0.99,1.03,0.99,23633,1
1,и,1.0,1.05,0.92,16346,1
2,на,1.01,0.94,1.06,8172,1
3,с,1.0,1.01,0.98,8170,1
4,для,1.01,1.07,0.89,5237,1


In [8]:
df_res.to_csv('../data/frequencies/statistical_phrases.csv', index=False)

In [None]:
# df_res = pd.read_csv('../data/frequencies/statistical_phrases.csv', index_col=None)

In [14]:
# input	output	confidence
result = []
for words, output, confidence in zip(transformed_corpus[-1], aggregated.output, aggregated.confidence):
    result.append(dict(
        input_normal_form_phrases=' '.join(words),
        output=output,
        confidence=confidence,
    ))
result = pd.DataFrame(result)[['input_normal_form_phrases', 'output', 'confidence']]

In [15]:
result.to_csv('../data/marks_csv/aggregated_normalize_phrases.csv', index=False)