In [1]:
import re
import math
import string
import pymorphy2
import pandas as pd

from typing import List, Tuple
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score

In [2]:
morph = pymorphy2.MorphAnalyzer()


def normalize(line: str) -> List[str]:
    return [morph.parse(word)[0].normal_form for word in (v.lower() for v in re.findall(r"('?[а-яА-ЯёЁ][а-яА-ЯёЁ]*(?:-[а-яА-ЯёЁ]+)*'?)", line))]

def get_n_gramm(words: list, n: int = 2): 
    return ['_'.join(words[i-n+1:i+1]) for i in range(n - 1, len(words))]


In [3]:
aggregated = pd.read_csv('../data/marks_csv/aggregated_clear.csv', index_col=None)
aggregated.drop_duplicates(inplace=True)
aggregated['input_normal_form'] = aggregated.input.apply(normalize)
aggregated['input_normal_form'] = aggregated.input_normal_form.apply(lambda x: x + get_n_gramm(x, 2))
print(aggregated.shape)

aggregated_train, aggregated_test = train_test_split(aggregated, test_size=0.2, random_state=42)
del aggregated
print(aggregated_train.shape, aggregated_test.shape)

(7150, 4)
(5720, 4) (1430, 4)


In [4]:
output_counts = aggregated_train.output.value_counts().to_dict()
_sum = sum(output_counts.values())
output_coefficients = {n: v / _sum for n, v in output_counts.items()}
output_coefficients

{'NEUTRAL': 0.5047202797202798,
 'GOOD': 0.28251748251748254,
 'BAD': 0.21276223776223777}

In [5]:
words_by_groups = {name: [] for name in aggregated_train.output.unique()}
all_words = []

for index, row in aggregated_train.iterrows():
    words_by_groups[row['output']].extend(row['input_normal_form'])
    all_words.extend(row['input_normal_form'])

counts = {name: Counter(values) for name, values in words_by_groups.items()}
counter_sum = Counter(all_words)

normalized_counts = {out_name: {word: number/counter_sum[word]/output_coefficients[out_name] for (word, number) in count.most_common()} for out_name, count in counts.items()}
normalized_counts['all_count'] = dict(counter_sum)
normalized_counts.keys()

dict_keys(['NEUTRAL', 'BAD', 'GOOD', 'all_count'])

In [6]:
words = pd.DataFrame(normalized_counts)
words.sort_values('all_count', ascending=False, inplace=True)
words.index.set_names(['word'], inplace=True)
# words.reset_index(inplace=True)

def round_not_none(value):
    if pd.isnull(value):
        return value
    return round(value, 2)
words.NEUTRAL = words.NEUTRAL.apply(round_not_none)
words.GOOD = words.GOOD.apply(round_not_none)
words.BAD = words.BAD.apply(round_not_none)

words_dict = {word: info.to_dict() for word, info in words.iterrows()}

In [7]:
def get_correct_value(values: dict, name: str) -> float:
    value = values.get(name)
    return value if not pd.isna(value) else 0

results = []
for _, row in aggregated_test.iterrows():
    weights = {'NEUTRAL': 0., 'GOOD': 0., 'BAD': 0.}
    for word in row['input_normal_form']:
        word_info = words_dict.get(word, {})
        weights['NEUTRAL'] += get_correct_value(word_info, 'NEUTRAL')
        weights['GOOD'] += get_correct_value(word_info, 'GOOD')
        weights['BAD'] += get_correct_value(word_info, 'BAD')
    results.append(dict(output=row['output'], predict=sorted(weights.items(), key=lambda x: x[1])[-1][0], **weights))

results = pd.DataFrame(results)
results.head(2)

Unnamed: 0,BAD,GOOD,NEUTRAL,output,predict
0,66.73,155.07,122.81,GOOD,GOOD
1,176.48,102.8,147.4,BAD,BAD


In [8]:
confusion_matrix(results['output'].tolist(), results['predict'].tolist(), labels=["GOOD", "NEUTRAL", "BAD"])

array([[210, 102, 112],
       [196, 293, 199],
       [ 52,  55, 211]], dtype=int64)

In [9]:
(
    f1_score(results['output'].tolist(), results['predict'].tolist(), labels=["GOOD", "NEUTRAL", "BAD"], average='weighted'),
    f1_score(results['output'].tolist(), results['predict'].tolist(), labels=["GOOD", "NEUTRAL", "BAD"], average='micro'),
    f1_score(results['output'].tolist(), results['predict'].tolist(), labels=["GOOD", "NEUTRAL", "BAD"], average='macro'),
)

(0.5006570523793723, 0.4993006993006993, 0.4978366390492928)