In [None]:
import re
import math
import string
import pymorphy2
import pandas as pd

from typing import List, Tuple
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score

In [3]:
morph = pymorphy2.MorphAnalyzer()


def normalize(line: str) -> List[str]:
    return [morph.parse(word)[0].normal_form for word in (v.lower() for v in re.findall(r"('?[а-яА-ЯёЁ][а-яА-ЯёЁ]*(?:-[а-яА-ЯёЁ]+)*'?)", line))]

In [4]:
aggregated = pd.read_csv('../data/marks_csv/aggregated_clear.csv', index_col=None)
aggregated.drop_duplicates(inplace=True)
aggregated['input_normal_form'] = aggregated.input.apply(normalize)
print(aggregated.shape)

aggregated_train, aggregated_test, _, _ = train_test_split(aggregated, aggregated['output'], test_size=0.2, random_state=42)
del aggregated
print(aggregated_train.shape, aggregated_test.shape)

(7150, 4)
(5720, 4) (1430, 4)


In [5]:
aggregated_train.head(1)

Unnamed: 0,input,output,confidence,input_normal_form
5178,Многорезультатная суперкомпиляция – это метод ...,NEUTRAL,0.51,"[многорезультатный, суперкомпиляция, это, мето..."


In [6]:
output_counts = aggregated_train.output.value_counts().to_dict()
_sum = sum(output_counts.values())
output_coefficients = {n: v / _sum for n, v in output_counts.items()}
output_coefficients

{'NEUTRAL': 0.5047202797202798,
 'GOOD': 0.28251748251748254,
 'BAD': 0.21276223776223777}

In [7]:
words_by_groups = {name: [] for name in aggregated_train.output.unique()}
all_words = []

for index, row in aggregated_train.iterrows():
    words_by_groups[row['output']].extend(row['input_normal_form'])
    all_words.extend(row['input_normal_form'])

counts = {name: Counter(values) for name, values in words_by_groups.items()}
counter_sum = Counter(all_words)

normalized_counts = {out_name: {word: number/counter_sum[word]/output_coefficients[out_name] for (word, number) in count.most_common()} for out_name, count in counts.items()}
normalized_counts['all_count'] = dict(counter_sum)
normalized_counts.keys()

dict_keys(['NEUTRAL', 'BAD', 'GOOD', 'all_count'])

In [8]:
words = pd.DataFrame(normalized_counts)
words.sort_values('all_count', ascending=False, inplace=True)
words.index.set_names(['word'], inplace=True)
# words.reset_index(inplace=True)

def round_not_none(value):
    if pd.isnull(value):
        return value
    return round(value, 2)
words.NEUTRAL = words.NEUTRAL.apply(round_not_none)
words.GOOD = words.GOOD.apply(round_not_none)
words.BAD = words.BAD.apply(round_not_none)

In [9]:
#words = pd.read_csv('../data/frequencies/bag_of_words.csv', index_col=0)
# words = words[words.all_count > 2]
words.head(2)

Unnamed: 0_level_0,NEUTRAL,BAD,GOOD,all_count
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
в,0.99,0.98,1.03,17450
и,1.0,0.92,1.06,12032


In [10]:
words_dict = {word: info.to_dict() for word, info in words.iterrows()}

In [11]:
def get_correct_value(values: dict, name: str) -> float:
    value = values.get(name)
    return value if not pd.isna(value) else 0

results = []
for _, row in aggregated_test.iterrows():
    weights = {'NEUTRAL': 0., 'GOOD': 0., 'BAD': 0.}
    for word in row['input_normal_form']:
        word_info = words_dict.get(word, {})
        weights['NEUTRAL'] += get_correct_value(word_info, 'NEUTRAL')
        weights['GOOD'] += get_correct_value(word_info, 'GOOD')
        weights['BAD'] += get_correct_value(word_info, 'BAD')
    results.append(dict(output=row['output'], predict=sorted(weights.items(), key=lambda x: x[1])[-1][0], **weights))

results = pd.DataFrame(results)
results.head(2)

Unnamed: 0,BAD,GOOD,NEUTRAL,output,predict
0,49.03,74.19,68.57,GOOD,GOOD
1,77.26,67.46,70.32,BAD,BAD


In [12]:
confusion_matrix(results['output'].tolist(), results['predict'].tolist(), labels=["GOOD", "NEUTRAL", "BAD"])

array([[222,  87, 115],
       [206, 235, 247],
       [ 55,  30, 233]], dtype=int64)

In [13]:
(
    f1_score(results['output'].tolist(), results['predict'].tolist(), labels=["GOOD", "NEUTRAL", "BAD"], average='weighted'),
    f1_score(results['output'].tolist(), results['predict'].tolist(), labels=["GOOD", "NEUTRAL", "BAD"], average='micro'),
    f1_score(results['output'].tolist(), results['predict'].tolist(), labels=["GOOD", "NEUTRAL", "BAD"], average='macro'),
)

(0.4760775765322739, 0.4825174825174825, 0.48395141463611596)

In [14]:
results[(results['output'] == 'GOOD') & (results['predict'] == 'BAD')].head(5)

Unnamed: 0,BAD,GOOD,NEUTRAL,output,predict
14,65.96,63.85,61.28,GOOD,BAD
161,71.99,68.29,62.21,GOOD,BAD
186,54.71,53.44,51.98,GOOD,BAD
306,72.71,69.09,51.46,GOOD,BAD
327,74.88,71.49,57.09,GOOD,BAD
