In [1]:
import re
import math
import string
import pymorphy2
import pandas as pd

from typing import List, Tuple
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
morph = pymorphy2.MorphAnalyzer()


def normalize(line: str) -> str:
    return ' '.join([morph.parse(word)[0].normal_form for word in (v.lower() for v in re.findall(r"('?[а-яА-ЯёЁ][а-яА-ЯёЁ]*(?:-[а-яА-ЯёЁ]+)*'?)", line))])


In [3]:
aggregated = pd.read_csv('../data/marks_csv/aggregated_clear.csv', index_col=None)
aggregated.drop_duplicates(inplace=True)
aggregated['input_normal_form'] = aggregated.input.apply(normalize)
print(aggregated.shape)

aggregated_train, aggregated_test, _, _ = train_test_split(aggregated, aggregated['output'], test_size=0.2, random_state=42)
del aggregated
print(aggregated_train.shape, aggregated_test.shape)

(7150, 4)
(5720, 4) (1430, 4)


In [4]:
aggregated_train.head(1)

Unnamed: 0,input,output,confidence,input_normal_form
5178,Многорезультатная суперкомпиляция – это метод ...,NEUTRAL,0.51,многорезультатный суперкомпиляция это метод ко...


In [5]:
vectorizer = TfidfVectorizer(ngram_range=(2, 3))
X = vectorizer.fit_transform(aggregated_train['input_normal_form'])
y = aggregated_train['output']

In [6]:
clf = LogisticRegression(random_state=0, solver='newton-cg', multi_class='multinomial', class_weight='balanced').fit(X, y)

In [7]:
results = pd.DataFrame()
results['predict'] = clf.predict(vectorizer.transform(aggregated_test['input_normal_form']))
results['output'] = aggregated_test['output'].tolist()

In [8]:
confusion_matrix(results['output'].tolist(), results['predict'].tolist(), labels=["GOOD", "NEUTRAL", "BAD"])

array([[164, 196,  64],
       [137, 468,  83],
       [ 38, 139, 141]], dtype=int64)

In [9]:
(
    f1_score(results['output'].tolist(), results['predict'].tolist(), labels=["GOOD", "NEUTRAL", "BAD"], average='weighted'),
    f1_score(results['output'].tolist(), results['predict'].tolist(), labels=["GOOD", "NEUTRAL", "BAD"], average='micro'),
    f1_score(results['output'].tolist(), results['predict'].tolist(), labels=["GOOD", "NEUTRAL", "BAD"], average='macro'),
)

(0.5329745492564901, 0.5405594405594406, 0.5076650596039981)