In [7]:
import pandas as pd
from sklearn.metrics import classification_report, f1_score, confusion_matrix


class BasePredictor:

    def __init__(self, train_path: str, test_path: str):
        self.train_dataset = pd.read_csv(train_path)
        self.test_dataset = pd.read_csv(test_path)

    def train(self) -> None:
        raise NotImplementedError

    def predict(self, text: str) -> int:
        raise NotImplementedError

    def test(self):
        correct_predictions = self.test_dataset['Author']
        predictions = [
            self.predict(row['Content'])
            for _, row in self.test_dataset.iterrows()
        ]
        print(classification_report(correct_predictions, predictions))
        print('F1-score:', f1_score(correct_predictions, predictions, average='macro'))
        print(confusion_matrix(correct_predictions, predictions))



In [8]:
from collections import defaultdict
import zlib


class ZipPredictor(BasePredictor):

    def __init__(self, train_path: str, test_path: str):
        super().__init__(train_path, test_path)
        self.concatinated: dict[int, str] = defaultdict(str)

    def _build_concatinated(self):
        for _, row in self.train_dataset.iterrows():
            self.concatinated[int(row['Author'])] += row['Content']

    def train(self) -> int:
        self._build_concatinated()

    def predict(self, text: str) -> int:
        compression = {}
        for author_id, big_text in self.concatinated.items():
            big_text_compression = len(zlib.compress((big_text).encode('utf-8'))) / len(big_text)
            concat_text_compression = len(zlib.compress((big_text + text).encode('utf-8'))) / len(big_text + text)
            compression_delta =  concat_text_compression - big_text_compression  # ищем, где качество сжатия ухудшилось меньше всего
            compression[author_id] = compression_delta

        return min(compression, key=lambda x: compression[x])



In [9]:
z = ZipPredictor('datasets/russian_classics/train_4_10_1000.csv', 'datasets/russian_classics/test_4_40_600.csv')
z.train()
z.test()

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        40
           1       1.00      0.07      0.14        40
           2       0.45      0.82      0.58        40
           3       0.43      0.90      0.58        40

    accuracy                           0.45       160
   macro avg       0.47      0.45      0.33       160
weighted avg       0.47      0.45      0.33       160

F1-score: 0.3260627103678575
[[ 0  0 23 17]
 [ 0  3 13 24]
 [ 0  0 33  7]
 [ 0  0  4 36]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [10]:
from collections import defaultdict, Counter
import re

class ThreeGramPredictor(BasePredictor):

    def __init__(self, train_path: str, test_path: str):
        super().__init__(train_path, test_path)
        self.n_grams_idx = []
        self.n_gram_vectors: dict[int, list[float]] = {}

    @staticmethod
    def clean_text(text: str):
        return ' '.join(re.findall(r'[a-zа-яё]+', text.lower()))

    @staticmethod
    def get_n_grams(text: str):
        text = ThreeGramPredictor.clean_text(text)
        return [text[i:i + 3] for i in range(0, len(text) - 2)]

    def vectorize(self, text) -> list[float]:
        assert self.n_grams_idx

        t_grams = self.get_n_grams(text)
        counter = Counter(t_grams)
        vector = [0] * len(self.n_grams_idx)
        for t_gram, idx in self.n_grams_idx.items():
            vector[idx] = counter.get(t_gram, 0)
        total = sum(vector)
        return [x / total for x in vector]

    def train(self):
        all_tgrams = self.get_n_grams(' '.join(row['Content'] for _, row in self.train_dataset.iterrows()))
        self.n_grams_idx = {t_gram: idx for idx, t_gram in enumerate(set(all_tgrams))}

        concatinated = defaultdict(str)
        for _, row in self.train_dataset.iterrows():
            concatinated[int(row['Author'])] += row['Content']

        for author_id, text in concatinated.items():
            self.n_gram_vectors[author_id] = self.vectorize(text)

    @staticmethod
    def cosine_distance(v1: list[float], v2: list[float]) -> float:
        return sum((a - b) ** 2 for a, b in zip(v1, v2)) ** 0.5

    def predict(self, text: str) -> int:
        n_grams = self.get_n_grams(text)
        counter = Counter(n_grams)
        vector = [0] * len(self.n_grams_idx)
        for tgram, idx in counter.items():
            vector[idx] = counter[tgram]
        total = sum(vector)
        vector = [x / total for x in vector]

        distances = {
            author_id: self.cosine_distance(vector, author_vector)
            for author_id, author_vector in self.n_gram_vectors.items()
        }

        return min(distances, key=lambda x: distances[x])




In [11]:
t = ThreeGramPredictor('datasets/russian_classics/train_4_10_1000.csv', 'datasets/russian_classics/test_4_40_600.csv')
t.train()
t.test()

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        40
           1       0.33      0.12      0.18        40
           2       0.24      0.88      0.38        40
           3       0.00      0.00      0.00        40

    accuracy                           0.25       160
   macro avg       0.14      0.25      0.14       160
weighted avg       0.14      0.25      0.14       160

F1-score: 0.14056324110671936
[[ 0  3 37  0]
 [ 1  5 34  0]
 [ 0  5 35  0]
 [ 0  2 38  0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [12]:
from collections import defaultdict, Counter
import re
from sklearn.svm import SVC


class ThreeGramSVMPredictor(BasePredictor):

    def __init__(self, train_path: str, test_path: str):
        super().__init__(train_path, test_path)
        self.n_grams_idx: dict[str, int] = {}
        self.vectors: list[tuple[int, list[float]]] = []
        self.model = SVC()

    @staticmethod
    def clean_text(text: str):
        return ' '.join(re.findall(r'[a-zа-яё]+', text.lower()))

    @staticmethod
    def get_n_grams(text: str):
        text = ThreeGramPredictor.clean_text(text)
        return [text[i:i + 3] for i in range(0, len(text) - 2)]

    def vectorize(self, text) -> list[float]:
        assert self.n_grams_idx

        t_grams = self.get_n_grams(text)
        counter = Counter(t_grams)
        vector = [0] * len(self.n_grams_idx)
        for t_gram, idx in self.n_grams_idx.items():
            vector[idx] = counter.get(t_gram, 0)
        total = sum(vector)
        return [x / total for x in vector]

    def train(self):
        all_tgrams = self.get_n_grams(' '.join(row['Content'] for _, row in self.train_dataset.iterrows()))
        self.n_grams_idx = {t_gram: idx for idx, t_gram in enumerate(set(all_tgrams))}

        self.vectors = [
            (row['Author'], self.vectorize(row['Content']))
            for _, row in self.train_dataset.iterrows()
        ]
        self.model.fit([v for _, v in self.vectors], [a for a, _ in self.vectors])

    def predict(self, text: str) -> int:
        vector = self.vectorize(text)
        return self.model.predict([vector])[0]




In [13]:
s = ThreeGramSVMPredictor('datasets/russian_classics/train_4_10_1000.csv', 'datasets/russian_classics/test_4_40_600.csv')
s.train()
s.test()

              precision    recall  f1-score   support

           0       0.95      0.88      0.91        40
           1       0.75      0.38      0.50        40
           2       0.97      0.70      0.81        40
           3       0.50      0.93      0.65        40

    accuracy                           0.72       160
   macro avg       0.79      0.72      0.72       160
weighted avg       0.79      0.72      0.72       160

F1-score: 0.7174519797517509
[[35  2  0  3]
 [ 0 15  0 25]
 [ 2  1 28  9]
 [ 0  2  1 37]]


In [14]:
s = ThreeGramSVMPredictor('datasets/russian_classics/train_23_10_1000.csv', 'datasets/russian_classics/test_23_40_1000.csv')
s.train()
s.test()

              precision    recall  f1-score   support

           0       0.42      0.88      0.57        26
           1       0.33      0.25      0.29        40
           2       0.40      0.53      0.46        40
           3       0.21      0.89      0.34        35
           4       1.00      0.20      0.33        40
           5       0.71      0.25      0.37        40
           6       0.47      0.50      0.48        40
           7       0.28      0.20      0.23        40
           8       0.78      0.53      0.63        40
           9       1.00      0.10      0.18        40
          10       0.73      0.88      0.80        40
          11       0.39      0.61      0.48        36
          12       0.91      0.72      0.81        40
          13       1.00      0.07      0.14        40
          14       0.96      0.55      0.70        40
          15       0.41      0.93      0.57        40
          16       0.64      0.40      0.49        40
          17       1.00    

In [78]:
from sklearn.ensemble import RandomForestClassifier

class ThreeGramRandomForestPredictor(ThreeGramSVMPredictor):

    def __init__(self, train_path: str, test_path: str):
        super().__init__(train_path, test_path)
        self.n_grams_idx: dict[str, int] = {}
        self.vectors: list[tuple[int, list[float]]] = []
        self.model = RandomForestClassifier(n_estimators=1000)

    @staticmethod
    def get_n_grams(text: str):
        text = ThreeGramPredictor.clean_text(text)
        return [text[i:i + 3] for i in range(0, len(text) - 2) if ' ' not in text[i:i + 3]]

In [79]:
r = ThreeGramRandomForestPredictor('datasets/russian_classics/train_4_10_1000.csv', 'datasets/russian_classics/test_4_40_600.csv')
r.train()
r.test()

              precision    recall  f1-score   support

           0       0.81      0.85      0.83        40
           1       0.78      1.00      0.88        40
           2       0.85      0.72      0.78        40
           3       0.82      0.68      0.74        40

    accuracy                           0.81       160
   macro avg       0.82      0.81      0.81       160
weighted avg       0.82      0.81      0.81       160

F1-score: 0.8079747457462124
[[34  5  0  1]
 [ 0 40  0  0]
 [ 4  2 29  5]
 [ 4  4  5 27]]


In [81]:
list(sorted(zip(r.model.feature_importances_, list(r.n_grams_idx)), key=lambda x: -x[0]))[:30]

[(0.009309169086252834, 'еще'),
 (0.006896891781806129, 'тве'),
 (0.006330846492777131, 'душ'),
 (0.006167807824271111, 'енн'),
 (0.0059950175791715285, 'кот'),
 (0.005068994911029305, 'амо'),
 (0.0049584071307570135, 'льн'),
 (0.004804350383855186, 'год'),
 (0.0044800909024806, 'ств'),
 (0.004455937218906102, 'ьно'),
 (0.0044182431055065415, 'нно'),
 (0.004378168700792956, 'тво'),
 (0.00411100722433722, 'иди'),
 (0.003929786480019471, 'зак'),
 (0.0038859618964381913, 'вом'),
 (0.0038022256764205843, 'нте'),
 (0.0036937662830906727, 'ада'),
 (0.003678940799384179, 'всё'),
 (0.003651570600772305, 'лет'),
 (0.003638395937582947, 'тны'),
 (0.003573474930343697, 'ите'),
 (0.003496545783416789, 'одн'),
 (0.003476148470961268, 'ающ'),
 (0.003400099706462994, 'ени'),
 (0.003360963701971449, 'ани'),
 (0.003251519100712927, 'лек'),
 (0.0032242053359772256, 'дви'),
 (0.0031939444804709484, 'ещё'),
 (0.003129088042530563, 'вид'),
 (0.0031102803239466143, 'ата')]

In [21]:
r = ThreeGramRandomForestPredictor('datasets/russian_classics/train_23_10_1000.csv', 'datasets/russian_classics/test_23_40_1000.csv')
r.train()
r.test()

              precision    recall  f1-score   support

           0       0.37      0.85      0.52        26
           1       0.53      0.88      0.66        40
           2       0.92      0.60      0.73        40
           3       0.43      0.66      0.52        35
           4       0.76      0.72      0.74        40
           5       0.56      0.12      0.20        40
           6       0.67      0.45      0.54        40
           7       0.43      0.38      0.40        40
           8       0.45      0.68      0.54        40
           9       0.51      0.45      0.48        40
          10       0.67      0.95      0.78        40
          11       0.51      0.58      0.55        36
          12       0.63      0.85      0.72        40
          13       0.71      0.50      0.59        40
          14       0.84      0.65      0.73        40
          15       0.71      0.72      0.72        40
          16       0.75      0.60      0.67        40
          17       0.91    

In [61]:
import pymorphy3
morph = pymorphy3.MorphAnalyzer()

class WordsThreeGramsPredictor(BasePredictor):

    def __init__(self, train_path: str, test_path: str):
        super().__init__(train_path, test_path)
        self.n_grams_idx: dict[str, int] = {}
        self.vectors: list[tuple[int, list[float]]] = []
        self.model = RandomForestClassifier(n_estimators=1000)

    @staticmethod
    def clean_text(text: str) -> list[str]:
        return [x for x in re.findall(r'[a-zа-яё]+|\.|,|\-|;|–|:', text.lower()) if x != ' ']

    @staticmethod
    def get_n_grams(text: str):
        text = WordsThreeGramsPredictor.clean_text(text)

        def preprocess(word):
            if word in '+.,-;–:':
                return word
            return morph.parse(word)[0].tag.POS

        grams = []
        for i in range(0, len(text) - 2):
            a, b, c = text[i:i + 3]
            grams.append((preprocess(a), preprocess(b), preprocess(c)))

        return grams

    def vectorize(self, text) -> list[float]:
        assert self.n_grams_idx

        t_grams = self.get_n_grams(text)
        counter = Counter(t_grams)
        vector = [0] * len(self.n_grams_idx)
        for t_gram, idx in self.n_grams_idx.items():
            vector[idx] = counter.get(t_gram, 0)
        total = sum(vector)
        return [x / total for x in vector]

    def train(self):
        all_tgrams = self.get_n_grams(' '.join(row['Content'] for _, row in self.train_dataset.iterrows()))
        self.n_grams_idx = {t_gram: idx for idx, t_gram in enumerate(set(all_tgrams))}

        self.vectors = [
            (row['Author'], self.vectorize(row['Content']))
            for _, row in self.train_dataset.iterrows()
        ]
        self.model.fit([v for _, v in self.vectors], [a for a, _ in self.vectors])

    def predict(self, text: str) -> int:
        vector = self.vectorize(text)
        return self.model.predict([vector])[0]

In [62]:
w = WordsThreeGramsPredictor('datasets/russian_classics/train_4_10_1000.csv', 'datasets/russian_classics/test_4_40_600.csv')
w.train()
w.test()

              precision    recall  f1-score   support

           0       0.79      0.82      0.80        40
           1       0.80      0.90      0.85        40
           2       0.85      0.70      0.77        40
           3       0.78      0.78      0.78        40

    accuracy                           0.80       160
   macro avg       0.80      0.80      0.80       160
weighted avg       0.80      0.80      0.80       160

F1-score: 0.7985150399952831
[[33  4  0  3]
 [ 0 36  3  1]
 [ 5  2 28  5]
 [ 4  3  2 31]]


In [64]:
w = WordsThreeGramsPredictor('datasets/russian_classics/train_23_10_1000.csv', 'datasets/russian_classics/test_23_40_1000.csv')
w.train()
w.test()

              precision    recall  f1-score   support

           0       0.52      0.88      0.66        26
           1       0.38      0.47      0.42        40
           2       0.67      0.72      0.70        40
           3       0.69      0.31      0.43        35
           4       0.91      0.72      0.81        40
           5       0.35      0.88      0.50        40
           6       0.76      0.47      0.58        40
           7       0.65      0.42      0.52        40
           8       0.73      0.80      0.76        40
           9       0.68      0.57      0.62        40
          10       0.71      0.85      0.77        40
          11       0.52      0.39      0.44        36
          12       0.34      0.28      0.31        40
          13       0.71      0.30      0.42        40
          14       0.48      0.50      0.49        40
          15       0.93      0.33      0.48        40
          16       0.86      0.47      0.61        40
          17       0.43    

In [74]:
list(sorted(zip(w.model.feature_importances_, list(w.n_grams_idx)), key=lambda x: -x[0]))[:10]

[(0.0033284187961860343, ('PREP', 'NOUN', '.')),
 (0.0031838514065482717, ('ADJF', 'NOUN', ';')),
 (0.0031719062214566516, ('VERB', 'NOUN', '.')),
 (0.0029930231966530108, (',', 'VERB', 'NPRO')),
 (0.002893573865672614, ('NOUN', ';', 'CONJ')),
 (0.002778986350330607, ('.', 'NOUN', 'VERB')),
 (0.0026163458471558185, ('ADJF', 'NOUN', ',')),
 (0.002571363046133408, ('ADJF', 'NOUN', '.')),
 (0.002554732247852976, ('VERB', '.', 'NOUN')),
 (0.002552396346602851, ('NOUN', '.', 'CONJ'))]