In [40]:
import pandas as pd
from sklearn.metrics import classification_report, f1_score, confusion_matrix


class BasePredictor:

    def __init__(self, train_path: str, test_path: str):
        self.train_dataset = pd.read_csv(train_path)
        self.test_dataset = pd.read_csv(test_path)

    def train(self) -> None:
        raise NotImplementedError

    def predict(self, text: str) -> int:
        raise NotImplementedError

    def test(self):
        correct_predictions = self.test_dataset['Author']
        predictions = [
            self.predict(row['Content'])
            for _, row in self.test_dataset.iterrows()
        ]
        print(classification_report(correct_predictions, predictions))
        print('F1-score:', f1_score(correct_predictions, predictions, average='macro'))
        print(confusion_matrix(correct_predictions, predictions))



In [55]:
from collections import defaultdict
import zlib


class ZipPredictor(BasePredictor):

    def __init__(self, train_path: str, test_path: str):
        super().__init__(train_path, test_path)
        self.concatinated: dict[int, str] = defaultdict(str)

    def _build_concatinated(self):
        for _, row in self.train_dataset.iterrows():
            self.concatinated[int(row['Author'])] += row['Content']

    def train(self) -> int:
        self._build_concatinated()

    def predict(self, text: str) -> int:
        compression = {}
        for author_id, big_text in self.concatinated.items():
            big_text_compression = len(zlib.compress((big_text).encode('utf-8'))) / len(big_text)
            concat_text_compression = len(zlib.compress((big_text + text).encode('utf-8'))) / len(big_text + text)
            compression_delta =  concat_text_compression - big_text_compression  # ищем, где качество сжатия ухудшилось меньше всего
            compression[author_id] = compression_delta

        return min(compression, key=lambda x: compression[x])



In [56]:
z = ZipPredictor('datasets/russian_classics/train_4_10_1000.csv', 'datasets/russian_classics/test_4_40_600.csv')
z.train()
z.test()

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        40
           1       1.00      0.07      0.14        40
           2       0.45      0.82      0.58        40
           3       0.43      0.90      0.58        40

    accuracy                           0.45       160
   macro avg       0.47      0.45      0.33       160
weighted avg       0.47      0.45      0.33       160

F1-score: 0.3260627103678575
[[ 0  0 23 17]
 [ 0  3 13 24]
 [ 0  0 33  7]
 [ 0  0  4 36]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [136]:
from collections import defaultdict, Counter
import re

class ThreeGramPredictor(BasePredictor):

    def __init__(self, train_path: str, test_path: str):
        super().__init__(train_path, test_path)
        self.n_grams_idx = []
        self.n_gram_vectors: dict[int, list[float]] = {}

    @staticmethod
    def clean_text(text: str):
        return ' '.join(re.findall(r'[a-zа-яё]+', text.lower()))

    @staticmethod
    def get_n_grams(text: str):
        text = ThreeGramPredictor.clean_text(text)
        return [text[i:i + 3] for i in range(0, len(text) - 2)]

    def vectorize(self, text) -> list[float]:
        assert self.n_grams_idx

        t_grams = self.get_n_grams(text)
        counter = Counter(t_grams)
        vector = [0] * len(self.n_grams_idx)
        for t_gram, idx in self.n_grams_idx.items():
            vector[idx] = counter.get(t_gram, 0)
        total = sum(vector)
        return [x / total for x in vector]

    def train(self):
        all_tgrams = self.get_n_grams(' '.join(row['Content'] for _, row in self.train_dataset.iterrows()))
        self.n_grams_idx = {t_gram: idx for idx, t_gram in enumerate(set(all_tgrams))}

        concatinated = defaultdict(str)
        for _, row in self.train_dataset.iterrows():
            concatinated[int(row['Author'])] += row['Content']

        for author_id, text in concatinated.items():
            self.n_gram_vectors[author_id] = self.vectorize(text)

    @staticmethod
    def cosine_distance(v1: list[float], v2: list[float]) -> float:
        return sum((a - b) ** 2 for a, b in zip(v1, v2)) ** 0.5

    def predict(self, text: str) -> int:
        n_grams = self.get_n_grams(text)
        counter = Counter(n_grams)
        vector = [0] * len(self.n_grams_idx)
        for tgram, idx in counter.items():
            vector[idx] = counter[tgram]
        total = sum(vector)
        vector = [x / total for x in vector]

        distances = {
            author_id: self.cosine_distance(vector, author_vector)
            for author_id, author_vector in self.n_gram_vectors.items()
        }

        return min(distances, key=lambda x: distances[x])




In [137]:
t = ThreeGramPredictor('datasets/russian_classics/train_4_10_1000.csv', 'datasets/russian_classics/test_4_40_600.csv')
t.train()
t.test()

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        40
           1       0.00      0.00      0.00        40
           2       0.25      1.00      0.40        40
           3       0.00      0.00      0.00        40

    accuracy                           0.25       160
   macro avg       0.06      0.25      0.10       160
weighted avg       0.06      0.25      0.10       160

F1-score: 0.1
[[ 0  0 40  0]
 [ 0  0 40  0]
 [ 0  0 40  0]
 [ 0  0 40  0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [132]:
from collections import defaultdict, Counter
import re
from sklearn.svm import SVC


class ThreeGramSVMPredictor(BasePredictor):

    def __init__(self, train_path: str, test_path: str):
        super().__init__(train_path, test_path)
        self.n_grams_idx: dict[str, int] = {}
        self.vectors: list[tuple[int, list[float]]] = []
        self.model = SVC()

    @staticmethod
    def clean_text(text: str):
        return ' '.join(re.findall(r'[a-zа-яё]+', text.lower()))

    @staticmethod
    def get_n_grams(text: str):
        text = ThreeGramPredictor.clean_text(text)
        return [text[i:i + 3] for i in range(0, len(text) - 2)]

    def vectorize(self, text) -> list[float]:
        assert self.n_grams_idx

        t_grams = self.get_n_grams(text)
        counter = Counter(t_grams)
        vector = [0] * len(self.n_grams_idx)
        for t_gram, idx in self.n_grams_idx.items():
            vector[idx] = counter.get(t_gram, 0)
        total = sum(vector)
        return [x / total for x in vector]

    def train(self):
        all_tgrams = self.get_n_grams(' '.join(row['Content'] for _, row in self.train_dataset.iterrows()))
        self.n_grams_idx = {t_gram: idx for idx, t_gram in enumerate(set(all_tgrams))}

        self.vectors = [
            (row['Author'], self.vectorize(row['Content']))
            for _, row in self.train_dataset.iterrows()
        ]
        self.model.fit([v for _, v in self.vectors], [a for a, _ in self.vectors])

    def predict(self, text: str) -> int:
        vector = self.vectorize(text)
        return self.model.predict([vector])[0]




In [133]:
s = ThreeGramSVMPredictor('datasets/russian_classics/train_4_10_1000.csv', 'datasets/russian_classics/test_4_40_600.csv')
s.train()
s.test()

              precision    recall  f1-score   support

           0       0.95      0.88      0.91        40
           1       0.75      0.38      0.50        40
           2       0.97      0.70      0.81        40
           3       0.50      0.93      0.65        40

    accuracy                           0.72       160
   macro avg       0.79      0.72      0.72       160
weighted avg       0.79      0.72      0.72       160

F1-score: 0.7174519797517509
[[35  2  0  3]
 [ 0 15  0 25]
 [ 2  1 28  9]
 [ 0  2  1 37]]


In [139]:
s = ThreeGramSVMPredictor('datasets/russian_classics/train_23_10_1000.csv', 'datasets/russian_classics/test_23_40_1000.csv')
s.train()
s.test()

              precision    recall  f1-score   support

           0       0.42      0.88      0.57        26
           1       0.33      0.25      0.29        40
           2       0.40      0.53      0.46        40
           3       0.21      0.89      0.34        35
           4       1.00      0.20      0.33        40
           5       0.71      0.25      0.37        40
           6       0.47      0.50      0.48        40
           7       0.28      0.20      0.23        40
           8       0.78      0.53      0.63        40
           9       1.00      0.10      0.18        40
          10       0.73      0.88      0.80        40
          11       0.39      0.61      0.48        36
          12       0.91      0.72      0.81        40
          13       1.00      0.07      0.14        40
          14       0.96      0.55      0.70        40
          15       0.41      0.93      0.57        40
          16       0.64      0.40      0.49        40
          17       1.00    