# Метод, основанный на использовании слов закрытых грамматичесикх классов

## Загрузка необходимых модулей

In [2]:
import re
import pandas as pd
import pymorphy2

morph = pymorphy2.MorphAnalyzer()

In [3]:
with open('../data/functional_words/vvodn') as f:
    VVODN_LIST = f.read().split()  # список вводных слов русского языка

with open('../data/functional_words/souz') as f:
    SOUZ_LIST = f.read().split() # список союзов русского языка

with open('../data/functional_words/chast') as f:
    CHAST_LIST = f.read().split()  #  список частиц русского языка

In [4]:
def vectorize(text):
    text = re.findall('[а-яё]+', text.lower())
    vvodn = dict.fromkeys(VVODN_LIST, 0)
    souz = dict.fromkeys(SOUZ_LIST, 0)
    chast = dict.fromkeys(CHAST_LIST, 0)
    for word in text:
        if word in vvodn:
            vvodn[word] += 1
        if word in souz:
            souz[word] += 1
        if word in chast:
            chast[word] += 1

    res = []
    for key in sorted(vvodn):
        res.append(vvodn[key] / max(1, sum(vvodn.values())))
    for key in sorted(souz):
        res.append(souz[key] / max(1, sum(vvodn.values())))
    for key in sorted(chast):
        res.append(chast[key] / max(1, sum(vvodn.values())))

    return res

## Загрузка тренировочного и тестового датасета

In [49]:
train_df = pd.read_csv('../datasets/russian_classics/train_4_10_1000.csv')
print(train_df.shape)

test_df = pd.read_csv('../datasets/russian_classics/test_4_40_600.csv')
print(test_df.shape)

(40, 4)
(160, 4)


## Предобработка текстов

In [50]:
from tqdm import tqdm

train_texts = train_df['Content']
test_texts = test_df['Content']

In [51]:
train_X, train_y = [vectorize(text) for text in train_texts], [int(a) for a in train_df['Author']]
test_X, test_y = [vectorize(text) for text in test_texts], [int(a) for a in test_df['Author']]

In [75]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

all_texts = list(train_texts) + list(test_texts)
y = train_y + test_y

X = [vectorize(text) for text in all_texts]

X_train, X_test, y_train, y_test = X[:len(train_y)], X[len(train_y):], train_y, test_y

## Построение деревьев решений

In [None]:
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train) 

## Результаты

In [53]:
y_pred = classifier.predict(X_test)

In [54]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[24  3  4  9]
 [ 4 31  0  5]
 [ 2  3 26  9]
 [13  3  9 15]]
              precision    recall  f1-score   support

           0       0.56      0.60      0.58        40
           1       0.78      0.78      0.78        40
           2       0.67      0.65      0.66        40
           3       0.39      0.38      0.38        40

    accuracy                           0.60       160
   macro avg       0.60      0.60      0.60       160
weighted avg       0.60      0.60      0.60       160

0.6


## Проверка метода на других датасетах

In [63]:
def test_method(train_datasets: list, test_datasets: list, predict_func):
    for d_test in test_datasets:
        for d_train in train_datasets:
            print("Test: {}, train: {}: {}".format(d_test, d_train, predict_func(d_test, d_train)))
    

In [67]:
def predict(test_dataset, train_dataset):
    train_df = pd.read_csv(train_dataset)
    test_df = pd.read_csv(test_dataset)
    train_texts = train_df['Content']
    test_texts = test_df['Content']
    train_X, train_y = [vectorize(text) for text in train_texts], [int(a) for a in train_df['Author']]
    test_X, test_y = [vectorize(text) for text in test_texts], [int(a) for a in test_df['Author']]
    all_texts = list(train_texts) + list(test_texts)
    y = train_y + test_y

    X = [vectorize(text) for text in all_texts]
    X_train, X_test, y_train, y_test = X[:len(train_y)], X[len(train_y):], train_y, test_y

    classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
    classifier.fit(X_train, y_train) 

    y_pred = classifier.predict(X_test)
    return accuracy_score(y_test, y_pred)

In [68]:
train_datasets = [
    '../datasets/russian_classics/train_4_5_600.csv',
    '../datasets/russian_classics/train_4_10_1000.csv'
]

test_datasets = [
    '../datasets/russian_classics/test_4_40_600.csv'
]

In [69]:
test_method(train_datasets, test_datasets, predict)

Test: ../datasets/russian_classics/test_4_40_600.csv, train: ../datasets/russian_classics/train_4_5_600.csv: 0.49375
Test: ../datasets/russian_classics/test_4_40_600.csv, train: ../datasets/russian_classics/train_4_10_1000.csv: 0.6


In [70]:
train_datasets = [
    '../datasets/russian_classics/train_23_5_600.csv',
    '../datasets/russian_classics/train_23_10_1000.csv'
]

test_datasets = [
    '../datasets/russian_classics/test_23_40_600.csv'
]

In [71]:
test_method(train_datasets, test_datasets, predict)

Test: ../datasets/russian_classics/test_23_40_600.csv, train: ../datasets/russian_classics/train_23_5_600.csv: 0.18538713195201745
Test: ../datasets/russian_classics/test_23_40_600.csv, train: ../datasets/russian_classics/train_23_10_1000.csv: 0.29443838604143946


In [72]:
train_datasets = [
    '../datasets/russian_classics/train_4_5_600.csv',
    '../datasets/russian_classics/train_4_10_1000.csv'
]

test_datasets = [
    '../datasets/russian_classics/test_4_40_1000.csv'
]

In [73]:
test_method(train_datasets, test_datasets, predict)

Test: ../datasets/russian_classics/test_4_40_1000.csv, train: ../datasets/russian_classics/train_4_5_600.csv: 0.49375
Test: ../datasets/russian_classics/test_4_40_1000.csv, train: ../datasets/russian_classics/train_4_10_1000.csv: 0.6


In [74]:
train_datasets = [
    '../datasets/proza_ru/proza_train.csv',
]

test_datasets = [
    '../datasets/proza_ru/proza_test.csv'
]

test_method(train_datasets, test_datasets, predict)

Test: ../datasets/proza_ru/proza_test.csv, train: ../datasets/proza_ru/proza_train.csv: 0.8181818181818182
