# Метод, основанный на структурных триграммах

## Загрузка необходимых модулей

In [None]:
import re
import pandas as pd
import pymorphy2
import nltk

morph = pymorphy2.MorphAnalyzer()

In [2]:
def syntax_encode(text):
    """Создает из текста список меток частей речи и знаков препинания"""
    text = text.lower()
    text = re.findall('\w+|\.\.\.|\!\?|[:\-,\.;\(\)\!\?]', text)
    result = []
    for token in text:
        pos = str(morph.parse(token)[0].tag.POS or morph.parse(token)[0].tag)
        if pos == 'PNCT':
            result.append(token)
            continue 
        result.append(pos)
    return result

test_text = "Один курьер огромное солнце по городу нес; одежда горела на нём, и ночь стала днём рядом с ним..."
syntax_encode(test_text)

['ADJF',
 'NOUN',
 'ADJF',
 'NOUN',
 'PREP',
 'NOUN',
 'VERB',
 ';',
 'NOUN',
 'ADJS',
 'PREP',
 'NPRO',
 ',',
 'CONJ',
 'NOUN',
 'VERB',
 'ADVB',
 'NOUN',
 'PREP',
 'NPRO',
 '...']

In [3]:
def tokenize(text):
    res = []
    for sent in nltk.sent_tokenize(text):
        t = syntax_encode(sent)
        s = nltk.ngrams(t, 3)
        res += s
    return res

tokenize(test_text)

[('ADJF', 'NOUN', 'ADJF'),
 ('NOUN', 'ADJF', 'NOUN'),
 ('ADJF', 'NOUN', 'PREP'),
 ('NOUN', 'PREP', 'NOUN'),
 ('PREP', 'NOUN', 'VERB'),
 ('NOUN', 'VERB', ';'),
 ('VERB', ';', 'NOUN'),
 (';', 'NOUN', 'ADJS'),
 ('NOUN', 'ADJS', 'PREP'),
 ('ADJS', 'PREP', 'NPRO'),
 ('PREP', 'NPRO', ','),
 ('NPRO', ',', 'CONJ'),
 (',', 'CONJ', 'NOUN'),
 ('CONJ', 'NOUN', 'VERB'),
 ('NOUN', 'VERB', 'ADVB'),
 ('VERB', 'ADVB', 'NOUN'),
 ('ADVB', 'NOUN', 'PREP'),
 ('NOUN', 'PREP', 'NPRO'),
 ('PREP', 'NPRO', '...')]

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(analyzer='word', tokenizer=tokenize, max_features=1500, min_df=5, max_df=0.7)

## Загрузка тренировочного и тестового датасета

In [6]:
train_df = pd.read_csv('../datasets/russian_classics/train_4_10_1000.csv')
print(train_df.shape)

test_df = pd.read_csv('../datasets/russian_classics/test_4_40_600.csv')
print(test_df.shape)

(40, 4)
(160, 4)


## Предобработка текстов

In [7]:
from tqdm import tqdm

train_texts = train_df['Content']
test_texts = test_df['Content']

In [8]:
train_X, train_y = list(train_texts), [int(a) for a in train_df['Author']]
test_X, test_y = list(test_texts), [int(a) for a in test_df['Author']]

## Построение деревьев решений

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

all_texts = list(train_texts) + list(test_texts)
y = train_y + test_y

tfidfconverter = vectorizer
X = tfidfconverter.fit_transform(all_texts).toarray()

X_train, X_test, y_train, y_test = X[:len(train_y)], X[len(train_y):], train_y, test_y

classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train) 

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

## Результат

In [11]:
y_pred = classifier.predict(X_test)

In [12]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[23  5  4  8]
 [ 0 40  0  0]
 [ 4 11 25  0]
 [ 2  7  3 28]]
              precision    recall  f1-score   support

           0       0.79      0.57      0.67        40
           1       0.63      1.00      0.78        40
           2       0.78      0.62      0.69        40
           3       0.78      0.70      0.74        40

    accuracy                           0.73       160
   macro avg       0.75      0.73      0.72       160
weighted avg       0.75      0.72      0.72       160

0.725


## Проверка метода на других датасетах

In [13]:
def test_method(train_datasets: list, test_datasets: list, predict_func):
    for d_test in test_datasets:
        for d_train in train_datasets:
            print("Test: {}, train: {}: {}".format(d_test, d_train, predict_func(d_test, d_train)))

In [14]:
def predict(test_dataset, train_dataset):
    train_df = pd.read_csv(train_dataset)
    test_df = pd.read_csv(test_dataset)
    train_texts = train_df['Content']
    test_texts = test_df['Content']
    train_X, train_y = list(train_texts), [int(a) for a in train_df['Author']]
    test_X, test_y = list(test_texts), [int(a) for a in test_df['Author']]
    all_texts = list(train_texts) + list(test_texts)
    y = train_y + test_y

    tfidfconverter = vectorizer
    X = tfidfconverter.fit_transform(all_texts).toarray()
    X_train, X_test, y_train, y_test = X[:len(train_y)], X[len(train_y):], train_y, test_y

    classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
    classifier.fit(X_train, y_train) 

    y_pred = classifier.predict(X_test)
    return accuracy_score(y_test, y_pred)

In [15]:
train_datasets = [
    '../datasets/russian_classics/train_4_5_600.csv',
    '../datasets/russian_classics/train_4_10_1000.csv'
]

test_datasets = [
    '../datasets/russian_classics/test_4_40_600.csv'
]

In [16]:
test_method(train_datasets, test_datasets, predict)

Test: ../datasets/russian_classics/test_4_40_600.csv, train: ../datasets/russian_classics/train_4_5_600.csv: 0.60625
Test: ../datasets/russian_classics/test_4_40_600.csv, train: ../datasets/russian_classics/train_4_10_1000.csv: 0.725


In [17]:
train_datasets = [
    '../datasets/russian_classics/train_23_5_600.csv',
    '../datasets/russian_classics/train_23_10_1000.csv'
]

test_datasets = [
    '../datasets/russian_classics/test_23_40_600.csv'
]

In [18]:
test_method(train_datasets, test_datasets, predict)

Test: ../datasets/russian_classics/test_23_40_600.csv, train: ../datasets/russian_classics/train_23_5_600.csv: 0.31624863685932386
Test: ../datasets/russian_classics/test_23_40_600.csv, train: ../datasets/russian_classics/train_23_10_1000.csv: 0.3914940021810251


In [19]:
train_datasets = [
    '../datasets/russian_classics/train_4_5_600.csv',
    '../datasets/russian_classics/train_4_10_1000.csv'
]

test_datasets = [
    '../datasets/russian_classics/test_4_40_1000.csv'
]

In [20]:
test_method(train_datasets, test_datasets, predict)

Test: ../datasets/russian_classics/test_4_40_1000.csv, train: ../datasets/russian_classics/train_4_5_600.csv: 0.60625
Test: ../datasets/russian_classics/test_4_40_1000.csv, train: ../datasets/russian_classics/train_4_10_1000.csv: 0.725


In [21]:
train_datasets = [
    '../datasets/proza_ru/proza_train.csv',
]

test_datasets = [
    '../datasets/proza_ru/proza_test.csv'
]

test_method(train_datasets, test_datasets, predict)

Test: ../datasets/proza_ru/proza_test.csv, train: ../datasets/proza_ru/proza_train.csv: 0.8695652173913043
