# Метод, основанный на TF-IDF по словам

## Загрузка необходимых модулей

In [None]:
import re
import pandas as pd
import pymorphy2

morph = pymorphy2.MorphAnalyzer()

## Загрузка тренировочного и тестового датасета

In [95]:
train_df = pd.read_csv('../datasets/russian_classics/train_4_10_1000.csv')
print(train_df.shape)

test_df = pd.read_csv('../datasets/russian_classics/test_4_40_600.csv')
print(test_df.shape)

(40, 4)
(160, 4)


In [96]:
def preprocess(text):
    """Возвращает строку из начальных форм слов"""
    text = text.lower()
    words = re.findall('[а-яё]+', text)
    return " ".join([morph.parse(word)[0].normal_form for word in words])

print(preprocess("Привет, как дела, что случилось? Хочешь быть веселей - не грусти!"))

привет как дело что случиться хотеть быть веселеть не грусть


## Предобработка текстов

In [97]:
from tqdm import tqdm

train_texts = []
for text in tqdm(train_df['Content']):
    train_texts.append(preprocess(text))

test_texts = []
for text in tqdm(test_df['Content']):
    test_texts.append(preprocess(text))

100%|██████████| 40/40 [00:10<00:00,  3.65it/s]
100%|██████████| 160/160 [00:24<00:00,  6.48it/s]


In [98]:
train_X, train_y = train_texts, [int(a) for a in train_df['Author']]
test_X, test_y = test_texts, [int(a) for a in test_df['Author']]

## Векторизация с применением TF-IDF и построение деревьев решений

In [106]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

all_texts = train_texts + test_texts
y = train_y + test_y

tfidfconverter = TfidfVectorizer(max_features=1500, min_df=5, max_df=0.7)
X = tfidfconverter.fit_transform(all_texts).toarray()

X_train, X_test, y_train, y_test = X[:len(train_y)], X[len(train_y):], train_y, test_y

classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train) 

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

## Результат

In [108]:
y_pred = classifier.predict(X_test)

In [109]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[26  9  4  1]
 [ 0 40  0  0]
 [ 0 17 23  0]
 [ 2 22  2 14]]
              precision    recall  f1-score   support

           0       0.93      0.65      0.76        40
           1       0.45      1.00      0.62        40
           2       0.79      0.57      0.67        40
           3       0.93      0.35      0.51        40

    accuracy                           0.64       160
   macro avg       0.78      0.64      0.64       160
weighted avg       0.78      0.64      0.64       160

0.64375


## Проверка метода на других датасетах

In [111]:
def test_method(train_datasets: list, test_datasets: list, predict_func):
    for d_test in test_datasets:
        for d_train in train_datasets:
            print("Test: {}, train: {}: {}".format(d_test, d_train, predict_func(d_test, d_train)))

In [115]:
def predict(test_dataset, train_dataset):
    train_df = pd.read_csv(train_dataset)
    test_df = pd.read_csv(test_dataset)
    train_texts = []
    for text in tqdm(train_df['Content']):
        train_texts.append(preprocess(text))
    test_texts = []
    for text in tqdm(test_df['Content']):
        test_texts.append(preprocess(text))
    
    train_X, train_y = train_texts, [int(a) for a in train_df['Author']]
    test_X, test_y = test_texts, [int(a) for a in test_df['Author']]
    all_texts = train_texts + test_texts
    y = train_y + test_y

    tfidfconverter = TfidfVectorizer(max_features=1500, min_df=5, max_df=0.7)
    X = tfidfconverter.fit_transform(all_texts).toarray()
    X_train, X_test, y_train, y_test = X[:len(train_y)], X[len(train_y):], train_y, test_y

    classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
    classifier.fit(X_train, y_train) 

    y_pred = classifier.predict(X_test)
    return accuracy_score(y_test, y_pred)

In [116]:
train_datasets = [
    '../datasets/russian_classics/train_4_5_600.csv',
    '../datasets/russian_classics/train_4_10_1000.csv'
]

test_datasets = [
    '../datasets/russian_classics/test_4_40_600.csv'
]

In [117]:
test_method(train_datasets, test_datasets, predict)

100%|██████████| 20/20 [00:07<00:00,  2.52it/s]
100%|██████████| 160/160 [00:45<00:00,  3.49it/s]
  0%|          | 0/40 [00:00<?, ?it/s]Test: ../datasets/russian_classics/test_4_40_600.csv, train: ../datasets/russian_classics/train_4_5_600.csv: 0.5625
100%|██████████| 40/40 [00:15<00:00,  2.64it/s]
100%|██████████| 160/160 [00:46<00:00,  3.41it/s]
Test: ../datasets/russian_classics/test_4_40_600.csv, train: ../datasets/russian_classics/train_4_10_1000.csv: 0.64375


In [118]:
train_datasets = [
    '../datasets/russian_classics/train_23_5_600.csv',
    '../datasets/russian_classics/train_23_10_1000.csv'
]

test_datasets = [
    '../datasets/russian_classics/test_23_40_600.csv'
]

In [119]:
test_method(train_datasets, test_datasets, predict)

100%|██████████| 115/115 [00:50<00:00,  2.28it/s]
100%|██████████| 917/917 [05:52<00:00,  2.60it/s]
Test: ../datasets/russian_classics/test_23_40_600.csv, train: ../datasets/russian_classics/train_23_5_600.csv: 0.3249727371864776
100%|██████████| 230/230 [01:26<00:00,  2.65it/s]
100%|██████████| 917/917 [03:38<00:00,  4.21it/s]
Test: ../datasets/russian_classics/test_23_40_600.csv, train: ../datasets/russian_classics/train_23_10_1000.csv: 0.46892039258451473


In [124]:
train_datasets = [
    '../datasets/russian_classics/train_4_5_600.csv',
    '../datasets/russian_classics/train_4_10_1000.csv'
]

test_datasets = [
    '../datasets/russian_classics/test_4_40_1000.csv'
]

In [125]:
test_method(train_datasets, test_datasets, predict)

100%|██████████| 20/20 [00:05<00:00,  3.60it/s]
100%|██████████| 160/160 [00:40<00:00,  3.93it/s]
  0%|          | 0/40 [00:00<?, ?it/s]Test: ../datasets/russian_classics/test_4_40_1000.csv, train: ../datasets/russian_classics/train_4_5_600.csv: 0.5625
100%|██████████| 40/40 [00:17<00:00,  2.25it/s]
100%|██████████| 160/160 [00:45<00:00,  3.50it/s]
Test: ../datasets/russian_classics/test_4_40_1000.csv, train: ../datasets/russian_classics/train_4_10_1000.csv: 0.64375


In [126]:
train_datasets = [
    '../datasets/proza_ru/proza_train.csv',
]

test_datasets = [
    '../datasets/proza_ru/proza_test.csv'
]

test_method(train_datasets, test_datasets, predict)

100%|██████████| 933/933 [10:08<00:00,  1.53it/s]
100%|██████████| 253/253 [02:42<00:00,  1.56it/s]
Test: ../datasets/proza_ru/proza_test.csv, train: ../datasets/proza_ru/proza_train.csv: 0.9011857707509882
