In [1]:
import pandas as pd
import pickle
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score
from statistics import mean
from feature_functions import *
from natasha_func import get_doc
from natasha_func import get_words
from natasha_func import get_lemma_words
from natasha_func import get_sents
from collections import defaultdict
from natasha_func import get_lemma_part_speech

In [2]:
dataset = pickle.load(open("dataset.pkl", "rb"))

In [3]:
def dataset_target(dataset):
    authors = list(set(dataset['authors']))
    dict_authors = dict()
    reverse_dict_authors = dict()
    for index in range(len(authors)):
        dict_authors[authors[index]] = index
        reverse_dict_authors[index] = authors[index]
    target = []
    for author in dataset["authors"]:
        target += [dict_authors[author]]
    return target

In [4]:
from features.features_lemma_part_speech import get_feature_lemma_part_speech
from features.features_with_word_count import get_feature_symbol
from features.features_with_sentence_list import get_feature_sents
from features.features_words import get_feature_word

def get_list_of_features(doc):
    features_symbol = get_feature_symbol(doc)
    features_words = get_feature_word(doc)
    features_sents = get_feature_sents(doc)
    features_lemma_part_speech = get_feature_lemma_part_speech(doc)
    list_of_features = features_sents + features_words + features_symbol + features_lemma_part_speech
    return list_of_features

In [5]:
from features.features_words import func_words
from features.features_with_sentence_list import FUNC_LIST
from features.features_with_word_count import func_chars
from features.features_lemma_part_speech import func_lemma_part_speech
from tqdm import tqdm
all_func = FUNC_LIST + func_words + func_chars + func_lemma_part_speech

In [6]:
import json
import compress_pickle
import os


observations = []
DATA_PATH = "C:\\Users\\mashk\\PycharmProjects\\texts_attribution\\Данные"
with open(os.path.join(DATA_PATH, "docs_info.json"), "r") as f_docs_info:
    docs_info = json.load(f_docs_info)
    authors = docs_info["authors"]
    file_names = docs_info["file_name"]
    for i, file_name in enumerate(tqdm(file_names)):
        author = authors[i]
        doc = compress_pickle.load(os.path.join(DATA_PATH, "Обработанные", file_name))

        features = get_list_of_features(doc)
        observation = features + [author]
        observations.append(observation)

df_features = pd.DataFrame(observations, columns = all_func + ["target"])

  3%|▎         | 17/516 [00:09<04:29,  1.85it/s]


KeyboardInterrupt: 

In [None]:
df_features.to_csv('out.csv', index=False)

In [7]:
def validation(df, model):
    accuracy = []
    for shift in range(0, 10, 2):
        authors_seen = []
        test_ixs = []
        for ixs in range(len(df)):
            aut = df['targets'][ixs]
            if authors_seen.count(aut) < 2:
                authors_seen += [aut]
                test_ixs += [ixs + shift]
        train_ixs = list(set(range(len(df))) - set(test_ixs))
        y_test = df.iloc[test_ixs]['target']
        x_test = df.iloc[test_ixs].drop(columns=["target"])
        y_train = df.iloc[train_ixs]['target']
        x_train = df.iloc[train_ixs].drop(columns=["target"])

        rfc = model
        rfc.fit(x_train, y_train)
        accuracy.append(accuracy_score(y_test, rfc.predict(x_test)))
    return accuracy

print( mean(validation(df, ExtraTreesClassifier())))

NameError: name 'df' is not defined

In [None]:
from catboost import CatBoostClassifier

validation(df, CatBoostClassifier())

In [15]:
def cross_val(df, model):
    accuracy = []
    train_last, test_last = 0, 0
    for shift in range(0, 10, 2):
            authors_seen = []
            test_ixs = []
            for ixs in range(len(df)):
                aut = df['targets'][ixs]
                if authors_seen.count(aut) < 2:
                    authors_seen += [aut]
                    test_ixs += [ixs + shift]
            train_ixs = list(set(range(len(df))) - set(test_ixs))
            if shift != 8:
                y_test = df.iloc[test_ixs]['targets']
                x_test = df.iloc[test_ixs].drop(columns=["targets"])
                y_train = df.iloc[train_ixs]['targets']
                x_train = df.iloc[train_ixs].drop(columns=["targets"])

                rfc = model
                rfc.fit(x_train, y_train)
                accuracy.append(accuracy_score(y_test, rfc.predict(x_test)))
            else:
                train_last = train_ixs
                test_last = test_ixs

    return accuracy, train_last, test_last

df = pd.read_csv('first_dataset.csv')
vals, train_ixs, test_ixs = cross_val(df, ExtraTreesClassifier())

In [19]:
df.iloc[test_ixs]

Unnamed: 0,capitalized_words_count_without_start_of_sentences,most_common_first_letter_in_sentences,avg_length_of_sentence_by_letters,count_upper_words,frequency_of_longest_word,frequency_of_a,freq_of_t,freq_of_n,freq_of_e,freq_of_o,...,avg_syllable_per_adjective,avg_syllable_per_verb,avg_syllable_per_noun,freq_word_from_verbs,freq_word_from_noun,average_lenght_of_verbs,average_lenght_of_adverbs,average_lenght_of_adjectives,average_lenght_of_nouns,targets
8,316,9,81.558891,32,74.045455,2365,0.042074,0.050465,0.067414,0.086395,...,2.812971,2.784822,2.471574,0.018252,0.021137,8.045149,5.765748,7.684766,6.262391,Л. Н. Андреев
9,540,15,76.046729,135,226.670103,8064,0.046091,0.050557,0.062249,0.085457,...,2.905146,2.642650,2.433882,0.021378,0.014527,7.714700,5.614793,7.797679,6.117069,Л. Н. Андреев
18,1405,14,65.733168,3,219.059322,9708,0.049846,0.045338,0.067352,0.085255,...,2.816133,2.541258,2.544352,0.046593,0.027441,7.339212,5.404243,7.452550,6.318230,Н. Г. Чернышевский
19,5437,14,113.186867,3,599.416107,32876,0.052479,0.052078,0.067669,0.092631,...,2.963515,2.668595,2.749985,0.037105,0.027977,7.697164,5.753523,7.835741,6.791797,Н. Г. Чернышевский
28,714,2,102.996337,139,85.862385,3774,0.048906,0.049257,0.066606,0.085189,...,2.970925,2.919721,2.820275,0.018325,0.020795,8.287086,5.950935,8.043172,6.888600,В. Ф. Ходасевич
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494,320,20,109.870879,2,76.774194,3080,0.039823,0.048140,0.072455,0.080941,...,2.976398,2.867863,2.461623,0.018760,0.020285,8.221044,5.504373,7.818634,6.058662,В. А. Жуковский
504,221,14,121.763441,10,63.978723,2554,0.043815,0.048942,0.069124,0.087206,...,3.046990,2.839250,2.487450,0.015779,0.021797,8.121302,5.602305,8.049927,6.284676,Д. С. Мережковский
505,22970,2,82.520919,919,1202.259067,71162,0.045295,0.044892,0.066273,0.075470,...,2.556667,2.520815,2.506780,0.056565,0.019865,7.342177,5.331973,7.155396,6.329431,Д. С. Мережковский
514,339,16,112.157447,68,56.845238,1941,0.045323,0.049884,0.067166,0.085121,...,3.143541,2.687578,2.554808,0.025094,0.021154,7.764115,5.919255,8.126794,6.431731,М. А. Кузмин
