In [1]:
import numpy as np
import pandas as pd
from itertools import product
#from nltk import ngrams
#import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Calculate n grams

In [2]:
alphabet = 'абвгдеёжзийклмнопрстуфхцчьыъшщэюя'
n = 3

def build_n_grams(n):
    combinations = dict()
    for i in range(1,n+1):
        combinations[i] = [''.join(val) for val in [p for p in product(list(alphabet),repeat = i)]]
    return combinations

n_grams = build_n_grams(3)

# Read textes and build dataframe

In [3]:
def build_dataframe(text: str, gramms):
    text_data = {}
    for j in gramms:
        for i in gramms[j]:
            text_data[i] = text.count(i)
    return text_data

In [4]:
def read_authors(dirname, n_grams):
    data = []
    for _dir in os.listdir(dirname):
        _current_dir = os.path.join(dirname,_dir)
        if(os.path.isdir(_current_dir)):
            for _file in os.listdir(_current_dir):
                filepath = os.path.join(_current_dir, _file)
                text = None
                string_data = {'author': _dir, 'name': _file}
                with open(filepath, 'r') as f:
                    try:
                        text = ''.join(filter(str.isalpha, f.read()))
                        string_data['lenght'] = len(text)
                        n_parts = build_dataframe(text, n_grams)
                        string_data.update(n_parts)
                    except UnicodeDecodeError as e:
                        print('Unable to read the text (some encoding errors)')
                        print('\033[91m {} \033[0m'.format(str(e)))
                data.append(string_data)
    return data

In [35]:
def read_text(filepath):
    data = {}
    with open(filepath, 'r') as f:
        text = ''.join(filter(str.isalpha, f.read()))
        data['lenght'] = len(text)
        data.update(build_dataframe(text, n_grams))
    return data

In [116]:
def read_unknown(path):
    data = {}
    for _file in os.listdir(path):
        data[_file] = [read_text(os.path.join(path, _file))]
    return data

In [117]:
predict = read_unknown('./predict')

# Modify data and calculate PDF

In [192]:
data = read_authors('./data', n_grams)
df = pd.DataFrame(data)
unique_authors = df.author.unique()

In [194]:
df = df.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,author,lenght,name,а,аа,ааа,ааб,аав,ааг,аад,...,ёёч,ёёш,ёёщ,ёёъ,ёёы,ёёь,ёёэ,ёёю,ёёя,ёёё
0,Толстой,5669,Война и мир,436,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Толстой,14702,После бала,1253,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Толстой,8152,Анна Каренина,699,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Гоголь,16794,Тарас Бульба,1418,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Гоголь,10915,Записки сумасшедшего,957,5,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0


In [71]:
#Unique authors list
unique_authors

array(['Гоголь', 'Толстой', 'Достоевский'], dtype=object)

In [87]:
#Dense probabulity for all combinations
def dense_probability(text_frame):
    return text_frame.div(text_frame.lenght, axis=0)

def calc_ethalon(author, data_frame):
    _df = data_frame.loc[data_frame.author == author].drop(['author', 'name'], axis=1).sum()
    return dense_probability(_df)

In [195]:
#Calculate author ethalon
authors_e = {}
authors_deviation = {}
for author in unique_authors:
    authors_e[author] = calc_ethalon(author, df)
    authors_deviation[author] = {'own' : deviation_own(author, df), 'foreign' : deviation_foreign(author, authors_e[author], df)}

In [178]:
#Calculate distance between text and the ethalon. We need only letters combinations to pass the function arguments
def dist_between_text_ethalon(text, ethalon):
    dist = np.sum(np.absolute(np.array(text - ethalon)))
    return dist
#Calculate the deviation of the own textes from pseudo ethalon. Own text is excluded form the ethalon
def deviation_own(author, data_frame):
    distances = []
    current_text = None
    ethalon_without_current = None
    data_frame = data_frame.loc[data_frame.author == author].drop(['author', 'name'], axis=1)
    for i in data_frame.index:
        current_text = dense_probability(data_frame.loc[[i]])
        ethalon_without_current = dense_probability(data_frame.drop([i], axis=0).sum())
        distances.append(dist_between_text_ethalon(current_text, ethalon_without_current))
    return min(distances)
#Calculate the deviation of the foreign textes from ethalon
def deviation_foreign(author, ethalon, data_frame):
    distances = []
    current_text = None
    data_frame = data_frame.loc[data_frame.author != author].drop(['author', 'name'], axis=1)
    for i in data_frame.index:
        current_text = dense_probability(data_frame.loc[[i]])
        distances.append(dist_between_text_ethalon(current_text, ethalon))
    return max(distances)

In [196]:
authors_deviation

{'Гоголь': {'own': 0.8430963968401642, 'foreign': 1.320334040493846},
 'Толстой': {'own': 0.9851571918315121, 'foreign': 1.3277515494513266},
 'Достоевский': {'own': 0.844524805169581, 'foreign': 1.2458406824108037}}

# Authors prediction

In [197]:
def text_distances(text, ethalons):
    dist = 1
    data_dist = {}
    for author in ethalons:
        dist = dist_between_text_ethalon(authors_e[author],dense_probability(text))
        data_dist[author] = dist
    return data_dist
def predict_text(text, ethalons, max_dist_own = 1, min_dist_foreign = 1):
    distances = text_distances(text, ethalons)
    return min(distances, key = lambda x: distances.get(x) )
        

In [200]:
predictions = {}
for i in predict:
    predictions[i] = predict_text(pd.DataFrame(predict[i]),authors_e)

In [201]:
predictions

{'Советские помещики - Гоголь': 'Гоголь',
 'Слабое сердце - Достоевский': 'Достоевский',
 'Петербургские сновидения - Достоевский': 'Достоевский',
 'Ползунков - Достоевский': 'Достоевский',
 'Повесть о том - Гоголь': 'Гоголь'}