# Стастистические методы

## Загрузка необходимых модулей

In [2]:
import re
import pandas as pd
import pymorphy2
import nltk
import tqdm 
from sklearn.feature_extraction.text import TfidfVectorizer

morph = pymorphy2.MorphAnalyzer()

  return f(*args, **kwds)


In [3]:
def syntax_encode(text):
    text = text.lower()
    text = re.findall('\w+|\.\.\.|\!\?|[:\-,\.;\(\)\!\?]', text)
    result = []
    for token in text:
        pos = str(morph.parse(token)[0].tag.POS or morph.parse(token)[0].tag)
        if pos == 'PNCT':
            result.append(token)
            continue 
        result.append(pos)
    return result

def tokenize(text):
    res = []
    for sent in nltk.sent_tokenize(text):
        t = syntax_encode(sent)
        s = nltk.ngrams(t, 3)
        res += s
    return res

vectorizer = TfidfVectorizer(analyzer='word', tokenizer=tokenize, max_features=1000, min_df=5, max_df=0.7)

print(syntax_encode("Маама мыла раму. Папа ел пельмени!"))
print(tokenize("Маама мыла раму. Папа ел пельмени!"))

['NOUN', 'NOUN', 'NOUN', '.', 'NOUN', 'VERB', 'NOUN', '!']
[('NOUN', 'NOUN', 'NOUN'), ('NOUN', 'NOUN', '.'), ('NOUN', 'VERB', 'NOUN'), ('VERB', 'NOUN', '!')]


In [18]:
with open('../data/functional_words/vvodn') as f:
    VVODN_LIST = f.read().split()  # список вводных слов русского языка

with open('../data/functional_words/souz') as f:
    SOUZ_LIST = f.read().split() # список союзов русского языка

with open('../data/functional_words/chast') as f:
    CHAST_LIST = f.read().split()  #  список частиц русского языка

with open('../data/functional_words/judge') as f:
    JUDGE_LIST = f.read().split()  # список оценочных слов русского языка

def vectorize(text):
    text = re.findall('[а-яё]+', text.lower())
    vvodn = dict.fromkeys(VVODN_LIST, 0)
    souz = dict.fromkeys(SOUZ_LIST, 0)
    chast = dict.fromkeys(CHAST_LIST, 0)
    judge = dict.fromkeys(JUDGE_LIST, 0)
    for word in text:
        if word in vvodn:
            vvodn[word] += 1
        if word in souz:
            souz[word] += 1
        if word in chast:
            chast[word] += 1
        if morph.parse(word)[0].normal_form in judge:
            judge[morph.parse(word)[0].normal_form] += 1

    res_v = []
    res_s = []
    res_c = []
    res_j = []
    for key in sorted(vvodn):
        res_v.append(vvodn[key] / max(1, sum(vvodn.values())))
    for key in sorted(souz):
        res_s.append(souz[key] / max(1, sum(souz.values())))
    for key in sorted(chast):
        res_c.append(chast[key] / max(1, sum(chast.values())))
    for key in sorted(judge):
        res_j.append(judge[key] / max(1, sum(judge.values())))

    return [res_v, res_s, res_c, res_j]

## Загрузка тренировочного и тестового датасета

In [229]:
train_df = pd.read_csv('../datasets/russian_classics/train_4_5_600.csv') 
print(train_df.shape)

test_df = pd.read_csv('../datasets/russian_classics/test_4_40_600.csv') 
print(test_df.shape)
test_df.head()

(20, 4)
(160, 4)


Unnamed: 0.1,Unnamed: 0,Author Name,Author,Content
0,0,Андреев,0,"﻿I\n\nЯ и другой прокаженный, мы осторожно по..."
1,1,Андреев,0,﻿I\n\nВ учении Ницше Сергея Петровича больше ...
2,2,Андреев,0,"﻿Над бесконечной снежною равниною, тяжело взм..."
3,3,Андреев,0,﻿Андрей Николаевич снял с подоконника горшок ...
4,4,Андреев,0,﻿I\n\nПомощник присяжного поверенного Толпенн...


## Векторизация текстов

In [230]:
from tqdm import tqdm

train_texts = train_df['Content']
test_texts = test_df['Content']

In [231]:
train_X, train_y = list(train_texts), [int(a) for a in train_df['Author']]
test_X, test_y = list(test_texts), [int(a) for a in test_df['Author']]

In [232]:
from tqdm import trange

all_texts = list(train_texts) + list(test_texts)
y = train_y + test_y

tfidfconverter = vectorizer
X = tfidfconverter.fit_transform(all_texts).toarray()
X1 = []
for i in trange(len(y)):
    X1.append([list(X[i])] + vectorize(all_texts[i]))

X = X1
X_train, X_test, y_train, y_test = X[:len(train_y)], X[len(train_y):], train_y, test_y

100%|██████████| 180/180 [00:25<00:00,  7.09it/s]


In [233]:
from scipy.spatial.distance import cosine, euclidean
from collections import Counter, defaultdict

## Проверка работы различных методов

In [234]:
def evaluate(X_train, y_train, X_test, y_test, method):
    labels = [int(i) for i in sorted(dict(Counter(y_test)).keys())]
    results = [[0 for _ in range(len(labels))] for _ in range(len(labels))]
    for i in trange(len(X_test)):
        results[int(y_test[i])][method(X_test[i], X_train, y_train)] += 1
    print('Accuracy: ', sum([results[i][i] for i in range(len(results))]) / sum([sum(results[i]) for i in range(len(results))]))
    return results 

In [235]:
def min_key(d: dict):
    mk = list(d.keys())[0]
    min_val = d[mk] 
    for key in d:
        if d[key] < min_val:
            min_val = d[key]
            mk = key 
    return mk

def predict_sum(v_test, X_train, y_train):
    labels = sorted(dict(Counter(y_test)).keys())
    res = defaultdict(list)
    for i in range(len(X_train)):
        v = list(X_train[i])
        d = sum([euclidean(v[j], v_test[j]) for j in range(len(v_test))])
        res[y_train[i]].append(d)
    for key in res:
        res[key] = sum(res[key]) / len(res[key])
    
    return min_key(res)

In [236]:
evaluate(X_train, y_train, X_test, y_test, predict_sum)

100%|██████████| 160/160 [00:01<00:00, 146.99it/s]Accuracy:  0.46875



[[20, 0, 19, 1], [4, 13, 20, 3], [0, 0, 39, 1], [11, 0, 26, 3]]

In [237]:
def predict_prod(v_test, X_train, y_train):
    labels = sorted(dict(Counter(y_test)).keys())
    res = defaultdict(list)
    for i in range(len(X_train)):
        v = list(X_train[i])
        d = euclidean(v[0], v_test[0]) * euclidean(v[1], v_test[1]) * euclidean(v[2], v_test[2]) * euclidean(v[3], v_test[3]) * euclidean(v[4], v_test[4])
        res[y_train[i]].append(d)
    for key in res:
        res[key] = sum(res[key]) / len(res[key])
    
    return min_key(res)

evaluate(X_train, y_train, X_test, y_test, predict_prod)

100%|██████████| 160/160 [00:01<00:00, 135.84it/s]Accuracy:  0.45625



[[18, 0, 18, 4], [4, 11, 9, 16], [2, 1, 35, 2], [10, 1, 20, 9]]

In [238]:
def predict_min(v_test, X_train, y_train):
    labels = sorted(dict(Counter(y_test)).keys())
    res = defaultdict(list)
    for i in range(len(X_train)):
        v = list(X_train[i])
        d1 = euclidean(v[0], v_test[0]) 
        d2 = euclidean(v[1], v_test[1]) 
        d3 = euclidean(v[2], v_test[2]) 
        d4 = euclidean(v[3], v_test[3])
        d5 = euclidean(v[4], v_test[4])
        res[y_train[i]].append(min(d1, d2, d3, d4, d5))
    for key in res:
        res[key] = sum(res[key]) / len(res[key])
    
    return min_key(res)

evaluate(X_train, y_train, X_test, y_test, predict_min)

100%|██████████| 160/160 [00:01<00:00, 108.63it/s]Accuracy:  0.3875



[[20, 3, 6, 11], [5, 9, 6, 20], [3, 10, 13, 14], [12, 4, 4, 20]]

In [239]:
def predict_max(v_test, X_train, y_train):
    labels = sorted(dict(Counter(y_test)).keys())
    res = defaultdict(list)
    for i in range(len(X_train)):
        v = list(X_train[i])
        d1 = euclidean(v[0], v_test[0]) 
        d2 = euclidean(v[1], v_test[1]) 
        d3 = euclidean(v[2], v_test[2]) 
        d4 = euclidean(v[3], v_test[3])
        d5 = euclidean(v[4], v_test[4])
        res[y_train[i]].append(max(d1, d2, d3, d4, d5))
    for key in res:
        res[key] = sum(res[key]) / len(res[key])
    
    return min_key(res)

evaluate(X_train, y_train, X_test, y_test, predict_max)

100%|██████████| 160/160 [00:01<00:00, 157.11it/s]Accuracy:  0.6375



[[37, 0, 2, 1], [7, 29, 2, 2], [16, 1, 23, 0], [16, 0, 11, 13]]

In [240]:
def predict_closed(v_test, X_train, y_train):
    labels = sorted(dict(Counter(y_test)).keys())
    res = defaultdict(list)
    for i in range(len(X_train)):
        v = list(X_train[i])
        d1 = euclidean(v[0], v_test[0]) 
        d2 = euclidean(v[1], v_test[1]) 
        d3 = euclidean(v[2], v_test[2]) 
        d4 = euclidean(v[3], v_test[3])
        d5 = euclidean(v[4], v_test[4])
        res[y_train[i]].append(d2 * d3 * d4 * d5)
    for key in res:
        res[key] = sum(res[key]) / len(res[key])
    
    return min_key(res)

evaluate(X_train, y_train, X_test, y_test, predict_closed)

100%|██████████| 160/160 [00:01<00:00, 108.12it/s]Accuracy:  0.43125



[[16, 0, 19, 5], [4, 11, 9, 16], [2, 2, 33, 3], [10, 1, 20, 9]]

In [241]:
def predict_syntax(v_test, X_train, y_train):
    labels = sorted(dict(Counter(y_test)).keys())
    res = defaultdict(list)
    for i in range(len(X_train)):
        v = list(X_train[i])
        d1 = euclidean(v[0], v_test[0]) 
        d2 = euclidean(v[1], v_test[1]) 
        d3 = euclidean(v[2], v_test[2]) 
        d4 = euclidean(v[3], v_test[3])
        d5 = euclidean(v[4], v_test[4])
        res[y_train[i]].append(d1)
    for key in res:
        res[key] = sum(res[key]) / len(res[key])
    
    return min_key(res)

evaluate(X_train, y_train, X_test, y_test, predict_max_closed)

100%|██████████| 160/160 [00:00<00:00, 163.55it/s]Accuracy:  0.65625



[[40, 0, 0, 0], [7, 27, 2, 4], [18, 1, 21, 0], [18, 0, 5, 17]]