In [1]:
import pandas as pd
import numpy as np
from progressbar import progressbar as pb

data = pd.read_csv('big_data_labels.csv', index_col=False)
print(len(data))
data.head()

13073


Unnamed: 0,0,1,2,3,4,5,label
0,Совещательный орган при императоре в начале XX в,Сенат,Государственный совет,Консилия министров,Верховный тайный совет,2,История
1,Министр внутренних дел с 1904 г либерал автор ...,Лорис Меликов,Святополк Мирский,Милюков,Витте,2,История
2,Какое из названных событий произошло 27 января...,подрыв флагманского корабля Петропавловск,высадка японских войск на Ляодунском полуострове,героический бой Варяга и Корейца,заключение Портсмутского мирного договора,3,История
3,Что из названного относится к результатам перв...,появление многопартийности,формирование конституционной монархии,ликвидация сословного строя,введение рабочего контроля за производством,1,История
4,Творчество поэтов Гумилева Ахматовой Мандельшт...,акмеизму,футуризму,импрессионизму,реализму,1,История


Логрегрессию обучаем ледующим образом. Из вектора вопроса (q_vec) и векторов ответов (a1_vec, ... a4_vec) создаем четыре вектора q_vec | a1_vec), ... (q_vec | a4_vec) с соответствующей бинарной разметкой, где | - конкатенация. Затем каждый из таких сэмплов с соответствующей разметкой подаем на вход логрегрессии.

In [2]:
import pickle

with open('/Users/user/Python/Diploma/Embeddings/elmo_embed_big.pickle', 'rb') as f:
    elmo_embs = pickle.load(f)

elmo_embs = np.array(elmo_embs)

with open('/Users/user/Python/Diploma/Embeddings/fasttext_embed_big.pickle', 'rb') as f:
    fasttext_embs = pickle.load(f)

fasttext_embs = np.array(fasttext_embs)

with open('/Users/user/Python/Diploma/Embeddings/rusvec_embed_big.pickle', 'rb') as f:
    rusvec_embs = pickle.load(f)

rusvec_embs = np.array(rusvec_embs)

with open('/Users/user/Python/Diploma/Embeddings/bert_embed_big.pickle', 'rb') as f:
    bert_embs = pickle.load(f)

bert_embs = np.array(bert_embs)

In [3]:
embeddings = {'RusVectores': rusvec_embs, 'FastText': fasttext_embs, 'ELMO': elmo_embs, 'BERT': bert_embs}

In [4]:
from sklearn.linear_model import LogisticRegression as LR
from sklearn.metrics import accuracy_score, precision_score, f1_score

def get_samples(data, embeddings, train_part, random_seed):
    
    '''
    Из вектора вопроса (q_vec) и векторов ответов (a1_vec, ... a2_vec) 
    создаем четыре вектора (q_vec | a1_vec), ... (q_vec | a2_vec) 
    с соответствующей бинарной разметкой, где | - конкатенация. 
    
    Делим выборку на трейн и тест. 
    
    data - исходный датасет с разметкой
    embeddings - векторное представление исходного датасета
    train_part - доля тренировочной выборки
    random_seed - сид для повторяемости эксперимента
    
    return:
    
    X_train, X_test, y_train, y_test - разделенная на тренировочную и тестовую части выборка,
    имеющая вышеописанную структуру
    
    '''
    
    predata = np.zeros((embeddings.shape[0] * 4, embeddings.shape[2] * 2))
    target = []
    np.random.seed(random_seed)

    for i in range(len(data)):
        for j in range(1,5):
            cur_targ = 1 if j == int(data.iloc[i,5]) else 0
            cur_vec = np.hstack((embeddings[i,0], embeddings[i,j]))
            predata[i*4+j-1] = cur_vec
            target.append(cur_targ)
            
    all_inds = np.arange(embeddings.shape[0])
    train_size = int(len(all_inds) * train_part)
    train_inds = np.random.choice(all_inds, size=train_size, replace=False)
    test_inds = np.array(list(set(all_inds) - set(train_inds)))
    
    X_train, X_test, y_train, y_test = [],[],[],[]

    for i in range(len(train_inds)):
        for j in range(4):
            X_train.append(predata[train_inds[i] * 4 + j])
            y_train.append(target[train_inds[i] * 4 + j])

    for i in range(len(test_inds)):
        for j in range(4):
            X_test.append(predata[test_inds[i] * 4 + j])
            y_test.append(target[test_inds[i] * 4 + j])
            
    return X_train, X_test, y_train, y_test


def approach_logreg(data, embeddings, train_part=0.7, random_seed=4):
    
    '''
    Обучаем логрегрессию и делаем предсказание.
    
    data - исходный датасет с разметкой
    embeddings - векторное представление исходного датасета
    train_part - доля тренировочной выборки
    random_seed - сид для повторяемости эксперимента
    
    return:
    
    accuracy - значение метрики точности для предсказания
    
    '''
    
    X_train, X_test, y_train, y_test = get_samples(data, embeddings, train_part, random_seed)
    
    clf = LR()
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict_proba(X_test)
    y_pred_new = []

    for i in range(len(y_pred) // 4):
        cur_probs, y_pred_splitted = [], [0,0,0,0]
        for j in range(4):
            cur_probs.append(y_pred[i*4 + j][1])
        y_pred_splitted[np.argmax(cur_probs)] = 1
        y_pred_new.extend(y_pred_splitted)
        
    y_pred_splitted, y_test_splitted = [], []

    for i in range(len(y_pred) // 4):
        y_pred_splitted.append(np.argmax(y_pred_new[i*4:i*4+4]))
        y_test_splitted.append(np.argmax(y_test[i*4:i*4+4]))
        
    accuracy = accuracy_score(y_test_splitted, y_pred_splitted)
    precision_macro = precision_score(y_test_splitted, y_pred_splitted, average='macro')
    precision_micro = precision_score(y_test_splitted, y_pred_splitted, average='micro')
    f1_macro = f1_score(y_test_splitted, y_pred_splitted, average='macro')
    f1_micro = f1_score(y_test_splitted, y_pred_splitted, average='micro')
    
    return accuracy, precision_macro, precision_micro, f1_macro, f1_micro

In [26]:
results_dict = {}
for key, embed in embeddings.items():
    results_dict[key] = approach_logreg(data, embed)

In [27]:
results = pd.DataFrame(data=results_dict, columns=list(embeddings.keys()), \
                       index=['Accuracy', 'Precision Macro', 'Precision Micro', 'F1 Macro', 'F1 Micro'])
results

Unnamed: 0,RusVectores,FastText,ELMO,BERT
Accuracy,0.287608,0.303672,0.308771,0.318205
Precision Macro,0.288244,0.301829,0.307374,0.317983
Precision Micro,0.287608,0.303672,0.308771,0.318205
F1 Macro,0.287345,0.30164,0.307148,0.317563
F1 Micro,0.287608,0.303672,0.308771,0.318205


# История

In [5]:
hist_inds = []

for i in range(len(data)):
    if data.iloc[i,6] == 'История':
        hist_inds.append(i)
len(hist_inds)

2498

In [6]:
elmo_embs_hist = elmo_embs[hist_inds]
bert_embs_hist = bert_embs[hist_inds]
fasttext_embs_hist = fasttext_embs[hist_inds]
rusvec_embs_hist = rusvec_embs[hist_inds]

In [7]:
embeddings = {'RusVectores': rusvec_embs_hist, 'FastText': fasttext_embs_hist, 'ELMO': elmo_embs_hist, 'BERT': bert_embs_hist}
data2 = data[data['label'] == 'История']

In [8]:
results_dict = {}
for key, embed in pb(embeddings.items()):
    results_dict[key] = approach_logreg(data2, embed)

100% (4 of 4) |##########################| Elapsed Time: 0:00:34 Time:  0:00:34


In [9]:
results = pd.DataFrame(data=results_dict, columns=list(embeddings.keys()), \
                       index=['Accuracy', 'Precision Macro', 'Precision Micro', 'F1 Macro', 'F1 Micro'])
results

Unnamed: 0,RusVectores,FastText,ELMO,BERT
Accuracy,0.28,0.296,0.318667,0.285333
Precision Macro,0.283367,0.294129,0.318767,0.284942
Precision Micro,0.28,0.296,0.318667,0.285333
F1 Macro,0.28017,0.293863,0.318622,0.28448
F1 Micro,0.28,0.296,0.318667,0.285333


## Медицина

In [10]:
med_inds = []

for i in range(len(data)):
    if data.iloc[i,6] == 'Медицина':
        med_inds.append(i)
len(med_inds)

4013

In [11]:
elmo_embs_med = elmo_embs[med_inds]
bert_embs_med = bert_embs[med_inds]
fasttext_embs_med = fasttext_embs[med_inds]
rusvec_embs_med = rusvec_embs[med_inds]

In [12]:
embeddings = {'RusVectores': rusvec_embs_med, 'FastText': fasttext_embs_med, 'ELMO': elmo_embs_med, 'BERT': bert_embs_med}
data2 = data[data['label'] == 'Медицина']

In [14]:
results_dict = {}
for key, embed in pb(embeddings.items()):
    results_dict[key] = approach_logreg(data2, embed)

100% (4 of 4) |##########################| Elapsed Time: 0:01:00 Time:  0:01:00


In [15]:
results = pd.DataFrame(data=results_dict, columns=list(embeddings.keys()), \
                       index=['Accuracy', 'Precision Macro', 'Precision Micro', 'F1 Macro', 'F1 Micro'])
results

Unnamed: 0,RusVectores,FastText,ELMO,BERT
Accuracy,0.299834,0.343023,0.344684,0.335548
Precision Macro,0.300021,0.333593,0.338423,0.323557
Precision Micro,0.299834,0.343023,0.344684,0.335548
F1 Macro,0.29452,0.333505,0.33583,0.321993
F1 Micro,0.299834,0.343023,0.344684,0.335548


## Биология

In [16]:
bio_inds = []

for i in range(len(data)):
    if data.iloc[i,6] == 'Биология':
        bio_inds.append(i)
len(bio_inds)

2184

In [17]:
elmo_embs_bio = elmo_embs[bio_inds]
bert_embs_bio = bert_embs[bio_inds]
fasttext_embs_bio = fasttext_embs[bio_inds]
rusvec_embs_bio = rusvec_embs[bio_inds]

In [18]:
embeddings = {'RusVectores': rusvec_embs_bio, 'FastText': fasttext_embs_bio, 'ELMO': elmo_embs_bio, 'BERT': bert_embs_bio}
data2 = data[data['label'] == 'Биология']

In [19]:
results_dict = {}
for key, embed in pb(embeddings.items()):
    results_dict[key] = approach_logreg(data2, embed)

100% (4 of 4) |##########################| Elapsed Time: 0:00:26 Time:  0:00:26


In [20]:
results = pd.DataFrame(data=results_dict, columns=list(embeddings.keys()), \
                       index=['Accuracy', 'Precision Macro', 'Precision Micro', 'F1 Macro', 'F1 Micro'])
results

Unnamed: 0,RusVectores,FastText,ELMO,BERT
Accuracy,0.309451,0.338415,0.280488,0.272866
Precision Macro,0.310827,0.337511,0.280388,0.272495
Precision Micro,0.309451,0.338415,0.280488,0.272866
F1 Macro,0.308527,0.337219,0.280434,0.273114
F1 Micro,0.309451,0.338415,0.280488,0.272866
