# Citation based summarization

The `scisumm-corpus` directory contains the corpus of papers published in the context of the [CL-SciSumm 2017](http://wing.comp.nus.edu.sg/~cl-scisumm2017/) challenge.

## Терминология

Статьи, которые необходимо обобщить, называются **reference papers**. Каждый справочный документ связан с набором **citing papers**. Предложения цитирующей статьи, в которых цитируется соответствующий справочный документ, называются **citances**. Соответствующие предложения в справочном документе называются **provenances**. **Аннотация** - это отношение между набором цитирований цитирующей статьи и набором источников соответствующей справочной статьи.


In [110]:
import re
from os import listdir
from os.path import join

import numpy as np
from sklearn import neural_network
from sklearn.metrics import confusion_matrix

import scisummgen

import time
from sklearn import ensemble
from sklearn import svm
import pandas as pd
import xgboost as xgb

from gensim.models.word2vec import Word2Vec
from multiprocessing import cpu_count
import gensim.downloader as api
import re
import nltk 
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.corpus import wordnet
stopWords = set(stopwords.words('english'))
from rouge import Rouge 
rouge = Rouge()

from difflib import SequenceMatcher
from itertools import product 

from catboost import CatBoostClassifier

[nltk_data] Downloading package stopwords to /home/andrey/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [111]:
training_path = 'scisumm-corpus/Training-Set-2017'
test_path = 'scisumm-corpus/Test-Set-2017'

In [3]:
%time
fasttext_model300 = api.load('fasttext-wiki-news-subwords-300')


CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 5.25 µs


In [4]:
index2word_fasttext = set(fasttext_model300.index2word)

In [5]:
def tokenize(string):
        # МБ стоит попробовать ламматизацию данных???
        raw_tokens = re.findall('\w+', string.lower())
        tokens = []
        for token in raw_tokens:
            if token not in stopWords:
                tokens.append(token)
        return tokens

In [6]:
def avg_sentence_vector(words, model, num_features, index2word_set):
    #function to average all words vectors in a given paragraph
    featureVec = np.zeros((num_features,), dtype="float32")
    nwords = 0

    for word in words:
        if word in index2word_set:
            nwords = nwords+1
            featureVec = np.add(featureVec, model[word])

    if nwords>0:
        featureVec = np.divide(featureVec, nwords)
    return featureVec

In [7]:
def change_1_most_similar(words, model,index2word_set=set(fasttext_model300.index2word)):
    changed_words = ""
    for word in words:
        if word in index2word_set:
            changed_words+= model.most_similar(word)[0][0]+ " "
    return changed_words

In [14]:
hypothesis = "In such cases, neither global features (Chieu and Ng, 2002) nor aggregated contexts (Chieu and Ng, 2003) can help."

reference = "Global features are extracted from other occurrences of the same token in the whole document."

def calc_Rouge(hypothesis,reference):
    rouge = Rouge()
    scores = rouge.get_scores(hypothesis,reference)

    rougelist = []
    r1_f = scores[0]['rouge-1']['f']
    rougelist.append(r1_f)
    rougelist.append(scores[0]['rouge-2']['f'])
    rougelist.append(scores[0]['rouge-l']['f'])

    return tuple(rougelist)
calc_Rouge(hypothesis,reference)

(0.06666666168888927, 0.0, 0.06608187134441518)

In [112]:
def create_features(path,with_1_most=False):
    X = []
    y = []

    # For all the papers
    for num,directory in enumerate(listdir(path)):
        print('num = ',num,' directory',directory)
        paper = scisummgen.Paper(join(path, directory))
        similarity = scisummgen.Similarity(paper)

        # For all the citances
        for citance in paper.annotation.citances:
            # For all the sentences of the reference paper
            for sentence_sid, sentence in paper.reference.sentences.items():
                citance_text = paper.get_citance_text(citance)
                sentence_text = sentence['text']
                
                tfidf = similarity.tfidf_similarity(sentence_text, citance_text)
                lsi = similarity.lsi_similarity(sentence_text, citance_text)
                lda = similarity.lda_similarity(sentence['text'], citance_text)
                hdp = similarity.hdp_similarity(sentence['text'], citance_text)
                bigrams = similarity.count_bigrams(sentence_text, citance_text)

                try:
                    sid_pos = sentence['sid'] / sentence['sid_max']
                except ZeroDivisionError:
                    sid_pos = 0

                try:
                    ssid_pos = sentence['ssid'] / sentence['ssid_max']
                except ZeroDivisionError:
                    ssid_pos = 0

                try:
                    section_pos = sentence['section'] / sentence['section_max']
                except ZeroDivisionError:
                    section_pos = 0
                # calcRouge
                r1_f, r2_f, rl_f = calc_Rouge(sentence_text,citance_text)
                # SequenceMatcher
                seq_match = SequenceMatcher(None, sentence_text, citance_text).ratio()
                # w2v features
                sentence_tok = tokenize(sentence_text)
                citance_tok = tokenize(citance_text)
                
                if len(sentence_tok) <= 2:
                    wmdistance = 0
                    w2v_cos_sim = 0
                else:
                    wmdistance = fasttext_model300.wmdistance(sentence_tok,citance_tok)
                    if np.isinf(wmdistance):
                        wmdistance = 0
                        print(type(sentence_tok))
                        print(citance_text)
                        print('_'*50)
                    sentence_tok_avg = avg_sentence_vector(sentence_tok, model=fasttext_model300, num_features=300,index2word_set=index2word_fasttext)
                    citance_tok_avg = avg_sentence_vector(citance_tok, model=fasttext_model300, num_features=300,index2word_set=index2word_fasttext)
      
                    w2v_cos_sim = fasttext_model300.cosine_similarities(sentence_tok_avg,[citance_tok_avg])[0]
                    
                if with_1_most:
                    if len(sentence_tok) <= 2:
                        w2v_1_most_cos_sim = 0
                    else:
                        sentence_tok_sim = change_1_most_similar(sentence_tok,model=fasttext_model300)
                        citance_tok_sim = change_1_most_similar(citance_tok,model=fasttext_model300)
                        
                        sentence_avg_sim = avg_sentence_vector(sentence_tok_sim, model=fasttext_model300, num_features=300,index2word_set=set(fasttext_model300.index2word))
                        citance_avg_sim = avg_sentence_vector(citance_tok_sim, model=fasttext_model300, num_features=300,index2word_set=set(fasttext_model300.index2word))
                        w2v_1_most_cos_sim = fasttext_model300.cosine_similarities(sentence_avg_sim,[citance_avg_sim])[0]
                
                #  check the similarity between each words in the two list and find out the maximum similarity
                # get_wordnet_based_similarity
#                 allsyns1 = set(ss for word in sentence_tok for ss in wordnet.synsets(word))
#                 allsyns2 = set(ss for word in citance_tok for ss in wordnet.synsets(word))
#                 try:
#                     best = max((wordnet.wup_similarity(s1, s2) or 0, s1, s2) for s1, s2 in 
#                         product(allsyns1, allsyns2))
#                     wordnet_sc =  best[0]
#                 except:
#                     wordnet_sc =  0 
                    
                X.append([tfidf, lsi, lda, hdp, bigrams, sid_pos, ssid_pos, section_pos, seq_match, wmdistance,w2v_cos_sim,r1_f, r2_f, rl_f])
                if with_1_most:
                    X.append([tfidf, lsi, bigrams, sid_pos, ssid_pos, section_pos, wmdistance,w2v_cos_sim, w2v_1_most_cos_sim])
                # Check if this sentence is also a provenance
                if sentence_sid in citance['RO']:
                    y.append(1)
                else:
                    y.append(-1)

    return X, y

In [4]:
len([1, 1, 1, 1, 1, 1, 1, 1, 1, 1,1,1, 1, 1]),len([0.09661177, 0.10026853, 0.07010722, 0.01624012, 0.01915178,
0.09136614, 0.08223378, 0.03783819, 0.09495141, 0.09158937,
       0.09194491, 0.07627548, 0.04469802, 0.08672328])

(14, 14)

In [None]:
[tfidf, lsi, lda, hdp, bigrams, 
 sid_pos, ssid_pos, section_pos, seq_match, wmdistance,
 w2v_cos_sim,r1_f, r2_f, rl_f]

In [1]:
[0.09661177, 0.10026853, 0.07010722, 0.01624012, 0.01915178,
0.09136614, 0.08223378, 0.03783819, 0.09495141, 0.09158937,
       0.09194491, 0.07627548, 0.04469802, 0.08672328]

[0.09661177,
 0.10026853,
 0.07010722,
 0.01624012,
 0.01915178,
 0.09136614,
 0.08223378,
 0.03783819,
 0.09495141,
 0.09158937,
 0.09194491,
 0.07627548,
 0.04469802,
 0.08672328]

In [113]:
def create_summaries(path, y_probability):
    y_index = 0

    # For all the test papers
    for directory in listdir(path):
        print('Creating summary for paper', directory)
        paper = scisummgen.Paper(join(path, directory))
        sentence_scores = np.zeros(next(iter(paper.reference.sentences.values()))['sid_max'] + 1)

        # For all the citances
        for citance in paper.annotation.citances:
            # For all the sentences of the reference paper
            for sentence_sid, sentence in paper.reference.sentences.items():
                # The probability of being a provenance
                sentence_scores[sentence_sid] += y_probability[y_index][1]
                y_index += 1

        # Find the sentences with the highest scores
        sentence_sid_sorted = sentence_scores.argsort()[::-1]
        summary_sentences = []
        tot_words = 0

        for sid in sentence_sid_sorted:
            sentence = paper.reference.sentences[sid]
            # Avoid considering the title
            if sentence['sid'] == 0:
                continue
            # Count the number of words
            words = len(re.findall('\w+', sentence['text'].lower()))
            tot_words += words
            if tot_words <= 250:
                summary_sentences.append(sentence)
            else:
                break

        # Sort sentences by sid
        summary_sentences.sort(key=lambda x: x['sid'])

        # Create the summary
        summary = ''
        for sentence in summary_sentences:
            summary += sentence['text'] + ' '
        summary = summary.strip(' ')

        with open(join('summary', directory + '.system.txt'), 'w', encoding='utf-8') as file:
            file.write(summary)

In [186]:
def create_summaries_top_k(path, y_probability,k=3):
    y_index = 0

    # For all the test papers
    for directory in listdir(path):
        print('Creating summary for paper', directory)
        paper = scisummgen.Paper(join(path, directory))
        sentence_scores = np.zeros(next(iter(paper.reference.sentences.values()))['sid_max'] + 1)

        # For all the citances
        for citance in paper.annotation.citances:
            # For all the sentences of the reference paper
            for sentence_sid, sentence in paper.reference.sentences.items():
                # The probability of being a provenance
                sentence_scores[sentence_sid] += y_probability[y_index][1]
                y_index += 1

        # Find the sentences with the highest scores
        sentence_sid_sorted = sentence_scores.argsort()[::-1]
        
        for num in range(1,k+1):
            summary_sentences = []
            tot_words = 0
            del_sids = []
            for sid in sentence_sid_sorted:
                del_sids.append(sid)
                sentence = paper.reference.sentences[sid]
                # Avoid considering the title
                if sentence['sid'] == 0:
                    continue
                # Count the number of words
                words = len(re.findall('\w+', sentence['text'].lower()))
                tot_words += words
                if tot_words <= 250:
                    summary_sentences.append(sentence)
                else:
                    break
                    
            sentence_sid_sorted = sentence_sid_sorted[~np.isin(sentence_sid_sorted,del_sids)]
            # Sort sentences by sid
            summary_sentences.sort(key=lambda x: x['sid'])

            # Create the summary
            summary = ''
            for sentence in summary_sentences:
                summary += sentence['text'] + ' '
            summary = summary.strip(' ')

            with open(join('summary', directory +f'-{str(num)}' +'.system.txt'), 'w', encoding='utf-8') as file:
                file.write(summary)


### Подготовка текста

Каждый reference paper представлен классом ***Paper class***. Этот класс *содержит reference paper, соответствующие цитирующие документы и соответствующие аннотации*. Документы представлены классом ***Document***, который *содержит фактические предложения*. *Список аннотаций* представлен классом  ***Annotation***.

Предложения:
- переводятся в нижний регистр
- и затем токенизируется с помощью регулярного выражения `\w+`. Токены, перечисленные в файле `stopwords.txt`, игнорируются.

### Предсказание provenance 

For each reference paper, for each citance, and for each sentence of the reference paper, it is possible to compute a set of features. These features are used to train an MLP classifier with the goal of predicting if a sentence of the reference paper is a provenance or not. The features considered are the following:

* **tfidf**: the TF-IDF similarity between the two sentences as computed by `gensim`;
* **lsi**: the LSI similarity between the two sentences, as computed by `gensim` considering 50 topics;
* **bigrams**: the number of common bigrams between the two sentences;
* **sid_pos**: the position of the sentence in the reference paper;
* **ssid_pos**: the position of the sentence in the local section of the reference paper;
* **section_pos**: the position of the local section in the reference paper.

The corpus of documents used for computing the TF-IDF similarity and the LSI similarity includes all the sentences of the reference paper and all the sentences of all its citing papers.

The classifier is trained to predict the probability for a sentence of being a provenance given a particular citance. Please note that a citance, in practice, may include several sentences of the citing paper. These probabilities are predicted for each pair composed of a sentence of the reference paper and a citance of all its citing papers.

### Прогноз provenance

Для каждого reference paper, для каждого цитирования и для каждого предложения reference paper можно вычислить набор функций. Эти функции используются для обучения классификаторов с целью прогнозирования, является ли предложение reference paper provenance или нет. Рассматриваются следующие функции:

* **tfidf**: сходство TF-IDF между двумя предложениями, вычисленное с помощью `gensim`;
* **lsi**: сходство LSI между двумя предложениями, вычисленное `gensim` с учетом 50 тем;
* **биграммы**: количество общих биграмм между двумя предложениями;
* **sid_pos**: позиция предложения в справочном документе;
* **ssid_pos**: позиция предложения в локальном разделе справочного документа;
* **section_pos**: положение локального раздела в справочном документе.

Корпус документов, используемых для вычисления подобия TF-IDF и подобия LSI, включает все предложения справочного документа и все предложения всех цитирующих документов.

Классификатор обучен предсказывать вероятность того, что предложение будет провенансом для конкретного случая. Обратите внимание, что на практике цитата может включать несколько предложений цитирующей статьи. Эти вероятности предсказываются для каждой пары, состоящей из предложения reference paper и цитирования всех цитирующих документов.

In [114]:
start_time = time.time()
print('Creating features for the training set')
X_training, y_training = create_features(training_path)
print(f"Time of creating: {time.time()-start_time}")

Creating features for the training set
num =  0  directory W08-2222
num =  1  directory W03-0410
num =  2  directory W04-0213
num =  3  directory E03-1020
num =  4  directory N01-1011
num =  5  directory P98-1046
num =  6  directory I05-5011
num =  7  directory P05-1004
num =  8  directory P06-2124
num =  9  directory N04-1038
num =  10  directory D10-1083
num =  11  directory J98-2005
num =  12  directory J00-3003
num =  13  directory X96-1048
num =  14  directory J96-3004
num =  15  directory C02-1025
num =  16  directory P98-1081
num =  17  directory C00-2123
num =  18  directory C04-1089
num =  19  directory H89-2014
num =  20  directory C08-1098
num =  21  directory H05-1115
num =  22  directory C94-2154
num =  23  directory E09-2008
num =  24  directory P98-2143
num =  25  directory N06-2049
num =  26  directory C90-2039
num =  27  directory C10-1045
num =  28  directory P05-1053
num =  29  directory W95-0104
Time of creating: 1784.7404243946075


In [115]:
def lists2txt(dataset,name='file'):
    with open(f'{name}.txt', 'w+', encoding='utf-8') as file:
        for res in dataset:
            try:
                line = "|".join(map(str, res))
#             print(line)
                file.write(line+ '\n')
            except:
                line = str(res)
                file.write(line+ '\n')

In [116]:
lists2txt(X_training,name='X_training_ALL')
# lists2txt(y_training,name='y_training_new')

In [117]:
len(X_training),len(y_training)

(147885, 147885)

In [118]:
X_training[0]

[0.09107501196704412,
 0.4870091174665887,
 0.4517682313836073,
 0.0,
 0,
 0.006097560975609756,
 0.08333333333333333,
 0.2,
 0.3178294573643411,
 0.8542434272016185,
 0.8698474,
 0.10810810328707107,
 0.0,
 0.10111958924554615]

In [38]:
X_training[0]

[0.09107501196704412,
 0.4565450292731724,
 0,
 0.006097560975609756,
 0.08333333333333333,
 0.2,
 0.8542434272016185,
 0.8698474]

In [39]:
np.isnan(X_training[22428][-1])

False

In [40]:
np.isinf(X_training[22428][-2])

False

In [41]:
X_training[22674]

[0.0,
 0.0,
 0,
 0.7439024390243902,
 0.48695652173913045,
 0.8333333333333334,
 0,
 0]

In [42]:
X_training[22674]

[0.0,
 0.0,
 0,
 0.7439024390243902,
 0.48695652173913045,
 0.8333333333333334,
 0,
 0]

In [43]:
np.where(np.isnan(np.array(X_training)[:,-1]))

(array([], dtype=int64),)

In [6]:
len(X_training),len(y_training)

(147885, 147885)

In [7]:
X_training[0]

[0.09107501196704412,
 0.4670766223405552,
 0,
 0.006097560975609756,
 0.08333333333333333,
 0.2]

In [8]:
set(y_training)

{-1, 1}

In [119]:
start_time = time.time()
print('Creating features for the test set')
X_test, y_test = create_features(test_path)
print(f"Time of creating: {time.time()-start_time}")

Creating features for the test set
num =  0  directory W09-0621
num =  1  directory D10-1058
num =  2  directory C98-1097
num =  3  directory W11-0815
num =  4  directory D09-1023
num =  5  directory P00-1025
num =  6  directory N09-1001
num =  7  directory W06-3909
num =  8  directory P07-1040
num =  9  directory N09-1025
Time of creating: 314.6532452106476


In [120]:
lists2txt(X_test,name='X_test_ALL')
# lists2txt(y_training,name='y_training_new')

In [121]:
len(X_test),len(y_test)

(32410, 32410)

In [122]:
X_test[0]

[0.0,
 0.18963097657912117,
 0.0,
 0.0,
 0,
 0.010101010101010102,
 0.0625,
 0.25,
 0.291970802919708,
 1.117975920350416,
 0.65321326,
 0.0,
 0.0,
 0.0]

In [103]:
X_test[0]

[0.0,
 0.16843295939527933,
 0,
 0.010101010101010102,
 0.0625,
 0.25,
 1.117975920350416,
 0.65321326]

In [104]:
set(y_test)

{-1, 1}

#### Look at balanced of classes

In [13]:
y_tr = pd.Series(y_training)

In [14]:
y_tr.value_counts()

-1    146962
 1       923
dtype: int64

In [15]:
923/len(y_training)

0.006241336173377962

In [16]:
y_ts = pd.Series(y_test)
y_ts.value_counts()

-1    32179
 1      231
dtype: int64

In [17]:
231/len(y_test)

0.007127429805615551

In [18]:
sum_wpos = sum( 1 for i in range(len(y_training)) if y_training[i] == 1.0  )
sum_wneg = sum( 1 for i in range(len(y_training)) if y_training[i] == -1.0  )

In [19]:
sum_wneg

146962

## Train

### RF

In [123]:
print('Training the classifier')
start_time = time.time()
clf = ensemble.RandomForestClassifier(n_estimators=1000,verbose=True,n_jobs=-1)
clf = clf.fit(X_training, y_training)
print(f"Time of fit: {time.time()-start_time}")
start_time = time.time()
y_prediction = clf.predict(X_test)
y_pred_train = clf.predict(X_training)
print(f"Time of prediction: {time.time()-start_time}")

Training the classifier


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   10.6s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:   25.2s
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:   45.7s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   58.5s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s


Time of fit: 58.94253444671631


[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    0.3s
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:    0.5s
[Parallel(n_jobs=12)]: Done 1000 out of 1000 | elapsed:    0.6s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.4s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    0.9s
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:    1.7s


Time of prediction: 3.2429749965667725


[Parallel(n_jobs=12)]: Done 1000 out of 1000 | elapsed:    2.2s finished


In [125]:
clf.feature_importances_

array([0.09661177, 0.10026853, 0.07010722, 0.01624012, 0.01915178,
       0.09136614, 0.08223378, 0.03783819, 0.09495141, 0.09158937,
       0.09194491, 0.07627548, 0.04469802, 0.08672328])

In [24]:
clf.feature_importances_

array([0.10567189, 0.11066879, 0.02007297, 0.09956031, 0.09023367,
       0.03927619, 0.10588974, 0.10133567, 0.10174596, 0.08206379,
       0.047983  , 0.09549802])

In [46]:
clf.feature_importances_

array([0.1595748 , 0.17167978, 0.02948181, 0.13758435, 0.12518599,
       0.04893513, 0.16448418, 0.16307396])

In [126]:
conf_mat = confusion_matrix(y_training, y_pred_train)
print('Confusion matrix on train set')
print(conf_mat)

Confusion matrix on train set
[[146962      0]
 [     3    920]]


In [127]:
#all
conf_mat = confusion_matrix(y_test, y_prediction)
print('Confusion matrix on test set')
print(conf_mat)

Confusion matrix on test set
[[32169    10]
 [  227     4]]


In [26]:
conf_mat = confusion_matrix(y_test, y_prediction)
print('Confusion matrix on test set')
print(conf_mat)

Confusion matrix on test set
[[32167    12]
 [  227     4]]


In [106]:
conf_mat = confusion_matrix(y_test, y_prediction)
print('Confusion matrix on test set')
print(conf_mat)

Confusion matrix on test set
[[32171     8]
 [  229     2]]


In [19]:
clf.feature_importances_

array([0.24051745, 0.32530681, 0.03390778, 0.18367122, 0.16302121,
       0.05357554])

### SVM

In [63]:
print('Training the classifier')
start_time = time.time()
clf = svm.SVC(kernel='rbf',probability=True,class_weight={1: 10},tol=1e-3)# class_weight={1: 10}, kernel='polynomial'
clf = clf.fit(np.array(X_training), np.array(y_training))
print(f"Time of fit: {time.time()-start_time}")
start_time = time.time()
y_prediction = clf.predict(np.array(X_test))
print(f"Time of prediction: {time.time()-start_time}")

Training the classifier




Time of fit: 1451.1102449893951
Time of prediction: 6.75842547416687


In [48]:
conf_mat = confusion_matrix(y_test, y_prediction)
print('Confusion matrix on test set')
print(conf_mat)

Confusion matrix on test set
[[32173     6]
 [  231     0]]


In [64]:
# kernel='rbf',probability=True,class_weight={1: 10}
conf_mat = confusion_matrix(y_test, y_prediction)
print('Confusion matrix on test set')
print(conf_mat)

Confusion matrix on test set
[[32048   131]
 [  211    20]]


### XGBoost

In [217]:
print('Training the classifier')
start_time = time.time()
clf = xgb.XGBClassifier(n_estimators=100,early_stopping_rounds=10,
                        eval_set=[(X_test, y_test)])
#                         objective='binary:hinge'
clf = clf.fit(np.array(X_training), np.array(y_training) )
print(f"Time of fit: {time.time()-start_time}")
start_time = time.time()
y_prediction = clf.predict(X_test)
print(f"Time of prediction: {time.time()-start_time}")

Training the classifier
Time of fit: 7.5019166469573975
Time of prediction: 0.1635732650756836


In [218]:
#all
conf_mat = confusion_matrix(y_test, y_prediction)
print('Confusion matrix on test set')
print(conf_mat)

Confusion matrix on test set
[[32173     6]
 [  228     3]]


In [28]:
conf_mat = confusion_matrix(y_test, y_prediction)
print('Confusion matrix on test set')
print(conf_mat)

Confusion matrix on test set
[[32170     9]
 [  228     3]]


In [59]:
conf_mat = confusion_matrix(y_test, y_prediction)
print('Confusion matrix on test set')
print(conf_mat)

Confusion matrix on test set
[[32175     4]
 [  230     1]]


### CatBoost

In [200]:
print('Training the classifier')
start_time = time.time()
clf = CatBoostClassifier(iterations=1000,
                           early_stopping_rounds=10,
                           learning_rate=0.1)
clf = clf.fit(np.array(X_training), np.array(y_training) )
print(f"Time of fit: {time.time()-start_time}")
start_time = time.time()
y_prediction = clf.predict(X_test)
print(f"Time of prediction: {time.time()-start_time}")

Training the classifier
0:	learn: 0.4471809	total: 15.1ms	remaining: 15.1s
1:	learn: 0.2930305	total: 31.2ms	remaining: 15.5s
2:	learn: 0.2012458	total: 47.4ms	remaining: 15.8s
3:	learn: 0.1440891	total: 65.1ms	remaining: 16.2s
4:	learn: 0.1079130	total: 82.1ms	remaining: 16.3s
5:	learn: 0.0848382	total: 103ms	remaining: 17.1s
6:	learn: 0.0693590	total: 120ms	remaining: 17s
7:	learn: 0.0582994	total: 150ms	remaining: 18.6s
8:	learn: 0.0511471	total: 182ms	remaining: 20.1s
9:	learn: 0.0460086	total: 199ms	remaining: 19.7s
10:	learn: 0.0421718	total: 225ms	remaining: 20.3s
11:	learn: 0.0395404	total: 247ms	remaining: 20.4s
12:	learn: 0.0374653	total: 264ms	remaining: 20s
13:	learn: 0.0359406	total: 281ms	remaining: 19.8s
14:	learn: 0.0347300	total: 297ms	remaining: 19.5s
15:	learn: 0.0337293	total: 321ms	remaining: 19.7s
16:	learn: 0.0330433	total: 344ms	remaining: 19.9s
17:	learn: 0.0324910	total: 376ms	remaining: 20.5s
18:	learn: 0.0319936	total: 396ms	remaining: 20.4s
19:	learn: 0.031

163:	learn: 0.0252564	total: 3.17s	remaining: 16.2s
164:	learn: 0.0252246	total: 3.19s	remaining: 16.1s
165:	learn: 0.0252153	total: 3.2s	remaining: 16.1s
166:	learn: 0.0251782	total: 3.22s	remaining: 16.1s
167:	learn: 0.0251577	total: 3.24s	remaining: 16s
168:	learn: 0.0251294	total: 3.27s	remaining: 16.1s
169:	learn: 0.0251128	total: 3.3s	remaining: 16.1s
170:	learn: 0.0250771	total: 3.33s	remaining: 16.1s
171:	learn: 0.0250624	total: 3.35s	remaining: 16.1s
172:	learn: 0.0250540	total: 3.38s	remaining: 16.2s
173:	learn: 0.0250439	total: 3.41s	remaining: 16.2s
174:	learn: 0.0250286	total: 3.43s	remaining: 16.2s
175:	learn: 0.0250202	total: 3.46s	remaining: 16.2s
176:	learn: 0.0249718	total: 3.49s	remaining: 16.2s
177:	learn: 0.0249689	total: 3.52s	remaining: 16.3s
178:	learn: 0.0249524	total: 3.55s	remaining: 16.3s
179:	learn: 0.0249437	total: 3.58s	remaining: 16.3s
180:	learn: 0.0249167	total: 3.61s	remaining: 16.3s
181:	learn: 0.0248876	total: 3.64s	remaining: 16.4s
182:	learn: 0.02

327:	learn: 0.0219964	total: 6.6s	remaining: 13.5s
328:	learn: 0.0219427	total: 6.63s	remaining: 13.5s
329:	learn: 0.0219302	total: 6.65s	remaining: 13.5s
330:	learn: 0.0219130	total: 6.67s	remaining: 13.5s
331:	learn: 0.0219038	total: 6.69s	remaining: 13.5s
332:	learn: 0.0218993	total: 6.7s	remaining: 13.4s
333:	learn: 0.0218950	total: 6.72s	remaining: 13.4s
334:	learn: 0.0218652	total: 6.74s	remaining: 13.4s
335:	learn: 0.0218564	total: 6.76s	remaining: 13.4s
336:	learn: 0.0218359	total: 6.78s	remaining: 13.3s
337:	learn: 0.0218209	total: 6.79s	remaining: 13.3s
338:	learn: 0.0218081	total: 6.82s	remaining: 13.3s
339:	learn: 0.0217940	total: 6.84s	remaining: 13.3s
340:	learn: 0.0217705	total: 6.86s	remaining: 13.3s
341:	learn: 0.0217664	total: 6.88s	remaining: 13.2s
342:	learn: 0.0217410	total: 6.9s	remaining: 13.2s
343:	learn: 0.0217283	total: 6.92s	remaining: 13.2s
344:	learn: 0.0217180	total: 6.94s	remaining: 13.2s
345:	learn: 0.0217084	total: 6.96s	remaining: 13.2s
346:	learn: 0.0

488:	learn: 0.0195932	total: 9.9s	remaining: 10.3s
489:	learn: 0.0195904	total: 9.91s	remaining: 10.3s
490:	learn: 0.0195652	total: 9.93s	remaining: 10.3s
491:	learn: 0.0195549	total: 9.95s	remaining: 10.3s
492:	learn: 0.0195394	total: 9.96s	remaining: 10.2s
493:	learn: 0.0195162	total: 9.99s	remaining: 10.2s
494:	learn: 0.0194996	total: 10s	remaining: 10.2s
495:	learn: 0.0194921	total: 10s	remaining: 10.2s
496:	learn: 0.0194806	total: 10.1s	remaining: 10.2s
497:	learn: 0.0194719	total: 10.1s	remaining: 10.1s
498:	learn: 0.0194616	total: 10.1s	remaining: 10.1s
499:	learn: 0.0194559	total: 10.1s	remaining: 10.1s
500:	learn: 0.0194460	total: 10.1s	remaining: 10.1s
501:	learn: 0.0194348	total: 10.1s	remaining: 10.1s
502:	learn: 0.0194303	total: 10.2s	remaining: 10s
503:	learn: 0.0194267	total: 10.2s	remaining: 10s
504:	learn: 0.0194076	total: 10.2s	remaining: 9.99s
505:	learn: 0.0194045	total: 10.2s	remaining: 9.96s
506:	learn: 0.0193751	total: 10.2s	remaining: 9.94s
507:	learn: 0.0193437

648:	learn: 0.0174919	total: 12.9s	remaining: 6.99s
649:	learn: 0.0174871	total: 12.9s	remaining: 6.97s
650:	learn: 0.0174783	total: 13s	remaining: 6.95s
651:	learn: 0.0174745	total: 13s	remaining: 6.92s
652:	learn: 0.0174727	total: 13s	remaining: 6.9s
653:	learn: 0.0174583	total: 13s	remaining: 6.88s
654:	learn: 0.0174528	total: 13s	remaining: 6.86s
655:	learn: 0.0174492	total: 13s	remaining: 6.84s
656:	learn: 0.0174339	total: 13.1s	remaining: 6.82s
657:	learn: 0.0174229	total: 13.1s	remaining: 6.79s
658:	learn: 0.0174168	total: 13.1s	remaining: 6.77s
659:	learn: 0.0173979	total: 13.1s	remaining: 6.75s
660:	learn: 0.0173740	total: 13.1s	remaining: 6.74s
661:	learn: 0.0173544	total: 13.2s	remaining: 6.71s
662:	learn: 0.0173399	total: 13.2s	remaining: 6.7s
663:	learn: 0.0173203	total: 13.2s	remaining: 6.67s
664:	learn: 0.0173140	total: 13.2s	remaining: 6.65s
665:	learn: 0.0173024	total: 13.2s	remaining: 6.63s
666:	learn: 0.0172951	total: 13.2s	remaining: 6.61s
667:	learn: 0.0172864	tota

815:	learn: 0.0157862	total: 16.1s	remaining: 3.64s
816:	learn: 0.0157662	total: 16.2s	remaining: 3.62s
817:	learn: 0.0157562	total: 16.2s	remaining: 3.6s
818:	learn: 0.0157533	total: 16.2s	remaining: 3.58s
819:	learn: 0.0157456	total: 16.2s	remaining: 3.56s
820:	learn: 0.0157414	total: 16.2s	remaining: 3.54s
821:	learn: 0.0157363	total: 16.3s	remaining: 3.52s
822:	learn: 0.0157305	total: 16.3s	remaining: 3.51s
823:	learn: 0.0157048	total: 16.3s	remaining: 3.49s
824:	learn: 0.0156911	total: 16.4s	remaining: 3.48s
825:	learn: 0.0156751	total: 16.4s	remaining: 3.46s
826:	learn: 0.0156521	total: 16.4s	remaining: 3.44s
827:	learn: 0.0156459	total: 16.4s	remaining: 3.41s
828:	learn: 0.0156301	total: 16.5s	remaining: 3.39s
829:	learn: 0.0156029	total: 16.5s	remaining: 3.37s
830:	learn: 0.0155859	total: 16.5s	remaining: 3.35s
831:	learn: 0.0155811	total: 16.5s	remaining: 3.33s
832:	learn: 0.0155678	total: 16.5s	remaining: 3.31s
833:	learn: 0.0155571	total: 16.5s	remaining: 3.29s
834:	learn: 0

976:	learn: 0.0142353	total: 19.2s	remaining: 452ms
977:	learn: 0.0142291	total: 19.2s	remaining: 433ms
978:	learn: 0.0142216	total: 19.3s	remaining: 413ms
979:	learn: 0.0142163	total: 19.3s	remaining: 393ms
980:	learn: 0.0142103	total: 19.3s	remaining: 374ms
981:	learn: 0.0141958	total: 19.3s	remaining: 354ms
982:	learn: 0.0141917	total: 19.3s	remaining: 334ms
983:	learn: 0.0141847	total: 19.3s	remaining: 315ms
984:	learn: 0.0141614	total: 19.4s	remaining: 295ms
985:	learn: 0.0141549	total: 19.4s	remaining: 275ms
986:	learn: 0.0141504	total: 19.4s	remaining: 255ms
987:	learn: 0.0141452	total: 19.4s	remaining: 236ms
988:	learn: 0.0141331	total: 19.4s	remaining: 216ms
989:	learn: 0.0141205	total: 19.5s	remaining: 197ms
990:	learn: 0.0141083	total: 19.5s	remaining: 177ms
991:	learn: 0.0141052	total: 19.5s	remaining: 157ms
992:	learn: 0.0140945	total: 19.5s	remaining: 138ms
993:	learn: 0.0140854	total: 19.5s	remaining: 118ms
994:	learn: 0.0140717	total: 19.5s	remaining: 98.2ms
995:	learn:

In [201]:
#all
conf_mat = confusion_matrix(y_test, y_prediction)
print('Confusion matrix on test set')
print(conf_mat)

Confusion matrix on test set
[[32155    24]
 [  228     3]]


In [45]:
conf_mat = confusion_matrix(y_test, y_prediction)
print('Confusion matrix on test set')
print(conf_mat)

Confusion matrix on test set
[[32154    25]
 [  225     6]]


In [203]:
conf_mat = confusion_matrix(y_test, y_prediction)
print('Confusion matrix on test set')
print(conf_mat)

Confusion matrix on test set
[[32162    17]
 [  226     5]]


### MLPClassifier

In [172]:
print('Training the classifier')
start_time = time.time()
clf = neural_network.MLPClassifier(hidden_layer_sizes=(100, 100, 100, 100, 100),
                                   learning_rate_init=0.0001, verbose=True)
clf = clf.fit(X_training, y_training)
print(f"Time of fit: {time.time()-start_time}")
start_time = time.time()
y_prediction = clf.predict(X_test)
print(f"Time of prediction: {time.time()-start_time}")

Training the classifier
Iteration 1, loss = 0.07086089
Iteration 2, loss = 0.03217128
Iteration 3, loss = 0.03165446
Iteration 4, loss = 0.03139923
Iteration 5, loss = 0.03123551
Iteration 6, loss = 0.03112792
Iteration 7, loss = 0.03094385
Iteration 8, loss = 0.03087728
Iteration 9, loss = 0.03077148
Iteration 10, loss = 0.03073340
Iteration 11, loss = 0.03056610
Iteration 12, loss = 0.03053833
Iteration 13, loss = 0.03051315
Iteration 14, loss = 0.03044407
Iteration 15, loss = 0.03033452
Iteration 16, loss = 0.03028445
Iteration 17, loss = 0.03020966
Iteration 18, loss = 0.03029790
Iteration 19, loss = 0.03011276
Iteration 20, loss = 0.03014852
Iteration 21, loss = 0.03016961
Iteration 22, loss = 0.03007019
Iteration 23, loss = 0.02998510
Iteration 24, loss = 0.02999223
Iteration 25, loss = 0.02997056
Iteration 26, loss = 0.02995444
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Time of fit: 128.52027201652527
Time of prediction: 0.349485397

In [173]:
conf_mat = confusion_matrix(y_test, y_prediction)
print('Confusion matrix on test set')
print(conf_mat)

Confusion matrix on test set
[[32165    14]
 [  228     3]]


In [113]:
#only w2v features

conf_mat = confusion_matrix(y_test, y_prediction)
print('Confusion matrix on test set')
print(conf_mat)

Confusion matrix on test set
[[32177     2]
 [  230     1]]


In [68]:
conf_mat = confusion_matrix(y_test, y_prediction)
print('Confusion matrix on test set')
print(conf_mat)

Confusion matrix on test set
[[32175     4]
 [  230     1]]


In [58]:
np.array(y_test)

array([-1, -1,  1, ..., -1, -1, -1])

In [59]:
y_prediction

array([-1, -1, -1, ..., -1, -1, -1])

In [60]:
len(np.where(np.array(y_test) == y_prediction)[0]),len(y_test)

(32172, 32410)

In [114]:
len(np.where(np.array(y_test) == y_prediction)[0])

32178

In [64]:
len(y_test)

32410

### Ранжирование

Принимая во внимание все вероятности того, что предложение в reference paper является provenance, и рассчитывается с учетом доступных расхождений, глобальный балл для каждого кандидата происхождения определяется путем суммирования всех его вероятностей. Предложения с наивысшей оценкой отбираются для создания summary of reference paper до тех пор, пока длина summary не превысит 250 слов. Эти предложения упорядочены в соответствии с их исходным положением в справочном документе.

In [219]:
clf.predict_proba(X_test)

array([[0.99656284, 0.00343714],
       [0.9666594 , 0.0333406 ],
       [0.8919206 , 0.10807935],
       ...,
       [0.9980294 , 0.00197056],
       [0.998102  , 0.00189799],
       [0.99869853, 0.00130149]], dtype=float32)

In [220]:
# create_summaries(test_path, clf.predict_proba(X_test))

In [221]:
create_summaries_top_k(test_path, clf.predict_proba(X_test))

Creating summary for paper W09-0621
Creating summary for paper D10-1058
Creating summary for paper C98-1097
Creating summary for paper W11-0815
Creating summary for paper D09-1023
Creating summary for paper P00-1025
Creating summary for paper N09-1001
Creating summary for paper W06-3909
Creating summary for paper P07-1040
Creating summary for paper N09-1025


### Evaluation

https://github.com/pltrdy/rouge

In [222]:
from collections import defaultdict

In [223]:
my_summary = defaultdict(list)
summary_dir_path = 'summary/'
for directory in sorted(listdir(summary_dir_path)):
    with open(summary_dir_path+directory) as f:
        content = f.read()
        print(directory,directory.split('.')[0][:-2])
        key = directory.split('.')[0][:-2]
        my_summary[key]+=[content]

C98-1097-1.system.txt C98-1097
C98-1097-2.system.txt C98-1097
C98-1097-3.system.txt C98-1097
D09-1023-1.system.txt D09-1023
D09-1023-2.system.txt D09-1023
D09-1023-3.system.txt D09-1023
D10-1058-1.system.txt D10-1058
D10-1058-2.system.txt D10-1058
D10-1058-3.system.txt D10-1058
N09-1001-1.system.txt N09-1001
N09-1001-2.system.txt N09-1001
N09-1001-3.system.txt N09-1001
N09-1025-1.system.txt N09-1025
N09-1025-2.system.txt N09-1025
N09-1025-3.system.txt N09-1025
P00-1025-1.system.txt P00-1025
P00-1025-2.system.txt P00-1025
P00-1025-3.system.txt P00-1025
P07-1040-1.system.txt P07-1040
P07-1040-2.system.txt P07-1040
P07-1040-3.system.txt P07-1040
W06-3909-1.system.txt W06-3909
W06-3909-2.system.txt W06-3909
W06-3909-3.system.txt W06-3909
W09-0621-1.system.txt W09-0621
W09-0621-2.system.txt W09-0621
W09-0621-3.system.txt W09-0621
W11-0815-1.system.txt W11-0815
W11-0815-2.system.txt W11-0815
W11-0815-3.system.txt W11-0815


In [224]:
my_summary['P07-1040']

['In speech recognition, confusion network decoding (Mangu et al., 2000) has become widely used in system combination. Recently, confusion network decoding for MT system combination has been proposed (Bangalore et al., 2001). In (Bangalore et al., 2001), Levenshtein alignment was used to generate the network. A modified Levenshtein alignment allowing shifts as in computation of the translation edit rate (TER) (Snover et al., 2006) was used to align hy Proceedings of the 45th Annual Meeting of the Association of Computational Linguistics, pages 312–319, Prague, Czech Republic, June 2007. Qc 2007 Association for Computational Linguistics potheses in (Sim et al., 2007). Minimum Bayes risk (MBR) was used to choose the skeleton in (Sim et al., 2007). This work was extended in (Rosti et al., 2007) by introducing system weights for word confidences. In (Rosti et al., 2007), simple score was assigned to the word coming from the th- best hypothesis. In (Rosti et al., 2007), the total confidence

In [225]:
gold_dir_path = 'Gold/'
summary_abstract = dict()
summary_combined = dict()
summary_community = dict()
summary_human = dict()
for directory in listdir(gold_dir_path):
    with open(gold_dir_path+directory) as f:
        content = f.read()
        key = directory.split('.')[0]
        if directory.endswith("abstract.summary.txt"):
            summary_abstract[key]=content
        elif directory.endswith("community.summary.txt"):
            summary_community[key]=content
        elif directory.endswith("combined.summary.txt"):
            summary_combined[key]=content
        elif directory.endswith("human.summary.txt"):
            summary_human[key]=content
        else:
            print('WARNING!')
            break

In [226]:
summary_abstract['C98-1097']

'Text Segmentation Using Reiteration and Collocation A method is presented for segmenting text into subtopic areas. The proportion of related pairwise words is calculated between adjacent windows of text to determine their lexical similarity. The lexical cohesion relations of reiteration and collocation are used to identify related words. These relations are automatically located using a combination of three linguistic features: word repetition, collocation and relation weights. This method is shown to successfully detect known subject changes in text and corresponds well to the segmentations placed by test subjects. '

In [227]:
summary_human['C98-1097']

"The paper 'Text Segmentation Using Reiteration and Collocation'  by Amanda C. Jobbins and Lindsay J. Evett presentes a method for segmenting a text into subtopics using Lexical cohesion.Lexical cohesion is expressed through the vocabulary used in text and the semantic relations between those words.For automatic detection of lexical cohesion  ties between pairwise words, three linguistic features were considered: word repetition, collocation and relation weights.The proposed segmentation algorithm compares adjacent windows of sentences and determines their lexical similarity. In first investigation word repetition alone achieved better results than using either collocation or relation weights individually. The combination of word repetition with another linguistic feature improved on its individual result, where less troughs were placed per text.In second investigation, recall rates tended to be lower than precision rates because the algorithm identified fewer segments than the test su

In [228]:
scores_abstract = defaultdict(list)
scores_community = defaultdict(list)
scores_human = defaultdict(list)
for key in summary_human.keys():
    for i in range(3):
        scores_abstract[key]+=rouge.get_scores(my_summary[key][i], summary_abstract[key])
        scores_community[key] += rouge.get_scores(my_summary[key][i], summary_community[key])
        scores_human[key] += rouge.get_scores(my_summary[key][i], summary_human[key])

In [229]:
def average_rouge(scores,rouge='rouge-2',metric='f'):
    rouges = []
    for value in scores.values():
        rouges.append(value[rouge][metric])
    return rouges

In [230]:
def average_rouge_top(scores,rouge='rouge-2',metric='f'):
    rouges = []
    for name,score in scores.items():
        all_r2f = [] 
        for sc in score:
            all_r2f.append(sc['rouge-2']['f'])
#         print(name)
        rouges.append(np.max(all_r2f))
    return rouges

In [231]:
abstract_r2_top = average_rouge_top(scores_abstract,rouge='rouge-2',metric='f')
community_r2_top = average_rouge_top(scores_community,rouge='rouge-2',metric='f')
human_r2_top = average_rouge_top(scores_human,rouge='rouge-2',metric='f')

SVM with rbf

In [62]:
np.mean(abstract_r2),np.mean(community_r2),np.mean(human_r2)

(0.07787471817756389, 0.14078694078267812, 0.10140411466064743)

XGBOOST 100 estimators

In [233]:
# only w2v features +seq_+ f1_feat+hdp+lda+ top3
np.mean(abstract_r2_top),np.mean(community_r2_top),np.mean(human_r2_top)

(0.29078129374670153, 0.3646400300537678, 0.20672613209210627)

In [141]:
# only w2v features +seq_+ f1_feat+hdp+lda
np.mean(abstract_r2),np.mean(community_r2),np.mean(human_r2)

(0.19324836251909452, 0.31241529455647604, 0.1579678258162968)

In [42]:
# only w2v features +seq_+ f1_feat
np.mean(abstract_r2),np.mean(community_r2),np.mean(human_r2)

(0.1982530602605909, 0.3372880651527327, 0.17542565169001512)

In [153]:
# only w2v features +seq_
np.mean(abstract_r2),np.mean(community_r2),np.mean(human_r2)

(0.21157892678256546, 0.31760102707938975, 0.18469409975970102)

In [66]:
# only w2v features 
np.mean(abstract_r2),np.mean(community_r2),np.mean(human_r2)

(0.19954038701545704, 0.3233288424347874, 0.149570976438548)

In [75]:
np.mean(abstract_r2),np.mean(community_r2),np.mean(human_r2)

(0.22951831953767093, 0.3110868759978951, 0.17520002466856807)

CatBoost

In [216]:
# all+top3
np.mean(abstract_r2_top),np.mean(community_r2_top),np.mean(human_r2_top)

(0.25130789793014474, 0.33627280522719694, 0.21403042532207833)

In [156]:
# all
np.mean(abstract_r2),np.mean(community_r2),np.mean(human_r2)

(0.2511979310967296, 0.28882339888771347, 0.18636222070100222)

In [60]:
# only w2v features +seq_+f1_feat
np.mean(abstract_r2),np.mean(community_r2),np.mean(human_r2)

(0.2340680811961347, 0.3393195285789483, 0.20316824858832608)

In [173]:
# only w2v features +seq_
np.mean(abstract_r2),np.mean(community_r2),np.mean(human_r2)

(0.2335245698609633, 0.3589430951751397, 0.16606803459535394)

MLPClassifier

In [199]:
# all-top3
np.mean(abstract_r2_top),np.mean(community_r2_top),np.mean(human_r2_top)

(0.27953123279875486, 0.3733387880160248, 0.2198220265613308)

In [185]:
# all
np.mean(abstract_r2),np.mean(community_r2),np.mean(human_r2)

(0.22749446736618434, 0.3390272849676415, 0.14407388766975493)

In [171]:
# all
np.mean(abstract_r2),np.mean(community_r2),np.mean(human_r2)

(0.2088605019269992, 0.304498999475492, 0.18219976440499414)

In [109]:
# only w2v features +seq_+f1_feat
np.mean(abstract_r2),np.mean(community_r2),np.mean(human_r2)

(0.26047637230255, 0.3060272572233433, 0.15140396026885963)

In [77]:
# only w2v features +seq_+f1_feat
np.mean(abstract_r2),np.mean(community_r2),np.mean(human_r2)

(0.2878034209847514, 0.2962740992909815, 0.1815719998672054)

In [202]:
# only w2v features +seq_
np.mean(abstract_r2),np.mean(community_r2),np.mean(human_r2)

(0.2330997495789881, 0.3185983338645845, 0.15646041677958894)

In [126]:
# only w2v features 
np.mean(abstract_r2),np.mean(community_r2),np.mean(human_r2)

(0.28513537303066505, 0.32439938426629594, 0.19037373170853195)

In [81]:
# only w2v features 
np.mean(abstract_r2),np.mean(community_r2),np.mean(human_r2)

(0.2906227673138955, 0.27673534723315296, 0.18172820836125608)

In [67]:
np.mean(abstract_r2),np.mean(community_r2),np.mean(human_r2)

(0.26870249097141874, 0.3203938411164929, 0.17075949648590674)

Все норм