# Citation based summarization

The `scisumm-corpus` directory contains the corpus of papers published in the context of the [CL-SciSumm 2017](http://wing.comp.nus.edu.sg/~cl-scisumm2017/) challenge.

## Терминология

Статьи, которые необходимо обобщить, называются **reference papers**. Каждый справочный документ связан с набором **citing papers**. Предложения цитирующей статьи, в которых цитируется соответствующий справочный документ, называются **citances**. Соответствующие предложения в справочном документе называются **provenances**. **Аннотация** - это отношение между набором цитирований цитирующей статьи и набором источников соответствующей справочной статьи.


In [67]:
import re
from os import listdir
from os.path import join

import numpy as np
from sklearn import neural_network
from sklearn.metrics import confusion_matrix

import scisummgen

import time
from sklearn import ensemble
from sklearn import svm
import pandas as pd
import xgboost as xgb

from gensim.models.word2vec import Word2Vec
from multiprocessing import cpu_count
import gensim.downloader as api
import re
import nltk 
nltk.download('stopwords')
from nltk.corpus import stopwords
stopWords = set(stopwords.words('english'))
from rouge import Rouge 
rouge = Rouge()
from catboost import CatBoostClassifier

from difflib import SequenceMatcher

[nltk_data] Downloading package stopwords to /home/andrey/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [68]:
training_path = 'scisumm-corpus/Training-Set-2017'
test_path = 'scisumm-corpus/Test-Set-2017'

можно добавить jaccard similarity, idf_sim ,w2v_sim

In [None]:
def calc_Rouge(hypothesis,reference):
	rouge = Rouge()
	scores = rouge.get_scores(hypothesis,reference)

	rougelist = []

	for key, value in scores[0]['rouge-1'].items():
		rougelist.append(value)

	for key, value in scores[0]['rouge-2'].items():
		rougelist.append(value)

	for key, value in scores[0]['rouge-l'].items():
		rougelist.append(value)

	return tuple(rougelist)
calc_Rouge(hypothesis,reference)

In [71]:
def create_features(path):
    X = []
    y = []

    # For all the papers
    for num,directory in enumerate(listdir(path)):
        print('num = ',num,' directory',directory)
        paper = scisummgen.Paper(join(path, directory))
        similarity = scisummgen.Similarity(paper)

        # For all the citances
        for citance in paper.annotation.citances:
            # For all the sentences of the reference paper
            for sentence_sid, sentence in paper.reference.sentences.items():
                citance_text = paper.get_citance_text(citance)

                tfidf = similarity.tfidf_similarity(sentence['text'], citance_text,)
                lsi = similarity.lsi_similarity(sentence['text'], citance_text)
                lda = similarity.lda_similarity(sentence['text'], citance_text)
                hdp = similarity.hdp_similarity(sentence['text'], citance_text)
                bigrams = similarity.count_bigrams(sentence['text'], citance_text)

                try:
                    sid_pos = sentence['sid'] / sentence['sid_max']
                except ZeroDivisionError:
                    sid_pos = 0

                try:
                    ssid_pos = sentence['ssid'] / sentence['ssid_max']
                except ZeroDivisionError:
                    ssid_pos = 0

                try:
                    section_pos = sentence['section'] / sentence['section_max']
                except ZeroDivisionError:
                    section_pos = 0

                X.append([tfidf, lsi, lda, hdp, bigrams, sid_pos, ssid_pos, section_pos])

                # Check if this sentence is also a provenance
                if sentence_sid in citance['RO']:
                    y.append(1)
                else:
                    y.append(-1)

    return X, y

In [72]:
def create_summaries(path, y_probability):
    y_index = 0

    # For all the test papers
    for directory in listdir(path):
        print('Creating summary for paper', directory)
        paper = scisummgen.Paper(join(path, directory))
        sentence_scores = np.zeros(next(iter(paper.reference.sentences.values()))['sid_max'] + 1)

        # For all the citances
        for citance in paper.annotation.citances:
            # For all the sentences of the reference paper
            for sentence_sid, sentence in paper.reference.sentences.items():
                # The probability of being a provenance
                sentence_scores[sentence_sid] += y_probability[y_index][1]
                y_index += 1

        # Find the sentences with the highest scores
        sentence_sid_sorted = sentence_scores.argsort()[::-1]
        summary_sentences = []
        tot_words = 0

        for sid in sentence_sid_sorted:
            sentence = paper.reference.sentences[sid]
            # Avoid considering the title
            if sentence['sid'] == 0:
                continue
            # Count the number of words
            words = len(re.findall('\w+', sentence['text'].lower()))
            tot_words += words
            if tot_words <= 250:
                summary_sentences.append(sentence)
            else:
                break

        # Sort sentences by sid
        summary_sentences.sort(key=lambda x: x['sid'])

        # Create the summary
        summary = ''
        for sentence in summary_sentences:
            summary += sentence['text'] + ' '
        summary = summary.strip(' ')

        with open(join('summary', directory + '.system.txt'), 'w', encoding='utf-8') as file:
            file.write(summary)

In [199]:
def create_summaries_top_k(path, y_probability,k=3):
    y_index = 0

    # For all the test papers
    for directory in listdir(path):
        print('Creating summary for paper', directory)
        paper = scisummgen.Paper(join(path, directory))
        sentence_scores = np.zeros(next(iter(paper.reference.sentences.values()))['sid_max'] + 1)

        # For all the citances
        for citance in paper.annotation.citances:
            # For all the sentences of the reference paper
            for sentence_sid, sentence in paper.reference.sentences.items():
                # The probability of being a provenance
                sentence_scores[sentence_sid] += y_probability[y_index][1]
                y_index += 1

        # Find the sentences with the highest scores
        sentence_sid_sorted = sentence_scores.argsort()[::-1]
        
        for num in range(1,k+1):
            summary_sentences = []
            tot_words = 0
            del_sids = []
            for sid in sentence_sid_sorted:
                del_sids.append(sid)
                sentence = paper.reference.sentences[sid]
                # Avoid considering the title
                if sentence['sid'] == 0:
                    continue
                # Count the number of words
                words = len(re.findall('\w+', sentence['text'].lower()))
                tot_words += words
                if tot_words <= 250:
                    summary_sentences.append(sentence)
                else:
                    break
                    
            sentence_sid_sorted = sentence_sid_sorted[~np.isin(sentence_sid_sorted,del_sids)]
            # Sort sentences by sid
            summary_sentences.sort(key=lambda x: x['sid'])

            # Create the summary
            summary = ''
            for sentence in summary_sentences:
                summary += sentence['text'] + ' '
            summary = summary.strip(' ')

            with open(join('summary', directory +f'-{str(num)}' +'.system.txt'), 'w', encoding='utf-8') as file:
                file.write(summary)


### Подготовка текста

Каждый reference paper представлен классом ***Paper class***. Этот класс *содержит reference paper, соответствующие цитирующие документы и соответствующие аннотации*. Документы представлены классом ***Document***, который *содержит фактические предложения*. *Список аннотаций* представлен классом  ***Annotation***.

Предложения:
- переводятся в нижний регистр
- и затем токенизируется с помощью регулярного выражения `\w+`. Токены, перечисленные в файле `stopwords.txt`, игнорируются.

### Предсказание provenance 

For each reference paper, for each citance, and for each sentence of the reference paper, it is possible to compute a set of features. These features are used to train an MLP classifier with the goal of predicting if a sentence of the reference paper is a provenance or not. The features considered are the following:

* **tfidf**: the TF-IDF similarity between the two sentences as computed by `gensim`;
* **lsi**: the LSI similarity between the two sentences, as computed by `gensim` considering 50 topics;
* **bigrams**: the number of common bigrams between the two sentences;
* **sid_pos**: the position of the sentence in the reference paper;
* **ssid_pos**: the position of the sentence in the local section of the reference paper;
* **section_pos**: the position of the local section in the reference paper.

The corpus of documents used for computing the TF-IDF similarity and the LSI similarity includes all the sentences of the reference paper and all the sentences of all its citing papers.

The classifier is trained to predict the probability for a sentence of being a provenance given a particular citance. Please note that a citance, in practice, may include several sentences of the citing paper. These probabilities are predicted for each pair composed of a sentence of the reference paper and a citance of all its citing papers.

### Прогноз provenance

Для каждого reference paper, для каждого цитирования и для каждого предложения reference paper можно вычислить набор функций. Эти функции используются для обучения классификаторов с целью прогнозирования, является ли предложение reference paper provenance или нет. Рассматриваются следующие функции:

* **tfidf**: сходство TF-IDF между двумя предложениями, вычисленное с помощью `gensim`;
* **lsi**: сходство LSI между двумя предложениями, вычисленное `gensim` с учетом 50 тем;
* **биграммы**: количество общих биграмм между двумя предложениями;
* **sid_pos**: позиция предложения в справочном документе;
* **ssid_pos**: позиция предложения в локальном разделе справочного документа;
* **section_pos**: положение локального раздела в справочном документе.

Корпус документов, используемых для вычисления подобия TF-IDF и подобия LSI, включает все предложения справочного документа и все предложения всех цитирующих документов.

Классификатор обучен предсказывать вероятность того, что предложение будет провенансом для конкретного случая. Обратите внимание, что на практике цитата может включать несколько предложений цитирующей статьи. Эти вероятности предсказываются для каждой пары, состоящей из предложения reference paper и цитирования всех цитирующих документов.

In [73]:
start_time = time.time()
print('Creating features for the training set')
X_training, y_training = create_features(training_path)
print(f"Time of creating: {time.time()-start_time}")

Creating features for the training set
num =  0  directory W08-2222
num =  1  directory W03-0410
num =  2  directory W04-0213
num =  3  directory E03-1020
num =  4  directory N01-1011
num =  5  directory P98-1046
num =  6  directory I05-5011
num =  7  directory P05-1004
num =  8  directory P06-2124
num =  9  directory N04-1038
num =  10  directory D10-1083
num =  11  directory J98-2005
num =  12  directory J00-3003
num =  13  directory X96-1048
num =  14  directory J96-3004
num =  15  directory C02-1025
num =  16  directory P98-1081
num =  17  directory C00-2123
num =  18  directory C04-1089
num =  19  directory H89-2014
num =  20  directory C08-1098
num =  21  directory H05-1115
num =  22  directory C94-2154
num =  23  directory E09-2008
num =  24  directory P98-2143
num =  25  directory N06-2049
num =  26  directory C90-2039
num =  27  directory C10-1045
num =  28  directory P05-1053
num =  29  directory W95-0104
Time of creating: 939.109461069107


In [74]:
len(X_training),len(y_training)

(147885, 147885)

In [117]:
def lists2txt(dataset,name='file'):
    with open(f'{name}.txt', 'w+', encoding='utf-8') as file:
        for res in dataset:
            try:
                line = "|".join(map(str, res))
#             print(line)
                file.write(line+ '\n')
            except:
                line = str(res)
                file.write(line+ '\n')

In [118]:
lists2txt(X_training,name='X_training_hdp_lda')
# lists2txt(y_training,name='y_training_new')

In [75]:
pd.Series(np.array(X_training)[:,2]).value_counts()

0.000000    67089
0.141421        3
0.141421        3
0.033386        2
0.268680        2
0.036906        2
0.019426        2
0.268680        2
0.141421        2
0.141421        2
0.319950        1
0.017041        1
0.091533        1
0.322496        1
0.050828        1
0.588447        1
0.025433        1
0.332834        1
0.918227        1
0.045712        1
0.314978        1
0.450350        1
0.321749        1
0.345879        1
0.343828        1
0.032961        1
0.392122        1
0.107873        1
0.049597        1
0.090775        1
            ...  
0.460305        1
0.009996        1
0.317577        1
0.128942        1
0.662239        1
0.132698        1
0.057939        1
0.040164        1
0.030673        1
0.024966        1
0.096629        1
0.221584        1
0.014543        1
0.388358        1
0.179298        1
0.038476        1
0.941813        1
0.556659        1
0.050986        1
0.023662        1
0.409310        1
0.171864        1
0.043476        1
0.629552        1
0.092126  

In [8]:
X_training[1]

[0.04203652932414605,
 0.22509949339999136,
 0.0,
 0,
 0.012195121951219513,
 0.16666666666666666,
 0.2]

In [9]:
X_training[2]

[0.07526056544129943,
 0.6316113439990435,
 0.0,
 0,
 0.018292682926829267,
 0.25,
 0.2]

In [8]:
set(y_training)

{-1, 1}

In [77]:
start_time = time.time()
print('Creating features for the test set')
X_test, y_test = create_features(test_path)
print(f"Time of creating: {time.time()-start_time}")

Creating features for the test set
num =  0  directory W09-0621
num =  1  directory D10-1058
num =  2  directory C98-1097
num =  3  directory W11-0815
num =  4  directory D09-1023
num =  5  directory P00-1025
num =  6  directory N09-1001
num =  7  directory W06-3909
num =  8  directory P07-1040
num =  9  directory N09-1025
Time of creating: 197.0065267086029


In [119]:
lists2txt(X_test,name='X_test_hdp_lda')
# lists2txt(y_training,name='y_training_new')

In [78]:
len(X_test),len(y_test)

(32410, 32410)

In [79]:
X_test[0]

[0.0, 0.20123946474666796, 0.0, 0.0, 0, 0.010101010101010102, 0.0625, 0.25]

In [11]:
X_test[0]

[0.0, 0.25557976787590375, 0, 0.010101010101010102, 0.0625, 0.25]

In [12]:
set(y_test)

{-1, 1}

#### Look at balanced of classes

In [13]:
y_tr = pd.Series(y_training)

In [14]:
y_tr.value_counts()

-1    146962
 1       923
dtype: int64

In [15]:
923/len(y_training)

0.006241336173377962

In [16]:
y_ts = pd.Series(y_test)
y_ts.value_counts()

-1    32179
 1      231
dtype: int64

In [17]:
231/len(y_test)

0.007127429805615551

In [18]:
sum_wpos = sum( 1 for i in range(len(y_training)) if y_training[i] == 1.0  )
sum_wneg = sum( 1 for i in range(len(y_training)) if y_training[i] == -1.0  )

In [19]:
sum_wneg

146962

## Train

In [80]:
print('Training the classifier')
start_time = time.time()
clf = ensemble.RandomForestClassifier(n_estimators=1000,verbose=True,n_jobs=-1)
clf = clf.fit(X_training, y_training)
print(f"Time of fit: {time.time()-start_time}")
start_time = time.time()
y_prediction = clf.predict(X_test)
print(f"Time of prediction: {time.time()-start_time}")

Training the classifier


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:   14.8s
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:   27.2s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   35.3s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s


Time of fit: 35.729307889938354


[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    0.3s
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:    0.6s
[Parallel(n_jobs=12)]: Done 1000 out of 1000 | elapsed:    0.7s finished


Time of prediction: 0.8384726047515869


In [81]:
clf.feature_importances_

array([0.1919095 , 0.24338118, 0.15519214, 0.02126485, 0.03236696,
       0.15922571, 0.14495554, 0.05170413])

In [83]:
conf_mat = confusion_matrix(y_test, y_prediction)
print('Confusion matrix on test set')
print(conf_mat)

Confusion matrix on test set
[[32174     5]
 [  230     1]]


### SVM

In [None]:
print('Training the classifier')
start_time = time.time()
clf = svm.SVC(kernel='rbf',probability=True,class_weight={1: 10},tol=1e-3)# class_weight={1: 10}, kernel='polynomial'
clf = clf.fit(np.array(X_training), np.array(y_training))
print(f"Time of fit: {time.time()-start_time}")
start_time = time.time()
y_prediction = clf.predict(np.array(X_test))
print(f"Time of prediction: {time.time()-start_time}")

In [48]:
conf_mat = confusion_matrix(y_test, y_prediction)
print('Confusion matrix on test set')
print(conf_mat)

Confusion matrix on test set
[[32173     6]
 [  231     0]]


In [64]:
# kernel='rbf',probability=True,class_weight={1: 10}
conf_mat = confusion_matrix(y_test, y_prediction)
print('Confusion matrix on test set')
print(conf_mat)

Confusion matrix on test set
[[32048   131]
 [  211    20]]


### XGBoost

In [176]:
print('Training the classifier')
start_time = time.time()
clf = xgb.XGBClassifier(n_estimators=100,early_stopping_rounds=10,
                        eval_set=[(X_test, y_test)])
#                         objective='binary:hinge'
clf = clf.fit(np.array(X_training), np.array(y_training) )
print(f"Time of fit: {time.time()-start_time}")
start_time = time.time()
y_prediction = clf.predict(X_test)
print(f"Time of prediction: {time.time()-start_time}")

Training the classifier
Time of fit: 4.980705261230469
Time of prediction: 0.11186528205871582


In [177]:
#with_hdp_lda
conf_mat = confusion_matrix(y_test, y_prediction)
print('Confusion matrix on test set')
print(conf_mat)

Confusion matrix on test set
[[32177     2]
 [  230     1]]


In [22]:
#with_hdp
conf_mat = confusion_matrix(y_test, y_prediction)
print('Confusion matrix on test set')
print(conf_mat)

Confusion matrix on test set
[[32176     3]
 [  229     2]]


In [59]:
conf_mat = confusion_matrix(y_test, y_prediction)
print('Confusion matrix on test set')
print(conf_mat)

Confusion matrix on test set
[[32175     4]
 [  230     1]]


In [115]:
print('Training the classifier')
start_time = time.time()
clf = CatBoostClassifier(iterations=1000,
                           early_stopping_rounds=10,
                           learning_rate=0.1)
clf = clf.fit(np.array(X_training), np.array(y_training) )
print(f"Time of fit: {time.time()-start_time}")
start_time = time.time()
y_prediction = clf.predict(X_test)
print(f"Time of prediction: {time.time()-start_time}")

Training the classifier
0:	learn: 0.4447000	total: 34.3ms	remaining: 34.3s
1:	learn: 0.3054386	total: 66.4ms	remaining: 33.2s
2:	learn: 0.2090293	total: 98.9ms	remaining: 32.9s
3:	learn: 0.1474756	total: 131ms	remaining: 32.6s
4:	learn: 0.1091172	total: 163ms	remaining: 32.4s
5:	learn: 0.0851463	total: 193ms	remaining: 32s
6:	learn: 0.0687809	total: 222ms	remaining: 31.4s
7:	learn: 0.0580740	total: 255ms	remaining: 31.6s
8:	learn: 0.0510507	total: 288ms	remaining: 31.7s
9:	learn: 0.0460135	total: 324ms	remaining: 32.1s
10:	learn: 0.0422194	total: 362ms	remaining: 32.5s
11:	learn: 0.0396025	total: 403ms	remaining: 33.2s
12:	learn: 0.0376839	total: 441ms	remaining: 33.5s
13:	learn: 0.0362216	total: 482ms	remaining: 34s
14:	learn: 0.0350444	total: 532ms	remaining: 35s
15:	learn: 0.0339971	total: 580ms	remaining: 35.7s
16:	learn: 0.0332392	total: 620ms	remaining: 35.8s
17:	learn: 0.0326727	total: 659ms	remaining: 35.9s
18:	learn: 0.0322825	total: 700ms	remaining: 36.1s
19:	learn: 0.0319338

163:	learn: 0.0260503	total: 5.78s	remaining: 29.5s
164:	learn: 0.0260241	total: 5.82s	remaining: 29.4s
165:	learn: 0.0259972	total: 5.85s	remaining: 29.4s
166:	learn: 0.0259801	total: 5.88s	remaining: 29.4s
167:	learn: 0.0259665	total: 5.91s	remaining: 29.3s
168:	learn: 0.0259538	total: 5.95s	remaining: 29.2s
169:	learn: 0.0259420	total: 5.98s	remaining: 29.2s
170:	learn: 0.0259232	total: 6.01s	remaining: 29.1s
171:	learn: 0.0259001	total: 6.05s	remaining: 29.1s
172:	learn: 0.0258943	total: 6.09s	remaining: 29.1s
173:	learn: 0.0258877	total: 6.12s	remaining: 29.1s
174:	learn: 0.0258661	total: 6.15s	remaining: 29s
175:	learn: 0.0258460	total: 6.18s	remaining: 29s
176:	learn: 0.0258261	total: 6.21s	remaining: 28.9s
177:	learn: 0.0258133	total: 6.25s	remaining: 28.8s
178:	learn: 0.0257983	total: 6.28s	remaining: 28.8s
179:	learn: 0.0257933	total: 6.31s	remaining: 28.8s
180:	learn: 0.0257785	total: 6.35s	remaining: 28.7s
181:	learn: 0.0257390	total: 6.38s	remaining: 28.7s
182:	learn: 0.02

324:	learn: 0.0233121	total: 11.3s	remaining: 23.5s
325:	learn: 0.0232980	total: 11.3s	remaining: 23.5s
326:	learn: 0.0232896	total: 11.4s	remaining: 23.5s
327:	learn: 0.0232722	total: 11.4s	remaining: 23.4s
328:	learn: 0.0232652	total: 11.5s	remaining: 23.4s
329:	learn: 0.0232459	total: 11.5s	remaining: 23.4s
330:	learn: 0.0232380	total: 11.6s	remaining: 23.4s
331:	learn: 0.0232280	total: 11.6s	remaining: 23.3s
332:	learn: 0.0232107	total: 11.6s	remaining: 23.3s
333:	learn: 0.0231929	total: 11.7s	remaining: 23.2s
334:	learn: 0.0231756	total: 11.7s	remaining: 23.2s
335:	learn: 0.0231662	total: 11.7s	remaining: 23.2s
336:	learn: 0.0231514	total: 11.8s	remaining: 23.2s
337:	learn: 0.0231375	total: 11.8s	remaining: 23.1s
338:	learn: 0.0231202	total: 11.8s	remaining: 23.1s
339:	learn: 0.0231154	total: 11.9s	remaining: 23s
340:	learn: 0.0231000	total: 11.9s	remaining: 23s
341:	learn: 0.0230876	total: 11.9s	remaining: 23s
342:	learn: 0.0230740	total: 12s	remaining: 22.9s
343:	learn: 0.023065

483:	learn: 0.0212458	total: 16.6s	remaining: 17.7s
484:	learn: 0.0212218	total: 16.6s	remaining: 17.6s
485:	learn: 0.0212077	total: 16.6s	remaining: 17.6s
486:	learn: 0.0211936	total: 16.7s	remaining: 17.6s
487:	learn: 0.0211881	total: 16.7s	remaining: 17.5s
488:	learn: 0.0211745	total: 16.7s	remaining: 17.5s
489:	learn: 0.0211619	total: 16.8s	remaining: 17.5s
490:	learn: 0.0211386	total: 16.8s	remaining: 17.4s
491:	learn: 0.0211159	total: 16.8s	remaining: 17.4s
492:	learn: 0.0210984	total: 16.9s	remaining: 17.4s
493:	learn: 0.0210880	total: 16.9s	remaining: 17.3s
494:	learn: 0.0210782	total: 16.9s	remaining: 17.3s
495:	learn: 0.0210662	total: 17s	remaining: 17.3s
496:	learn: 0.0210528	total: 17s	remaining: 17.2s
497:	learn: 0.0210384	total: 17.1s	remaining: 17.2s
498:	learn: 0.0210297	total: 17.1s	remaining: 17.2s
499:	learn: 0.0210264	total: 17.2s	remaining: 17.2s
500:	learn: 0.0210184	total: 17.2s	remaining: 17.1s
501:	learn: 0.0210111	total: 17.2s	remaining: 17.1s
502:	learn: 0.02

648:	learn: 0.0195935	total: 22.3s	remaining: 12.1s
649:	learn: 0.0195858	total: 22.4s	remaining: 12s
650:	learn: 0.0195804	total: 22.4s	remaining: 12s
651:	learn: 0.0195639	total: 22.4s	remaining: 12s
652:	learn: 0.0195444	total: 22.5s	remaining: 11.9s
653:	learn: 0.0195340	total: 22.5s	remaining: 11.9s
654:	learn: 0.0195240	total: 22.5s	remaining: 11.9s
655:	learn: 0.0195027	total: 22.6s	remaining: 11.8s
656:	learn: 0.0194918	total: 22.6s	remaining: 11.8s
657:	learn: 0.0194742	total: 22.6s	remaining: 11.8s
658:	learn: 0.0194711	total: 22.7s	remaining: 11.7s
659:	learn: 0.0194634	total: 22.7s	remaining: 11.7s
660:	learn: 0.0194511	total: 22.7s	remaining: 11.7s
661:	learn: 0.0194443	total: 22.8s	remaining: 11.6s
662:	learn: 0.0194396	total: 22.8s	remaining: 11.6s
663:	learn: 0.0194327	total: 22.8s	remaining: 11.6s
664:	learn: 0.0194179	total: 22.9s	remaining: 11.5s
665:	learn: 0.0194143	total: 22.9s	remaining: 11.5s
666:	learn: 0.0194075	total: 22.9s	remaining: 11.5s
667:	learn: 0.0193

809:	learn: 0.0180921	total: 27.9s	remaining: 6.55s
810:	learn: 0.0180756	total: 27.9s	remaining: 6.51s
811:	learn: 0.0180685	total: 28s	remaining: 6.48s
812:	learn: 0.0180565	total: 28s	remaining: 6.44s
813:	learn: 0.0180501	total: 28s	remaining: 6.41s
814:	learn: 0.0180413	total: 28.1s	remaining: 6.38s
815:	learn: 0.0180265	total: 28.1s	remaining: 6.34s
816:	learn: 0.0180244	total: 28.2s	remaining: 6.31s
817:	learn: 0.0180192	total: 28.2s	remaining: 6.27s
818:	learn: 0.0180103	total: 28.2s	remaining: 6.24s
819:	learn: 0.0180093	total: 28.3s	remaining: 6.2s
820:	learn: 0.0179912	total: 28.3s	remaining: 6.17s
821:	learn: 0.0179722	total: 28.3s	remaining: 6.13s
822:	learn: 0.0179674	total: 28.4s	remaining: 6.1s
823:	learn: 0.0179547	total: 28.4s	remaining: 6.07s
824:	learn: 0.0179449	total: 28.4s	remaining: 6.03s
825:	learn: 0.0179386	total: 28.5s	remaining: 6s
826:	learn: 0.0179323	total: 28.5s	remaining: 5.97s
827:	learn: 0.0179229	total: 28.6s	remaining: 5.93s
828:	learn: 0.0179175	t

970:	learn: 0.0167420	total: 33.6s	remaining: 1s
971:	learn: 0.0167282	total: 33.6s	remaining: 968ms
972:	learn: 0.0167204	total: 33.6s	remaining: 933ms
973:	learn: 0.0167084	total: 33.7s	remaining: 899ms
974:	learn: 0.0167000	total: 33.7s	remaining: 864ms
975:	learn: 0.0166891	total: 33.7s	remaining: 830ms
976:	learn: 0.0166862	total: 33.8s	remaining: 795ms
977:	learn: 0.0166821	total: 33.8s	remaining: 760ms
978:	learn: 0.0166738	total: 33.8s	remaining: 726ms
979:	learn: 0.0166698	total: 33.9s	remaining: 691ms
980:	learn: 0.0166675	total: 33.9s	remaining: 657ms
981:	learn: 0.0166642	total: 33.9s	remaining: 622ms
982:	learn: 0.0166608	total: 34s	remaining: 588ms
983:	learn: 0.0166519	total: 34s	remaining: 553ms
984:	learn: 0.0166317	total: 34s	remaining: 518ms
985:	learn: 0.0166179	total: 34.1s	remaining: 484ms
986:	learn: 0.0166126	total: 34.1s	remaining: 449ms
987:	learn: 0.0166072	total: 34.1s	remaining: 415ms
988:	learn: 0.0165856	total: 34.2s	remaining: 380ms
989:	learn: 0.0165670

In [116]:
#hdp_lda
conf_mat = confusion_matrix(y_test, y_prediction)
print('Confusion matrix on test set')
print(conf_mat)

Confusion matrix on test set
[[32162    17]
 [  227     4]]


In [37]:
#hdp
conf_mat = confusion_matrix(y_test, y_prediction)
print('Confusion matrix on test set')
print(conf_mat)

Confusion matrix on test set
[[32164    15]
 [  227     4]]


In [197]:
conf_mat = confusion_matrix(y_test, y_prediction)
print('Confusion matrix on test set')
print(conf_mat)

Confusion matrix on test set
[[32157    22]
 [  230     1]]


In [134]:
print('Training the classifier')
start_time = time.time()
clf = neural_network.MLPClassifier(hidden_layer_sizes=(100, 100, 100, 100, 100),
                                   learning_rate_init=0.0001, verbose=True)
clf = clf.fit(X_training, y_training)
print(f"Time of fit: {time.time()-start_time}")
start_time = time.time()
y_prediction = clf.predict(X_test)
print(f"Time of prediction: {time.time()-start_time}")

Training the classifier
Iteration 1, loss = 0.09023999
Iteration 2, loss = 0.03258741
Iteration 3, loss = 0.03223750
Iteration 4, loss = 0.03206124
Iteration 5, loss = 0.03189568
Iteration 6, loss = 0.03176886
Iteration 7, loss = 0.03161101
Iteration 8, loss = 0.03146896
Iteration 9, loss = 0.03132973
Iteration 10, loss = 0.03123303
Iteration 11, loss = 0.03111883
Iteration 12, loss = 0.03097161
Iteration 13, loss = 0.03094137
Iteration 14, loss = 0.03085088
Iteration 15, loss = 0.03083888
Iteration 16, loss = 0.03080004
Iteration 17, loss = 0.03072775
Iteration 18, loss = 0.03072529
Iteration 19, loss = 0.03069744
Iteration 20, loss = 0.03058285
Iteration 21, loss = 0.03065555
Iteration 22, loss = 0.03058982
Iteration 23, loss = 0.03050968
Iteration 24, loss = 0.03044753
Iteration 25, loss = 0.03054196
Iteration 26, loss = 0.03042961
Iteration 27, loss = 0.03042776
Iteration 28, loss = 0.03041229
Iteration 29, loss = 0.03028999
Iteration 30, loss = 0.03033849
Iteration 31, loss = 0.03

In [135]:
#hdp_lda
conf_mat = confusion_matrix(y_test, y_prediction)
print('Confusion matrix on test set')
print(conf_mat)

Confusion matrix on test set
[[32178     1]
 [  230     1]]


In [52]:
#hdp
conf_mat = confusion_matrix(y_test, y_prediction)
print('Confusion matrix on test set')
print(conf_mat)

Confusion matrix on test set
[[32171     8]
 [  229     2]]


In [15]:
conf_mat = confusion_matrix(y_test, y_prediction)
print('Confusion matrix on test set')
print(conf_mat)

Confusion matrix on test set
[[32177     2]
 [  231     0]]


In [60]:
np.array(y_test)

array([-1, -1,  1, ..., -1, -1, -1])

In [61]:
y_prediction

array([-1, -1, -1, ..., -1, -1, -1])

In [62]:
len(np.where(np.array(y_test) == y_prediction)[0]),len(y_test)

(32176, 32410)

In [63]:
len(np.where(np.array(y_test) == y_prediction)[0])

32176

In [64]:
len(y_test)

32410

### Ранжирование

Принимая во внимание все вероятности того, что предложение в reference paper является provenance, и рассчитывается с учетом доступных расхождений, глобальный балл для каждого кандидата происхождения определяется путем суммирования всех его вероятностей. Предложения с наивысшей оценкой отбираются для создания summary of reference paper до тех пор, пока длина summary не превысит 250 слов. Эти предложения упорядочены в соответствии с их исходным положением в справочном документе.

In [178]:
clf.predict_proba(X_test)

array([[0.996574  , 0.00342602],
       [0.9715833 , 0.02841669],
       [0.88471794, 0.11528206],
       ...,
       [0.99846184, 0.00153816],
       [0.9985876 , 0.00141242],
       [0.99846184, 0.00153816]], dtype=float32)

In [179]:
create_summaries(test_path, clf.predict_proba(X_test))

Creating summary for paper W09-0621
Creating summary for paper D10-1058
Creating summary for paper C98-1097
Creating summary for paper W11-0815
Creating summary for paper D09-1023
Creating summary for paper P00-1025
Creating summary for paper N09-1001
Creating summary for paper W06-3909
Creating summary for paper P07-1040
Creating summary for paper N09-1025


In [198]:
create_summaries_top_k(test_path, clf.predict_proba(X_test))

Creating summary for paper W09-0621
Creating summary for paper D10-1058
Creating summary for paper C98-1097
Creating summary for paper W11-0815
Creating summary for paper D09-1023
Creating summary for paper P00-1025
Creating summary for paper N09-1001
Creating summary for paper W06-3909
Creating summary for paper P07-1040
Creating summary for paper N09-1025


### Test-Set 2018 annotation to Standart annotation

In [242]:
import pandas as pd

In [192]:
csv_anot = pd.read_csv('scisumm-corpus/Test-Set-2018/A00-2018/annotation/A00-2018.csv')

In [222]:
csv_anot['Reference Text'].notna()

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
Name: Reference Text, dtype: bool

In [193]:
csv_anot.head()

Unnamed: 0,Citance Number,Reference Article,Citing Article,Citation Marker Offset,Citation Marker,Citation Offset,Citation Text,Citation Text Clean,Reference Offset,Reference Text,Discourse Facet
0,2,A00-2018,N10-1002,0,"Charniak, 2000",0,"As a benchmark VPC extraction system, we use t...","As a benchmark VPC extraction system, we use t...",,,
1,3,A00-2018,W11-0610,0,"Charniak, 2000",0,Each of these scores can be calculated from a ...,Each of these scores can be calculated from a ...,,,
2,4,A00-2018,W06-3119,0,"Charniak, 2000",0,"We then use Charniak? s parser (Charniak, 2000...","We then use Charniak's parser (Charniak, 2000)...",,,
3,5,A00-2018,N03-2024,0,"Charniak, 2000",0,We were interested in the occurrence of featur...,We were interested in the occurrence of featur...,,,
4,6,A00-2018,N06-1039,0,"Charniak, 2000",0,"After getting a set of basic clusters, we pass...","After getting a set of basic clusters, we pass...",,,


In [211]:
for i, j in zip(csv_anot.columns,range(12)):
    print(j, i)

0 Citance Number
1 Reference Article
2 Citing Article
3 Citation Marker Offset
4 Citation Marker
5 Citation Offset
6 Citation Text
7 Citation Text Clean
8 Reference Offset
9 Reference Text
10 Discourse Facet
11 string


In [206]:
csv_anot.iloc[:,1]

0     A00-2018
1     A00-2018
2     A00-2018
3     A00-2018
4     A00-2018
5     A00-2018
6     A00-2018
7     A00-2018
8     A00-2018
9     A00-2018
10    A00-2018
11    A00-2018
12    A00-2018
13    A00-2018
14    A00-2018
15    A00-2018
Name: Reference Article, dtype: object

In [216]:
csv_anot['string'] = 'Citance Number: '+ csv_anot['Citance Number'].astype(str)+' | ' \
                   + 'Reference Article: ' + csv_anot['Reference Article'].astype(str)+' | ' \
                   + 'Citing Article: ' + csv_anot['Citing Article'].astype(str)+' | ' \
                   + 'Citation Marker Offset: ' + csv_anot['Citation Marker Offset'].astype(str)+' | ' \
                   + 'Citation Marker: ' + csv_anot['Citation Marker'].astype(str)+' | ' \
                   + 'Citation Offset: ' + csv_anot['Citation Offset'].astype(str)+' | ' \
                   + 'Citation Text: ' + csv_anot['Citation Text'].astype(str)+' | ' \
                   + 'Citation Text Clean: ' + csv_anot['Citation Text Clean'].astype(str)+' | ' \
                   + 'Reference Offset: ' + csv_anot['Reference Offset'].astype(str)+' | ' \
                   + 'Reference Text: ' + csv_anot['Reference Text'].astype(str)+' | ' \
                   + 'Discourse Facet: ' + csv_anot['Discourse Facet'].astype(str)+' | '

In [220]:
csv_anot.string.values[0]

'Citance Number: 2 | Reference Article: A00-2018 | Citing Article: N10-1002 | Citation Marker Offset: 0 | Citation Marker: Charniak, 2000 | Citation Offset: 0 | Citation Text: As a benchmark VPC extraction system, we use the Charniak parser (Charniak, 2000) | Citation Text Clean: As a benchmark VPC extraction system, we use the Charniak parser (Charniak, 2000) | Reference Offset: nan | Reference Text: nan | Discourse Facet: nan | '

### Evaluation

https://github.com/pltrdy/rouge

In [180]:
from rouge import Rouge 

In [181]:
rouge = Rouge()

In [182]:
my_summary = dict()
summary_dir_path = 'summary/'
for directory in listdir(summary_dir_path):
    with open(summary_dir_path+directory) as f:
        content = f.read()
        print(directory)
        key = directory.split('.')[0]
        my_summary[key]=content

W11-0815.system.txt
D09-1023.system.txt
P00-1025.system.txt
N09-1025.system.txt
W06-3909.system.txt
C98-1097.system.txt
N09-1001.system.txt
D10-1058.system.txt
W09-0621.system.txt
P07-1040.system.txt


In [183]:
my_summary['P07-1040']

'Recently, confusion network decoding has been applied in machine translation system combination. Recently, confusion network decoding for MT system combination has been proposed (Bangalore et al., 2001). In (Bangalore et al., 2001), Levenshtein alignment was used to generate the network. A modified Levenshtein alignment allowing shifts as in computation of the translation edit rate (TER) (Snover et al., 2006) was used to align hy Proceedings of the 45th Annual Meeting of the Association of Computational Linguistics, pages 312–319, Prague, Czech Republic, June 2007. Qc 2007 Association for Computational Linguistics potheses in (Sim et al., 2007). This work was extended in (Rosti et al., 2007) by introducing system weights for word confidences. In this work, confusion networks are generated by using the -best output from each system as the skeleton, and prior probabilities for each network are estimated from the average TER scores between the skeleton and other hypotheses. In (Rosti et 

In [184]:
gold_dir_path = 'Gold/'
summary_abstract = dict()
summary_combined = dict()
summary_community = dict()
summary_human = dict()
for directory in listdir(gold_dir_path):
    with open(gold_dir_path+directory) as f:
        content = f.read()
        key = directory.split('.')[0]
        if directory.endswith("abstract.summary.txt"):
            summary_abstract[key]=content
        elif directory.endswith("community.summary.txt"):
            summary_community[key]=content
        elif directory.endswith("combined.summary.txt"):
            summary_combined[key]=content
        elif directory.endswith("human.summary.txt"):
            summary_human[key]=content
        else:
            print('WARNING!')
            break

In [185]:
summary_abstract['C98-1097']

'Text Segmentation Using Reiteration and Collocation A method is presented for segmenting text into subtopic areas. The proportion of related pairwise words is calculated between adjacent windows of text to determine their lexical similarity. The lexical cohesion relations of reiteration and collocation are used to identify related words. These relations are automatically located using a combination of three linguistic features: word repetition, collocation and relation weights. This method is shown to successfully detect known subject changes in text and corresponds well to the segmentations placed by test subjects. '

In [186]:
summary_human['C98-1097']

"The paper 'Text Segmentation Using Reiteration and Collocation'  by Amanda C. Jobbins and Lindsay J. Evett presentes a method for segmenting a text into subtopics using Lexical cohesion.Lexical cohesion is expressed through the vocabulary used in text and the semantic relations between those words.For automatic detection of lexical cohesion  ties between pairwise words, three linguistic features were considered: word repetition, collocation and relation weights.The proposed segmentation algorithm compares adjacent windows of sentences and determines their lexical similarity. In first investigation word repetition alone achieved better results than using either collocation or relation weights individually. The combination of word repetition with another linguistic feature improved on its individual result, where less troughs were placed per text.In second investigation, recall rates tended to be lower than precision rates because the algorithm identified fewer segments than the test su

In [187]:
scores_abstract = dict()
scores_community = dict()
scores_human = dict()
for key in summary_human.keys():
    scores_abstract[key] = rouge.get_scores(my_summary[key], summary_abstract[key])[0]
    scores_community[key] = rouge.get_scores(my_summary[key], summary_community[key])[0]
    scores_human[key] = rouge.get_scores(my_summary[key], summary_human[key])[0]

In [188]:
scores_community['W11-0815']

{'rouge-1': {'f': 0.6488294264990325,
  'p': 0.7293233082706767,
  'r': 0.5843373493975904},
 'rouge-2': {'f': 0.4957264907882242,
  'p': 0.5576923076923077,
  'r': 0.4461538461538462},
 'rouge-l': {'f': 0.6205163237464864,
  'p': 0.7142857142857143,
  'r': 0.572289156626506}}

In [189]:
def average_rouge(scores,rouge='rouge-2',metric='f'):
    rouges = []
    for value in scores.values():
        rouges.append(value[rouge][metric])
    return rouges

In [190]:
abstract_r2 = average_rouge(scores_abstract,rouge='rouge-2',metric='f')
community_r2 = average_rouge(scores_community,rouge='rouge-2',metric='f')
human_r2 = average_rouge(scores_human,rouge='rouge-2',metric='f')

SVM with rbf

In [62]:
np.mean(abstract_r2),np.mean(community_r2),np.mean(human_r2)

(0.07787471817756389, 0.14078694078267812, 0.10140411466064743)

CatBoost

In [133]:
#with_hdp_lda
np.mean(abstract_r2),np.mean(community_r2),np.mean(human_r2)

(0.24975778386411546, 0.32898824520173875, 0.20797752277229575)

In [50]:
#with_hdp
np.mean(abstract_r2),np.mean(community_r2),np.mean(human_r2)

(0.24286631832422448, 0.32015983370924594, 0.19824910801274004)

In [270]:
#with_lda
np.mean(abstract_r2),np.mean(community_r2),np.mean(human_r2)

(0.24591310678310588, 0.30130490782111224, 0.18286560271292993)

In [209]:
# jaccard
np.mean(abstract_r2),np.mean(community_r2),np.mean(human_r2)

(0.19420697343739707, 0.27979162434216764, 0.1526580795590468)

XGBOOST 100 estimators

In [191]:
#with_hdp_lda
np.mean(abstract_r2),np.mean(community_r2),np.mean(human_r2)

(0.19662945624424172, 0.32426136179171317, 0.15410472639000478)

In [35]:
#with_hdp
np.mean(abstract_r2),np.mean(community_r2),np.mean(human_r2)

(0.20291673197138813, 0.3060764146373976, 0.17352097059644137)

In [287]:
#with_lda
np.mean(abstract_r2),np.mean(community_r2),np.mean(human_r2)

(0.1947858153472447, 0.3096493178154836, 0.16517953946560632)

In [192]:
# jaccard
np.mean(abstract_r2),np.mean(community_r2),np.mean(human_r2)

(0.2179759324593234, 0.30681668646050453, 0.17877269627840794)

In [75]:
np.mean(abstract_r2),np.mean(community_r2),np.mean(human_r2)

(0.22951831953767093, 0.3110868759978951, 0.17520002466856807)

MLPClassifier

In [149]:
#with_hdp_lda
np.mean(abstract_r2),np.mean(community_r2),np.mean(human_r2)

(0.30600373616405746, 0.31834343132677434, 0.22430687418319506)

In [66]:
#with_hdp
np.mean(abstract_r2),np.mean(community_r2),np.mean(human_r2)

(0.29837009283880433, 0.33578335345737853, 0.18129473949889643)

In [303]:
#with_lda
np.mean(abstract_r2),np.mean(community_r2),np.mean(human_r2)

(0.27280657818100984, 0.31921975917920653, 0.17620671453840597)

In [223]:
# jaccard
np.mean(abstract_r2),np.mean(community_r2),np.mean(human_r2)

(0.21971719889722582, 0.2822074493460799, 0.13555569664438108)

In [67]:
np.mean(abstract_r2),np.mean(community_r2),np.mean(human_r2)

(0.26870249097141874, 0.3203938411164929, 0.17075949648590674)

Все норм