In [70]:
import json
from tqdm import tqdm_notebook
import gzip
import pandas as pd
import sys
import json
from typing import Dict, List
import os
import numpy as np
import nltk

In [2]:
import json
from typing import Dict, List


def get_citation_contexts(paper: Dict, toks_in_context=10) -> List[Dict]:
    """
    Retrieve citation contexts from GORC paper
    :param paper:
    :param toks_in_context:
    :return:
    """
    if not paper:
        return []

    if not paper['grobid_parse']:
        return []

    if not paper['grobid_parse']['body_text']:
        return []

    contexts = []

    for paragraph in paper['grobid_parse']['body_text']:
        for cite_span in paragraph['cite_spans']:
            # get cited paper id, skip if none
            cite_ref = cite_span['ref_id']
            cited_paper_id = None
            if cite_ref in paper['grobid_parse']['bib_entries']:
                cited_paper_id = paper['grobid_parse']['bib_entries'][cite_ref]['links']
            if not cited_paper_id:
                continue

            # get pre and post tokens
            pre_span_tokens = paragraph['text'][:cite_span['start']].split(' ')[-toks_in_context:]
            post_span_tokens = paragraph['text'][cite_span['end']:].split(' ')[:toks_in_context]
            pre_string = ' '.join(pre_span_tokens)
            post_string = ' '.join(post_span_tokens)
            full_context = pre_string + cite_span['text'] + post_string

            contexts.append({
                "paper_id": paper['paper_id'],
                "context_string": full_context,
                "cite_start": len(pre_string),
                "cite_end": len(pre_string) + len(cite_span['text']),
                "cite_str": cite_span['text'],
                "cited_paper_id": cited_paper_id
            })

    return contexts


In [3]:
EXAMPLE_DATA_FILE = '../data/example_papers.jsonl'

In [4]:
all_contexts = []
all_papers = []
context_dict = dict()
with open(EXAMPLE_DATA_FILE, 'r') as f:
    for line in f:
        gorc_obj = json.loads(line)
        all_papers.append(gorc_obj)
        all_contexts += get_citation_contexts(gorc_obj)

In [5]:
from os import walk

f_zips = []
for (dirpath, dirnames, filenames) in walk('../../../gorc/'):
    f_zips.extend(filenames)
    break

In [6]:
len(f_zips),f_zips

(10002,
 ['0.jsonl.gz',
  '1.jsonl.gz',
  '10.jsonl.gz',
  '100.jsonl.gz',
  '1000.jsonl.gz',
  '1001.jsonl.gz',
  '1002.jsonl.gz',
  '1003.jsonl.gz',
  '1004.jsonl.gz',
  '1005.jsonl.gz',
  '1006.jsonl.gz',
  '1007.jsonl.gz',
  '1008.jsonl.gz',
  '1009.jsonl.gz',
  '101.jsonl.gz',
  '1010.jsonl.gz',
  '1011.jsonl.gz',
  '1012.jsonl.gz',
  '1013.jsonl.gz',
  '1014.jsonl.gz',
  '1015.jsonl.gz',
  '1016.jsonl.gz',
  '1017.jsonl.gz',
  '1018.jsonl.gz',
  '1019.jsonl.gz',
  '102.jsonl.gz',
  '1020.jsonl.gz',
  '1021.jsonl.gz',
  '1022.jsonl.gz',
  '1023.jsonl.gz',
  '1024.jsonl.gz',
  '1025.jsonl.gz',
  '1026.jsonl.gz',
  '1027.jsonl.gz',
  '1028.jsonl.gz',
  '1029.jsonl.gz',
  '103.jsonl.gz',
  '1030.jsonl.gz',
  '1031.jsonl.gz',
  '1032.jsonl.gz',
  '1033.jsonl.gz',
  '1034.jsonl.gz',
  '1035.jsonl.gz',
  '1036.jsonl.gz',
  '1037.jsonl.gz',
  '1038.jsonl.gz',
  '1039.jsonl.gz',
  '104.jsonl.gz',
  '1040.jsonl.gz',
  '1041.jsonl.gz',
  '1042.jsonl.gz',
  '1043.jsonl.gz',
  '1044.jsonl.gz'

In [7]:
[file for file in f_zips if '.gz' not in file]

['s2orc-master.zip']

In [8]:
all_papers[0].keys()

dict_keys(['paper_id', 'metadata', 's2_pdf_hash', 'grobid_parse', 'latex_parse'])

In [9]:
all_papers[0]

{'paper_id': '104172',
 'metadata': {'title': 'Nonlinear inversion of tilt-affected very long period records of explosive eruptions at Fuego volcano: INVERSION OF TILT-AFFECTED VLP EVENTS',
  'authors': [{'first': 'Gregory',
    'middle': ['P.'],
    'last': 'Waite',
    'suffix': ''},
   {'first': 'Federica', 'middle': [], 'last': 'Lanza', 'suffix': ''}],
  'abstract': None,
  'year': '2016',
  'arxiv_id': None,
  'acl_id': None,
  'pmc_id': None,
  'pubmed_id': None,
  'doi': '10.1002/2016jb013287',
  'venue': 'Journal of Geophysical Research: Solid Earth',
  'journal': 'Journal of Geophysical Research'},
 's2_pdf_hash': '73ed8076fc747e77c41845cb5f18b40ece350865',
 'grobid_parse': {'abstract': [],
  'body_text': [{'text': 'solution to this is to evaluate long wavelength, very-long-period (VLP) data that are relativelyFuego is a 3800 m stratovolcano that regularly produces Strombolian and weak 76Vulcanian explosions. The dynamics of these explosive events have been examined in the VLP

In [10]:
with open("../acl_only_json/acl_only_json_list_10000.json", "r") as read_file:
    all_articles = json.load(read_file)
print(len(all_articles))
read_file.close()

41660


## Анализ подборки

### проверка наличия названия секции

In [11]:
acl_paper_ids = [article['paper_id'] for article in all_articles]

##### Удалим статьи,у которых нет body_text

In [12]:
acl_ids_not_body_text = [article['paper_id'] for article in all_articles if not article['grobid_parse']['body_text'] or not article['grobid_parse']['bib_entries']]
len(acl_ids_not_body_text)

1897

In [13]:
def delete_items_from_papers(acl_ids_not_body_text,acl_only_articles):
    del_items = []
    del_num_items = []
    for num_artic, article in enumerate(acl_only_articles):
        if article['paper_id'] in acl_ids_not_body_text:
            del_items.append(article['paper_id'])
            del_num_items.append(num_artic)
            
    del_num_items = np.array(del_num_items)
    acl_only_articles = np.array(acl_only_articles)
    acl_only_articles = np.delete(acl_only_articles,del_num_items)
    return acl_only_articles

In [14]:
all_articles = delete_items_from_papers(acl_ids_not_body_text,all_articles)
len(all_articles)

39763

##### удалим дублированные статьи по acl_id

In [15]:
acl_paper_ids = [article['metadata']['acl_id'] for article in all_articles if article['metadata']['acl_id']]
len(acl_paper_ids)

39763

In [16]:
doubled_acl_id_papers = []
for ind,cnt_of_acl_id in zip(pd.Series(acl_paper_ids).value_counts().index,pd.Series(acl_paper_ids).value_counts()):
    if cnt_of_acl_id >=2:
        doubled_acl_id_papers.append(ind)
len(doubled_acl_id_papers)

284

In [17]:
del_items = []
del_num_items = []
for num_artic, article in enumerate(all_articles):
    if article['metadata']['acl_id'] in doubled_acl_id_papers:
        del_items.append(article['paper_id'])
        del_num_items.append(num_artic)

In [18]:
len(del_items)

572

In [19]:
all_articles = np.delete(all_articles,del_num_items)

In [20]:
del_items = []
del_num_items = []
for num_artic, article in enumerate(all_articles):
    if article['metadata']['acl_id'] in doubled_acl_id_papers:
        del_items.append(article['paper_id'])
        del_num_items.append(num_artic)
len(del_items)

0

In [21]:
acl_paper_ids = [article['paper_id'] for article in all_articles if article['metadata']['acl_id']]
len(acl_paper_ids)

39191

#### проверка наличия текста и названия секций во всех статьях в grobid части

In [22]:
len([article['paper_id'] for article in all_articles if article['grobid_parse']['body_text']])

39191

In [23]:
acl_ids_not_bofy_text = [article['paper_id'] for article in all_articles if not article['grobid_parse']['body_text']]

In [24]:
article_with_sect = dict()
for article in all_articles:
    for sections in article['grobid_parse']['body_text']:
        if sections['section']:
            if article['paper_id'] in article_with_sect:
                article_with_sect[article['paper_id']] +=1
            else:
                article_with_sect[article['paper_id']] = 1
article_with_sect

{}

###### как видим нет названия секций у grobid_parse

In [25]:
# article_with_sect = dict()
for num,article in enumerate(all_articles):
    if num == 2:
        break
    for cnt_sect,sections in enumerate(article['grobid_parse']['body_text']):
        if sections['cite_spans']:
            print(cnt_sect,len(sections['cite_spans']))
            for cnt_cite,cite in enumerate(sections['cite_spans']):
                print(cnt_cite,cite)
            print(sections['text'])
            print('----')
    print(10*'==')
#             if article['paper_id'] in article_with_sect:
#                 article_with_sect[article['paper_id']] +=1
#             else:
#                 article_with_sect[article['paper_id']] = 1

0 13
0 {'start': 192, 'end': 216, 'text': '(Goldstein et al., 2000;', 'latex': None, 'ref_id': 'BIBREF6'}
1 {'start': 217, 'end': 239, 'text': 'Erkan and Radev, 2004;', 'latex': None, 'ref_id': 'BIBREF4'}
2 {'start': 240, 'end': 257, 'text': 'Wan et al., 2007;', 'latex': None, 'ref_id': 'BIBREF19'}
3 {'start': 258, 'end': 284, 'text': 'Nenkova and McKeown, 2012;', 'latex': None, 'ref_id': 'BIBREF16'}
4 {'start': 285, 'end': 302, 'text': 'Min et al., 2012;', 'latex': None, 'ref_id': 'BIBREF15'}
5 {'start': 303, 'end': 319, 'text': 'Li et al., 2017)', 'latex': None, 'ref_id': 'BIBREF11'}
6 {'start': 773, 'end': 797, 'text': '(Project Code: 14203414)', 'latex': None, 'ref_id': None}
7 {'start': 2288, 'end': 2305, 'text': '(Hu et al., 2008;', 'latex': None, 'ref_id': 'BIBREF7'}
8 {'start': 2306, 'end': 2324, 'text': 'Yang et al., 2011)', 'latex': None, 'ref_id': 'BIBREF22'}
9 {'start': 2582, 'end': 2598, 'text': 'Li et al. (2015)', 'latex': None, 'ref_id': 'BIBREF2'}
10 {'start': 2911, 'en

Variational Autoencoders (VAEs) (Kingma and Welling, 2014; Rezende et al., 2014 ) is a generative model based on neural networks which can be used to conduct latent semantic modeling. Li et al. (2017) employ VAEs to map the news sentences into a latent semantic space, which is helpful in improving the MDS performance. Similarly, we also employ VAEs to conduct the semantic modeling for news sentences and comment sentences. Assume that both the prior and posterior of the latent variables are Gaussian, i.e., p θ (z) = N (0, I) and q φ (z|x) = N (z; µ, σ 2 I), where µ and σ denote the variational mean and standard deviation respectively, which can be calculated with a multilayer perceptron (MLP). VAEs can be divided into two phases, namely, encoding (inference), and decoding (generation). All the operations are de- picted as follows:h enc = relu(W xh x + b xh ) µ = W hµ h enc + b hµ log(σ 2 ) = W hσ h enc + b hσ ε ∼ N (0, I), z = µ + σ ⊗ ε h dec = relu(W zh z + b zh ) x = sigmoid(W hx h de

To evaluate the performance of our dataset and the proposed framework RAVAESum for RA-MDS, we compare our model with the following methods:• RA-Sparse : It is a framework to tackle the RA-MDS problem. A sparse-coding-based method is used to calculate the salience of the news sentences by jointly considering news documents and reader comments.• Lead (Wasson, 1998) : It ranks the news sentences chronologically and extracts the leading sentences one by one until the length limit.• Centroid (Radev et al., 2000) : It summarizes clusters of news articles automatically grouped by a topic detection system, and then it uses information from the centroids of the clusters to select sentences.• LexRank (Erkan and Radev, 2004) and TextRank (Mihalcea and Tarau, 2004) : Both methods are graph-based unsupervised framework for sentence salience estimation based on PageRank algorithm.• Concept : It generates abstractive summaries using phrase-based optimization framework with concept weight as salience 

(1) by builder (∼30 minutes)(2) via domain-general grammar (3) via crowdsourcing (∼5 hours) (4) by training a paraphrasing model Figure 1 : Functionality-driven process for building semantic parsers. The two red boxes are the domain-specific parts provided by the builder of the semantic parser, and the other two are generated by the framework.parser in a new domain. At a high-level, we seek to minimize the amount of work needed for a new domain by factoring out the domaingeneral aspects (done by our framework) from the domain-specific ones (done by the builder of the semantic parser). We assume that the builder already has the desired functionality of the semantic parser in mind-e.g., the publications database is set up and the schema is fixed. Figure 1 depicts the functionality-driven process: First, the builder writes a seed lexicon specifying a canonical phrase ("publication date") for each predicate (publicationDate).Second, our framework uses a domain-general grammar, along with t

Our logical forms are represented in lambda DCS, a logical language where composition operates on sets rather than truth values. Here we give a brief description; see Liang (2013) for details.Every logical form z in this paper is either a unary (denoting a set of entities) or a binary (denoting a set of entity-pairs). In the base case, each entity e (e.g., 2015) is a unary denoting the singleton set: e w = {e}; and each property p (e.g., publicationDate) is a binary denoting all entitypairs (e 1 , e 2 ) that satisfy the property p. Unaries and binaries can be composed: Given a binary b and unary u, the join b.u denotes all entities e 1 for which there exists an e 2 ∈ u w with (e 1 , e 2 ) ∈ b w . For example, publicationDate.2015 denote entities published in 2015.The intersection u 1 u 2 , union u 1 u 2 , complement ¬u denote the corresponding set operations on the denotations. We let R(b) denote the reversal of b: (e 1 , e 2 ) ∈ b w iff (e 2 , e 1 ) ∈ R(b) w . This allows us to define

Geo880. To test how our parser generalizes to utterances independent of our framework, we created a semantic parser for the domain of US geography, and tested on the standard 280 test examples from GEO880 (Zelle and Mooney, 1996) . We did not use the standard 600 training examples. Our parser obtained 56.4% accuracy, which is substantially lower than state-of-the-art (∼ 90%).We performed error analysis on 100 random sentences from the development set where accuracy was 60%. We found that the parser learns from the training data to prefer shorter paraphrases, which accounts for 30% of the errors. In most of these cases, the correct logical form is ranked at the top-3 results (accuracy for the top-3 derivations is 73%). GEO880 contains highly compositional utterances, and in 25% of the errors the correct derivation tree exceeds the maximum depth used for our parser. Another 17.5% of the errors are caused by problems in the paraphrasing model. For example, in the utterance "what is the si

all_results[0] - Reader-Aware Multi-Document Summarization: An Enhanced Model and The First Dataset∗



Result 

- 0 {'start': 192, 'end': 216, 'text': '(Goldstein et al., 2000;', 'latex': None, 'ref_id': 'BIBREF6'}
- 1 {'start': 217, 'end': 239, 'text': 'Erkan and Radev, 2004;', 'latex': None, 'ref_id': 'BIBREF4'}
- 2 {'start': 240, 'end': 257, 'text': 'Wan et al., 2007;', 'latex': None, 'ref_id': 'BIBREF19'}
- 3 {'start': 258, 'end': 284, 'text': 'Nenkova and McKeown, 2012;', 'latex': None, 'ref_id': 'BIBREF16'}
- 4 {'start': 285, 'end': 302, 'text': 'Min et al., 2012;', 'latex': None, 'ref_id': 'BIBREF15'}
- 5 {'start': 303, 'end': 319, 'text': 'Li et al., 2017)', 'latex': None, 'ref_id': 'BIBREF11'}

True 

- 0 (Goldstein et al., 2000; 
- 1 Erkan and Radev,2004; 
- 2 Wan et al., 2007; 
- 3 Nenkova and McKeown, 2012; 
- 4 Min et al., 2012; 
- 5 Bing et al., 2015; 
- 6 Li et al.,2017)


Result

- {'start': 971, 'end': 987, 'text': 'Li et al. (2015)', 'latex': None, 'ref_id': 'BIBREF2'}

True

- Woodsend and Lapata (2012), Bing et al. (2015), and Li et al. (2015).

**При большом перечислении подряд ссылок, GROBID не выделяет предпоследнюю ссылку**

**Также он не срабатывает на части ссылок**

#### проверка наличия текста и названия секций во всех статьях в latex части

In [26]:
len([article['paper_id'] for article in all_articles if article['latex_parse'] and article['latex_parse']['body_text']])

3868

In [27]:
acl_ids_not_body_text_tex = [article['paper_id'] for article in all_articles if not ( article['latex_parse'] and article['latex_parse']['body_text'])]

In [28]:
len(acl_ids_not_body_text_tex)

35323

In [29]:
for num,paper_id in enumerate(acl_ids_not_body_text_tex):
    if num == 2:
        break
    id_lst = acl_paper_ids.index(paper_id)
    print(id_lst,all_articles[id_lst]['latex_parse'])
    print(10*'==')

1 None
2 None


In [30]:
{k: all_articles[0]['latex_parse']['bib_entries'][k] for k in sorted(all_articles[0]['latex_parse']['bib_entries'])}

{'BIBREF0': {'ref_id': 'BIBREF0',
  'title': 'Multi-document summarization by sentence extraction',
  'authors': [{'first': 'Jade',
    'middle': [],
    'last': 'Goldstein',
    'suffix': ''},
   {'first': 'Vibhu', 'middle': [], 'last': 'Mittal', 'suffix': ''},
   {'first': 'Jaime', 'middle': [], 'last': 'Carbonell', 'suffix': ''},
   {'first': 'Mark', 'middle': [], 'last': 'Kantrowitz', 'suffix': ''}],
  'year': 2000,
  'venue': 'NAACL-ANLPWorkshop',
  'volume': '',
  'issn': '',
  'pages': '40--48',
  'other_ids': {},
  'links': '8294822'},
 'BIBREF1': {'ref_id': 'BIBREF1',
  'title': 'Lexpagerank: Prestige in multi-document text summarization',
  'authors': [{'first': 'Günes', 'middle': [], 'last': 'Erkan', 'suffix': ''},
   {'first': '', 'middle': [], 'last': 'Dragomir R Radev', 'suffix': ''}],
  'year': 2004,
  'venue': 'EMNLP',
  'volume': '4',
  'issn': '',
  'pages': '365--371',
  'other_ids': {},
  'links': '10418456'},
 'BIBREF10': {'ref_id': 'BIBREF10',
  'title': 'Auto-enc

In [31]:
all_articles[0]['latex_parse']

{'abstract': [],
 'body_text': [{'text': 'The goal of multi-document summarization (MDS) is to automatically generate a brief, well-organized summary for a topic which describes an event with a set of documents from different sources. BIBREF0 , BIBREF1 , BIBREF2 , BIBREF3 , BIBREF4 , BIBREF5 , BIBREF6 . In the typical setting of MDS, the input is a set of news documents about the same topic. The output summary is a piece of short text document containing several sentences, generated only based on the input original documents.',
   'cite_spans': [{'start': 193,
     'end': 200,
     'text': None,
     'latex': None,
     'ref_id': 'BIBREF0'},
    {'start': 203,
     'end': 210,
     'text': None,
     'latex': None,
     'ref_id': 'BIBREF1'},
    {'start': 213,
     'end': 220,
     'text': None,
     'latex': None,
     'ref_id': 'BIBREF2'},
    {'start': 223,
     'end': 230,
     'text': None,
     'latex': None,
     'ref_id': 'BIBREF3'},
    {'start': 233,
     'end': 240,
     'te

In [32]:
all_articles[0]['latex_parse']['body_text'][0].keys()

dict_keys(['text', 'cite_spans', 'ref_spans', 'eq_spans', 'section'])

In [33]:
article_with_sect_latex = dict()
for article in all_articles:
    if article['latex_parse'] and article['latex_parse']['body_text']: 
        for sections in article['latex_parse']['body_text']:
            if sections['section']:
                if article['paper_id'] in article_with_sect_latex:
                    article_with_sect_latex[article['paper_id']] +=1
                else:
                    article_with_sect_latex[article['paper_id']] = 1

##### Количество статей, в которых есть названия секций

In [34]:
len(article_with_sect_latex)

3502

#### Сравнение количества выделенных ссылок в grobid_parse & latex_parse

In [35]:
latex_more_grobid_bib_entr = []
grobid_more_latex_bib_entr = []
equal = []
for num_art ,article in enumerate(all_articles):
    if article['latex_parse'] and article['latex_parse']['body_text']:
        if len(article['latex_parse']['bib_entries']) > len(article['grobid_parse']['bib_entries']):
            print(num_art,article['paper_id'])
            latex_more_grobid_bib_entr.append(article['paper_id'])
        elif len(article['latex_parse']['bib_entries']) < len(article['grobid_parse']['bib_entries']):
#             print(num_art,article['paper_id'])
            grobid_more_latex_bib_entr.append(article['paper_id'])
        else:
            equal.append(article['paper_id'])

76 16050464
96 173990592
273 29245285
304 100300
346 2558
504 86813509
552 1703535
601 14170854
651 49358911
682 371926
696 682772
700 2840197
735 2411
773 16273304
827 17511008
886 3101294
892 5740960
920 2145766
1062 5079594
1135 298504
1141 870921
1183 53217693
1302 1438450
1323 309476
1324 44278
1335 7669927
1481 711424
1513 1423962
1701 370914
1743 15600925
1758 184488087
1811 10086161
1866 11492268
2028 534431
2062 15986631
2109 189998202
2125 21665312
2166 9371149
2168 311594
2170 6256345
2271 14974
2287 3933075
2339 1571038
2345 2862211
2351 6210126
2553 15881253
2630 5201435
2666 5054582
2754 10324034
2826 7021843
3006 3152424
3077 139106285
3094 14922772
3095 2652169
3120 3204831
3180 12245103
3218 85543217
3230 17297069
3279 621025
3326 85556928
3355 13661068
3356 1765384
3371 3025759
3543 1871596
3588 27914547
3611 53082704
3622 118680003
3627 11451871
3772 3132651
3797 20995314
3800 85529973
3801 14401063
3808 2213896
3834 3204935
3861 537
3971 1031444
3989 5112203
3996 46

30643 1746246
30673 3191956
30681 21015570
30780 9174081
30807 173990523
30851 52155263
30875 10250472
30900 84842989
30904 3265541
30961 195750811
31029 29151507
31155 3265262
31296 6008960
31300 174802477
31401 13888952
31406 3380653
31431 85518027
31488 195776133
31494 14980132
31503 59986
31640 15620570
31748 12087925
31757 395839
31776 7116029
31866 718342
31872 52169534
31943 49881509
31945 932197
31975 10009142
32015 47019063
32156 3205220
32219 3266611
32251 102350939
32277 174798275
32316 1119356
32320 14519034
32400 53082542
32462 53217060
32550 155093144
32741 48354032
32783 195345047
32815 16960682
32873 744471
32920 1900253
33043 102352298
33094 52111211
33255 9573708
33291 198985976
33338 1373479
33388 14586568
33445 84841767
33464 167217880
33493 189762439
33602 60368
33707 24609417
33718 216107
33753 5151070
33811 53590103
33850 14425690
33877 51877560
33896 10619801
33917 8233374
34056 608
34139 2522459
34185 53092624
34200 3426453
34250 131773668
34331 14841563
34340 

In [36]:
len(latex_more_grobid_bib_entr),len(grobid_more_latex_bib_entr),len(equal)

(739, 2480, 649)

In [37]:
739+2480+649

3868

In [38]:
for num_art ,article in enumerate(all_articles):
    if num_art > 1000:
        break
    if article['latex_parse'] and article['latex_parse']['body_text']:
        latex_links = [v['links'] for k,v in article['latex_parse']['bib_entries'].items() if v['links']]
        if article['grobid_parse']:
            grobid_links =  [v['links'] for k,v in article['grobid_parse']['bib_entries'].items() if v['links']]
            if len(latex_links)> 0 and len(grobid_links) == 0:
                print('WOW!')
            print(f'latex_links = {len(latex_links)}| grobid_links = {len(grobid_links)}| together = {len(set(grobid_links+latex_links))}')
            print(10*'==')

        else:
            print(f'latex_links = {len(latex_links)}| grobid_links = 0| together = {len(set(latex_links))}')
            print(10*'==')
    

latex_links = 18| grobid_links = 21| together = 22
latex_links = 7| grobid_links = 7| together = 9
latex_links = 0| grobid_links = 12| together = 12
latex_links = 23| grobid_links = 32| together = 33
latex_links = 21| grobid_links = 21| together = 21
latex_links = 0| grobid_links = 38| together = 38
latex_links = 50| grobid_links = 46| together = 51
latex_links = 9| grobid_links = 38| together = 37
latex_links = 27| grobid_links = 13| together = 30
latex_links = 31| grobid_links = 27| together = 32
latex_links = 25| grobid_links = 28| together = 31
latex_links = 12| grobid_links = 40| together = 41
latex_links = 24| grobid_links = 30| together = 32
latex_links = 4| grobid_links = 16| together = 16
latex_links = 10| grobid_links = 13| together = 13
latex_links = 31| grobid_links = 31| together = 32
latex_links = 0| grobid_links = 22| together = 21
latex_links = 43| grobid_links = 41| together = 45
latex_links = 28| grobid_links = 26| together = 28
latex_links = 0| grobid_links = 8| toge

latex_links = 32| grobid_links = 40| together = 42


Проверка ситуации, когда есть **latex_parse**, но  нет **grobid_parse**

In [39]:
for num_art ,article in enumerate(all_articles):
    if article['latex_parse'] and article['latex_parse']['body_text']:
        if len(article['grobid_parse']['body_text'])==0:
            print(num_art,articcle['paper_id'])

Не бывает

## Выделение обзорной части статьи

1. Самый простой принцип построения - ***по максимальному количеству ссылок в абзаце***:
<br> **(СМОТРИ В collect ACL papers-evaluate covering overview titiles_16_05 & collect ACL papers-evaluate covering overview titiles_8_05)**
 - **Решение**:
    - подсчитать количество ссылок в каждой секции
    - выбрать секцию с максимальным количеством ссылок (возможно ещё оставить ещё 1 секцию, в которой количество ссылок было больше половины чем в максимальной)
    - для latex статей надо объединить текст одинаковых секций в 1 абзац
 - **Критерий**:
    -  в части latex публикаций есть названия секций => после выделения обзорных часте можно посмотреть какие секции выделились: какие топ-3, сделать просмотр глазами и после этого решать что делать дальше.
    - возможно логично сохранять для 2 максимального текста название статей

2. Сделать ***ML - модель на признаках***:
 - **Сбор выборки**: (считаем признаки для каждой секции статьи - наша выборка)
     - подсчитать $\text{густота ссылок} =  \frac{\text{кол-во ссылок}}{\text{длина абзаца}}$
     - подсчитать кол-во непрерывных предложений, в которых есть хотя бы 1 ссылка
     - расположение секции в документе (мб нормализовать)
     - усреднение позиции in-line ссылок в каждой секции
 - **Сбор таргета**: (для статей с latex)
     - вытаскиваем название секции как колонку 
 - **Формирование финальной таблицы**:
     - объединяем таблицы
     - в зависимости от наличия в секции: RW, background, overview - назначаем 1 или 0
     - разделяем выборку на train & test
 - **Обучаем различные классификаторы и смотрим качество**:
     - LogReg
     - Xgboost
     - DT
     - RF
     - SVM
 - Также мб посмотреть если делаем учет по топ 2 секциям

### Сбор выборки

In [40]:
overview_papers = dict()

In [41]:
all_articles[0].keys()

dict_keys(['paper_id', 'metadata', 's2_pdf_hash', 'grobid_parse', 'latex_parse'])

In [42]:
all_articles[0]['grobid_parse'].keys()

dict_keys(['abstract', 'body_text', 'ref_entries', 'bib_entries'])

In [43]:
all_articles[0]['latex_parse'].keys()

dict_keys(['abstract', 'body_text', 'ref_entries', 'bib_entries'])

In [44]:
overview_papers[all_articles[0]['paper_id']] = {
    'paper_id':all_articles[0]['paper_id'],   'metadata':all_articles[0]['metadata'],
    's2_pdf_hash':all_articles[0]['s2_pdf_hash'], 'grobid_parse':None,'latex_parse':None}

In [45]:
overview_papers

{'10164018': {'paper_id': '10164018',
  'metadata': {'title': 'Reader-Aware Multi-Document Summarization: An Enhanced Model and The First Dataset',
   'authors': [{'first': 'Piji', 'middle': [], 'last': 'Li', 'suffix': ''},
    {'first': 'Lidong', 'middle': [], 'last': 'Bing', 'suffix': ''},
    {'first': 'Wai', 'middle': [], 'last': 'Lam', 'suffix': ''}],
   'abstract': 'We investigate the problem of reader-aware multi-document summarization (RA-MDS) and introduce a new dataset for this problem. To tackle RA-MDS, we extend a variational auto-encodes (VAEs) based MDS framework by jointly considering news documents and reader comments. To conduct evaluation for summarization performance, we prepare a new dataset. We describe the methods for data collection, aspect annotation, and summary writing as well as scrutinizing by experts. Experimental results show that reader comments can improve the summarization performance, which also demonstrates the usefulness of the proposed dataset. The 

In [46]:
all_articles[0]['grobid_parse']['body_text'][0].keys()

dict_keys(['text', 'cite_spans', 'ref_spans', 'eq_spans', 'section'])

In [47]:
def text2sentences(text):
    sentences = []
    text = text.replace('?','. ').replace('!','. ').replace('Mt.','Mt').replace('.)','. ').replace('•','').split('.')
    #.replace('al.','al ').replace('i.e.','ie').replace('e.g.','eg').replace('...','. ')
    for line in text:
#         print(line,len(sentences))
        if len(sentences) == 0:
            if len(line)>25:
                sentences.append(line)
            elif len(text)>1:
                text[1] = text[0]+'.'+ text[1]
            else:
                continue
        elif len(line.split())>2:
            if len(line)>2 and (line[0].isupper() or line[1].isupper()):
                sentences.append(line)
            elif len(line)>4 and ( line[1].isdigit() or line[2].isdigit()) and not (line[0].isdigit() and line[1].isdigit()): #line[0].isdigit() or
                new_line = line.replace('(','').replace(')','')
                if (new_line[2].isupper() or new_line[3].isupper()):
                    sentences.append(line)
                else:
                    line = '.'+line
                    sentences[-1] += line
            else:
                line = '.'+line
                sentences[-1] += line 
        else:
            sentences[-1] += '.'+line
    return sentences

#### Тестирование разбиения на предложения
- **1 Тест**
    - посмотреть на предложения, у которых длина маленькая и проверить все ли верно распарсили
- **2 Тест**
    - посмотреть длинные предложения, в которых много заглавных букв

**1 Тест**

In [48]:
for num_art, article in enumerate(all_articles):
    sentences_article = []
    if num_art >20:
        break
    for num_sec, section in enumerate(article['grobid_parse']['body_text']):
        sentences_sec = text2sentences(section['text'])
#         sentences_sec_len = [len(sentence) for sentence in sentences_sec]
        sentences_sec_len = []
        for sentence in sentences_sec:
            if len(sentence.split()) < 3:
                print(10*'==')
                print('paper_num = {0} paper_sec = {1} '.format(num_art,num_sec))
                print(len(sentence.split()),sentence)
                print(10*'==')
            sentences_sec_len.append(len(sentence.split()))
        sentences_article.append(sentences_sec_len)
    print(50*'-')
    print('paper_id = {0} | paper_num = {1} '.format(article['paper_id'],num_art))
    print(sentences_article)

--------------------------------------------------
paper_id = 10164018 | paper_num = 0 
[[53, 18, 21, 57, 22, 16, 8, 36, 8, 10, 17, 27, 15, 18, 27, 27, 11, 26, 20, 32, 25, 30, 45, 19, 10, 11, 8, 20, 11, 21, 11, 19, 20], [27, 32, 25, 39, 22], [29, 24, 16, 53, 13, 152, 25, 13, 15, 45, 13, 49, 76, 20, 30, 22, 55, 49, 10, 21, 15, 30, 112, 26, 19, 27, 25, 95, 18, 45, 20], [33, 18, 84, 10, 17, 15, 7, 18, 12], [11, 7], [13, 21, 10, 16, 7, 29, 9, 9, 21, 18, 34, 43, 23], [7, 11, 9, 12, 11, 22, 20, 20, 21, 19, 17, 17, 20, 14, 10, 21, 13, 24, 14, 16, 10], [10, 14, 10, 10, 13, 13, 41, 7], [11, 14, 8], [33, 21, 22, 33, 26, 16, 8, 9, 11, 11], [16, 27, 15, 21, 15, 17, 19], [16, 15, 29, 13, 20, 14, 25, 19, 33, 13, 21, 29], [41, 10, 18, 11, 22, 13, 34], [14, 21, 16, 19, 47]]
--------------------------------------------------
paper_id = 14472576 | paper_num = 1 
[[47, 36, 21, 5, 22], [28, 29, 38, 27, 23, 40, 18, 23, 12, 13, 18, 27, 25, 10, 12, 20, 40, 15, 10, 16, 34, 6, 45, 11, 51, 18, 36, 26, 11, 22, 1

[[19, 24, 13, 49, 29, 20, 24, 20], [20, 10, 23, 18, 11, 16, 51, 33, 7, 16, 20, 40, 26, 11, 26, 20, 17, 30, 20, 14, 12, 18, 17, 32, 24], [12, 13, 24, 21, 21, 26, 14, 24, 49, 35, 11, 11, 11, 14], [23, 31], [34, 19, 4, 29, 27, 12, 23, 16, 9, 13, 25, 19, 9, 21, 45, 36, 16, 22, 20, 28, 22, 18, 25, 24, 8, 16, 26, 11, 34, 21, 19, 38, 25, 16, 22, 19, 31, 46, 16, 15, 71, 23], [26, 13, 12, 16, 15, 12, 15, 20, 14, 15]]
--------------------------------------------------
paper_id = 823637 | paper_num = 13 
[[24, 23, 13, 10, 30, 10, 29, 23, 23, 43, 21, 16, 17, 12, 17, 27, 39, 28, 16, 14, 22, 15, 10, 17, 12, 19, 31, 25, 16, 26], [46, 14, 24, 30, 9, 39, 39, 12, 26, 11, 8], [15, 12, 42, 7, 15, 24, 29, 23, 42, 51, 55, 17, 7, 9, 26, 13, 34], [6, 17, 32, 29, 30, 22, 23, 24, 35, 57, 24, 19, 23, 27, 17, 20, 55, 50, 21, 10, 20, 21, 11, 22, 20, 13, 10, 18, 12, 17, 46, 13, 12, 27, 13, 27, 10, 19, 25, 13], [19, 4, 21, 16, 18, 16, 10, 22, 11, 6, 8, 28, 11, 14, 5, 8, 17, 16, 7, 21, 16, 15, 12, 23, 12, 30, 15, 12,

In [49]:
num_art = 1
num_sect = 8

In [50]:
all_articles[num_art]['grobid_parse']['body_text'][num_sect]['text']

'(x,c,z)∈D log p θ (z, c | x, w) − λ θ 1 . To optimize, we use AdaGrad (Duchi et al., 2010) .Features Table 4 describes the features. Our basic features mainly match words and bigrams in x and c, if they share a lemma or are aligned in the PPDB resource (Ganitkevitch et al., 2013) . We count the number of exact matches, PPDB matches, and unmatched words.To obtain lexical features, we run the Berkeley Aligner (Liang et al., 2006) on the training set and compute conditional probabilities of aligning one word type to another. Based on these probabilities we compute a maximum weight alignment A between words in x and c. We define features over A (see Table 4 ). We also use the word alignments to construct a phrase table by applying the consistent phrase pair heuristic (Och and Ney, 2004) . We define an indicator feature for every phrase pair of x and c that appear in the phrase table. Examples from the PUBLICATIONS domain include fewestleast number and by-whose author is. Note that we do n

In [51]:
text2sentences(all_articles[num_art]['grobid_parse']['body_text'][num_sect]['text'])

['(x,c,z)∈D log p θ (z, c | x, w) − λ θ 1 ',
 ' To optimize, we use AdaGrad (Duchi et al., 2010) ',
 'Features Table 4 describes the features',
 ' Our basic features mainly match words and bigrams in x and c, if they share a lemma or are aligned in the PPDB resource (Ganitkevitch et al., 2013) ',
 ' We count the number of exact matches, PPDB matches, and unmatched words',
 'To obtain lexical features, we run the Berkeley Aligner (Liang et al., 2006) on the training set and compute conditional probabilities of aligning one word type to another',
 ' Based on these probabilities we compute a maximum weight alignment A between words in x and c',
 ' We define features over A (see Table 4 )',
 ' We also use the word alignments to construct a phrase table by applying the consistent phrase pair heuristic (Och and Ney, 2004) ',
 ' We define an indicator feature for every phrase pair of x and c that appear in the phrase table',
 ' Examples from the PUBLICATIONS domain include fewestleast number 

In [52]:
len('This year comes the third') # меньше символов в секции уирать

25

**2 Тест**

In [53]:
import re
len(re.findall(r'[A-Z]',all_articles[num_art]['grobid_parse']['body_text'][num_sect]['text']))

48

In [54]:
for num_art, article in enumerate(all_articles):
    sentences_len_up_article = []
    if num_art >20:
        break
    for num_sec, section in enumerate(article['grobid_parse']['body_text']):
        sentences_sec = text2sentences(section['text'])
#         sentences_sec_len = [len(sentence) for sentence in sentences_sec]
        
        sentences_sec_len_up = []
        for sentence in sentences_sec:
            
            if len(re.findall(r'[A-Z]',sentence)) > 10:
                print(10*'==')
                print('paper_num = {0} paper_sec = {1} '.format(num_art,num_sec))
                print(len(re.findall(r'[A-Z]',sentence)),sentence)
                print(10*'==')
            sentences_sec_len_up.append(len(re.findall(r'[A-Z]',sentence)))
        sentences_len_up_article.append(sentences_sec_len)
    print(50*'-')
    print('paper_id = {0} | paper_num = {1} '.format(article['paper_id'],num_art))
    print(sentences_len_up_article)

paper_num = 0 paper_sec = 0 
13 The goal of multi-document summarization (MDS) is to automatically generate a brief, wellorganized summary for a topic which describes an event with a set of documents from different sources. (Goldstein et al., 2000; Erkan and Radev, 2004; Wan et al., 2007; Nenkova and McKeown, 2012; Min et al., 2012; Li et al., 2017) 
paper_num = 0 paper_sec = 0 
18 With the development of social media and mobile equipments, more and more user generated * The work described in this paper is supported by a grant from the Grant Council of the Hong Kong Special Administrative Region, China (Project Code: 14203414) .1 http://www.se.cuhk.edu.hk/˜textmine/ dataset/ra-mds/ NEWS: The most important announcements from Google's big developers' conference content is available
paper_num = 0 paper_sec = 0 
15 Recently, Li et al. (2017) proposed a sentence salience estimation framework known as VAESum based on a neural generative model called Variational Auto-Encoders (VAEs) (Kingma 

[[14, 20, 23, 9, 32], [14, 20, 23, 9, 32], [14, 20, 23, 9, 32], [14, 20, 23, 9, 32], [14, 20, 23, 9, 32], [14, 20, 23, 9, 32], [14, 20, 23, 9, 32], [14, 20, 23, 9, 32], [14, 20, 23, 9, 32], [14, 20, 23, 9, 32], [14, 20, 23, 9, 32], [14, 20, 23, 9, 32], [14, 20, 23, 9, 32], [14, 20, 23, 9, 32]]
paper_num = 1 paper_sec = 2 
13 G is RELNP[r] of NP[x] → NP[RNext, the builder (backed by crowdsourcing) paraphrases each canonical utterance c output above into a set of natural utterances P(c) (e.g., "when was article 1 published. ")
paper_num = 1 paper_sec = 4 
12  Broadly speaking, the rules (R1)-(R4), (C1)-(C4) take a binary and a noun phrase, and compose them (optionally via comparatives, counting, and negation) to produce a complementizer phrase CP representing a unary (e.g., "that cites article 1" or "that cites more than three article"). (G3) combines these CP's with an NP (e.g., "article")
paper_num = 1 paper_sec = 4 
19  For example, in the SOCIAL domain, Alice's education can be repre

11  We would also like to find out whether a heuristic treatment of intensifiers and detensifiers, the normalization of character repetitions, or the inclusion of some punctuationbased features could further improve classifier performance. 13 For task B, even the extended unigram bag-of-words model by itself, without any additional resources, would have performed quite well as the 9th best constrained system on the Twitter test set (13th best system overall) and the 5th best system on the SMS test set.14 http://www.ark.cs.cmu.edu/TweetNLP/
--------------------------------------------------
paper_id = 17302615 | paper_num = 2 
[[14, 20, 23, 9, 32], [14, 20, 23, 9, 32], [14, 20, 23, 9, 32], [14, 20, 23, 9, 32], [14, 20, 23, 9, 32], [14, 20, 23, 9, 32], [14, 20, 23, 9, 32], [14, 20, 23, 9, 32], [14, 20, 23, 9, 32], [14, 20, 23, 9, 32]]
paper_num = 3 paper_sec = 1 
14 Given a NI P2 N8 structure, these transt'ornmtions are obtained through corpus-based tuning 1The following symbols are used

20 Besides the general idea to allow for nonexperts to assess information encoded in RDF, we envision application of these verbalization templates in three scenarios: (1) In query interfaces to semantic databases, casual users -usually not capable of writing formal queries -specify their information needs using keywords (Lei et al., 2006; Thomas et al., 2007; Wang et al., 2008) , questions in free-text or using a controlled language (Kaufmann et al., 2006; Cimiano et al., 2008; Wendt et al., 2012; Damljanovic et al., 2012) , or forms (Hunter and Odat, 2011 Figure 1 : A template consists of a graph pattern GP and a sentence pattern SP 
paper_num = 5 paper_sec = 0 
12  The graph pattern GP can be transformed into a SPARQL query Q GP 
paper_num = 5 paper_sec = 0 
11  This graph can be verbalized as an English sentence S en using the English sentence pattern SP en or as a German sentence S de using the German sentence pattern SP de 
paper_num = 5 paper_sec = 2 
14  We denote the set of var

16  It includes three types of texts from the biomedical domain -namely, radiological reports, biological full papers and abstracts from the GENIA corpus. (15 new full biomedical papers were annotated for hedge cues and their scopes, which served as the evaluation database of the CoNLL-2010 Shared Task (Farkas et al., 2010) , and this dataset will be added to BioScope in the near future.  The annotation was carried out by two students of linguistics supervised by a linguist
paper_num = 7 paper_sec = 4 
12  Nested scopes One scope includes another one:These observations (suggest that TNF and PMA do (not lead to NFkappa B activation through induction of changes in the cell redox status))
paper_num = 7 paper_sec = 5 
25  In order to see what the main differences are between the corpora, the annotation principles were contrasted: in GENIA Event, no modifier keywords are marked, however, in BioScope, they are; the scope of speculation and negation is explicitly marked in BioScope and it can

paper_num = 11 paper_sec = 0 
50 The following NP level constructions are considered here (cf. the classifications provided by (Quirk et al.1985) and (Semmelmeyer and Bolander 1992) ):(1) Compound Nominals consisting of two consecutive nouns (eg night club -a TEMPORAL relation -indicating that club functions at night), (2) Adjective Noun constructions where the adjectival modifier is derived from a noun (eg musical clock -a MAKE/PRODUCE relation), (3) Genitives (eg the door of the car -a PART-WHOLE relation), and (4) Adjective phrases (cf. (Semmelmeyer and Bolander 1992) ) in which the modifier noun is expressed by a prepositional phrase which functions as an adjective (eg toy in the box -a LOCATION relation)
paper_num = 11 paper_sec = 0 
109 There are several semantic relations at the noun phrase level: (1) Saturday's snowfall is a genitive encoding a TEMPORAL relation, (2) one-day record is a TOPIC noun compound indicating that record is about one-day snowing -an ellipsis here, (3) r

[[14, 20, 23, 9, 32], [14, 20, 23, 9, 32], [14, 20, 23, 9, 32], [14, 20, 23, 9, 32], [14, 20, 23, 9, 32], [14, 20, 23, 9, 32], [14, 20, 23, 9, 32], [14, 20, 23, 9, 32], [14, 20, 23, 9, 32], [14, 20, 23, 9, 32], [14, 20, 23, 9, 32], [14, 20, 23, 9, 32], [14, 20, 23, 9, 32], [14, 20, 23, 9, 32], [14, 20, 23, 9, 32], [14, 20, 23, 9, 32], [14, 20, 23, 9, 32]]
paper_num = 12 paper_sec = 1 
15  For any instances i p and i n in RP and RN, they can be in the same output set I of an IR system as:∀i p ∈ RP , ∀i n ∈ RN , ∃I, {i p , i n } ⊂ IPrecision and recall are thus not directly correlated
paper_num = 12 paper_sec = 1 
11 In Table 1 , the source Chinese sentence and its English translation in the form of character strings Table 1 : Sample sentences along with the output of two trivial segmenters (T1, T2) and three other segmenters (S1, S2, S3)
paper_num = 12 paper_sec = 1 
12  True Positives (TP), Precision (P), recall (R), F1-score (F) and true negative rate (TNR) are calculated respectively

44  The accuracy acc(E k ), macro-average avg M , and micro-average avg µ are defined as follows:acc(E k ) = T P (E k ) + T N (E k ) T P (E k ) + F P (E k ) + T N (E k ) + F N (E k ) , (3) avg M = 1 m m k=1 acc(E k ),(4)avg µ = acc(E k ) × N (E k ) m k=1 N (E k ) ,(5)where T P (E k ) is the set of test documents correctly classified to the emotion E k , F P (E k ) is the set of test documents incorrectly classified to the emotion, F N (E k ) is the set of test documents wrongly rejected, T N (E k ) is the set of test documents correctly rejected, and N (E k ) is the total number of documents in this emotion category
paper_num = 15 paper_sec = 7 
12  For example, as stated in the previous section, we observed that keywords related to "Happy" (in green) are mostly about sports, including terms such as team names (e.g., "熱火 (Miami Heat)" and "紅襪 (Boston Red Sox)") and player names (e.g., "陳偉殷 (Wei-Yin Chen)", a pitcher for the baseball team Baltimore Orioles)
paper_num = 15 paper_sec = 7 

18  There is an obvious connection between causal relation and temporal relation: by definition, the CAUSE event starts 'BEFORE' the EFFECT event
paper_num = 19 paper_sec = 8 
23  This definition is in line with the definition of CAUSE and PRECONDITION presented in the RED annotation guidelines (Ikuta et al., 2014) (to be discussed in Section 6)
paper_num = 19 paper_sec = 16 
11  Following the other discourse structure annotation tasks such as Rhetorical Structure Theory (RST), we aggregate all the relations captured by all annotators as the annotation object, then labeling 'NONE' as the category for coders who have not captured this relation
paper_num = 19 paper_sec = 17 
11 One of the most recent temporal annotation schemas is Temporal Histories of Your Medical Event (THYME) 
paper_num = 19 paper_sec = 17 
42  In their work, they combine the TimeML annotation schema with Allen Interval Algebra, identifying the five temporal relations BEFORE, OVERLAP, BEGINS-ON, ENDS-ON, and CONTAINS


Я считаю, что у нас приемлимое качество

Далее можно улучшать

##### Создание признаков
 - подсчитать $\text{густота ссылок} =  \frac{\text{кол-во ссылок}}{\text{длина абзаца}}$
 - подсчитать кол-во непрерывных предложений, в которых есть хотя бы 1 ссылка
 - расположение секции в документе (мб нормализовать)
 - усреднение позиции in-line ссылок в каждой секции

**подсчитаем кол-во ссылок в предложениях секции**

In [250]:
num_art = 0
num_sect = 0

In [251]:
for cite_span in all_articles[num_art]['grobid_parse']['body_text'][num_sect]['cite_spans']:
    print(cite_span)

{'start': 192, 'end': 216, 'text': '(Goldstein et al., 2000;', 'latex': None, 'ref_id': 'BIBREF6'}
{'start': 217, 'end': 239, 'text': 'Erkan and Radev, 2004;', 'latex': None, 'ref_id': 'BIBREF4'}
{'start': 240, 'end': 257, 'text': 'Wan et al., 2007;', 'latex': None, 'ref_id': 'BIBREF19'}
{'start': 258, 'end': 284, 'text': 'Nenkova and McKeown, 2012;', 'latex': None, 'ref_id': 'BIBREF16'}
{'start': 285, 'end': 302, 'text': 'Min et al., 2012;', 'latex': None, 'ref_id': 'BIBREF15'}
{'start': 303, 'end': 319, 'text': 'Li et al., 2017)', 'latex': None, 'ref_id': 'BIBREF11'}
{'start': 773, 'end': 797, 'text': '(Project Code: 14203414)', 'latex': None, 'ref_id': None}
{'start': 2288, 'end': 2305, 'text': '(Hu et al., 2008;', 'latex': None, 'ref_id': 'BIBREF7'}
{'start': 2306, 'end': 2324, 'text': 'Yang et al., 2011)', 'latex': None, 'ref_id': 'BIBREF22'}
{'start': 2582, 'end': 2598, 'text': 'Li et al. (2015)', 'latex': None, 'ref_id': 'BIBREF2'}
{'start': 2911, 'end': 2927, 'text': 'Li et al.

In [252]:
all_articles[num_art]['grobid_parse']['body_text'][num_sect]['text']

'The goal of multi-document summarization (MDS) is to automatically generate a brief, wellorganized summary for a topic which describes an event with a set of documents from different sources. (Goldstein et al., 2000; Erkan and Radev, 2004; Wan et al., 2007; Nenkova and McKeown, 2012; Min et al., 2012; Li et al., 2017) . In the typical setting of MDS, the input is a set of news documents about the same topic. The output summary is a piece of short text document containing several sentences, generated only based on the input original documents.With the development of social media and mobile equipments, more and more user generated * The work described in this paper is supported by a grant from the Grant Council of the Hong Kong Special Administrative Region, China (Project Code: 14203414) .1 http://www.se.cuhk.edu.hk/˜textmine/ dataset/ra-mds/ NEWS: The most important announcements from Google\'s big developers\' conference content is available. Figure 1 is a snapshot of reader comments

In [253]:
sent_text = nltk.sent_tokenize(all_articles[num_art]['grobid_parse']['body_text'][num_sect]['text'])
sent_text

['The goal of multi-document summarization (MDS) is to automatically generate a brief, wellorganized summary for a topic which describes an event with a set of documents from different sources.',
 '(Goldstein et al., 2000; Erkan and Radev, 2004; Wan et al., 2007; Nenkova and McKeown, 2012; Min et al., 2012; Li et al., 2017) .',
 'In the typical setting of MDS, the input is a set of news documents about the same topic.',
 "The output summary is a piece of short text document containing several sentences, generated only based on the input original documents.With the development of social media and mobile equipments, more and more user generated * The work described in this paper is supported by a grant from the Grant Council of the Hong Kong Special Administrative Region, China (Project Code: 14203414) .1 http://www.se.cuhk.edu.hk/˜textmine/ dataset/ra-mds/ NEWS: The most important announcements from Google's big developers' conference content is available.",
 'Figure 1 is a snapshot of 

In [254]:
sentences_sec = text2sentences(all_articles[num_art]['grobid_parse']['body_text'][num_sect]['text'])
sentences_sec

['The goal of multi-document summarization (MDS) is to automatically generate a brief, wellorganized summary for a topic which describes an event with a set of documents from different sources. (Goldstein et al., 2000; Erkan and Radev, 2004; Wan et al., 2007; Nenkova and McKeown, 2012; Min et al., 2012; Li et al., 2017) ',
 ' In the typical setting of MDS, the input is a set of news documents about the same topic',
 ' The output summary is a piece of short text document containing several sentences, generated only based on the input original documents',
 "With the development of social media and mobile equipments, more and more user generated * The work described in this paper is supported by a grant from the Grant Council of the Hong Kong Special Administrative Region, China (Project Code: 14203414) .1 http://www.se.cuhk.edu.hk/˜textmine/ dataset/ra-mds/ NEWS: The most important announcements from Google's big developers' conference content is available",
 ' Figure 1 is a snapshot of 

In [255]:
if len(sentences_sec) != len(sent_text):
    print(len(sentences_sec),len(sent_text))

33 31


**как видим наше разбиение на предложения лучше**

In [256]:
cite_spans_sec = all_articles[num_art]['grobid_parse']['body_text'][num_sect]['cite_spans']
cite_spans_sec

[{'start': 192,
  'end': 216,
  'text': '(Goldstein et al., 2000;',
  'latex': None,
  'ref_id': 'BIBREF6'},
 {'start': 217,
  'end': 239,
  'text': 'Erkan and Radev, 2004;',
  'latex': None,
  'ref_id': 'BIBREF4'},
 {'start': 240,
  'end': 257,
  'text': 'Wan et al., 2007;',
  'latex': None,
  'ref_id': 'BIBREF19'},
 {'start': 258,
  'end': 284,
  'text': 'Nenkova and McKeown, 2012;',
  'latex': None,
  'ref_id': 'BIBREF16'},
 {'start': 285,
  'end': 302,
  'text': 'Min et al., 2012;',
  'latex': None,
  'ref_id': 'BIBREF15'},
 {'start': 303,
  'end': 319,
  'text': 'Li et al., 2017)',
  'latex': None,
  'ref_id': 'BIBREF11'},
 {'start': 773,
  'end': 797,
  'text': '(Project Code: 14203414)',
  'latex': None,
  'ref_id': None},
 {'start': 2288,
  'end': 2305,
  'text': '(Hu et al., 2008;',
  'latex': None,
  'ref_id': 'BIBREF7'},
 {'start': 2306,
  'end': 2324,
  'text': 'Yang et al., 2011)',
  'latex': None,
  'ref_id': 'BIBREF22'},
 {'start': 2582,
  'end': 2598,
  'text': 'Li et a

In [257]:
cite_spans_sec_time = np.array(cite_spans_sec.copy())
cite_spans_sec_time

array([{'start': 192, 'end': 216, 'text': '(Goldstein et al., 2000;', 'latex': None, 'ref_id': 'BIBREF6'},
       {'start': 217, 'end': 239, 'text': 'Erkan and Radev, 2004;', 'latex': None, 'ref_id': 'BIBREF4'},
       {'start': 240, 'end': 257, 'text': 'Wan et al., 2007;', 'latex': None, 'ref_id': 'BIBREF19'},
       {'start': 258, 'end': 284, 'text': 'Nenkova and McKeown, 2012;', 'latex': None, 'ref_id': 'BIBREF16'},
       {'start': 285, 'end': 302, 'text': 'Min et al., 2012;', 'latex': None, 'ref_id': 'BIBREF15'},
       {'start': 303, 'end': 319, 'text': 'Li et al., 2017)', 'latex': None, 'ref_id': 'BIBREF11'},
       {'start': 773, 'end': 797, 'text': '(Project Code: 14203414)', 'latex': None, 'ref_id': None},
       {'start': 2288, 'end': 2305, 'text': '(Hu et al., 2008;', 'latex': None, 'ref_id': 'BIBREF7'},
       {'start': 2306, 'end': 2324, 'text': 'Yang et al., 2011)', 'latex': None, 'ref_id': 'BIBREF22'},
       {'start': 2582, 'end': 2598, 'text': 'Li et al. (2015)', 'lat

Old version

In [258]:
cite_spans_sec_time = np.array(cite_spans_sec.copy())
del_bibref = []
for num_sent,sentence in enumerate(sentences_sec):
    cite_spans_sec_time = [cite_span  for cite_span in  cite_spans_sec_time if not cite_span['ref_id'] in del_bibref] 
#     del_bibref = []
    for num_cite_span,cite_span in enumerate(cite_spans_sec_time):
        if len(cite_span['text'])<4:
            temp = ' '+cite_span['text']+' '
            if temp in sentence:
                print(num_sent,sentence)
                del_bibref.append(cite_span['ref_id'])
                print(cite_span)
                print(cite_spans_sec_time)
                print('---')
#                 del cite_spans_sec_time[num_cite_span]
        elif cite_span['text'] in sentence:
            print(num_sent,sentence)
            del_bibref.append(cite_span['ref_id'])
            print(cite_span)
            print(cite_spans_sec_time)
            print('---')
#             del cite_spans_sec_time[num_cite_span]

0 The goal of multi-document summarization (MDS) is to automatically generate a brief, wellorganized summary for a topic which describes an event with a set of documents from different sources. (Goldstein et al., 2000; Erkan and Radev, 2004; Wan et al., 2007; Nenkova and McKeown, 2012; Min et al., 2012; Li et al., 2017) 
{'start': 192, 'end': 216, 'text': '(Goldstein et al., 2000;', 'latex': None, 'ref_id': 'BIBREF6'}
[{'start': 192, 'end': 216, 'text': '(Goldstein et al., 2000;', 'latex': None, 'ref_id': 'BIBREF6'}, {'start': 217, 'end': 239, 'text': 'Erkan and Radev, 2004;', 'latex': None, 'ref_id': 'BIBREF4'}, {'start': 240, 'end': 257, 'text': 'Wan et al., 2007;', 'latex': None, 'ref_id': 'BIBREF19'}, {'start': 258, 'end': 284, 'text': 'Nenkova and McKeown, 2012;', 'latex': None, 'ref_id': 'BIBREF16'}, {'start': 285, 'end': 302, 'text': 'Min et al., 2012;', 'latex': None, 'ref_id': 'BIBREF15'}, {'start': 303, 'end': 319, 'text': 'Li et al., 2017)', 'latex': None, 'ref_id': 'BIBREF1

[{'start': 192, 'end': 216, 'text': '(Goldstein et al., 2000;', 'latex': None, 'ref_id': 'BIBREF6'}, {'start': 217, 'end': 239, 'text': 'Erkan and Radev, 2004;', 'latex': None, 'ref_id': 'BIBREF4'}, {'start': 240, 'end': 257, 'text': 'Wan et al., 2007;', 'latex': None, 'ref_id': 'BIBREF19'}, {'start': 258, 'end': 284, 'text': 'Nenkova and McKeown, 2012;', 'latex': None, 'ref_id': 'BIBREF16'}, {'start': 285, 'end': 302, 'text': 'Min et al., 2012;', 'latex': None, 'ref_id': 'BIBREF15'}, {'start': 303, 'end': 319, 'text': 'Li et al., 2017)', 'latex': None, 'ref_id': 'BIBREF11'}, {'start': 773, 'end': 797, 'text': '(Project Code: 14203414)', 'latex': None, 'ref_id': None}, {'start': 2288, 'end': 2305, 'text': '(Hu et al., 2008;', 'latex': None, 'ref_id': 'BIBREF7'}, {'start': 2306, 'end': 2324, 'text': 'Yang et al., 2011)', 'latex': None, 'ref_id': 'BIBREF22'}, {'start': 2582, 'end': 2598, 'text': 'Li et al. (2015)', 'latex': None, 'ref_id': 'BIBREF2'}, {'start': 2911, 'end': 2927, 'text':

In [259]:
del_bibref

['BIBREF6',
 'BIBREF4',
 'BIBREF19',
 'BIBREF16',
 'BIBREF15',
 'BIBREF11',
 None,
 'BIBREF7',
 'BIBREF22',
 'BIBREF2',
 'BIBREF9',
 'BIBREF18']

old version END

In [260]:
num_art = 0
num_sect = 0

sentences_sec = text2sentences(all_articles[num_art]['grobid_parse']['body_text'][num_sect]['text'])
cite_spans_sec = all_articles[num_art]['grobid_parse']['body_text'][num_sect]['cite_spans']

print('Расспечатаем из 500 статей: \nСтатьи в которых кол-во найденных ссылок в предложениях секции != кол-ву найденных inline ссылок')

for num_art,article in enumerate(all_articles):
    if num_art > 500:
        break 
    for num_sec,section in enumerate(article['grobid_parse']['body_text']):
        
        sentences_sec = text2sentences(section['text'])
        cite_spans_sec = section['cite_spans']
        cite_spans_sec_time = np.array(cite_spans_sec.copy())
        sum_prev_sects = 0
        sents_num_citations = []
        del_bib_start = []
        for num_sent,sentence in enumerate(sentences_sec):
            cite_spans_sec_time = [cite_span  for cite_span in  cite_spans_sec_time if not cite_span['start'] in del_bib_start]
            len_sent = len(sentence)+1
            sent_num_cits = 0
            for num_cite_span,cite_span in enumerate(cite_spans_sec_time):
                if len(cite_span['text'])<4:
                    temp = ' '+cite_span['text']+' '
                    if (cite_span['start'] >= (sum_prev_sects-2) and (cite_span['end']-3) <=(sum_prev_sects+len_sent)) or (temp in sentence) or (temp.replace('.','') in sentence):
                        sent_num_cits+=1
                        del_bib_start.append(cite_span['start'])
                elif (cite_span['start'] >= (sum_prev_sects-2) and (cite_span['end']-3) <=(sum_prev_sects+len_sent)) or (cite_span['text'] in sentence) or (cite_span['text'].replace('.','') in sentence):
                    sent_num_cits+=1
                    del_bib_start.append(cite_span['start'])
#                     print(cite_span)
#                     print(num_sent,sentence)
#                     print('---')
            sents_num_citations.append(sent_num_cits)
            sum_prev_sects += len_sent
#         print(num_art,num_sec)
        try:
            assert sum(sents_num_citations) == len(cite_spans_sec)
        except:
            print(num_art,num_sec)

Расспечатаем из 500 статей: 
Статьи в которых кол-во найденных ссылок в предложениях секции != кол-ву найденных inline ссылок
87 10
171 10
236 10
334 8
353 7
353 19
411 6
449 0


In [261]:
num_art = 0
num_sect = 0
article = all_articles[num_art]

In [262]:
for cite_span in article['grobid_parse']['body_text'][num_sect]['cite_spans']:
    print(cite_span)

{'start': 192, 'end': 216, 'text': '(Goldstein et al., 2000;', 'latex': None, 'ref_id': 'BIBREF6'}
{'start': 217, 'end': 239, 'text': 'Erkan and Radev, 2004;', 'latex': None, 'ref_id': 'BIBREF4'}
{'start': 240, 'end': 257, 'text': 'Wan et al., 2007;', 'latex': None, 'ref_id': 'BIBREF19'}
{'start': 258, 'end': 284, 'text': 'Nenkova and McKeown, 2012;', 'latex': None, 'ref_id': 'BIBREF16'}
{'start': 285, 'end': 302, 'text': 'Min et al., 2012;', 'latex': None, 'ref_id': 'BIBREF15'}
{'start': 303, 'end': 319, 'text': 'Li et al., 2017)', 'latex': None, 'ref_id': 'BIBREF11'}
{'start': 773, 'end': 797, 'text': '(Project Code: 14203414)', 'latex': None, 'ref_id': None}
{'start': 2288, 'end': 2305, 'text': '(Hu et al., 2008;', 'latex': None, 'ref_id': 'BIBREF7'}
{'start': 2306, 'end': 2324, 'text': 'Yang et al., 2011)', 'latex': None, 'ref_id': 'BIBREF22'}
{'start': 2582, 'end': 2598, 'text': 'Li et al. (2015)', 'latex': None, 'ref_id': 'BIBREF2'}
{'start': 2911, 'end': 2927, 'text': 'Li et al.

In [263]:
for num_sec,section in enumerate(article['grobid_parse']['body_text']):
    if num_sec > 0:
        break
    sentences_sec = text2sentences(section['text'])
    cite_spans_sec = section['cite_spans']
    cite_spans_sec_time = np.array(cite_spans_sec.copy())
    sum_prev_sects = 0
    sents_num_citations = []
    del_bib_start = []
    for num_sent,sentence in enumerate(sentences_sec):
        
        cite_spans_sec_time = [cite_span  for cite_span in  cite_spans_sec_time if not cite_span['start'] in del_bib_start]
        len_sent = len(sentence)+1
        sent_num_cits = 0
        for num_cite_span,cite_span in enumerate(cite_spans_sec_time):
            temp = cite_span['text']
            if len(cite_span['text'])<4:
                temp = ' '+cite_span['text']+' '
                if (cite_span['start'] >= sum_prev_sects and cite_span['end'] <=(sum_prev_sects+len_sent)) or (temp in sentence):
                    sent_num_cits+=1
                    del_bib_start.append(cite_span['start'])
                    print('---')
                    print(cite_span)
                    print(num_sent,sentence)
                    print('---')
            elif (cite_span['start'] >= (sum_prev_sects-2) and (cite_span['end']-3) <=(sum_prev_sects+len_sent)) or (temp in sentence) or (temp.replace('.','') in sentence):
                sent_num_cits+=1
                del_bib_start.append(cite_span['start'])
                print(cite_span)
                print(num_sent,sentence)
                print('---')
        sents_num_citations.append(sent_num_cits)
        sum_prev_sects += len_sent
    try:
        assert sum(sents_num_citations) == len(cite_spans_sec)
        print(20*'==')
    except:
        print('!!!!',num_sec)

{'start': 192, 'end': 216, 'text': '(Goldstein et al., 2000;', 'latex': None, 'ref_id': 'BIBREF6'}
0 The goal of multi-document summarization (MDS) is to automatically generate a brief, wellorganized summary for a topic which describes an event with a set of documents from different sources. (Goldstein et al., 2000; Erkan and Radev, 2004; Wan et al., 2007; Nenkova and McKeown, 2012; Min et al., 2012; Li et al., 2017) 
---
{'start': 217, 'end': 239, 'text': 'Erkan and Radev, 2004;', 'latex': None, 'ref_id': 'BIBREF4'}
0 The goal of multi-document summarization (MDS) is to automatically generate a brief, wellorganized summary for a topic which describes an event with a set of documents from different sources. (Goldstein et al., 2000; Erkan and Radev, 2004; Wan et al., 2007; Nenkova and McKeown, 2012; Min et al., 2012; Li et al., 2017) 
---
{'start': 240, 'end': 257, 'text': 'Wan et al., 2007;', 'latex': None, 'ref_id': 'BIBREF19'}
0 The goal of multi-document summarization (MDS) is to au

In [264]:
sentences_sec = text2sentences(article['grobid_parse']['body_text'][0]['text'])
cite_spans_sec = article['grobid_parse']['body_text'][0]['cite_spans']

cite_spans_sec_time = np.array(cite_spans_sec.copy())
sum_prev_sects = 0
sents_num_citations = []
del_bib_start = []
for num_sent,sentence in enumerate(sentences_sec):
    cite_spans_sec_time = [cite_span  for cite_span in  cite_spans_sec_time if not cite_span['start'] in del_bib_start]
    len_sent = len(sentence)+1
    sent_num_cits = 0
    for num_cite_span,cite_span in enumerate(cite_spans_sec_time):
        temp = cite_span['text']
        if len(cite_span['text'])<4:
            temp = ' '+cite_span['text']+' '
            if (cite_span['start'] >= sum_prev_sects and cite_span['end'] <=(sum_prev_sects+len_sent)) or (temp in sentence):
                sent_num_cits+=1
                del_bib_start.append(cite_span['start'])
                print('---')
                print(cite_span)
                print(num_sent,sentence)
                print('---')
        elif (cite_span['start'] >= (sum_prev_sects-2) and (cite_span['end']-3) <=(sum_prev_sects+len_sent)) or (temp in sentence) or (temp.replace('.','') in sentence):
            sent_num_cits+=1
            del_bib_start.append(cite_span['start'])
            print(cite_span)
            print(num_sent,sentence)
            print('---')
    sents_num_citations.append(sent_num_cits)
    print(sentence)
    print(num_sent,sum_prev_sects,sum_prev_sects+len_sent)
    sum_prev_sects += len_sent
    
print(num_art,num_sec)
assert sum(sents_num_citations) == len(cite_spans_sec)

{'start': 192, 'end': 216, 'text': '(Goldstein et al., 2000;', 'latex': None, 'ref_id': 'BIBREF6'}
0 The goal of multi-document summarization (MDS) is to automatically generate a brief, wellorganized summary for a topic which describes an event with a set of documents from different sources. (Goldstein et al., 2000; Erkan and Radev, 2004; Wan et al., 2007; Nenkova and McKeown, 2012; Min et al., 2012; Li et al., 2017) 
---
{'start': 217, 'end': 239, 'text': 'Erkan and Radev, 2004;', 'latex': None, 'ref_id': 'BIBREF4'}
0 The goal of multi-document summarization (MDS) is to automatically generate a brief, wellorganized summary for a topic which describes an event with a set of documents from different sources. (Goldstein et al., 2000; Erkan and Radev, 2004; Wan et al., 2007; Nenkova and McKeown, 2012; Min et al., 2012; Li et al., 2017) 
---
{'start': 240, 'end': 257, 'text': 'Wan et al., 2007;', 'latex': None, 'ref_id': 'BIBREF19'}
0 The goal of multi-document summarization (MDS) is to au

19 2901 3119
 During our investigation, we find that the Gaussian based VAEs have a strong ability to capture the salience information and filter the noise from texts
20 3119 3273
 Intuitively, if we feed both the news sentences and the comment sentences into the VAEs, commonly existed latent aspect information from both of them will be enhanced and become salient
21 3273 3460
 Inspired by this consideration, to address the sentence salience estimation problem for RA-MDS by jointly considering news documents and reader comments, we extend the VAESum framework by training the news sentence latent model and the comment sentence latent model simultaneously by sharing the neural parameters
22 3460 3775
 After estimating the sentence salience, we employ a phrase based compressive unified optimization framework to generate a final summary
23 3775 3912
There is a lack of high-quality dataset suitable for RA-MDS
24 3912 3972
 Existing datasets from DUC 3 and TAC 4 are not appropriate
25 3972 4

In [265]:
for cite_span in article['grobid_parse']['body_text'][num_sect]['cite_spans']:
    print(cite_span)

{'start': 192, 'end': 216, 'text': '(Goldstein et al., 2000;', 'latex': None, 'ref_id': 'BIBREF6'}
{'start': 217, 'end': 239, 'text': 'Erkan and Radev, 2004;', 'latex': None, 'ref_id': 'BIBREF4'}
{'start': 240, 'end': 257, 'text': 'Wan et al., 2007;', 'latex': None, 'ref_id': 'BIBREF19'}
{'start': 258, 'end': 284, 'text': 'Nenkova and McKeown, 2012;', 'latex': None, 'ref_id': 'BIBREF16'}
{'start': 285, 'end': 302, 'text': 'Min et al., 2012;', 'latex': None, 'ref_id': 'BIBREF15'}
{'start': 303, 'end': 319, 'text': 'Li et al., 2017)', 'latex': None, 'ref_id': 'BIBREF11'}
{'start': 773, 'end': 797, 'text': '(Project Code: 14203414)', 'latex': None, 'ref_id': None}
{'start': 2288, 'end': 2305, 'text': '(Hu et al., 2008;', 'latex': None, 'ref_id': 'BIBREF7'}
{'start': 2306, 'end': 2324, 'text': 'Yang et al., 2011)', 'latex': None, 'ref_id': 'BIBREF22'}
{'start': 2582, 'end': 2598, 'text': 'Li et al. (2015)', 'latex': None, 'ref_id': 'BIBREF2'}
{'start': 2911, 'end': 2927, 'text': 'Li et al.

Если мы  посмотрим почему есть несоотвествие, то заметим что ошибка в распознавании идентификатора in-line ссылки

Функции для рассчета признака

In [266]:
def sect_num_citations(section):
    sentences_sec = text2sentences(section['text'])
    cite_spans_sec = section['cite_spans']
    cite_spans_sec_time = np.array(cite_spans_sec.copy())
    sum_prev_sects = 0
    sents_num_citations = []
    del_bib_start = []
    for num_sent,sentence in enumerate(sentences_sec):
        cite_spans_sec_time = [cite_span  for cite_span in  cite_spans_sec_time if not cite_span['start'] in del_bib_start]
        len_sent = len(sentence)+1
        sent_num_cits = 0
        for num_cite_span,cite_span in enumerate(cite_spans_sec_time):
            if len(cite_span['text'])<4:
                temp = ' '+cite_span['text']+' '
                if (cite_span['start'] >= sum_prev_sects and cite_span['end'] <=(sum_prev_sects+len_sent)) or (temp in sentence) or (temp.replace('.','') in sentence):
                    sent_num_cits+=1
                    del_bib_start.append(cite_span['start'])
            elif (cite_span['start'] >= (sum_prev_sects-2) and (cite_span['end']-3) <=(sum_prev_sects+len_sent)) or (cite_span['text'] in sentence) or (cite_span['text'].replace('.','') in sentence):
                sent_num_cits+=1
                del_bib_start.append(cite_span['start'])
#                 print(cite_span)
#                 print(num_sent,sentence)
#                 print('---')
        sents_num_citations.append(sent_num_cits)
        sum_prev_sects += len_sent
    # Checking соотвествия кол-ва найденных ссылок в предложениях и кол-ва всех ссылок
#     try:
#         assert sum(sents_num_citations) == len(cite_spans_sec)
#     except:
#         print(num_art,num_sec)
    return sents_num_citations

In [267]:
def paper_num_citations(article):
    if article['grobid_parse'] and article['grobid_parse']['body_text']:
        papers_num_citations = []
        for num_sec,section in enumerate(article['grobid_parse']['body_text']):
            sents_num_citations = sect_num_citations(section)
            papers_num_citations.append(sents_num_citations)
        return papers_num_citations
    else:
        return [-1]

In [268]:
paper_num_citations(all_articles[0])

[[6,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  2,
  0,
  0,
  1,
  0,
  3,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0, 0, 1, 0, 0],
 [2,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  3,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [2, 0, 0, 0, 0, 0, 0, 1, 0],
 [0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0],
 [0, 1, 0],
 [0, 0, 1, 1, 2, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 1, 1],
 [0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 1]]

In [273]:
num_art = 0
num_sect = 0
all_articles[num_art]['grobid_parse']['body_text'][num_sect]['text']

'The goal of multi-document summarization (MDS) is to automatically generate a brief, wellorganized summary for a topic which describes an event with a set of documents from different sources. (Goldstein et al., 2000; Erkan and Radev, 2004; Wan et al., 2007; Nenkova and McKeown, 2012; Min et al., 2012; Li et al., 2017) . In the typical setting of MDS, the input is a set of news documents about the same topic. The output summary is a piece of short text document containing several sentences, generated only based on the input original documents.With the development of social media and mobile equipments, more and more user generated * The work described in this paper is supported by a grant from the Grant Council of the Hong Kong Special Administrative Region, China (Project Code: 14203414) .1 http://www.se.cuhk.edu.hk/˜textmine/ dataset/ra-mds/ NEWS: The most important announcements from Google\'s big developers\' conference content is available. Figure 1 is a snapshot of reader comments

In [274]:
article['grobid_parse']['body_text'][0]['cite_spans']

[{'start': 57,
  'end': 78,
  'text': '(Wilson et al., 2013)',
  'latex': None,
  'ref_id': 'BIBREF7'},
 {'start': 369,
  'end': 379,
  'text': 'Liu (2012)',
  'latex': None,
  'ref_id': 'BIBREF3'},
 {'start': 893, 'end': 894, 'text': '1', 'latex': None, 'ref_id': None}]

In [275]:
print(list(map(lambda x: int(x['start']),article['grobid_parse']['body_text'][0]['cite_spans'])))
np.mean(list(map(lambda x: int(x['start']),article['grobid_parse']['body_text'][0]['cite_spans'])))

[57, 369, 893]


439.6666666666667

In [276]:
grobid_parse_overview = dict()
article = all_articles[num_art]
len_sects = len(article['grobid_parse']['body_text'])
for num_sec,sections in enumerate(article['grobid_parse']['body_text']):
    grobid_parse_overview[num_sec] = sections
    print(sections)
    cnt_citations = len(grobid_parse_overview[num_sec]['cite_spans'])
    sect_len = len(sections['text'])
    sect_len_toks = len(sections['text'].split())
    cite_spans_sec_start = list(map(lambda x: int(x['start']),grobid_parse_overview[num_sec]['cite_spans']))
    cite_spans_sec_start_mean = 0
    sents_num_citations = sect_num_citations(sections)
    if len(cite_spans_sec_start) >=1:
        cite_spans_sec_start_mean = np.mean(cite_spans_sec_start)/sect_len
    print('Features: \nciting dencity {0}: #citations={1} & sect_len_toks={2}|  '.format(cnt_citations/sect_len_toks, cnt_citations, sect_len_toks))
    print('Number of citations in sentences: {0}'.format(sents_num_citations))
    print('Positon sect: {0} : num_sec = {1} len_sects={2}'.format((num_sec+1)/len_sects,num_sec+1,len_sects))
    print('Positon aver. citation: {0} {1} len= {2}'.format(cite_spans_sec_start_mean,cite_spans_sec_start,sect_len))
    print(10*'==')

{'text': 'The goal of multi-document summarization (MDS) is to automatically generate a brief, wellorganized summary for a topic which describes an event with a set of documents from different sources. (Goldstein et al., 2000; Erkan and Radev, 2004; Wan et al., 2007; Nenkova and McKeown, 2012; Min et al., 2012; Li et al., 2017) . In the typical setting of MDS, the input is a set of news documents about the same topic. The output summary is a piece of short text document containing several sentences, generated only based on the input original documents.With the development of social media and mobile equipments, more and more user generated * The work described in this paper is supported by a grant from the Grant Council of the Hong Kong Special Administrative Region, China (Project Code: 14203414) .1 http://www.se.cuhk.edu.hk/˜textmine/ dataset/ra-mds/ NEWS: The most important announcements from Google\'s big developers\' conference content is available. Figure 1 is a snapshot of reader

{'text': 'Variational Autoencoders (VAEs) (Kingma and Welling, 2014; Rezende et al., 2014 ) is a generative model based on neural networks which can be used to conduct latent semantic modeling. Li et al. (2017) employ VAEs to map the news sentences into a latent semantic space, which is helpful in improving the MDS performance. Similarly, we also employ VAEs to conduct the semantic modeling for news sentences and comment sentences. Assume that both the prior and posterior of the latent variables are Gaussian, i.e., p θ (z) = N (0, I) and q φ (z|x) = N (z; µ, σ 2 I), where µ and σ denote the variational mean and standard deviation respectively, which can be calculated with a multilayer perceptron (MLP). VAEs can be divided into two phases, namely, encoding (inference), and decoding (generation). All the operations are de- picted as follows:h enc = relu(W xh x + b xh ) µ = W hµ h enc + b hµ log(σ 2 ) = W hσ h enc + b hσ ε ∼ N (0, I), z = µ + σ ⊗ ε h dec = relu(W zh z + b zh ) x = sigmoid

{'text': 'In order to produce reader-aware summaries, inspired by the phrase-based model in and Li et al. (2015) , we refine this model to consider the news sentences salience information obtained by our framework. Based on the parsed constituency tree for each input sentence, we extract the noun-phrases (NPs) and verb-phrases (VPs). The overall objective function of this optimization formulation for selecting salient NPs and VPs is formulated as an integer linear programming (ILP) problem:max{ i α i S i − i<j α ij (S i + S j )R ij },(12)where α i is the selection indicator for the phrase P i , S i is the salience scores of P i , α ij and R ij is co-occurrence indicator and the similarity a pair of phrases (P i , P j ) respectively. The similarity is calculated with the Jaccard Index based method. In order to obtain coherent summaries with good readability, we add some constraints into the ILP framework. For details, please refer to Woodsend and Lapata (2012), , and Li et al. (2015) . 

citing dencity 0.029411764705882353: #citations=1 & sect_len_toks=34|  
Number of citations in sentences: [0, 1, 0]
Positon sect: 0.6428571428571429 : num_sec = 9 len_sects=14
Positon aver. citation: 0.5297029702970297 [107] len= 202
{'text': 'To evaluate the performance of our dataset and the proposed framework RAVAESum for RA-MDS, we compare our model with the following methods:• RA-Sparse : It is a framework to tackle the RA-MDS problem. A sparse-coding-based method is used to calculate the salience of the news sentences by jointly considering news documents and reader comments.• Lead (Wasson, 1998) : It ranks the news sentences chronologically and extracts the leading sentences one by one until the length limit.• Centroid (Radev et al., 2000) : It summarizes clusters of news articles automatically grouped by a topic detection system, and then it uses information from the centroids of the clusters to select sentences.• LexRank (Erkan and Radev, 2004) and TextRank (Mihalcea and Tarau

{'text': 'We investigate the problem of reader-aware multidocument summarization (RA-MDS) and introduce a new dataset. To tackle the RA-MDS, we extend a variational auto-encodes (VAEs) based MDS framework by jointly considering news documents and reader comments. The methods for data collection, aspect annotation, and summary writing and scrutinizing by experts are described. Experimental results show that reader comments can improve the summarization performance, which demonstrate the usefulness of the proposed dataset. Sony, headset, game, virtual, morpheus, reality, vr, project, playstation, Yoshida +C Sony, game, vr, virtual, headset, reality, morpheus, oculus, project, playstation "Bitcoin Mt. Gox Offlile" −C bitcoin, gox, exchange, mt., currency, Gox, virtual, company, money, price +C bitcoin, currency, money, exchange, gox, mt., virtual, company, price, world ', 'cite_spans': [{'start': 517, 'end': 868, 'text': 'Sony, headset, game, virtual, morpheus, reality, vr, project, plays

In [180]:
grobid_parse_overview

{0: {'text': 'The SemEval-2013 task on "Sentiment Analysis in Twitter" (Wilson et al., 2013) focuses on polarity classification, i. e. the problem of determining whether a textual unit, e. g. a document, paragraph, sentence or phrase, expresses a positive, negative or neutral sentiment (for a review of research topics and recent developments in the field of sentiment analysis see Liu (2012) ). There are two subtasks: in task B, "Message Polarity Classification", whole messages have to be classified as being of positive, negative or neutral sentiment; in task A, "Contextual Polarity Disambiguation", a marked instance of a word or phrase has to be classified in the context of a whole message.The training data for task B consist of approximately 10 200 manually annotated Twitter messages, the training data for task A of approximately 9 500 marked instances in approximately 6 300 Twitter messages. 1 The test data consist of in-domain Twitter messages (3 813 messages for task B and 4 435 ma

для latex_parse:
 1. Собираем все абзацы по секциям
 2. Рассчёт признаков

In [374]:
article = all_articles[0]
latex_parse_overview = dict()
for sections in article['latex_parse']['body_text']:
    latex_parse_overview[sections['section']] = sections
    print(sections['section'],len(sections['cite_spans']))
    print(sections)
    print(10*'==')

Introduction 7
{'text': 'The goal of multi-document summarization (MDS) is to automatically generate a brief, well-organized summary for a topic which describes an event with a set of documents from different sources. BIBREF0 , BIBREF1 , BIBREF2 , BIBREF3 , BIBREF4 , BIBREF5 , BIBREF6 . In the typical setting of MDS, the input is a set of news documents about the same topic. The output summary is a piece of short text document containing several sentences, generated only based on the input original documents.', 'cite_spans': [{'start': 193, 'end': 200, 'text': None, 'latex': None, 'ref_id': 'BIBREF0'}, {'start': 203, 'end': 210, 'text': None, 'latex': None, 'ref_id': 'BIBREF1'}, {'start': 213, 'end': 220, 'text': None, 'latex': None, 'ref_id': 'BIBREF2'}, {'start': 223, 'end': 230, 'text': None, 'latex': None, 'ref_id': 'BIBREF3'}, {'start': 233, 'end': 240, 'text': None, 'latex': None, 'ref_id': 'BIBREF4'}, {'start': 243, 'end': 250, 'text': None, 'latex': None, 'ref_id': 'BIBREF5'}, 

{'text': 'Variational Autoencoders (VAEs) BIBREF10 , BIBREF11 is a generative model based on neural networks which can be used to conduct latent semantic modeling. BIBREF6 employ VAEs to map the news sentences into a latent semantic space, which is helpful in improving the MDS performance. Similarly, we also employ VAEs to conduct the semantic modeling for news sentences and comment sentences. Assume that both the prior and posterior of the latent variables are Gaussian, i.e., INLINEFORM0 and INLINEFORM1 , where INLINEFORM2 and INLINEFORM3 denote the variational mean and standard deviation respectively, which can be calculated with a multilayer perceptron (MLP). VAEs can be divided into two phases, namely, encoding (inference), and decoding (generation). All the operations are depicted as follows: DISPLAYFORM0 ', 'cite_spans': [{'start': 32, 'end': 40, 'text': None, 'latex': None, 'ref_id': 'BIBREF10'}, {'start': 43, 'end': 51, 'text': None, 'latex': None, 'ref_id': 'BIBREF11'}, {'star

{'text': 'VAESum BIBREF6 employs an alignment mechanism BIBREF12 , BIBREF13 to recall the lost detailed information from the input sentence. Inspired this idea, we design a jointly weighted alignment mechanism by considering the news sentence and the comment sentence simultaneously. For each decoder hidden state INLINEFORM0 , we align it with each news encoder hidden state INLINEFORM1 by an alignment vector INLINEFORM2 . We also align it with each comments encoder hidden state INLINEFORM3 by an alignment vector INLINEFORM4 . In order to filter the noisy information from the comments, we again employ the comment weight INLINEFORM5 to adjust the alignment vector of comments: DISPLAYFORM0 ', 'cite_spans': [{'start': 7, 'end': 14, 'text': None, 'latex': None, 'ref_id': 'BIBREF6'}, {'start': 46, 'end': 54, 'text': None, 'latex': None, 'ref_id': 'BIBREF12'}, {'start': 57, 'end': 65, 'text': None, 'latex': None, 'ref_id': 'BIBREF13'}], 'ref_spans': [], 'eq_spans': [{'start': 304, 'end': 315, 

{'text': 'Because we have different representations from different vector space for the sentences, therefore we can calculate the comment weight in different semantic vector space. Here we use two spaces, namely, latent semantic space obtained by VAEs, and the original bag-of-words vector space. Then we can merge the weights by a parameter INLINEFORM0 : DISPLAYFORM0 ', 'cite_spans': [], 'ref_spans': [], 'eq_spans': [{'start': 332, 'end': 343, 'text': 'λ p ', 'latex': '\\lambda _p', 'ref_id': None}, {'start': 346, 'end': 358, 'text': 'ρ=λ p ×ρ z +(1-λ p )×ρ x ', 'latex': '\n\\rho  = \\lambda _p \\times \\rho _z + (1-\\lambda _p) \\times \\rho _x\n', 'ref_id': 'EQREF20'}], 'section': 'Reader-Aware Salience Estimation'}
Reader-Aware Salience Estimation 0
{'text': 'where INLINEFORM0 and INLINEFORM1 are the comment weight calculated from latent semantic space and term vector space. Actually, we can regard INLINEFORM2 as some gates to control the proportion of each comment sentence absorbed 

{'text': 'Each topic is assigned to 4 experts, who are major in journalism, to conduct the summary writing. The task of summary writing is divided into two phases, namely, aspect facet identification, and summary generation. For the aspect facet identification, the experts read and digested all the news documents and reader comments under the topic. Then for each aspect, the experts extracted the related facets from the news document. The summaries were generated based on the annotated aspect facets. When selecting facets, one consideration is those facets that are popular in both news documents and reader comments have higher priority. Next, the facets that are popular in news documents have the next priority. The generated summary should cover as many aspects as possible, and should be well-organized using complete sentences with a length restriction of 100 words.', 'cite_spans': [], 'ref_spans': [], 'eq_spans': [], 'section': 'Data Collection'}
Data Collection 0
{'text': 'After fini

{'text': 'To further investigate the effectiveness of our proposed RAVAESum framework, we adjust our framework by removing the comments related components. Then the model settings of RAVAESum-noC are similar to VAESum BIBREF6 . The evaluation results are shown in Table TABREF42 , which illustrate that our framework with reader comments RAVAESum is better than RAVAESum-noC significantly( INLINEFORM0 ).', 'cite_spans': [{'start': 208, 'end': 215, 'text': None, 'latex': None, 'ref_id': 'BIBREF6'}], 'ref_spans': [{'start': 260, 'end': 268, 'text': None, 'latex': None, 'ref_id': 'TABREF42'}], 'eq_spans': [{'start': 380, 'end': 391, 'text': 'p<0.05', 'latex': 'p<0.05', 'ref_id': None}], 'section': 'Further Investigation of Our Framework '}
Further Investigation of Our Framework  1
{'text': "Moreover, as mentioned in VAESum BIBREF6 , the output aspect vectors contain the word salience information. Then we select the top-10 terms for event “Sony Virtual Reality PS4”, and “`Bitcoin Mt. Gox Offl

In [375]:
article['latex_parse']

{'abstract': [],
 'body_text': [{'text': 'The goal of multi-document summarization (MDS) is to automatically generate a brief, well-organized summary for a topic which describes an event with a set of documents from different sources. BIBREF0 , BIBREF1 , BIBREF2 , BIBREF3 , BIBREF4 , BIBREF5 , BIBREF6 . In the typical setting of MDS, the input is a set of news documents about the same topic. The output summary is a piece of short text document containing several sentences, generated only based on the input original documents.',
   'cite_spans': [{'start': 193,
     'end': 200,
     'text': None,
     'latex': None,
     'ref_id': 'BIBREF0'},
    {'start': 203,
     'end': 210,
     'text': None,
     'latex': None,
     'ref_id': 'BIBREF1'},
    {'start': 213,
     'end': 220,
     'text': None,
     'latex': None,
     'ref_id': 'BIBREF2'},
    {'start': 223,
     'end': 230,
     'text': None,
     'latex': None,
     'ref_id': 'BIBREF3'},
    {'start': 233,
     'end': 240,
     'te

In [376]:
latex_parse_overview = dict()
for sections in article['latex_parse']['body_text']:
    if sections['section'] in latex_parse_overview:
        # если есть дублирование, такое бывает у первых часте
        if latex_parse_overview[sections['section']] == sections:
            continue
        else:
            latex_parse_overview[sections['section']]['text'].append(sections['text'])
            latex_parse_overview[sections['section']]['cite_spans'].append(sections['cite_spans'])
            latex_parse_overview[sections['section']]['cite_span_lens'].append(len(sections['cite_spans']))
            latex_parse_overview[sections['section']]['cite_spans_start'].append(list(map(lambda x: int(x['start']),sections['cite_spans'])))
            latex_parse_overview[sections['section']]['section'].append(sections['section'])
#             latex_parse_overview[sections['section']]['section'].append(sections['section'])
    else:
        latex_parse_overview[sections['section']] = {'text':[sections['text']],   'cite_spans':[sections['cite_spans']],
                                                      'cite_span_lens':[len(sections['cite_spans'])],
                                                      'cite_spans_start':[list(map(lambda x: int(x['start']),sections['cite_spans']))],
                                                      'section':[sections['section']]}#, 
                                                      #'bib_entries':article['latex_parse']}
       
    
for sections in latex_parse_overview:
    sections_dict = latex_parse_overview[sections]
    print(sections,len(sections_dict['cite_spans']),sections_dict['cite_span_lens'])
    print(sections_dict)
    print(10*'==')

Introduction 6 [7, 0, 3, 3, 0, 0]
{'text': ['The goal of multi-document summarization (MDS) is to automatically generate a brief, well-organized summary for a topic which describes an event with a set of documents from different sources. BIBREF0 , BIBREF1 , BIBREF2 , BIBREF3 , BIBREF4 , BIBREF5 , BIBREF6 . In the typical setting of MDS, the input is a set of news documents about the same topic. The output summary is a piece of short text document containing several sentences, generated only based on the input original documents.', "With the development of social media and mobile equipments, more and more user generated content is available. Figure FIGREF2 is a snapshot of reader comments under the news report “The most important announcements from Google's big developers' conference”. The content of the original news report talks about some new products based on AI techniques. The news report generally conveys an enthusiastic tone. However, while some readers share similar enthusiasms,

{'text': ['Variational Autoencoders (VAEs) BIBREF10 , BIBREF11 is a generative model based on neural networks which can be used to conduct latent semantic modeling. BIBREF6 employ VAEs to map the news sentences into a latent semantic space, which is helpful in improving the MDS performance. Similarly, we also employ VAEs to conduct the semantic modeling for news sentences and comment sentences. Assume that both the prior and posterior of the latent variables are Gaussian, i.e., INLINEFORM0 and INLINEFORM1 , where INLINEFORM2 and INLINEFORM3 denote the variational mean and standard deviation respectively, which can be calculated with a multilayer perceptron (MLP). VAEs can be divided into two phases, namely, encoding (inference), and decoding (generation). All the operations are depicted as follows: DISPLAYFORM0 ', 'Based on the reparameterization trick in Equation EQREF9 , we can get the analytical representation of the variational lower bound INLINEFORM0 : DISPLAYFORM0 ', 'where INLIN

{'text': ['In order to produce reader-aware summaries, inspired by the phrase-based model in BIBREF5 and BIBREF9 , we refine this model to consider the news sentences salience information obtained by our framework. Based on the parsed constituency tree for each input sentence, we extract the noun-phrases (NPs) and verb-phrases (VPs). The overall objective function of this optimization formulation for selecting salient NPs and VPs is formulated as an integer linear programming (ILP) problem: DISPLAYFORM0 ', 'where INLINEFORM0 is the selection indicator for the phrase INLINEFORM1 , INLINEFORM2 is the salience scores of INLINEFORM3 , INLINEFORM4 and INLINEFORM5 is co-occurrence indicator and the similarity a pair of phrases ( INLINEFORM6 , INLINEFORM7 ) respectively. The similarity is calculated with the Jaccard Index based method. In order to obtain coherent summaries with good readability, we add some constraints into the ILP framework. For details, please refer to BIBREF14 , BIBREF5 , 

{'text': ['To evaluate the performance of our dataset and the proposed framework RAVAESum for RA-MDS, we compare our model with the following methods:', 'RA-Sparse BIBREF9 : It is a framework to tackle the RA-MDS problem. A sparse-coding-based method is used to calculate the salience of the news sentences by jointly considering news documents and reader comments.', 'Lead BIBREF17 : It ranks the news sentences chronologically and extracts the leading sentences one by one until the length limit.', 'Centroid BIBREF18 : It summarizes clusters of news articles automatically grouped by a topic detection system, and then it uses information from the centroids of the clusters to select sentences.', 'LexRank BIBREF1 and TextRank BIBREF19 : Both methods are graph-based unsupervised framework for sentence salience estimation based on PageRank algorithm.', 'Concept BIBREF5 : It generates abstractive summaries using phrase-based optimization framework with concept weight as salience estimation. The

In [377]:
sents_num_citations_sections = []
for num_key,keys in enumerate(latex_parse_overview.keys()):
    print(num_key,keys)
    if num_key>=2:
        break
    sents_num_citations_section = []
    for num_text, text in enumerate(latex_parse_overview[keys]['text']):
        sentences_sec = text2sentences(text)
        cite_spans_sec = latex_parse_overview[keys]['cite_spans'][num_text]
        cite_spans_sec_time = np.array(cite_spans_sec.copy())
        sum_prev_sects = 0
        sents_num_citations = []
        del_bib_start = []
        for num_sent,sentence in enumerate(sentences_sec):
            cite_spans_sec_time = [cite_span  for cite_span in  cite_spans_sec_time if not cite_span['start'] in del_bib_start]
            print('sentence',sentence)
#             print(cite_spans_sec_time)
            len_sent = len(sentence)+1
            sent_num_cits = 0
            for num_cite_span,cite_span in enumerate(cite_spans_sec_time):
#                 print('cite_span',cite_span,)
                if  cite_span['text']!=None:
                    temp = cite_span['text']
                elif cite_span['ref_id']:
                    temp = cite_span['ref_id']
                else:
                    print('ERROR')
                    break
                if len(temp)<4:
                    temp = ' '+temp+' '
                    if (cite_span['start'] >= sum_prev_sects and cite_span['end'] <=(sum_prev_sects+len_sent)) or (temp in sentence):
                        sent_num_cits+=1
                        del_bib_start.append(cite_span['start'])
                        print('---')
                        print(cite_span)
                        print(num_sent,sentence)
                        print('---')
                elif (cite_span['start'] >= (sum_prev_sects-2) and (cite_span['end']-3) <=(sum_prev_sects+len_sent)) or (temp in sentence) or (temp.replace('.','') in sentence):
                    sent_num_cits+=1
                    del_bib_start.append(cite_span['start'])
                    print(cite_span)
                    print(num_sent,sentence)
                    print('---')
            sents_num_citations.append(sent_num_cits)
            sum_prev_sects += len_sent
            print('final len=',sum(sents_num_citations), len(cite_spans_sec))
        sents_num_citations_section += sents_num_citations
        
        try:
            assert sum(sents_num_citations) == len(cite_spans_sec)
            print(20*'==')
        except:
            print('!!!!',num_sec)
    sents_num_citations_sections.append(sents_num_citations_section)
    print(sents_num_citations_sections)
    print(20*'==')

0 Introduction
sentence The goal of multi-document summarization (MDS) is to automatically generate a brief, well-organized summary for a topic which describes an event with a set of documents from different sources
final len= 0 7
sentence  BIBREF0 , BIBREF1 , BIBREF2 , BIBREF3 , BIBREF4 , BIBREF5 , BIBREF6 
{'start': 193, 'end': 200, 'text': None, 'latex': None, 'ref_id': 'BIBREF0'}
1  BIBREF0 , BIBREF1 , BIBREF2 , BIBREF3 , BIBREF4 , BIBREF5 , BIBREF6 
---
{'start': 203, 'end': 210, 'text': None, 'latex': None, 'ref_id': 'BIBREF1'}
1  BIBREF0 , BIBREF1 , BIBREF2 , BIBREF3 , BIBREF4 , BIBREF5 , BIBREF6 
---
{'start': 213, 'end': 220, 'text': None, 'latex': None, 'ref_id': 'BIBREF2'}
1  BIBREF0 , BIBREF1 , BIBREF2 , BIBREF3 , BIBREF4 , BIBREF5 , BIBREF6 
---
{'start': 223, 'end': 230, 'text': None, 'latex': None, 'ref_id': 'BIBREF3'}
1  BIBREF0 , BIBREF1 , BIBREF2 , BIBREF3 , BIBREF4 , BIBREF5 , BIBREF6 
---
{'start': 233, 'end': 240, 'text': None, 'latex': None, 'ref_id': 'BIBREF4'}
1

sentence As shown in Figure FIGREF7 , our reader-aware news sentence salience framework has three main components: (1) latent semantic modeling; (2) comment weight estimation; (3) joint reconstruction
final len= 0 1
sentence  Consider a dataset INLINEFORM0 and INLINEFORM1 consisting of INLINEFORM2 news sentences and INLINEFORM3 comment sentences respectively from all the documents in a topic (event), represented by bag-of-words vectors
final len= 0 1
sentence  Our proposed news sentence salience estimation framework is extended from VAESum BIBREF6 , which can jointly consider news documents and reader comments
{'start': 489, 'end': 496, 'text': None, 'latex': None, 'ref_id': 'BIBREF6'}
2  Our proposed news sentence salience estimation framework is extended from VAESum BIBREF6 , which can jointly consider news documents and reader comments
---
final len= 1 1
sentence  One extension is that, in order to absorb more useful information and filter the noisy data from comments, we design a w

In [378]:
def sect_num_citations_latex(latex_parse_overview_key):
#     for num_key,keys in enumerate(latex_parse_overview.keys()):
#     print(num_key,keys)
    sents_num_citations_section = []
    for num_text, text in enumerate(latex_parse_overview_key['text']):
        sentences_sec = text2sentences(text)
        cite_spans_sec = latex_parse_overview_key['cite_spans'][num_text]
        cite_spans_sec_time = np.array(cite_spans_sec.copy())
        sum_prev_sects = 0
        sents_num_citations = []
        del_bib_start = []
        for num_sent,sentence in enumerate(sentences_sec):
            cite_spans_sec_time = [cite_span  for cite_span in  cite_spans_sec_time if not cite_span['start'] in del_bib_start]
            len_sent = len(sentence)+1
            sent_num_cits = 0
            for num_cite_span,cite_span in enumerate(cite_spans_sec_time):
                if  cite_span['text']!=None:
                    temp = cite_span['text']
                elif cite_span['ref_id']:
                    temp = cite_span['ref_id']
                else:
                    print('ERROR')
                    break
                if len(temp)<4:
                    temp = ' '+temp+' '
                    if (cite_span['start'] >= sum_prev_sects and cite_span['end'] <=(sum_prev_sects+len_sent)) or (temp in sentence) or (temp.replace('.','') in sentence):
                        sent_num_cits+=1
                        del_bib_start.append(cite_span['start'])
                elif (cite_span['start'] >= (sum_prev_sects-2) and (cite_span['end']-3) <=(sum_prev_sects+len_sent)) or (temp in sentence) or (temp.replace('.','') in sentence):
                    sent_num_cits+=1
                    del_bib_start.append(cite_span['start'])
    #                 print(cite_span)
    #                 print(num_sent,sentence)
    #                 print('---')
            sents_num_citations.append(sent_num_cits)
            sum_prev_sects += len_sent
        sents_num_citations_section += sents_num_citations
        # Checking соотвествия кол-ва найденных ссылок в предложениях и кол-ва всех ссылок
        try:
            assert sum(sents_num_citations) == len(cite_spans_sec)
        except:
            print(num_art,num_sec)
    return sents_num_citations_section

In [379]:
sect_num_citations_latex(latex_parse_overview['Introduction'])

[0,
 7,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 0,
 1,
 0,
 3,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [380]:
def make_latex_overview(article):
    latex_parse_overview = dict()
    for sections in article['latex_parse']['body_text']:
        if sections['section'] == None:
            sections['section'] = 'other'
        if sections['section'] in latex_parse_overview:
            # если есть дублирование, такое бывает у первых часте
            if latex_parse_overview[sections['section']] == sections:
                continue
            else:
                latex_parse_overview[sections['section']]['text'].append(sections['text'])
                latex_parse_overview[sections['section']]['cite_spans'].append(sections['cite_spans'])
                latex_parse_overview[sections['section']]['cite_span_lens'].append(len(sections['cite_spans']))
                latex_parse_overview[sections['section']]['cite_spans_start'].append(list(map(lambda x: int(x['start']),sections['cite_spans'])))
                latex_parse_overview[sections['section']]['section'].append(sections['section'])
    #             latex_parse_overview[sections['section']]['section'].append(sections['section'])
        else:
            latex_parse_overview[sections['section']] = {'text':[sections['text']],   'cite_spans':[sections['cite_spans']],
                                                          'cite_span_lens':[len(sections['cite_spans'])],
                                                          'cite_spans_start':[list(map(lambda x: int(x['start']),sections['cite_spans']))],
                                                          'section':[sections['section']]}#, 
                                                          #'latex_parse':article['latex_parse']}

    return latex_parse_overview

In [381]:
make_latex_overview(all_articles[70])

{'other': {'text': [' '],
  'cite_spans': [[]],
  'cite_span_lens': [0],
  'cite_spans_start': [[]],
  'section': ['other']}}

In [382]:
def paper_num_citations_latex(article):
    if article['latex_parse'] and article['latex_parse']['body_text']:
        latex_parse_overview = make_latex_overview(article)
        papers_num_citations = []
#         for num_sec,section in enumerate(article['latex_parse']['body_text']):
        for num_key,keys in enumerate(latex_parse_overview.keys()):
            sents_num_citations = sect_num_citations_latex(latex_parse_overview[keys])
            papers_num_citations.append(sents_num_citations)
        return papers_num_citations
    else:
        return [-1]

In [383]:
paper_num_citations_latex(all_articles[70])

[[]]

In [384]:
[len(text) for text in latex_parse_overview['Introduction']['text']]

[489, 918, 1112, 973, 357, 456]

In [385]:
latex_parse_overview['Introduction']['cite_spans_start']

[[193, 203, 213, 223, 233, 243, 253],
 [],
 [527, 537, 802],
 [10, 159, 170],
 [],
 []]

In [386]:
len_blocks = [len(text) for text in latex_parse_overview['Introduction']['text']]
start_by_all = []
for num_block,start_block in enumerate(latex_parse_overview['Introduction']['cite_spans_start']):
    for start in start_block:
        new_start = start
        new_start += sum(len_blocks[:num_block])
        start_by_all.append(new_start)
        print(start,new_start)
    print(10*'===')

193 193
203 203
213 213
223 223
233 233
243 243
253 253
527 1934
537 1944
802 2209
10 2529
159 2678
170 2689


In [387]:
def upgrade_start_span_latex(latex_parse_overview_key):
    len_blocks = [len(text) for text in latex_parse_overview_key['text']]
    start_by_all = []
    for num_block,start_block in enumerate(latex_parse_overview_key['cite_spans_start']):
        for start in start_block:
            new_start = start
            new_start += sum(len_blocks[:num_block])
            start_by_all.append(new_start)

    return start_by_all

In [388]:
upgrade_start_span_latex(latex_parse_overview['Introduction'])

[193, 203, 213, 223, 233, 243, 253, 1934, 1944, 2209, 2529, 2678, 2689]

In [389]:
''.join(latex_parse_overview['Introduction']['text'])[2529:]

'BIBREF6 proposed a sentence salience estimation framework known as VAESum based on a neural generative model called Variational Auto-Encoders (VAEs) BIBREF10 , BIBREF11 . During our investigation, we find that the Gaussian based VAEs have a strong ability to capture the salience information and filter the noise from texts. Intuitively, if we feed both the news sentences and the comment sentences into the VAEs, commonly existed latent aspect information from both of them will be enhanced and become salient. Inspired by this consideration, to address the sentence salience estimation problem for RA-MDS by jointly considering news documents and reader comments, we extend the VAESum framework by training the news sentence latent model and the comment sentence latent model simultaneously by sharing the neural parameters. After estimating the sentence salience, we employ a phrase based compressive unified optimization framework to generate a final summary.There is a lack of high-quality data

In [390]:
latex_parse_overview['Introduction']['cite_span_lens']

[7, 0, 3, 3, 0, 0]

In [391]:
def make_features_paper_latex(article):

    if article['latex_parse'] and article['latex_parse']['body_text']:
        if len(article['latex_parse']['body_text']) <= 2:
            if len(article['latex_parse']['body_text'][0]['text'].split()) < 1 :
                #or not article['latex_parse']['body_text'][0]['section']
                # не использую условие выше тк иначе уменьшится выборка (навряд ли в ней есть RW)
                return [-1]
        latex_parse_overview = make_latex_overview(article)
        section_num_citations = []
        section_names = []
        section_cite_dencite = []
        section_position = []
        section_av_cit_pos = []
        
#         for num_sec,section in enumerate(article['latex_parse']['body_text']):

        len_sects = len(latex_parse_overview)
        for num_key,keys in enumerate(latex_parse_overview.keys()):
            
            section_names.append(keys)
            all_text = ''.join(latex_parse_overview[keys]['text'])
            
            cnt_citations = sum(latex_parse_overview[keys]['cite_span_lens'])
            sect_len = len(all_text)
            sect_len_toks = len(all_text.split())
            
            cite_spans_sec_start = upgrade_start_span_latex(latex_parse_overview[keys])
            cite_spans_sec_start_mean = 0
            if len(cite_spans_sec_start) >=1:
                cite_spans_sec_start_mean = np.mean(cite_spans_sec_start)/sect_len
                
            sents_num_citations = sect_num_citations_latex(latex_parse_overview[keys])
            
            section_num_citations.append(sents_num_citations)
            section_cite_dencite.append(cnt_citations/sect_len_toks)
            section_position.append((num_key+1)/len_sects)
            section_av_cit_pos.append(cite_spans_sec_start_mean)
            print(20*'==')
            print('Features: \nciting dencity {0}: #citations={1} & sect_len_toks={2}|  '.format(cnt_citations/sect_len_toks, cnt_citations, sect_len_toks))
            print('Number of citations in sentences: {0}'.format(sents_num_citations))
            print('Positon sect: {0} : num_sec = {1} len_sects={2}'.format((num_key+1)/len_sects,num_key+1,len_sects))
            print('Positon aver. citation: {0} {1} len= {2}'.format(cite_spans_sec_start_mean,cite_spans_sec_start,sect_len))
            print('Section name:',keys)
        papers_features = {'cite_dencity':section_cite_dencite,'num_cits':section_num_citations,
                           'sec_pos':section_position,'sec_av_cit_pos':section_av_cit_pos,'sec_name':section_names}
        return papers_features
    else:
        return [-1]

In [403]:
article = all_articles[1007]
latex_parse_overview = dict()
for sections in article['latex_parse']['body_text']:
    latex_parse_overview[sections['section']] = sections
    print(sections['section'],len(sections['cite_spans']))
    print(sections)
    print(10*'==')

Introduction 5
{'text': 'Text simplification (TS) addresses the translation of an input sentence into one or more simpler sentences. It is a useful preprocessing step for several NLP tasks, such as machine translation BIBREF0 , BIBREF1 and relation extraction BIBREF2 , and has also been shown useful in the development of reading aids, e.g., for people with dyslexia BIBREF3 or non-native speakers BIBREF4 .', 'cite_spans': [{'start': 193, 'end': 200, 'text': None, 'latex': None, 'ref_id': 'BIBREF0'}, {'start': 203, 'end': 210, 'text': None, 'latex': None, 'ref_id': 'BIBREF1'}, {'start': 235, 'end': 242, 'text': None, 'latex': None, 'ref_id': 'BIBREF2'}, {'start': 343, 'end': 350, 'text': None, 'latex': None, 'ref_id': 'BIBREF3'}, {'start': 374, 'end': 381, 'text': None, 'latex': None, 'ref_id': 'BIBREF4'}], 'ref_spans': [], 'eq_spans': [], 'section': 'Introduction'}
Introduction 6
{'text': 'The task has attracted much attention in the past decade BIBREF5 , BIBREF6 , BIBREF7 , BIBREF8 , B

Introduction 0
{'text': '', 'cite_spans': [], 'ref_spans': [], 'eq_spans': [], 'section': 'Introduction'}
Related Work 0
{'text': '', 'cite_spans': [], 'ref_spans': [], 'eq_spans': [], 'section': 'Related Work'}
UCCA's Semantic Structures 7
{'text': ' In this section we will briefly describe the UCCA scheme, focusing on the concepts of Scenes and Centers which are key in the definition of SAMSA. UCCA BIBREF13 is a semantic annotation scheme based on typological BIBREF35 , BIBREF36 , BIBREF37 and cognitive BIBREF38 theories which aims to represent the main semantic phenomena in the text, abstracting away from syntactic detail. UCCA structures are directed acyclic graphs whose nodes (or units) correspond either to the leaves of the graph (including the words of the text) or to several elements jointly viewed as a single entity according to some semantic or cognitive consideration. Unlike AMR, UCCA semantic units are directly anchored in the text BIBREF39 , BIBREF15 , which allows easy in

{'text': ' INLINEFORM0 : in this case, we compute the maximal Many-to-1 correspondence between Scenes and sentences. A Scene is matched to a sentence in the following way. We say that a leaf INLINEFORM1 in a Scene INLINEFORM2 is consistent in a Scene-sentence mapping INLINEFORM3 which maps INLINEFORM4 to a sentence INLINEFORM5 , if there is a word INLINEFORM6 which INLINEFORM7 aligns to (according to the word alignment INLINEFORM8 ). The score of matching a Scene INLINEFORM9 to a sentence INLINEFORM10 is then defined to be the total number of consistent leaves in INLINEFORM11 . We traverse the Scenes in their order of occurrence in the text, selecting for each the sentence that maximizes the score. If INLINEFORM12 , once a sentence is matched to a Scene, it cannot be matched to another one. Ties between sentences are broken towards the sentence that appeared first in the output.', 'cite_spans': [], 'ref_spans': [], 'eq_spans': [{'start': 1, 'end': 12, 'text': 'n inp ≥n out ', 'latex': 

{'text': 'The role of the non-splitting penalty term INLINEFORM0 in the SAMSA formula is to penalize cases where the number of sentences in the output is smaller than the number of Scenes. In order to isolate the effect of the non-splitting penalty, we experiment with an additional metric SAMSA INLINEFORM1 (reads “SAMSA ablated”), which is identical to SAMSA but does not take this term into account. Corpus-level SAMSA and SAMSA INLINEFORM2 scores are obtained by averaging their sentence scores.', 'cite_spans': [], 'ref_spans': [], 'eq_spans': [{'start': 43, 'end': 54, 'text': 'n out /n inp ', 'latex': 'n_{out}/n_{inp}', 'ref_id': None}, {'start': 286, 'end': 297, 'text': ' abl ', 'latex': '_{abl}', 'ref_id': None}, {'start': 422, 'end': 433, 'text': ' abl ', 'latex': '_{abl}', 'ref_id': None}], 'section': 'Score Computation'}
Score Computation 1
{'text': 'In the case of implicit units i.e. omitted units that do not appear explicitly in the text BIBREF13 , since the unit preservation ca

{'text': "Inter-annotator agreement rates are computed in two ways. Table TABREF23 presents the absolute agreement and Cohen's quadratic weighted INLINEFORM0 BIBREF41 . Table TABREF24 presents Spearman's correlation ( INLINEFORM1 ) between the human ratings of the input-output pairs (top row), and between the resulting system scores (bottom row). In both cases, the agreement between the five annotators is computed as the average agreement over the 10 annotator pairs.", 'cite_spans': [{'start': 148, 'end': 156, 'text': None, 'latex': None, 'ref_id': 'BIBREF41'}], 'ref_spans': [{'start': 64, 'end': 72, 'text': None, 'latex': None, 'ref_id': 'TABREF23'}, {'start': 165, 'end': 173, 'text': None, 'latex': None, 'ref_id': 'TABREF24'}], 'eq_spans': [{'start': 136, 'end': 147, 'text': 'κ', 'latex': '\\kappa ', 'ref_id': None}, {'start': 208, 'end': 219, 'text': 'ρ', 'latex': '\\rho ', 'ref_id': None}], 'section': 'Inter-annotator Agreement'}
Inter-annotator Agreement 0
{'text': '', 'cite_spans

Evaluation on the QATS Benchmark 0
{'text': '', 'cite_spans': [], 'ref_spans': [], 'eq_spans': [], 'section': 'Evaluation on the QATS Benchmark'}
Conclusion 0
{'text': ' We presented the first structure-aware metric for text simplification, SAMSA, and the first evaluation experiments that directly target the structural simplification component, separately from the lexical component. We argue that the structural and lexical dimensions of simplification are loosely related, and that TS evaluation protocols should assess both. We empirically demonstrate that strong measures that assess lexical simplification quality (notably SARI), fail to correlate with human judgments when structural simplification is performed by the evaluated systems. Our experiments show that SAMSA correlates well with human judgments in such settings, which demonstrates its usefulness for evaluating and tuning statistical simplification systems, and shows that structural evaluation provides a complementary perspecti

In [404]:
article['latex_parse']

{'abstract': [],
 'body_text': [{'text': 'Text simplification (TS) addresses the translation of an input sentence into one or more simpler sentences. It is a useful preprocessing step for several NLP tasks, such as machine translation BIBREF0 , BIBREF1 and relation extraction BIBREF2 , and has also been shown useful in the development of reading aids, e.g., for people with dyslexia BIBREF3 or non-native speakers BIBREF4 .',
   'cite_spans': [{'start': 193,
     'end': 200,
     'text': None,
     'latex': None,
     'ref_id': 'BIBREF0'},
    {'start': 203,
     'end': 210,
     'text': None,
     'latex': None,
     'ref_id': 'BIBREF1'},
    {'start': 235,
     'end': 242,
     'text': None,
     'latex': None,
     'ref_id': 'BIBREF2'},
    {'start': 343,
     'end': 350,
     'text': None,
     'latex': None,
     'ref_id': 'BIBREF3'},
    {'start': 374,
     'end': 381,
     'text': None,
     'latex': None,
     'ref_id': 'BIBREF4'}],
   'ref_spans': [],
   'eq_spans': [],
   'sect

In [396]:
latex_nums  = [num_art  for num_art,article in enumerate(all_articles) if article['latex_parse'] and article['latex_parse']['body_text']]
len(latex_nums)

3868

In [397]:
for num_l in latex_nums:
    if num_l > 1000:
        break
    if len(all_articles[num_l]['latex_parse']['body_text']) <= 2:
        if len(all_articles[num_l]['latex_parse']['body_text'][0]['text'].split()) < 1 or not all_articles[num_l]['latex_parse']['body_text'][0]['section']:
            print('Delete them')
            print(num_l)
            print(all_articles[num_l]['latex_parse']['body_text'])
            print(20*'==')
        else:
            print(num_l)
            print(all_articles[num_l]['latex_parse']['body_text'])
            print(20*'==')

38
[{'text': 'We thank Kyle Richardson, Vivek Srikumar and the anonymous reviewers for their constructive feedback. This work was completed in partial fulfillment for the PhD degree of the first author. Herzig was supported by a Google PhD fellowship. This research was partially supported by The Israel Science Foundation grant 942/16 and The Blavatnik Computer Science Research Fund.', 'cite_spans': [], 'ref_spans': [], 'eq_spans': [], 'section': 'Acknowledgments'}]
Delete them
70
[{'text': ' ', 'cite_spans': [], 'ref_spans': [], 'eq_spans': [], 'section': 'other'}]
Delete them
96
[{'text': 'This work was partially supported by Berkeley AI Research, the NSF and DARPA XAI. DF is supported by a Tencent AI Lab Fellowship.', 'cite_spans': [], 'ref_spans': [], 'eq_spans': [], 'section': None}]
160
[{'text': 'We thank all the anonymous reviewers for their insightful comments on this paper. This work was partially supported by National Natural Science Foundation of China (61572049 and 61333018

[{'text': 'Seen in Table TABREF2 is a list of hyperparameters for our deep entity resolution models. We use the same hyperparameters regardless of scenario and dataset. We initialize the 300 dimensional word embeddings by the character-based pretrained fastText vectors publicly available.', 'cite_spans': [], 'ref_spans': [{'start': 14, 'end': 21, 'text': None, 'latex': None, 'ref_id': 'TABREF2'}], 'eq_spans': [], 'section': 'Deep ER Hyperparameters'}, {'text': 'Magellan BIBREF1 is an open-source package that provides state-of-the-art learning-based algorithms for ER. We use the package to run the following 6 learning algorithms for baselines: Decision Tree, SVM, Random Forest, Naive Bayes, Logistic Regression, and Linear Regression. For each attribute in the schema, we apply the following similarity functions: q-gram jaccard, cosine distance, Levenshtein disntance, Levenshtein similairty, Monge-Elkan measure, and exact matching.', 'cite_spans': [{'start': 9, 'end': 16, 'text': None, 'l

[{'text': ' In this paper, we explored the task of describing the visual relationship between two images. We collected the Image Editing Request dataset, which contains image pairs and human annotated editing instructions. We designed novel relational speaker models and evaluate them on our collected and other public existing dataset. Based on automatic and human evaluations, our relational speaker model improves the ability to capture visual relationships. For future work, we are going to further explore the possibility to merge the three datasets by either learning a joint image representation or by transferring domain-specific knowledge. We are also aiming to enlarge our Image Editing Request dataset with newly-released posts on Reddit and Zhopped.', 'cite_spans': [], 'ref_spans': [], 'eq_spans': [], 'section': 'Conclusion'}, {'text': 'We thank the reviewers for their helpful comments and Nham Le for helping with the initial data collection. This work was supported by Adobe, ARO-YIP

In [398]:
make_features_paper_latex(all_articles[70])

[-1]

In [408]:
def make_features_all_papers_latex(all_articles,verbose=False,with_empty=False):
    section_num_citations = []
    section_names = []
    section_cite_dencite = []
    section_position = []
    section_av_cit_pos = []
    article_paper_ids = []
    for num_artic,article in enumerate(all_articles):
        if article['latex_parse'] and article['latex_parse']['body_text']:
            if len(article['latex_parse']['body_text']) <= 2:
                if len(article['latex_parse']['body_text'][0]['text'].split()) < 1 :
                    #or not article['latex_parse']['body_text'][0]['section']
                    # не использую условие выше тк иначе уменьшится выборка (навряд ли в ней есть RW)
                    if with_empty:
                        article_paper_ids.append(article['paper_id'])
                        section_names.append(keys)
                        section_num_citations.append(-1)
                        section_cite_dencite.append(-1)
                        section_position.append(-1)
                        section_av_cit_pos.append(-1)
                    else:
                        continue
            print('---\n{}'.format(num_artic))
            
            latex_parse_overview = make_latex_overview(article)
            
            len_sects = len(latex_parse_overview)
            for num_key,keys in enumerate(latex_parse_overview.keys()):
                article_paper_ids.append(article['paper_id'])
                section_names.append(keys)
                all_text = ''.join(latex_parse_overview[keys]['text'])

                cnt_citations = sum(latex_parse_overview[keys]['cite_span_lens'])
                sect_len = len(all_text)
                sect_len_toks = len(all_text.split())
                # если у нас мусор в тексте
                if sect_len_toks == 0:
                    cnt_citations = 0
                    sect_len_toks = 1 
                cite_spans_sec_start = upgrade_start_span_latex(latex_parse_overview[keys])
                cite_spans_sec_start_mean = 0
                if len(cite_spans_sec_start) >=1:
                    cite_spans_sec_start_mean = np.mean(cite_spans_sec_start)/sect_len

                sents_num_citations = sect_num_citations_latex(latex_parse_overview[keys])

                section_num_citations.append(sents_num_citations)
                section_cite_dencite.append(cnt_citations/sect_len_toks)
                section_position.append((num_key+1)/len_sects)
                section_av_cit_pos.append(cite_spans_sec_start_mean)
                if verbose:
                    print(20*'==')
                    print('Features: \nciting dencity {0}: #citations={1} & sect_len_toks={2}|  '.format(cnt_citations/sect_len_toks, cnt_citations, sect_len_toks))
                    print('Number of citations in sentences: {0}'.format(sents_num_citations))
                    print('Positon sect: {0} : num_sec = {1} len_sects={2}'.format((num_key+1)/len_sects,num_key+1,len_sects))
                    print('Positon aver. citation: {0} {1} len= {2}'.format(cite_spans_sec_start_mean,cite_spans_sec_start,sect_len))
                    print('Section name:',keys)
        else:
            if with_empty:
                article_paper_ids.append(article['paper_id'])
                section_names.append(keys)
                section_num_citations.append(-1)
                section_cite_dencite.append(-1)
                section_position.append(-1)
                section_av_cit_pos.append(-1)
                
            else:
                continue
    papers_features = {'paper_id':article_paper_ids,'sec_name':section_names ,'cite_dencity':section_cite_dencite,
                       'num_cits':section_num_citations,'sec_pos':section_position,'sec_av_cit_pos':section_av_cit_pos}
    return papers_features
        

In [409]:
latex_features = make_features_all_papers_latex(all_articles)

---
0
---
6
---
37
---
38
---
53
---
71
---
72
---
76
---
96
---
99
---
100
---
104
---
109
---
123
---
134
---
135
---
155
---
160
---
163
---
179
---
201
---
218
---
223
---
225
---
233
---
239
---
249
---
253
---
257
---
267
---
273
---
291
---
302
---
304
---
308
---
309
---
317
---
321
---
329
---
331
---
335
---
337
---
339
---
346
---
347
---
348
---
349
---
355
---
373
---
375
---
399
---
404
---
406
---
409
---
454
---
458
---
461
---
463
---
475
---
480
---
485
---
496
---
504
---
520
---
528
---
529
0 13
---
532
---
533
---
537
---
545
---
549
---
552
---
564
---
566
---
592
---
598
---
601
---
624
---
627
---
644
---
651
---
652
---
653
---
662
---
669
---
682
---
685
---
690
---
693
---
696
---
700
---
713
---
725
---
735
---
752
---
757
---
759
---
773
---
797
---
811
---
823
---
827
---
828
---
834
---
846
---
869
---
886
---
892
---
911
---
920
---
945
---
983
---
989
---
996
---
1007
---
1012
---
1031
---
1035
---
1045
---
1047
---
1051
---
1062
---
1063
---
1070
---
1

8751
---
8752
---
8755
---
8757
---
8758
---
8766
---
8777
---
8792
---
8794
---
8802
---
8814
---
8815
---
8836
---
8837
---
8841
---
8842
---
8856
---
8871
---
8874
---
8878
---
8879
---
8880
---
8885
---
8895
---
8896
---
8898
---
8913
---
8925
---
8944
---
8955
---
8957
---
8987
---
9003
---
9004
---
9014
---
9025
---
9041
---
9049
---
9059
---
9067
---
9070
---
9079
---
9110
---
9150
---
9157
---
9180
---
9183
---
9212
---
9217
---
9234
---
9238
---
9251
---
9299
---
9306
---
9313
---
9321
---
9326
---
9329
---
9331
---
9338
---
9357
---
9435
---
9442
---
9460
---
9478
---
9489
---
9501
---
9504
---
9506
---
9511
---
9518
---
9536
---
9543
---
9562
---
9563
---
9571
---
9572
---
9578
---
9609
---
9649
---
9657
---
9658
---
9683
---
9690
---
9706
---
9707
---
9714
---
9722
---
9727
---
9739
---
9741
---
9742
---
9770
---
9782
---
9839
---
9845
---
9847
---
9851
---
9855
---
9887
---
9897
---
9903
---
9915
---
9943
---
9966
---
9969
---
9988
---
10006
---
10024
---
10025
---
10030
-

17298
---
17305
---
17316
---
17337
---
17338
---
17344
---
17346


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)




---
21431
---
21435
---
21436
---
21441
---
21447
---
21458
---
21464
---
21468
---
21480
---
21481
---
21483
---
21500
---
21525
---
21534
---
21540
---
21555
---
21564
---
21566
---
21573
---
21592
---
21593
---
21623
---
21627
---
21629
---
21651
---
21663
---
21669
---
21677
---
21680
---
21682
---
21686
---
21690
---
21712
---
21720
---
21738
---
21739
---
21744
---
21746
---
21755
---
21758
---
21774
---
21788
---
21797
---
21823
---
21824
---
21828
---
21830
---
21832
---
21836
---
21861
---
21864
---
21872
---
21900
---
21920
---
21943
---
21964
---
21974
---
21978
---
21994
---
22024
---
22030
---
22039
---
22045
---
22046
---
22052
---
22053
---
22055
---
22082
---
22145
---
22151
---
22160
---
22163
---
22175
---
22177
---
22178
---
22184
---
22189
---
22201
---
22215
---
22218
---
22221
---
22232
---
22236
---
22245
---
22283
---
22312
---
22323
---
22328
---
22343
---
22355
---
22358
---
22376
---
22403
---
22404
---
22408
---
22411
---
22412
---
22415
---
22418
---
22420

29537
---
29555
---
29560
---
29565
---
29572
---
29574
---
29585
---
29586
---
29604
---
29609
---
29619
---
29631
---
29642
---
29648
---
29660
---
29673
---
29676
---
29690
---
29699
---
29712
---
29715
---
29735
---
29747
---
29757
---
29773
---
29776
---
29777
---
29788
---
29790
---
29797
---
29809
---
29832
---
29882
---
29892
---
29920
---
29946
---
29985
---
29995
---
30002
---
30007
---
30013
---
30043
---
30076
---
30089
---
30091
---
30092
---
30125
---
30141
---
30144
---
30167
---
30187
---
30188
---
30190
---
30194
---
30206
---
30207
---
30213
---
30219
---
30239
---
30253
---
30280
---
30281
---
30287
---
30306
---
30311
---
30312
---
30314
---
30325
---
30338
---
30351
---
30356
---
30374
---
30390
---
30415
---
30423
---
30446
---
30512
---
30523
---
30536
---
30537
---
30557
---
30565
---
30580
---
30584
---
30600
---
30601
---
30611
---
30617
---
30642
---
30643
---
30644
---
30654
---
30671
---
30673
---
30681
---
30729
---
30733
---
30738
---
30747
---
30750
---


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [412]:
df_latex_features = pd.DataFrame(latex_features)
df_latex_features.head(20)

Unnamed: 0,paper_id,sec_name,cite_dencity,num_cits,sec_pos,sec_av_cit_pos
0,10164018,Introduction,0.020093,"[0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.066667,0.277745
1,10164018,Overview,0.007519,"[0, 0, 1, 0, 0]",0.133333,0.529221
2,10164018,Reader-Aware Salience Estimation,0.007557,"[2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.2,0.201558
3,10164018,Summary Construction,0.032967,"[2, 0, 0, 0, 0, 0, 3, 0, 1, 0]",0.266667,0.585332
4,10164018,Data Description,0.0,"[0, 0]",0.333333,0.0
5,10164018,Background,0.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0.4,0.0
6,10164018,Data Collection,0.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.466667,0.0
7,10164018,Data Properties,0.0,"[0, 0, 0, 0, 0, 0, 0, 0]",0.533333,0.0
8,10164018,Dataset and Metrics,0.03125,"[0, 1, 0]",0.6,0.55665
9,10164018,Comparative Methods,0.034091,"[0, 1, 0, 1, 1, 2, 1, 0, 0, 0, 0]",0.666667,0.442059


##### мы должны выделить только обзорную часть из текста, а все остальноё сохранить

In [93]:
overview_papers[all_articles[0]['paper_id']]['grobid_parse'] = {'abstract':all_articles[0]['grobid_parse']['abstract'],
                                                                'overview_text':grobid_parse_overview,  
                                                                'bib_entries':all_articles[0]['grobid_parse']['bib_entries']}
overview_papers[all_articles[0]['paper_id']]['latex_parse'] =  {'abstract':all_articles[0]['latex_parse']['abstract'],
                                                                'overview_text':latex_parse_overview,  
                                                                'bib_entries':all_articles[0]['latex_parse']['bib_entries']}

In [94]:
overview_papers

{'10164018': {'paper_id': '10164018',
  'metadata': {'title': 'Reader-Aware Multi-Document Summarization: An Enhanced Model and The First Dataset',
   'authors': [{'first': 'Piji', 'middle': [], 'last': 'Li', 'suffix': ''},
    {'first': 'Lidong', 'middle': [], 'last': 'Bing', 'suffix': ''},
    {'first': 'Wai', 'middle': [], 'last': 'Lam', 'suffix': ''}],
   'abstract': 'We investigate the problem of reader-aware multi-document summarization (RA-MDS) and introduce a new dataset for this problem. To tackle RA-MDS, we extend a variational auto-encodes (VAEs) based MDS framework by jointly considering news documents and reader comments. To conduct evaluation for summarization performance, we prepare a new dataset. We describe the methods for data collection, aspect annotation, and summary writing as well as scrutinizing by experts. Experimental results show that reader comments can improve the summarization performance, which also demonstrates the usefulness of the proposed dataset. The 

In [97]:
for num_tex,(k,v) in enumerate(latex_parse_overview.items()):
    print(num_tex,k,sum(v['cite_span_lens']))

0 Introduction 13
1 Overview 1
2 Reader-Aware Salience Estimation 6
3 Summary Construction 6
4 Data Description 0
5 Background 0
6 Data Collection 0
7 Data Properties 0
8 Dataset and Metrics 1
9 Comparative Methods 6
10 Experimental Settings 2
11 Results on Our Dataset 0
12 Further Investigation of Our Framework  2
13 Case Study 0
14 Conclusions 0


In [98]:
for k,v in grobid_parse_overview.items():
    print(k,len(v['cite_spans']))

0 13
1 1
2 6
3 3
4 0
5 0
6 0
7 0
8 1
9 4
10 2
11 2
12 0
13 1


Упорядочим по количеству ссылок и возьмём абзац с максимальным значением, а также со 2 максимальным значением если количество ссылок в нём больше половины от максимального кол-ва 

In [99]:
grobid_parse_overview = {k: v for k, v in sorted(grobid_parse_overview.items(), 
                                                 key=lambda item: len(item[1]['cite_spans']), reverse=True)}

In [100]:
max_cite_span_sum = 0
max_grobid_parse_overview = dict()
for k,v in grobid_parse_overview.items():
    if max_cite_span_sum < len(v['cite_spans']):
        max_cite_span_sum = len(v['cite_spans'])
        max_grobid_parse_overview[k] = v
    elif (max_cite_span_sum>7) and len(v['cite_spans'])>max_cite_span_sum//2:
        max_grobid_parse_overview[k] = v

In [101]:
max_grobid_parse_overview

{0: {'text': 'The goal of multi-document summarization (MDS) is to automatically generate a brief, wellorganized summary for a topic which describes an event with a set of documents from different sources. (Goldstein et al., 2000; Erkan and Radev, 2004; Wan et al., 2007; Nenkova and McKeown, 2012; Min et al., 2012; Li et al., 2017) . In the typical setting of MDS, the input is a set of news documents about the same topic. The output summary is a piece of short text document containing several sentences, generated only based on the input original documents.With the development of social media and mobile equipments, more and more user generated * The work described in this paper is supported by a grant from the Grant Council of the Hong Kong Special Administrative Region, China (Project Code: 14203414) .1 http://www.se.cuhk.edu.hk/˜textmine/ dataset/ra-mds/ NEWS: The most important announcements from Google\'s big developers\' conference content is available. Figure 1 is a snapshot of re

In [102]:
latex_parse_overview = {k: v for k, v in sorted(latex_parse_overview.items(), 
                                                 key=lambda item: sum(item[1]['cite_span_lens']), reverse=True)}

In [103]:
max_cite_span_sum = 0
max_latex_parse_overview = dict()
for k,v in latex_parse_overview.items():
    if max_cite_span_sum < sum(v['cite_span_lens']):
        max_cite_span_sum = sum(v['cite_span_lens'])
        max_latex_parse_overview[k] = v
    elif (max_cite_span_sum>7) and sum(v['cite_span_lens'])>max_cite_span_sum//2:
        max_latex_parse_overview[k] = v

In [104]:
max_latex_parse_overview

{'Introduction': {'text': ['The goal of multi-document summarization (MDS) is to automatically generate a brief, well-organized summary for a topic which describes an event with a set of documents from different sources. BIBREF0 , BIBREF1 , BIBREF2 , BIBREF3 , BIBREF4 , BIBREF5 , BIBREF6 . In the typical setting of MDS, the input is a set of news documents about the same topic. The output summary is a piece of short text document containing several sentences, generated only based on the input original documents.',
   "With the development of social media and mobile equipments, more and more user generated content is available. Figure FIGREF2 is a snapshot of reader comments under the news report “The most important announcements from Google's big developers' conference”. The content of the original news report talks about some new products based on AI techniques. The news report generally conveys an enthusiastic tone. However, while some readers share similar enthusiasms, some others e

In [105]:
for k,v in latex_parse_overview.items():
    print(k,sum(v['cite_span_lens']))

Introduction 13
Reader-Aware Salience Estimation 6
Summary Construction 6
Comparative Methods 6
Experimental Settings 2
Further Investigation of Our Framework  2
Overview 1
Dataset and Metrics 1
Data Description 0
Background 0
Data Collection 0
Data Properties 0
Results on Our Dataset 0
Case Study 0
Conclusions 0


### Применим для всех 


In [223]:
overview_papers = dict()
for num_artic,article in enumerate(all_articles):
    # проверяем что у статьи есть grobid_parse и latex_parse и естб текст
    if (article['grobid_parse'] and article['grobid_parse']['body_text']) or (article['latex_parse'] and article['latex_parse']['body_text']):
        # задаем шаблон отражения статьи в укороченном формате (чтобы занимать меньше памяти)
        overview_papers[article['paper_id']] = { 'paper_id':article['paper_id'],   'metadata':article['metadata'],
                                                 's2_pdf_hash':article['s2_pdf_hash'], 'grobid_parse':None,'latex_parse':None}
        
        grobid_parse_overview = None
        # если у статьи есть article['grobid_parse']['body_text']
        if article['grobid_parse'] and article['grobid_parse']['body_text']:
            grobid_parse_overview = dict()
            # проходим по каждому абзацу текста
            for num_sec,sections in enumerate(article['grobid_parse']['body_text']):
                grobid_parse_overview[num_sec] = sections
            
            # отсортируем по количеству цитат абзацы
            grobid_parse_overview = {k: v for k, v in sorted(grobid_parse_overview.items(), key=lambda item: len(item[1]['cite_spans']), reverse=True)}
            
#             # найдем 1 и 2 максимум по обзорной части
#             max_cite_span_sum = 0
#             max_grobid_parse_overview = dict()
#             for k,v in grobid_parse_overview.items():
#                 if max_cite_span_sum < len(v['cite_spans']):
#                     max_cite_span_sum = len(v['cite_spans'])
#                     max_grobid_parse_overview[k] = v
#                 # записываем 2 максимум, если количество ссылок в егочасти больше половины от максимального 
#                 elif (max_cite_span_sum>7) and len(v['cite_spans'])>max_cite_span_sum//2:
#                     max_grobid_parse_overview[k] = v
            
#             grobid_parse_overview = max_grobid_parse_overview
            
        latex_parse_overview = None
        # если у статьи есть article['latex_parse']['body_text']
        if article['latex_parse'] and article['latex_parse']['body_text']:
            latex_parse_overview = dict()
            # проходим по каждому абзацу текста
            # в latex_parse 
            for sections in article['latex_parse']['body_text']:
                if sections['section'] in latex_parse_overview:
                    if latex_parse_overview[sections['section']] == sections:
                        continue
                    else:
                        latex_parse_overview[sections['section']]['text'].append(sections['text'])
                        latex_parse_overview[sections['section']]['cite_spans'].append(sections['cite_spans'])
                        latex_parse_overview[sections['section']]['cite_span_lens'].append(len(sections['cite_spans']))
                        latex_parse_overview[sections['section']]['section'].append(sections['section'])
                else:
                    latex_parse_overview[sections['section']] = {'text':[sections['text']],   'cite_spans':[sections['cite_spans']],
                                                                  'cite_span_lens':[len(sections['cite_spans'])],
                                                                  'section':[sections['section']]}
            latex_parse_overview = {k: v for k, v in sorted(latex_parse_overview.items(), key=lambda item: item[1]['cite_span_lens'], reverse=True)}
        
        
#             max_cite_span_sum = 0
#             max_latex_parse_overview = dict()
#             for k,v in latex_parse_overview.items():
#                 if max_cite_span_sum < sum(v['cite_span_lens']):
#                     max_cite_span_sum = sum(v['cite_span_lens'])
#                     max_latex_parse_overview[k] = v
#                 elif (max_cite_span_sum>0) and sum(v['cite_span_lens'])>max_cite_span_sum//2:
#                     max_latex_parse_overview[k] = v
            
#             latex_parse_overview = max_latex_parse_overview
        

        if grobid_parse_overview:
            overview_papers[article['paper_id']]['grobid_parse'] = {'abstract':None,
                                                        'overview_text':grobid_parse_overview,  
                                                        'bib_entries':None}
            if article['grobid_parse']['abstract']:
                overview_papers[article['paper_id']]['grobid_parse']['abstract'] = article['grobid_parse']['abstract']
            if article['grobid_parse']['bib_entries']:
                overview_papers[article['paper_id']]['grobid_parse']['bib_entries'] = article['grobid_parse']['bib_entries']
            
        if latex_parse_overview:            
            overview_papers[article['paper_id']]['latex_parse'] = {'abstract':None,
                                                                    'overview_text':latex_parse_overview,  
                                                                    'bib_entries':None}
            if article['latex_parse']['abstract']:
                overview_papers[article['paper_id']]['latex_parse']['abstract'] = article['latex_parse']['abstract']
            if article['latex_parse']['bib_entries']:
                overview_papers[article['paper_id']]['latex_parse']['bib_entries'] = article['latex_parse']['bib_entries']

In [224]:
list(overview_papers.keys())[:10]

['10164018',
 '14472576',
 '17302615',
 '3243536',
 '3248240',
 '2223737',
 '488',
 '14323173',
 '15251605',
 '8260435']

### Применим критерий и посмотрим какие секции выделились

In [306]:
overview_papers_w_latex = {k:v for k,v in overview_papers.items() if v['latex_parse']}

In [308]:
len(overview_papers_w_latex)

4039

In [216]:
list(overview_papers_w_latex.keys())[:10]

['10164018',
 '488',
 '189927790',
 '126168169',
 '184488238',
 '85517799',
 '16050464',
 '52155342',
 '52247458',
 '5267356']

In [312]:
for k,v in overview_papers_w_latex.items():
    if len(list(overview_papers_w_latex[k]['latex_parse']['overview_text']))<=1:
        print(k)

5084110
198922003
1238927
173990592
52097879
3698344
6385589
52118895
53981714
51862727
868799
102353905
2753602
131774178
7193975
165163629
174798321
5294994
44140055
52896891
29151018
174797858
46938018
67855637
2478928
29165442
49358911
371926
102351546
15213991
53082498
52114454
3161327
5740960
52073201
1762731
2821908
15359942
170079259
3502581
102350959
174798375
53064621
10643243
67856324
174801285
29164993
119105191
102486945
44170166
52205000
17953812
56895551
311594
52290656
14179380
51878898
52178091
174797955
8622019
15295411
247735
9387600
2955580
2728774
7887385
52113877
14922772
196177814
3204831
52117484
1452429
12245103
80628357
4875809
26397607
131773929
52143204
47018994
53082704
118680003
13747354
85529973
90262493
14748840
198229567
5037669
92994351
18449288
195345032
5112203
119184444
4943905
52074264
52176506
14206024
2404341
13052370
102350797
6376814
49907944
4889691
1707814
24129906
739426
4311819
4812047
52112703
52099643
23144639
119186140
19751968
11839232


In [256]:
print(overview_papers_w_latex['5084110'].keys())
print(overview_papers_w_latex['5084110']['latex_parse'].keys())


dict_keys(['paper_id', 'metadata', 's2_pdf_hash', 'grobid_parse', 'latex_parse'])
dict_keys(['abstract', 'overview_text', 'bib_entries'])


In [410]:
def mult_in(list_toks,line):
    flag = False
    line = line.lower()
    for tok in list_toks:
        if tok.lower() in line:
            flag = tok
            break
    return flag
RW_names = [
    'related wor','background','previous w'
]

In [411]:
tex_covering_max = list(map(lambda x: list(overview_papers_w_latex[x]['latex_parse']['overview_text'])[0].lower() if list(overview_papers_w_latex[x]['latex_parse']['overview_text'])[0] else 'None',overview_papers_w_latex))

In [412]:
for ind,x,y in zip(pd.Series(tex_covering_max).value_counts().index,pd.Series(tex_covering_max).value_counts()/len(tex_covering_max)*100,pd.Series(tex_covering_max).value_counts()):
    print("%25s|%.2f %% |%7d" % (ind,round(x,1),y))

             introduction|30.30 % |   1224
             related work|17.20 % |    693
                     None|9.50 % |    382
          acknowledgments|4.60 % |    185
         acknowledgements|2.10 % |     84
               conclusion|1.70 % |     70
               background|0.70 % |     29
              conclusions|0.60 % |     23
            related works|0.50 % |     20
              experiments|0.50 % |     19
                 sections|0.40 % |     18
                  results|0.40 % |     16
               discussion|0.40 % |     16
       experimental setup|0.30 % |     13
                 datasets|0.30 % |     12
                     data|0.30 % |     12
                  dataset|0.30 % |     11
    experimental settings|0.20 % |     10
                baselines|0.20 % |     10
conclusion and future work|0.20 % |     10
               motivation|0.20 % |      9
            previous work|0.20 % |      9
               references|0.20 % |      9
          acknowledgement|0.20 

    non-task-oriented sds|0.00 % |      1
keyphrase extraction and generation|0.00 % |      1
referential property constraint|0.00 % |      1
nlp event representations|0.00 % |      1
     adversarial heatmaps|0.00 % |      1
   cross-lingual analysis|0.00 % |      1
spanish, dutch and german|0.00 % |      1
           word embedding|0.00 % |      1
regular tree grammars of derivations|0.00 % |      1
     top-down propagation|0.00 % |      1
                 examples|0.00 % |      1
          unbounded model|0.00 % |      1
arabic dialect identification|0.00 % |      1
    datasets and settings|0.00 % |      1
             online abuse|0.00 % |      1
                  summary|0.00 % |      1
       evaluation details|0.00 % |      1
               base model|0.00 % |      1
       dual-coding theory|0.00 % |      1
step 3: measure collocational distributions|0.00 % |      1
subset approximation by transforming the grammar|0.00 % |      1
phrase-based machine translation|0.00 % |     

           nmt background|0.00 % |      1
            penn treebank|0.00 % |      1
               procedure:|0.00 % |      1
      the pointer softmax|0.00 % |      1
      classifier settings|0.00 % |      1
construction of an s-type transducer |0.00 % |      1
 predicting dialogue acts|0.00 % |      1
 augmented word embedding|0.00 % |      1
   typed feature grammars|0.00 % |      1
hedging as a sign of scientific discourse|0.00 % |      1
            the formalism|0.00 % |      1
further details on evaluation dataset|0.00 % |      1
                algorithm|0.00 % |      1
          text classifier|0.00 % |      1
lstm neural reordering model|0.00 % |      1
   training and test data|0.00 % |      1
existing models of attachment|0.00 % |      1
      extended levi graph|0.00 % |      1
    conversational models|0.00 % |      1
the task: base np chunking|0.00 % |      1
       maximizing metrics|0.00 % |      1
deriving chunks from treebank parses|0.00 % |      1
                 

         extracting edits|0.00 % |      1
   affective text dataset|0.00 % |      1
syntactic features in text classification|0.00 % |      1
assigning thesaurus categories|0.00 % |      1
     empirical evaluation|0.00 % |      1
 a specification language|0.00 % |      1
    conclusion and future|0.00 % |      1
  confidence and salience|0.00 % |      1
     readability measures|0.00 % |      1
                 ace 2005|0.00 % |      1
analysis of translation errors|0.00 % |      1
      result and thoughts|0.00 % |      1
            existing work|0.00 % |      1
        nmt configuration|0.00 % |      1
rules of sampling sql queries|0.00 % |      1
                     arae|0.00 % |      1
tree structure enhanced neural machine translation|0.00 % |      1
implementing hpsg in a clp framework|0.00 % |      1
   end-to-end nlg systems|0.00 % |      1
operationalizing irregularity|0.00 % |      1
              equivalence|0.00 % |      1
gender stereotypes in text|0.00 % |      1
searc

In [413]:
covering_by_tex = list(map(lambda x: 'related work' if ('related work' in x) or ('background' in x) or ('previous work' in x) or ('overview' in x) else x, covering_by_tex))

In [414]:
for ind,x,y in zip(pd.Series(covering_by_tex).value_counts().index,pd.Series(covering_by_tex).value_counts()/len(covering_by_tex)*100,pd.Series(covering_by_tex).value_counts()):
    print("%25s|%.2f %% |%7d" % (ind,round(x,1),y))

             introduction|37.70 % |   1208
             related work|25.00 % |    802
               conclusion|0.60 % |     19
                     None|0.60 % |     19
              experiments|0.60 % |     19
                 sections|0.60 % |     18
                  results|0.50 % |     16
               discussion|0.50 % |     15
       experimental setup|0.40 % |     13
                 datasets|0.40 % |     12
                     data|0.40 % |     12
                  dataset|0.30 % |     11
    experimental settings|0.30 % |     10
                baselines|0.30 % |     10
          acknowledgments|0.30 % |      9
               motivation|0.30 % |      9
              methodology|0.20 % |      8
   implementation details|0.20 % |      8
               evaluation|0.20 % |      8
                    model|0.20 % |      7
                   models|0.20 % |      5
                    setup|0.20 % |      5
              conclusions|0.20 % |      5
     experimental setting|0.20 %

word vector-based tree edit distance|0.00 % |      1
        the grammar model|0.00 % |      1
word embedding feature models|0.00 % |      1
cross-lingual word representations|0.00 % |      1
 maximum mean discrepancy|0.00 % |      1
                 modeling|0.00 % |      1
sparse representations for expansion|0.00 % |      1
interaction with the dialogue manager|0.00 % |      1
 datasets & architectures|0.00 % |      1
     learning experiments|0.00 % |      1
            video channel|0.00 % |      1
       sentence selection|0.00 % |      1
knowledge graph embeddings|0.00 % |      1
towards a classification scheme: linguistic theories of definite descriptions|0.00 % |      1
            skip-thoughts|0.00 % |      1
model selection criterion|0.00 % |      1
multilingual nmt versus bilingual nmt|0.00 % |      1
the task of grading lexical entailment|0.00 % |      1
          overall results|0.00 % |      1
   additional experiments|0.00 % |      1
         multi-task model|0.00 % | 

    finite-state calculus|0.00 % |      1
      system descriptions|0.00 % |      1
multilingual attentional nmt|0.00 % |      1
 technical implementation|0.00 % |      1
recurrent models for text classification|0.00 % |      1
conditional variational autoencoder |0.00 % |      1
a nonstationary language model|0.00 % |      1
deep learning for sentiment analysis|0.00 % |      1
      sentence similarity|0.00 % |      1
extending annotation coverage|0.00 % |      1
        neural benchmarks|0.00 % |      1
dependency-based algorithm (dba)|0.00 % |      1
unicon: an implementation|0.00 % |      1
sequence labeling neural models|0.00 % |      1
                  setting|0.00 % |      1
        annotation graphs|0.00 % |      1
                 decoding|0.00 % |      1
     baseline system (bl)|0.00 % |      1
learning base noun phrases by machine|0.00 % |      1
near threshold structure in j/ψ→γpp ¯j/\psi \rightarrow \gamma p \bar{p}|0.00 % |      1
       semantic functions|0.00 % |     

       evaluation dataset|0.00 % |      1
 negation scope detection|0.00 % |      1
parallelism and inference|0.00 % |      1
general neural model for chinese word segmentation|0.00 % |      1
     model initialization|0.00 % |      1
   maximum-entropy method|0.00 % |      1
fake news detection using textual information|0.00 % |      1
    alignment-based model|0.00 % |      1
flexible sense distinctions|0.00 % |      1
      semi-supervised nmt|0.00 % |      1
         paragraph reader|0.00 % |      1
  alignment visualization|0.00 % |      1
              pos-tagging|0.00 % |      1
      evaluation settings|0.00 % |      1
                 treebank|0.00 % |      1
shortcomings of current summarization models|0.00 % |      1
sexual predator detection|0.00 % |      1
dataset and experimental settings|0.00 % |      1
            the treebanks|0.00 % |      1
         the second model|0.00 % |      1
lexical replacement in phylogenetics|0.00 % |      1
bootstrapping a first-person sent

In [415]:
tex_covering_max_2 = list(map(lambda x: list(overview_papers_w_latex[x]['latex_parse']['overview_text'])[1].lower() if len(list(overview_papers_w_latex[x]['latex_parse']['overview_text']))>=2 and list(overview_papers_w_latex[x]['latex_parse']['overview_text'])[1]!=None else 'None',overview_papers_w_latex))

In [416]:
def rw_detecting(x):
    if list(x['latex_parse']['overview_text'])[0]!=None:
        line1 = list(x['latex_parse']['overview_text'])
#         print(line1)
        line1 = [x for x in line1 if x is not None]
        line1 = ' '.join(line1).lower()
        
        if mult_in(RW_names,line1):
            return 1
        else:
            return 0
    else:
        return 0
    

In [417]:
rw_detecting(overview_papers_w_latex['5084110'])

0

In [418]:
tex_covering_rw = list(map(lambda x: rw_detecting(overview_papers_w_latex[x]),overview_papers_w_latex))

In [419]:
tex_ids = list(map(lambda x: overview_papers_w_latex[x]['paper_id'] ,overview_papers_w_latex))

In [420]:
d = {'id':tex_ids,'max_title':tex_covering_max,'max_title_2':tex_covering_max_2,'rw_exact':tex_covering_rw}
df = pd.DataFrame(d)

In [421]:
df.head()

Unnamed: 0,id,max_title,max_title_2,rw_exact
0,10164018,introduction,reader-aware salience estimation,1
1,488,introduction,the grammar,0
2,189927790,introduction,relevance-based auxiliary task (rat),0
3,5084110,acknowledgments,,0
4,198922003,,,0


In [422]:
df.shape,df[(df.max_title == 'None') & (df.max_title_2 == 'None')].shape

((4039, 4), (382, 4))

In [423]:
df = df[~((df.max_title == 'None') & (df.max_title_2 == 'None'))]
df.shape

(3657, 4)

In [432]:
df[df.max_title == 'background'].head()

Unnamed: 0,id,max_title,max_title_2,rw_exact


In [430]:
df.replace(['related works','background','related works','related work and discussion','comparison with previous work',
            'previous work','further related work','introduction and background'], 'related work',inplace=True)

In [431]:
df[df.rw_exact == 1].shape

(1784, 4)

In [440]:
print("%25s|%10s |%7s"%('название секции','% от всех','кол-во '))
print(50*'-')
for ind,x,y in zip(df[df.rw_exact == 1].max_title.value_counts().index,df[df.rw_exact == 1].max_title.value_counts()/len(df[df.rw_exact == 1])*100,df[df.rw_exact == 1].max_title.value_counts()):
    print("%25s|%2.2f %% |%7d" % (ind,x,y))

          название секции| % от всех |кол-во 
--------------------------------------------------
             related work|42.94 % |    766
             introduction|30.72 % |    548
              experiments|0.56 % |     10
                  results|0.45 % |      8
                 datasets|0.45 % |      8
                  dataset|0.39 % |      7
                    model|0.34 % |      6
       experimental setup|0.28 % |      5
               evaluation|0.28 % |      5
                baselines|0.28 % |      5
                     data|0.22 % |      4
    experimental settings|0.22 % |      4
                    setup|0.22 % |      4
     experimental setting|0.22 % |      4
              methodology|0.22 % |      4
               discussion|0.22 % |      4
               conclusion|0.22 % |      4
                   models|0.22 % |      4
   implementation details|0.17 % |      3
               references|0.17 % |      3
                  methods|0.11 % |      2
       evaluation m

   geocoding reddit users|0.06 % |      1
      data representation|0.06 % |      1
   ed baselines & results|0.06 % |      1
span-attribute tagging (sa-t) model|0.06 % |      1
residual vq-vae for unsupervised monolingual paraphrasing|0.06 % |      1
asymptotic normality and statistical efficiency|0.06 % |      1
experimental data, setup and results|0.06 % |      1
revisiting the feature augmentation method|0.06 % |      1
       sentiment baseline|0.06 % |      1
shallow parsers with hand-written rules|0.06 % |      1
      extended levi graph|0.06 % |      1
        analysis of depnn|0.06 % |      1
large dataset for training and validation|0.06 % |      1
           classification|0.06 % |      1
training text classifiers |0.06 % |      1
deep convolutional networks|0.06 % |      1
word embedding based models|0.06 % |      1
distributed word representations|0.06 % |      1
   methods not considered|0.06 % |      1
                  systems|0.06 % |      1
approaches using prosodic 

In [441]:
print("%25s|%10s |%7s"%('название секции','% от всех','кол-во '))
print(50*'-')
for ind,x,y in zip(df.max_title.value_counts().index,df.max_title.value_counts()/len(df)*100,df.max_title.value_counts()):
    print("%25s|%2.2f %% |%7d" % (ind,x,y))

          название секции| % от всех |кол-во 
--------------------------------------------------
             introduction|33.47 % |   1224
             related work|20.95 % |    766
          acknowledgments|5.06 % |    185
         acknowledgements|2.30 % |     84
               conclusion|1.91 % |     70
              conclusions|0.63 % |     23
              experiments|0.52 % |     19
                 sections|0.49 % |     18
                  results|0.44 % |     16
               discussion|0.44 % |     16
       experimental setup|0.36 % |     13
                     data|0.33 % |     12
                 datasets|0.33 % |     12
                  dataset|0.30 % |     11
    experimental settings|0.27 % |     10
conclusion and future work|0.27 % |     10
                baselines|0.27 % |     10
          acknowledgement|0.25 % |      9
               references|0.25 % |      9
               motivation|0.25 % |      9
               evaluation|0.22 % |      8
              meth

application to other formalisms|0.03 % |      1
          structural cues|0.03 % |      1
spanish, dutch and german|0.03 % |      1
related work & background|0.03 % |      1
       on genre detection|0.03 % |      1
     top-down propagation|0.03 % |      1
                 examples|0.03 % |      1
          unbounded model|0.03 % |      1
arabic dialect identification|0.03 % |      1
    datasets and settings|0.03 % |      1
             online abuse|0.03 % |      1
                  summary|0.03 % |      1
       evaluation details|0.03 % |      1
               base model|0.03 % |      1
       dual-coding theory|0.03 % |      1
step 3: measure collocational distributions|0.03 % |      1
subset approximation by transforming the grammar|0.03 % |      1
phrase-based machine translation|0.03 % |      1
            fact-checking|0.03 % |      1
          morphophonology|0.03 % |      1
 fixed confidence by ttts|0.03 % |      1
    left-to-right parsing|0.03 % |      1
   traditional app

construction of an s-type transducer |0.03 % |      1
 predicting dialogue acts|0.03 % |      1
 augmented word embedding|0.03 % |      1
   typed feature grammars|0.03 % |      1
            the formalism|0.03 % |      1
  generation architecture|0.03 % |      1
further details on evaluation dataset|0.03 % |      1
                algorithm|0.03 % |      1
          text classifier|0.03 % |      1
lstm neural reordering model|0.03 % |      1
   training and test data|0.03 % |      1
existing models of attachment|0.03 % |      1
      extended levi graph|0.03 % |      1
    conversational models|0.03 % |      1
the task: base np chunking|0.03 % |      1
       maximizing metrics|0.03 % |      1
deriving chunks from treebank parses|0.03 % |      1
                     glue|0.03 % |      1
   gated-attention reader|0.03 % |      1
asymptotic normality and statistical efficiency|0.03 % |      1
similar, associated, and both|0.03 % |      1
experimental evaluation: bucc shared task on mini

##### Смотрим из 2 максимумов

In [444]:
def top_2(x):
    if x['max_title'] == 'related work' or x['max_title_2'] == 'related work':
        return 'related work'
    else:
        return x['max_title']+'|'+x['max_title_2']

In [445]:
df['top_2'] = df.apply(top_2,axis=1)

In [446]:
df.head(20)

Unnamed: 0,id,max_title,max_title_2,rw_exact,top_2
0,10164018,introduction,reader-aware salience estimation,1,introduction|reader-aware salience estimation
1,488,introduction,the grammar,0,introduction|the grammar
2,189927790,introduction,relevance-based auxiliary task (rat),0,introduction|relevance-based auxiliary task (rat)
3,5084110,acknowledgments,,0,acknowledgments|None
5,126168169,experiment,introduction,0,experiment|introduction
7,184488238,introduction,related work,1,related work
8,85517799,introduction,related work,1,related work
9,16050464,related work,introduction,1,related work
11,52155342,exposure bias and error propagation,introduction,0,exposure bias and error propagation|introduction
12,52247458,introduction,related work,1,related work


In [447]:
print("%25s|%10s |%7s"%('название секции','% от всех','кол-во '))
print(50*'-')
for ind,x,y in zip(df[df.rw_exact == 1].top_2.value_counts().index,df[df.rw_exact == 1].top_2.value_counts()/len(df[df.rw_exact == 1])*100,df[df.rw_exact == 1].top_2.value_counts()):
    print("%25s|%2.2f %% |%7d" % (ind,x,y))

          название секции| % от всех |кол-во 
--------------------------------------------------
             related work|59.42 % |   1060
       introduction|setup|0.50 % |      9
    introduction|datasets|0.39 % |      7
 introduction|experiments|0.39 % |      7
     introduction|results|0.34 % |      6
introduction|experimental setup|0.28 % |      5
  introduction|evaluation|0.22 % |      4
     introduction|dataset|0.22 % |      4
       introduction|model|0.22 % |      4
introduction|evaluation metrics|0.17 % |      3
       setup|introduction|0.17 % |      3
introduction|background and related work|0.17 % |      3
    datasets|introduction|0.17 % |      3
     dataset|introduction|0.17 % |      3
  introduction|discussion|0.17 % |      3
introduction|motivation and related work|0.11 % |      2
       model|introduction|0.11 % |      2
introduction|implementation details|0.11 % |      2
introduction|task definition|0.11 % |      2
  evaluation|introduction|0.11 % |      2
      r

         setting|datasets|0.06 % |      1
introduction|the webquestionssp dataset|0.06 % |      1
introduction|experimental framework|0.06 % |      1
      datasets|conclusion|0.06 % |      1
introduction|dip-dqn baseline|0.06 % |      1
conclusions and future work|recurrent networks and lstms|0.06 % |      1
introduction|results of triad model|0.06 % |      1
introduction|reverse dictionary|0.06 % |      1
word vector-based tree edit distance|word vector-based dynamic time warping|0.06 % |      1
phrase-based machine translation|evaluation|0.06 % |      1
relevant previous work|datasets and relevant knowledge graphs|0.06 % |      1
gated-attention reader|introduction|0.06 % |      1
introduction|approximately boolean entity tuples|0.06 % |      1
introduction|a model for editor identification|0.06 % |      1
transfer learning for nmt|introduction|0.06 % |      1
    training|introduction|0.06 % |      1
introduction|compositional similarity|0.06 % |      1
similar, associated, and bot

a lemmatizer: p(ℓ i ∣m i ,w i )p(\ell _i \mid m_i, w_i)|morphological tagger: p(𝐦∣𝐰)p(\mathbf {m}\mid \mathbf {w})|0.06 % |      1
   introduction|inference|0.06 % |      1
eye-tracking database for sarcasm analysis|the sarcasm classifier|0.06 % |      1
captioning model|introduction|0.06 % |      1
      hocus pocus|dataset|0.06 % |      1
ir engines|relevance feedback performance|0.06 % |      1
evaluation settings|introduction|0.06 % |      1
introduction|extensibility|0.06 % |      1
    baselines|experiments|0.06 % |      1
network training and hyper-parameters|regularization|0.06 % |      1
implementation details|graph lstms|0.06 % |      1
related work and datasets|introduction|0.06 % |      1
model debiasing|introduction|0.06 % |      1
introduction|the summarization approach|0.06 % |      1
data statistics|baseline model|0.06 % |      1
cross-lingual mapping|introduction|0.06 % |      1
            data|features|0.06 % |      1
introduction|linguistic constraints via regulariz

### Other

In [179]:
article_with_sect_latex = dict()

for article in all_articles:
    
    if article['latex_parse'] and article['latex_parse']['body_text']: 
        article_dict= dict()
        for sections in article['latex_parse']['body_text']:
            if sections['section'] or sections['section']!=None:
                if sections['section'] in article_dict:
                    article_dict[sections['section']] += len(sections['cite_spans'])
                else:
                    article_dict[sections['section']] = len(sections['cite_spans'])
                        
        article_with_sect_latex[article['paper_id']] = article_dict

In [180]:
article_with_sect_latex

{'10164018': {'Introduction': 13,
  'Overview': 1,
  'Reader-Aware Salience Estimation': 6,
  'Summary Construction': 6,
  'Data Description': 0,
  'Background': 0,
  'Data Collection': 0,
  'Data Properties': 0,
  'Dataset and Metrics': 1,
  'Comparative Methods': 6,
  'Experimental Settings': 2,
  'Results on Our Dataset': 0,
  'Further Investigation of Our Framework ': 2,
  'Case Study': 0,
  'Conclusions': 0},
 '488': {'Introduction': 6,
  'Random Field Models': 1,
  'RFM Estimation and Selection of the Informative Sample ': 2,
  'The Grammar': 1,
  'Modelling the Grammar ': 0,
  'Experiments': 0,
  'Testing the Various Sampling Strategies ': 0,
  'Larger Scale Evaluation': 1,
  'Comments': 0,
  'Acknowledgments': 0},
 '189927790': {'Introduction': 9,
  'Balanced Translation Approach': 1,
  'NMT and Transformer': 2,
  'Relevance-based Auxiliary Task (RAT)': 6,
  'Multi-task NMT Architecture': 3,
  'Results and Analysis': 2,
  'Conclusion': 0,
  'Acknowledgments': 0,
  'Loss Functio

##### Пример без названий секции

In [181]:
[article['latex_parse'] for article in all_articles if article['paper_id'] == '198922003']

[{'abstract': [],
  'body_text': [{'text': 'Bang Liu INLINEFORM0 , Ting Zhang INLINEFORM1 , Di Niu INLINEFORM2 , Jinghong Lin INLINEFORM3 , Kunfeng Lai INLINEFORM4 , Yu Xu INLINEFORM5 INLINEFORM6 University of Alberta, Edmonton, AB, Canada INLINEFORM7 Mobile Internet Group, Tencent, Shenzhen, China',
    'cite_spans': [],
    'ref_spans': [],
    'eq_spans': [{'start': 9,
      'end': 20,
      'text': ' 1 ',
      'latex': '^1',
      'ref_id': None},
     {'start': 34, 'end': 45, 'text': ' 1 ', 'latex': '^1', 'ref_id': None},
     {'start': 55, 'end': 66, 'text': ' 1 ', 'latex': '^1', 'ref_id': None},
     {'start': 82, 'end': 93, 'text': ' 2 ', 'latex': '^2', 'ref_id': None},
     {'start': 108, 'end': 119, 'text': ' 2 ', 'latex': '^2', 'ref_id': None},
     {'start': 128, 'end': 139, 'text': ' 2 ', 'latex': '^2', 'ref_id': None},
     {'start': 140, 'end': 151, 'text': ' 1 ', 'latex': '^1', 'ref_id': None},
     {'start': 196, 'end': 207, 'text': ' 2 ', 'latex': '^2', 'ref_id': Non

In [204]:
title = []
RW_names = [
    'related work','background','previous w'
]
rw_list = []
for k,v in article_with_sect_latex.items():
    if len(v)>0:
        title.append(max(article_with_sect_latex[k],key=article_with_sect_latex[k].get).lower())
    else
    line = ' '.join(article_with_sect_latex[k].keys()).lower()
    if mult_in(RW_names,line):
        rw_list.append(mult_in(RW_names,line))
    else:
        rw_list.append(0)

In [208]:
pd.Series(title).value_counts()

introduction                                       1503
related work                                        803
acknowledgments                                     181
acknowledgements                                     83
conclusion                                           64
background                                           35
related works                                        23
conclusions                                          23
sections                                             18
previous work                                        18
motivation                                           13
datasets                                             13
experiments                                          11
experimental setup                                   11
conclusion and future work                            9
discussion                                            9
acknowledgement                                       9
methods                                         

In [212]:
sum([1 for rw in rw_list if rw!=0] ),len(rw_list)

(1784, 4039)

In [207]:
max(article_with_sect_latex['10164018'],key=article_with_sect_latex['10164018'].get)

'Introduction'

In [205]:
line = ' '.join(article_with_sect_latex['10164018'].keys()).lower()
print(line)

introduction overview reader-aware salience estimation summary construction data description background data collection data properties dataset and metrics comparative methods experimental settings results on our dataset further investigation of our framework  case study conclusions


In [206]:
mult_in(RW_names,line)

'background'