In [1]:
import json
from tqdm import tqdm_notebook
import gzip
import pandas as pd
import sys

In [3]:
import json
from typing import Dict, List


def get_citation_contexts(paper: Dict, toks_in_context=10) -> List[Dict]:
    """
    Retrieve citation contexts from GORC paper
    :param paper:
    :param toks_in_context:
    :return:
    """
    if not paper:
        return []

    if not paper['grobid_parse']:
        return []

    if not paper['grobid_parse']['body_text']:
        return []

    contexts = []

    for paragraph in paper['grobid_parse']['body_text']:
        for cite_span in paragraph['cite_spans']:
            # get cited paper id, skip if none
            cite_ref = cite_span['ref_id']
            cited_paper_id = None
            if cite_ref in paper['grobid_parse']['bib_entries']:
                cited_paper_id = paper['grobid_parse']['bib_entries'][cite_ref]['links']
            if not cited_paper_id:
                continue

            # get pre and post tokens
            pre_span_tokens = paragraph['text'][:cite_span['start']].split(' ')[-toks_in_context:]
            post_span_tokens = paragraph['text'][cite_span['end']:].split(' ')[:toks_in_context]
            pre_string = ' '.join(pre_span_tokens)
            post_string = ' '.join(post_span_tokens)
            full_context = pre_string + cite_span['text'] + post_string

            contexts.append({
                "paper_id": paper['paper_id'],
                "context_string": full_context,
                "cite_start": len(pre_string),
                "cite_end": len(pre_string) + len(cite_span['text']),
                "cite_str": cite_span['text'],
                "cited_paper_id": cited_paper_id
            })

    return contexts

In [2]:
EXAMPLE_DATA_FILE = 'data/example_papers.jsonl'

In [6]:
all_contexts = []
all_papers = []
context_dict = dict()
with open(EXAMPLE_DATA_FILE, 'r') as f:
    for line in f:
        gorc_obj = json.loads(line)
        all_papers.append(gorc_obj)
        all_contexts += get_citation_contexts(gorc_obj)

In [7]:
from os import walk

f_zips = []
for (dirpath, dirnames, filenames) in walk('../../gorc/'):
    f_zips.extend(filenames)
    break

In [8]:
len(f_zips),f_zips

(10002,
 ['0.jsonl.gz',
  '1.jsonl.gz',
  '10.jsonl.gz',
  '100.jsonl.gz',
  '1000.jsonl.gz',
  '1001.jsonl.gz',
  '1002.jsonl.gz',
  '1003.jsonl.gz',
  '1004.jsonl.gz',
  '1005.jsonl.gz',
  '1006.jsonl.gz',
  '1007.jsonl.gz',
  '1008.jsonl.gz',
  '1009.jsonl.gz',
  '101.jsonl.gz',
  '1010.jsonl.gz',
  '1011.jsonl.gz',
  '1012.jsonl.gz',
  '1013.jsonl.gz',
  '1014.jsonl.gz',
  '1015.jsonl.gz',
  '1016.jsonl.gz',
  '1017.jsonl.gz',
  '1018.jsonl.gz',
  '1019.jsonl.gz',
  '102.jsonl.gz',
  '1020.jsonl.gz',
  '1021.jsonl.gz',
  '1022.jsonl.gz',
  '1023.jsonl.gz',
  '1024.jsonl.gz',
  '1025.jsonl.gz',
  '1026.jsonl.gz',
  '1027.jsonl.gz',
  '1028.jsonl.gz',
  '1029.jsonl.gz',
  '103.jsonl.gz',
  '1030.jsonl.gz',
  '1031.jsonl.gz',
  '1032.jsonl.gz',
  '1033.jsonl.gz',
  '1034.jsonl.gz',
  '1035.jsonl.gz',
  '1036.jsonl.gz',
  '1037.jsonl.gz',
  '1038.jsonl.gz',
  '1039.jsonl.gz',
  '104.jsonl.gz',
  '1040.jsonl.gz',
  '1041.jsonl.gz',
  '1042.jsonl.gz',
  '1043.jsonl.gz',
  '1044.jsonl.gz'

In [9]:
[file for file in f_zips if '.gz' not in file]

['s2orc-master.zip']

In [10]:
all_papers[0].keys()

dict_keys(['paper_id', 'metadata', 's2_pdf_hash', 'grobid_parse', 'latex_parse'])

In [11]:
all_papers[0]

{'paper_id': '104172',
 'metadata': {'title': 'Nonlinear inversion of tilt-affected very long period records of explosive eruptions at Fuego volcano: INVERSION OF TILT-AFFECTED VLP EVENTS',
  'authors': [{'first': 'Gregory',
    'middle': ['P.'],
    'last': 'Waite',
    'suffix': ''},
   {'first': 'Federica', 'middle': [], 'last': 'Lanza', 'suffix': ''}],
  'abstract': None,
  'year': '2016',
  'arxiv_id': None,
  'acl_id': None,
  'pmc_id': None,
  'pubmed_id': None,
  'doi': '10.1002/2016jb013287',
  'venue': 'Journal of Geophysical Research: Solid Earth',
  'journal': 'Journal of Geophysical Research'},
 's2_pdf_hash': '73ed8076fc747e77c41845cb5f18b40ece350865',
 'grobid_parse': {'abstract': [],
  'body_text': [{'text': 'solution to this is to evaluate long wavelength, very-long-period (VLP) data that are relativelyFuego is a 3800 m stratovolcano that regularly produces Strombolian and weak 76Vulcanian explosions. The dynamics of these explosive events have been examined in the VLP

In [12]:
all_papers[0]['metadata']

{'title': 'Nonlinear inversion of tilt-affected very long period records of explosive eruptions at Fuego volcano: INVERSION OF TILT-AFFECTED VLP EVENTS',
 'authors': [{'first': 'Gregory',
   'middle': ['P.'],
   'last': 'Waite',
   'suffix': ''},
  {'first': 'Federica', 'middle': [], 'last': 'Lanza', 'suffix': ''}],
 'abstract': None,
 'year': '2016',
 'arxiv_id': None,
 'acl_id': None,
 'pmc_id': None,
 'pubmed_id': None,
 'doi': '10.1002/2016jb013287',
 'venue': 'Journal of Geophysical Research: Solid Earth',
 'journal': 'Journal of Geophysical Research'}

In [13]:
all_papers[0]['grobid_parse'].keys()

dict_keys(['abstract', 'body_text', 'ref_entries', 'bib_entries'])

In [14]:
all_papers[0]['grobid_parse']['bib_entries']

{'BIBREF0': {'ref_id': 'b0',
  'title': 'Tilt change recorded by broadband seismometer prior to 476 small phreatic explosion of Meakan-dake volcano',
  'authors': [{'first': 'H', 'middle': [], 'last': 'Aoyama', 'suffix': ''},
   {'first': 'H', 'middle': [], 'last': 'Oshima', 'suffix': ''}],
  'year': 2008,
  'venue': 'Geophys. Res. Lett',
  'volume': '477',
  'issn': '6',
  'pages': '',
  'other_ids': {},
  'links': None},
 'BIBREF1': {'ref_id': 'b1',
  'title': 'Very long period conduit oscillations induced by rockfalls at 479',
  'authors': [{'first': 'B', 'middle': [], 'last': 'Chouet', 'suffix': ''},
   {'first': 'P', 'middle': [], 'last': 'Dawson', 'suffix': ''}],
  'year': 2013,
  'venue': '',
  'volume': '',
  'issn': '',
  'pages': '',
  'other_ids': {},
  'links': None},
 'BIBREF4': {'ref_id': 'b4',
  'title': 'Source mechanisms of explosions at Stromboli Volcano',
  'authors': [{'first': 'G', 'middle': [], 'last': 'Milana', 'suffix': ''},
   {'first': 'R', 'middle': [], 'last

In [15]:
all_papers[0]['grobid_parse']['body_text'][0].keys()

dict_keys(['text', 'cite_spans', 'ref_spans', 'eq_spans', 'section'])

In [16]:
all_articles = all_papers

In [17]:
sum([1 for paper in all_papers if paper['metadata']['acl_id']!=None])


1

In [18]:
with open("acl_only_json_list_10000.json", "r") as read_file:
    all_articles = json.load(read_file)
print(len(all_articles))
read_file.close()

41660


## Анализ подборки

### проверка наличия названия секции

In [19]:
all_articles[0].keys()

dict_keys(['paper_id', 'metadata', 's2_pdf_hash', 'grobid_parse', 'latex_parse'])

In [20]:
acl_paper_ids = [article['paper_id'] for article in all_articles]

In [21]:
all_articles[0]['metadata'].keys()

dict_keys(['title', 'authors', 'abstract', 'year', 'arxiv_id', 'acl_id', 'pmc_id', 'pubmed_id', 'doi', 'venue', 'journal'])

In [22]:
all_articles[0]['grobid_parse'].keys()

dict_keys(['abstract', 'body_text', 'ref_entries', 'bib_entries'])

#### проверка наличия текста и названия секций во всех статьях в grobid части

In [23]:
len([article['paper_id'] for article in all_articles if article['grobid_parse']['body_text']])

41439

In [24]:
acl_ids_not_bofy_text = [article['paper_id'] for article in all_articles if not article['grobid_parse']['body_text']]

In [25]:
for num,paper_id in enumerate(acl_ids_not_bofy_text):
    if num == 2:
        break
    id_lst = acl_paper_ids.index(paper_id)
    print(id_lst,all_articles[id_lst])
    print(10*'==')

156 {'paper_id': '60131735', 'metadata': {'title': 'Breadth and D e p t h of Semant ic Lexicons Proceedings of a Workshop Sponsored by the Special Interest Group on the Lexicon of the Associat ion for Computat ional Linguistics', 'authors': [{'first': 'Evelyne', 'middle': [], 'last': 'Viegas', 'suffix': ''}], 'abstract': 'Preface. Contributors. Introduction: E. Viegas. I. Lexical Rules and Underspecification. II. Breadth of Semantic Lexicons. III. Depth of Semantic Lexicons. IV. Lexical Semantics and Pragmatics. Subject Index. Author Index.', 'year': '1999', 'arxiv_id': None, 'acl_id': 'W96-0300', 'pmc_id': None, 'pubmed_id': None, 'doi': '10.1007/978-94-017-0952-1', 'venue': 'Text, Speech and Language Technology', 'journal': 'Text, Speech and Language Technology'}, 's2_pdf_hash': 'fef5115fca1124e5f994cd49414c743ca005b853', 'grobid_parse': {'abstract': [], 'body_text': [], 'ref_entries': {'TABREF0': {'text': 'Introduction', 'latex': None, 'type': 'table'}}, 'bib_entries': {}}, 'latex_p

In [26]:
all_articles[186]

{'paper_id': '7473534',
 'metadata': {'title': 'Interfacing Ontologies and Lexical Resources',
  'authors': [{'first': 'Laurent',
    'middle': [],
    'last': 'Prévot',
    'suffix': ''},
   {'first': 'Stefano', 'middle': [], 'last': 'Borgo', 'suffix': ''},
   {'first': 'Alessandro', 'middle': [], 'last': 'Oltramari', 'suffix': ''}],
  'abstract': 'During the last few years, a number of works aiming at interfacing ontologies and lexical resources have been initiated. This paper aims at clarifying the current picture of this domain. It compares ontologies built following different methodologies and analyses their combination with lexical resources. A point defended in the paper is that different methodologies lead to very different characteristics for the resulting resources. We classify these methodologies show how actual projects fit into this classification.',
  'year': '2005',
  'arxiv_id': None,
  'acl_id': 'I05-7013',
  'pmc_id': None,
  'pubmed_id': None,
  'doi': '10.1017/CBO97

In [27]:
all_articles[0]['grobid_parse']['body_text'][0].keys()

dict_keys(['text', 'cite_spans', 'ref_spans', 'eq_spans', 'section'])

In [28]:
all_articles[0]['grobid_parse']['body_text'][0]['section']

In [29]:
article_with_sect = dict()
for article in all_articles:
    for sections in article['grobid_parse']['body_text']:
        if sections['section']:
            if article['paper_id'] in article_with_sect:
                article_with_sect[article['paper_id']] +=1
            else:
                article_with_sect[article['paper_id']] = 1

In [30]:
article_with_sect

{}

In [31]:
all_articles[1]

{'paper_id': '14472576',
 'metadata': {'title': 'Building a Semantic Parser Overnight',
  'authors': [{'first': 'Yushi', 'middle': [], 'last': 'Wang', 'suffix': ''},
   {'first': 'Jonathan', 'middle': [], 'last': 'Berant', 'suffix': ''},
   {'first': 'Percy', 'middle': [], 'last': 'Liang', 'suffix': ''}],
  'abstract': 'How do we build a semantic parser in a new domain starting with zero training examples? We introduce a new methodology for this setting: First, we use a simple grammar to generate logical forms paired with canonical utterances. The logical forms are meant to cover the desired set of compositional operators, and the canonical utterances are meant to capture the meaning of the logical forms (although clumsily). We then use crowdsourcing to paraphrase these canonical utterances into natural utterances. The resulting data is used to train the semantic parser. We further study the role of compositionality in the resulting paraphrases. Finally, we test our methodology on seve

In [32]:
article_with_sect = dict()
for num,article in enumerate(all_articles):
    if num == 2:
        break
    for cnt_sect,sections in enumerate(article['grobid_parse']['body_text']):
        if sections['cite_spans']:
            print(cnt_sect,len(sections['cite_spans']))
            for cnt_cite,cite in enumerate(sections['cite_spans']):
                print(cnt_cite,cite)
            print(sections['text'])
            print('----')
    print(10*'==')
#             if article['paper_id'] in article_with_sect:
#                 article_with_sect[article['paper_id']] +=1
#             else:
#                 article_with_sect[article['paper_id']] = 1

0 13
0 {'start': 192, 'end': 216, 'text': '(Goldstein et al., 2000;', 'latex': None, 'ref_id': 'BIBREF6'}
1 {'start': 217, 'end': 239, 'text': 'Erkan and Radev, 2004;', 'latex': None, 'ref_id': 'BIBREF4'}
2 {'start': 240, 'end': 257, 'text': 'Wan et al., 2007;', 'latex': None, 'ref_id': 'BIBREF19'}
3 {'start': 258, 'end': 284, 'text': 'Nenkova and McKeown, 2012;', 'latex': None, 'ref_id': 'BIBREF16'}
4 {'start': 285, 'end': 302, 'text': 'Min et al., 2012;', 'latex': None, 'ref_id': 'BIBREF15'}
5 {'start': 303, 'end': 319, 'text': 'Li et al., 2017)', 'latex': None, 'ref_id': 'BIBREF11'}
6 {'start': 773, 'end': 797, 'text': '(Project Code: 14203414)', 'latex': None, 'ref_id': None}
7 {'start': 2288, 'end': 2305, 'text': '(Hu et al., 2008;', 'latex': None, 'ref_id': 'BIBREF7'}
8 {'start': 2306, 'end': 2324, 'text': 'Yang et al., 2011)', 'latex': None, 'ref_id': 'BIBREF22'}
9 {'start': 2582, 'end': 2598, 'text': 'Li et al. (2015)', 'latex': None, 'ref_id': 'BIBREF2'}
10 {'start': 2911, 'en

To evaluate the performance of our dataset and the proposed framework RAVAESum for RA-MDS, we compare our model with the following methods:• RA-Sparse : It is a framework to tackle the RA-MDS problem. A sparse-coding-based method is used to calculate the salience of the news sentences by jointly considering news documents and reader comments.• Lead (Wasson, 1998) : It ranks the news sentences chronologically and extracts the leading sentences one by one until the length limit.• Centroid (Radev et al., 2000) : It summarizes clusters of news articles automatically grouped by a topic detection system, and then it uses information from the centroids of the clusters to select sentences.• LexRank (Erkan and Radev, 2004) and TextRank (Mihalcea and Tarau, 2004) : Both methods are graph-based unsupervised framework for sentence salience estimation based on PageRank algorithm.• Concept : It generates abstractive summaries using phrase-based optimization framework with concept weight as salience 

Our logical forms are represented in lambda DCS, a logical language where composition operates on sets rather than truth values. Here we give a brief description; see Liang (2013) for details.Every logical form z in this paper is either a unary (denoting a set of entities) or a binary (denoting a set of entity-pairs). In the base case, each entity e (e.g., 2015) is a unary denoting the singleton set: e w = {e}; and each property p (e.g., publicationDate) is a binary denoting all entitypairs (e 1 , e 2 ) that satisfy the property p. Unaries and binaries can be composed: Given a binary b and unary u, the join b.u denotes all entities e 1 for which there exists an e 2 ∈ u w with (e 1 , e 2 ) ∈ b w . For example, publicationDate.2015 denote entities published in 2015.The intersection u 1 u 2 , union u 1 u 2 , complement ¬u denote the corresponding set operations on the denotations. We let R(b) denote the reversal of b: (e 1 , e 2 ) ∈ b w iff (e 2 , e 1 ) ∈ R(b) w . This allows us to define

all_results[0] - Reader-Aware Multi-Document Summarization: An Enhanced Model and The First Dataset∗



Result 

0 {'start': 192, 'end': 216, 'text': '(Goldstein et al., 2000;', 'latex': None, 'ref_id': 'BIBREF6'}
1 {'start': 217, 'end': 239, 'text': 'Erkan and Radev, 2004;', 'latex': None, 'ref_id': 'BIBREF4'}
2 {'start': 240, 'end': 257, 'text': 'Wan et al., 2007;', 'latex': None, 'ref_id': 'BIBREF19'}
3 {'start': 258, 'end': 284, 'text': 'Nenkova and McKeown, 2012;', 'latex': None, 'ref_id': 'BIBREF16'}
4 {'start': 285, 'end': 302, 'text': 'Min et al., 2012;', 'latex': None, 'ref_id': 'BIBREF15'}
5 {'start': 303, 'end': 319, 'text': 'Li et al., 2017)', 'latex': None, 'ref_id': 'BIBREF11'}

True 

0 (Goldstein et al., 2000; 
1 Erkan and Radev,2004; 
2 Wan et al., 2007; 
3 Nenkova and McKeown, 2012; 
4 Min et al., 2012; 
5 Bing et al., 2015; 
6 Li et al.,2017)


Result

{'start': 971, 'end': 987, 'text': 'Li et al. (2015)', 'latex': None, 'ref_id': 'BIBREF2'}

True

Woodsend and Lapata (2012), Bing et al. (2015), and Li et al. (2015).

**При большом перечислении подряд ссылок, GROBID не выделяет предпоследнюю ссылку**

**Также он не срабатывает на части ссылок**

#### проверка наличия текста и названия секций во всех статьях в latex части

In [33]:
len([article['paper_id'] for article in all_articles if article['latex_parse'] and article['latex_parse']['body_text']])

4039

In [34]:
acl_ids_not_body_text_tex = [article['paper_id'] for article in all_articles if not ( article['latex_parse'] and article['latex_parse']['body_text'])]

In [35]:
len(acl_ids_not_body_text_tex)

37621

In [36]:
for num,paper_id in enumerate(acl_ids_not_body_text_tex):
    if num == 2:
        break
    id_lst = acl_paper_ids.index(paper_id)
    print(id_lst,all_articles[id_lst]['latex_parse'])
    print(10*'==')

1 None
2 None


In [37]:
all_articles[0]['latex_parse'].keys()

dict_keys(['abstract', 'body_text', 'ref_entries', 'bib_entries'])

In [38]:
all_articles[0]

{'paper_id': '10164018',
 'metadata': {'title': 'Reader-Aware Multi-Document Summarization: An Enhanced Model and The First Dataset',
  'authors': [{'first': 'Piji', 'middle': [], 'last': 'Li', 'suffix': ''},
   {'first': 'Lidong', 'middle': [], 'last': 'Bing', 'suffix': ''},
   {'first': 'Wai', 'middle': [], 'last': 'Lam', 'suffix': ''}],
  'abstract': 'We investigate the problem of reader-aware multi-document summarization (RA-MDS) and introduce a new dataset for this problem. To tackle RA-MDS, we extend a variational auto-encodes (VAEs) based MDS framework by jointly considering news documents and reader comments. To conduct evaluation for summarization performance, we prepare a new dataset. We describe the methods for data collection, aspect annotation, and summary writing as well as scrutinizing by experts. Experimental results show that reader comments can improve the summarization performance, which also demonstrates the usefulness of the proposed dataset. The annotated dataset 

In [39]:
{k: all_articles[0]['latex_parse']['bib_entries'][k] for k in sorted(all_articles[0]['latex_parse']['bib_entries'])}

{'BIBREF0': {'ref_id': 'BIBREF0',
  'title': 'Multi-document summarization by sentence extraction',
  'authors': [{'first': 'Jade',
    'middle': [],
    'last': 'Goldstein',
    'suffix': ''},
   {'first': 'Vibhu', 'middle': [], 'last': 'Mittal', 'suffix': ''},
   {'first': 'Jaime', 'middle': [], 'last': 'Carbonell', 'suffix': ''},
   {'first': 'Mark', 'middle': [], 'last': 'Kantrowitz', 'suffix': ''}],
  'year': 2000,
  'venue': 'NAACL-ANLPWorkshop',
  'volume': '',
  'issn': '',
  'pages': '40--48',
  'other_ids': {},
  'links': '8294822'},
 'BIBREF1': {'ref_id': 'BIBREF1',
  'title': 'Lexpagerank: Prestige in multi-document text summarization',
  'authors': [{'first': 'Günes', 'middle': [], 'last': 'Erkan', 'suffix': ''},
   {'first': '', 'middle': [], 'last': 'Dragomir R Radev', 'suffix': ''}],
  'year': 2004,
  'venue': 'EMNLP',
  'volume': '4',
  'issn': '',
  'pages': '365--371',
  'other_ids': {},
  'links': '10418456'},
 'BIBREF10': {'ref_id': 'BIBREF10',
  'title': 'Auto-enc

In [40]:
all_articles[0]['latex_parse']

{'abstract': [],
 'body_text': [{'text': 'The goal of multi-document summarization (MDS) is to automatically generate a brief, well-organized summary for a topic which describes an event with a set of documents from different sources. BIBREF0 , BIBREF1 , BIBREF2 , BIBREF3 , BIBREF4 , BIBREF5 , BIBREF6 . In the typical setting of MDS, the input is a set of news documents about the same topic. The output summary is a piece of short text document containing several sentences, generated only based on the input original documents.',
   'cite_spans': [{'start': 193,
     'end': 200,
     'text': None,
     'latex': None,
     'ref_id': 'BIBREF0'},
    {'start': 203,
     'end': 210,
     'text': None,
     'latex': None,
     'ref_id': 'BIBREF1'},
    {'start': 213,
     'end': 220,
     'text': None,
     'latex': None,
     'ref_id': 'BIBREF2'},
    {'start': 223,
     'end': 230,
     'text': None,
     'latex': None,
     'ref_id': 'BIBREF3'},
    {'start': 233,
     'end': 240,
     'te

In [41]:
all_articles[0]['latex_parse']['body_text'][0].keys()

dict_keys(['text', 'cite_spans', 'ref_spans', 'eq_spans', 'section'])

In [42]:
article_with_sect_latex = dict()
for article in all_articles:
    if article['latex_parse'] and article['latex_parse']['body_text']: 
        for sections in article['latex_parse']['body_text']:
            if sections['section']:
                if article['paper_id'] in article_with_sect_latex:
                    article_with_sect_latex[article['paper_id']] +=1
                else:
                    article_with_sect_latex[article['paper_id']] = 1

##### Количество статей, в которых есть названия секций

In [43]:
len(article_with_sect_latex)

3657

#### Сравнение количества выделенных ссылок в grobid_parse & latex_parse

In [44]:
latex_more_grobid_bib_entr = []
grobid_more_latex_bib_entr = []
equal = []
for num_art ,article in enumerate(all_articles):
    if article['latex_parse'] and article['latex_parse']['body_text']:
        if len(article['latex_parse']['bib_entries']) > len(article['grobid_parse']['bib_entries']):
            print(num_art,article['paper_id'])
            latex_more_grobid_bib_entr.append(article['paper_id'])
        elif len(article['latex_parse']['bib_entries']) < len(article['grobid_parse']['bib_entries']):
#             print(num_art,article['paper_id'])
            grobid_more_latex_bib_entr.append(article['paper_id'])
        else:
            equal.append(article['paper_id'])

78 16050464
101 173990592
286 29245285
320 100300
366 2558
530 86813509
579 1703535
634 14170854
687 49358911
721 371926
736 682772
740 2840197
779 2411
817 16273304
874 17511008
935 3101294
936 3161327
942 5740960
972 2145766
1023 52073201
1127 5079594
1206 298504
1212 870921
1257 53217693
1380 1438450
1401 309476
1402 44278
1414 7669927
1571 711424
1604 1423962
1801 370914
1833 27246259
1846 15600925
1864 184488087
1919 10086161
1976 11492268
2153 534431
2190 15986631
2238 189998202
2255 21665312
2299 9371149
2302 311594
2305 6256345
2412 14974
2428 3933075
2483 1571038
2489 2862211
2495 6210126
2629 44123113
2705 15881253
2784 5201435
2823 5054582
2919 10324034
2998 7021843
3198 3152424
3274 139106285
3291 14922772
3292 2652169
3318 3204831
3383 12245103
3422 85543217
3434 17297069
3485 621025
3538 85556928
3571 13661068
3572 1765384
3589 3025759
3768 1871596
3815 27914547
3841 53082704
3852 118680003
3857 11451871
4013 3132651
4039 20995314
4042 85529973
4043 14401063
4050 2213896


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [45]:
len(latex_more_grobid_bib_entr),len(grobid_more_latex_bib_entr),len(equal)

(794, 2554, 691)

In [46]:
794+2554+691

4039

In [47]:
for num_art ,article in enumerate(all_articles):
    if article['latex_parse'] and article['latex_parse']['body_text']:
        latex_links = [v['links'] for k,v in article['latex_parse']['bib_entries'].items() if v['links']]
        if article['grobid_parse']:
            grobid_links =  [v['links'] for k,v in article['grobid_parse']['bib_entries'].items() if v['links']]
            if len(latex_links)> 0 and len(grobid_links) == 0:
                print('WOW!')
            print(f'latex_links = {len(latex_links)}| grobid_links = {len(grobid_links)}| together = {len(set(grobid_links+latex_links))}')
            print(10*'==')

        else:
            print(f'latex_links = {len(latex_links)}| grobid_links = 0| together = {len(set(latex_links))}')
            print(10*'==')
    

latex_links = 18| grobid_links = 21| together = 22
latex_links = 7| grobid_links = 7| together = 9
latex_links = 0| grobid_links = 12| together = 12
latex_links = 23| grobid_links = 32| together = 33
latex_links = 12| grobid_links = 42| together = 46
latex_links = 21| grobid_links = 21| together = 21
latex_links = 0| grobid_links = 38| together = 38
latex_links = 50| grobid_links = 46| together = 51
latex_links = 9| grobid_links = 38| together = 37
latex_links = 27| grobid_links = 13| together = 30
latex_links = 31| grobid_links = 27| together = 32
latex_links = 25| grobid_links = 28| together = 31
latex_links = 12| grobid_links = 40| together = 41
latex_links = 24| grobid_links = 30| together = 32
latex_links = 4| grobid_links = 16| together = 16
latex_links = 10| grobid_links = 13| together = 13
latex_links = 31| grobid_links = 31| together = 32
latex_links = 0| grobid_links = 22| together = 21
latex_links = 43| grobid_links = 41| together = 45
latex_links = 28| grobid_links = 26| to

latex_links = 6| grobid_links = 6| together = 6
latex_links = 23| grobid_links = 26| together = 27
latex_links = 0| grobid_links = 27| together = 27
latex_links = 22| grobid_links = 38| together = 43
latex_links = 7| grobid_links = 7| together = 8
latex_links = 19| grobid_links = 20| together = 21
latex_links = 20| grobid_links = 19| together = 20
latex_links = 15| grobid_links = 11| together = 15
latex_links = 15| grobid_links = 7| together = 15
latex_links = 0| grobid_links = 0| together = 0
latex_links = 29| grobid_links = 27| together = 31
latex_links = 31| grobid_links = 31| together = 34
latex_links = 20| grobid_links = 27| together = 28
latex_links = 21| grobid_links = 20| together = 22
latex_links = 14| grobid_links = 16| together = 16
latex_links = 18| grobid_links = 20| together = 21
latex_links = 29| grobid_links = 33| together = 35
latex_links = 26| grobid_links = 32| together = 34
latex_links = 11| grobid_links = 11| together = 11
latex_links = 16| grobid_links = 9| togeth

latex_links = 11| grobid_links = 34| together = 35
latex_links = 21| grobid_links = 29| together = 30
latex_links = 0| grobid_links = 18| together = 18
latex_links = 24| grobid_links = 29| together = 34
latex_links = 17| grobid_links = 16| together = 17
latex_links = 15| grobid_links = 14| together = 15
latex_links = 7| grobid_links = 6| together = 8
latex_links = 36| grobid_links = 40| together = 43
latex_links = 15| grobid_links = 23| together = 24
latex_links = 21| grobid_links = 27| together = 29
latex_links = 32| grobid_links = 53| together = 55
latex_links = 9| grobid_links = 14| together = 17
latex_links = 3| grobid_links = 43| together = 43
latex_links = 11| grobid_links = 8| together = 11
latex_links = 6| grobid_links = 6| together = 7
latex_links = 2| grobid_links = 38| together = 38
latex_links = 17| grobid_links = 26| together = 27
latex_links = 26| grobid_links = 33| together = 36
latex_links = 25| grobid_links = 25| together = 25
latex_links = 18| grobid_links = 19| toget

latex_links = 22| grobid_links = 20| together = 22
latex_links = 29| grobid_links = 28| together = 31
latex_links = 6| grobid_links = 2| together = 6
latex_links = 35| grobid_links = 33| together = 36
latex_links = 15| grobid_links = 17| together = 20
latex_links = 31| grobid_links = 29| together = 32
latex_links = 8| grobid_links = 11| together = 12
latex_links = 24| grobid_links = 21| together = 27
latex_links = 38| grobid_links = 38| together = 46
latex_links = 26| grobid_links = 13| together = 25
latex_links = 45| grobid_links = 23| together = 46
WOW!
latex_links = 34| grobid_links = 0| together = 34
latex_links = 33| grobid_links = 30| together = 33
latex_links = 16| grobid_links = 19| together = 21
latex_links = 0| grobid_links = 18| together = 18
latex_links = 11| grobid_links = 26| together = 28
latex_links = 0| grobid_links = 31| together = 31
latex_links = 22| grobid_links = 17| together = 22
latex_links = 15| grobid_links = 12| together = 15
latex_links = 14| grobid_links = 

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



latex_links = 17| grobid_links = 17| together = 17
latex_links = 13| grobid_links = 28| together = 28
latex_links = 0| grobid_links = 31| together = 31
WOW!
latex_links = 46| grobid_links = 0| together = 46
latex_links = 5| grobid_links = 4| together = 5
latex_links = 112| grobid_links = 82| together = 153
latex_links = 12| grobid_links = 12| together = 12
latex_links = 36| grobid_links = 27| together = 48
latex_links = 12| grobid_links = 10| together = 13
latex_links = 22| grobid_links = 20| together = 22
latex_links = 48| grobid_links = 47| together = 50
latex_links = 27| grobid_links = 31| together = 34
latex_links = 7| grobid_links = 5| together = 8
latex_links = 14| grobid_links = 18| together = 21
latex_links = 12| grobid_links = 12| together = 16
latex_links = 21| grobid_links = 20| together = 22
latex_links = 15| grobid_links = 15| together = 15
latex_links = 21| grobid_links = 47| together = 50
latex_links = 13| grobid_links = 12| together = 13
latex_links = 0| grobid_links = 

latex_links = 12| grobid_links = 18| together = 19
latex_links = 3| grobid_links = 3| together = 3
latex_links = 8| grobid_links = 9| together = 9
latex_links = 27| grobid_links = 25| together = 29
latex_links = 34| grobid_links = 34| together = 35
latex_links = 33| grobid_links = 32| together = 36
latex_links = 23| grobid_links = 38| together = 42
latex_links = 18| grobid_links = 8| together = 18
latex_links = 53| grobid_links = 54| together = 55
latex_links = 22| grobid_links = 25| together = 25
latex_links = 11| grobid_links = 25| together = 26
latex_links = 0| grobid_links = 2| together = 2
latex_links = 0| grobid_links = 24| together = 23
latex_links = 23| grobid_links = 24| together = 28
latex_links = 23| grobid_links = 23| together = 23
latex_links = 16| grobid_links = 15| together = 16
latex_links = 23| grobid_links = 23| together = 23
latex_links = 0| grobid_links = 34| together = 33
WOW!
latex_links = 2| grobid_links = 0| together = 2
latex_links = 51| grobid_links = 68| toge

latex_links = 5| grobid_links = 5| together = 5
latex_links = 30| grobid_links = 22| together = 33
latex_links = 17| grobid_links = 43| together = 44
latex_links = 9| grobid_links = 16| together = 18
latex_links = 26| grobid_links = 24| together = 26
latex_links = 9| grobid_links = 7| together = 10
latex_links = 0| grobid_links = 30| together = 30
latex_links = 20| grobid_links = 35| together = 35
latex_links = 24| grobid_links = 24| together = 25
latex_links = 19| grobid_links = 18| together = 19
latex_links = 52| grobid_links = 51| together = 54
latex_links = 9| grobid_links = 5| together = 11
latex_links = 4| grobid_links = 28| together = 28
WOW!
latex_links = 3| grobid_links = 0| together = 3
latex_links = 0| grobid_links = 12| together = 12
latex_links = 7| grobid_links = 17| together = 18
latex_links = 18| grobid_links = 14| together = 18
latex_links = 38| grobid_links = 2| together = 38
latex_links = 20| grobid_links = 19| together = 21
latex_links = 28| grobid_links = 26| toget

latex_links = 20| grobid_links = 19| together = 21
latex_links = 7| grobid_links = 18| together = 18
latex_links = 15| grobid_links = 15| together = 17
latex_links = 21| grobid_links = 31| together = 35
latex_links = 7| grobid_links = 10| together = 11
latex_links = 8| grobid_links = 15| together = 16
latex_links = 26| grobid_links = 26| together = 26
latex_links = 0| grobid_links = 41| together = 40
latex_links = 1| grobid_links = 16| together = 15
latex_links = 22| grobid_links = 3| together = 22
latex_links = 20| grobid_links = 53| together = 54
latex_links = 38| grobid_links = 35| together = 40
latex_links = 12| grobid_links = 11| together = 15
latex_links = 23| grobid_links = 27| together = 27
latex_links = 0| grobid_links = 12| together = 12
latex_links = 31| grobid_links = 42| together = 43
latex_links = 3| grobid_links = 3| together = 3
latex_links = 8| grobid_links = 9| together = 14
latex_links = 22| grobid_links = 22| together = 24
latex_links = 16| grobid_links = 16| togeth

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



latex_links = 36| grobid_links = 27| together = 36
latex_links = 33| grobid_links = 35| together = 35
latex_links = 20| grobid_links = 13| together = 21
latex_links = 15| grobid_links = 22| together = 24
latex_links = 28| grobid_links = 25| together = 28
latex_links = 41| grobid_links = 52| together = 53
latex_links = 24| grobid_links = 24| together = 26
latex_links = 29| grobid_links = 26| together = 30
latex_links = 44| grobid_links = 35| together = 47
latex_links = 26| grobid_links = 33| together = 34
latex_links = 20| grobid_links = 21| together = 23
latex_links = 15| grobid_links = 17| together = 18
latex_links = 28| grobid_links = 54| together = 56
latex_links = 0| grobid_links = 25| together = 25
latex_links = 18| grobid_links = 11| together = 21
latex_links = 40| grobid_links = 43| together = 45
latex_links = 10| grobid_links = 14| together = 15
latex_links = 39| grobid_links = 49| together = 51
latex_links = 13| grobid_links = 12| together = 13
latex_links = 43| grobid_links =

latex_links = 28| grobid_links = 31| together = 33
latex_links = 8| grobid_links = 7| together = 8
latex_links = 12| grobid_links = 29| together = 31
latex_links = 5| grobid_links = 19| together = 19
latex_links = 27| grobid_links = 34| together = 36
latex_links = 26| grobid_links = 26| together = 29
latex_links = 6| grobid_links = 6| together = 6
latex_links = 38| grobid_links = 42| together = 48
latex_links = 36| grobid_links = 29| together = 39
latex_links = 17| grobid_links = 7| together = 17
latex_links = 19| grobid_links = 27| together = 27
latex_links = 27| grobid_links = 31| together = 37
latex_links = 16| grobid_links = 22| together = 22
latex_links = 13| grobid_links = 16| together = 18
latex_links = 24| grobid_links = 22| together = 25
latex_links = 29| grobid_links = 17| together = 40
latex_links = 9| grobid_links = 10| together = 11
latex_links = 18| grobid_links = 18| together = 18
latex_links = 20| grobid_links = 17| together = 23
latex_links = 27| grobid_links = 24| tog

latex_links = 35| grobid_links = 48| together = 51
latex_links = 12| grobid_links = 39| together = 39
latex_links = 22| grobid_links = 27| together = 29
latex_links = 21| grobid_links = 19| together = 21
latex_links = 1| grobid_links = 12| together = 12
latex_links = 9| grobid_links = 22| together = 23
WOW!
latex_links = 7| grobid_links = 0| together = 7
latex_links = 29| grobid_links = 30| together = 31
latex_links = 21| grobid_links = 25| together = 25
latex_links = 25| grobid_links = 21| together = 29
latex_links = 0| grobid_links = 34| together = 33
latex_links = 20| grobid_links = 20| together = 28
latex_links = 4| grobid_links = 14| together = 15
latex_links = 14| grobid_links = 17| together = 21
latex_links = 25| grobid_links = 23| together = 25
latex_links = 20| grobid_links = 19| together = 20
latex_links = 14| grobid_links = 14| together = 15
latex_links = 22| grobid_links = 22| together = 22
latex_links = 0| grobid_links = 4| together = 4
latex_links = 35| grobid_links = 33|

latex_links = 26| grobid_links = 27| together = 29
latex_links = 4| grobid_links = 20| together = 19
latex_links = 29| grobid_links = 28| together = 29
latex_links = 20| grobid_links = 19| together = 20
latex_links = 9| grobid_links = 10| together = 11
latex_links = 17| grobid_links = 21| together = 21
latex_links = 19| grobid_links = 20| together = 21
latex_links = 15| grobid_links = 17| together = 21
latex_links = 28| grobid_links = 28| together = 29
latex_links = 53| grobid_links = 38| together = 54
latex_links = 16| grobid_links = 14| together = 19
latex_links = 8| grobid_links = 22| together = 23
latex_links = 16| grobid_links = 24| together = 27
latex_links = 4| grobid_links = 14| together = 14
latex_links = 21| grobid_links = 18| together = 24
latex_links = 30| grobid_links = 29| together = 31
latex_links = 29| grobid_links = 24| together = 31
latex_links = 17| grobid_links = 15| together = 18
latex_links = 22| grobid_links = 22| together = 22
latex_links = 26| grobid_links = 32

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



latex_links = 7| grobid_links = 8| together = 8
latex_links = 13| grobid_links = 10| together = 18
latex_links = 10| grobid_links = 28| together = 27
latex_links = 0| grobid_links = 11| together = 11
latex_links = 13| grobid_links = 42| together = 44
latex_links = 45| grobid_links = 51| together = 56
latex_links = 31| grobid_links = 30| together = 32
latex_links = 11| grobid_links = 14| together = 18
latex_links = 6| grobid_links = 17| together = 17
latex_links = 29| grobid_links = 33| together = 34
latex_links = 18| grobid_links = 34| together = 33
latex_links = 31| grobid_links = 36| together = 39
latex_links = 14| grobid_links = 14| together = 14
latex_links = 17| grobid_links = 28| together = 28
latex_links = 43| grobid_links = 58| together = 62
latex_links = 39| grobid_links = 41| together = 43
latex_links = 15| grobid_links = 16| together = 16
latex_links = 26| grobid_links = 25| together = 26
latex_links = 40| grobid_links = 37| together = 40
latex_links = 1| grobid_links = 45| 

latex_links = 19| grobid_links = 1| together = 19
latex_links = 9| grobid_links = 27| together = 27
latex_links = 0| grobid_links = 15| together = 15
latex_links = 33| grobid_links = 4| together = 33
latex_links = 44| grobid_links = 42| together = 45
latex_links = 28| grobid_links = 14| together = 28
latex_links = 14| grobid_links = 21| together = 22
latex_links = 26| grobid_links = 40| together = 43
latex_links = 16| grobid_links = 26| together = 28
latex_links = 29| grobid_links = 34| together = 34
latex_links = 3| grobid_links = 3| together = 3
latex_links = 32| grobid_links = 60| together = 59
latex_links = 25| grobid_links = 25| together = 25
WOW!
latex_links = 15| grobid_links = 0| together = 15
latex_links = 12| grobid_links = 25| together = 26
latex_links = 12| grobid_links = 37| together = 45
latex_links = 19| grobid_links = 16| together = 23
latex_links = 11| grobid_links = 6| together = 11
latex_links = 14| grobid_links = 17| together = 19
latex_links = 49| grobid_links = 47

latex_links = 13| grobid_links = 9| together = 13
latex_links = 6| grobid_links = 5| together = 6
latex_links = 15| grobid_links = 15| together = 15
latex_links = 9| grobid_links = 13| together = 14
latex_links = 13| grobid_links = 41| together = 47
latex_links = 23| grobid_links = 21| together = 23
latex_links = 0| grobid_links = 27| together = 27
latex_links = 14| grobid_links = 14| together = 14
latex_links = 7| grobid_links = 4| together = 7
latex_links = 13| grobid_links = 20| together = 22
latex_links = 19| grobid_links = 27| together = 28
latex_links = 9| grobid_links = 17| together = 21
latex_links = 30| grobid_links = 33| together = 34
latex_links = 31| grobid_links = 31| together = 33
latex_links = 9| grobid_links = 30| together = 30
latex_links = 27| grobid_links = 26| together = 27
latex_links = 23| grobid_links = 22| together = 24
latex_links = 13| grobid_links = 11| together = 14
latex_links = 20| grobid_links = 17| together = 22
latex_links = 21| grobid_links = 26| toget

Проверка ситуации, когда есть **latex_parse**, но  нет **grobid_parse**

In [48]:
for num_art ,article in enumerate(all_articles):
    if article['latex_parse'] and article['latex_parse']['body_text']:
        if len(article['grobid_parse']['body_text'])==0:
            print(num_art,articcle['paper_id'])

### Выделение обзорной части статьи

1. Самый простой принцип построения - по максимальному количеству ссылок в абзаце:
 - **Решение**:
    - подсчитать количество ссылок в каждой секции
    - выбрать секцию с максимальным количеством ссылок (возможно ещё оставить ещё 1 секцию, в которой количество ссылок было больше половины чем в максимальной)
    - для latex статей надо объединить текст одинаковых секций в 1 абзац
 - **Критерий**:
    -  в части latex публикаций есть названия секций => после выделения обзорных часте можно посмотреть какие секции выделились: какие топ-3, сделать просмотр глазами и после этого решать что делать дальше.
    - возможно логично сохранять для 2 максимального текста название статей
    

In [80]:
overview_papers = dict()

In [81]:
all_articles[0].keys()

dict_keys(['paper_id', 'metadata', 's2_pdf_hash', 'grobid_parse', 'latex_parse'])

In [82]:
all_articles[0]['grobid_parse'].keys()

dict_keys(['abstract', 'body_text', 'ref_entries', 'bib_entries'])

In [83]:
all_articles[0]['latex_parse'].keys()

dict_keys(['abstract', 'body_text', 'ref_entries', 'bib_entries'])

In [84]:
overview_papers[all_articles[0]['paper_id']] = {
    'paper_id':all_articles[0]['paper_id'],   'metadata':all_articles[0]['metadata'],
    's2_pdf_hash':all_articles[0]['s2_pdf_hash'], 'grobid_parse':None,'latex_parse':None}

In [85]:
overview_papers

{'10164018': {'paper_id': '10164018',
  'metadata': {'title': 'Reader-Aware Multi-Document Summarization: An Enhanced Model and The First Dataset',
   'authors': [{'first': 'Piji', 'middle': [], 'last': 'Li', 'suffix': ''},
    {'first': 'Lidong', 'middle': [], 'last': 'Bing', 'suffix': ''},
    {'first': 'Wai', 'middle': [], 'last': 'Lam', 'suffix': ''}],
   'abstract': 'We investigate the problem of reader-aware multi-document summarization (RA-MDS) and introduce a new dataset for this problem. To tackle RA-MDS, we extend a variational auto-encodes (VAEs) based MDS framework by jointly considering news documents and reader comments. To conduct evaluation for summarization performance, we prepare a new dataset. We describe the methods for data collection, aspect annotation, and summary writing as well as scrutinizing by experts. Experimental results show that reader comments can improve the summarization performance, which also demonstrates the usefulness of the proposed dataset. The 

In [86]:
all_articles[0]['grobid_parse']['body_text'][0].keys()

dict_keys(['text', 'cite_spans', 'ref_spans', 'eq_spans', 'section'])

In [87]:
all_articles[0]['grobid_parse']['bib_entries'].keys()

dict_keys(['BIBREF0', 'BIBREF1', 'BIBREF2', 'BIBREF3', 'BIBREF4', 'BIBREF5', 'BIBREF6', 'BIBREF7', 'BIBREF8', 'BIBREF9', 'BIBREF10', 'BIBREF11', 'BIBREF12', 'BIBREF13', 'BIBREF14', 'BIBREF15', 'BIBREF16', 'BIBREF17', 'BIBREF18', 'BIBREF19', 'BIBREF20', 'BIBREF21', 'BIBREF22'])

In [88]:
grobid_parse_overview = dict()
for num_sec,sections in enumerate(article['grobid_parse']['body_text']):
    grobid_parse_overview[num_sec] = sections
    print(len(grobid_parse_overview[num_sec]['cite_spans']),sections)
    print(10*'==')

13 {'text': 'The goal of multi-document summarization (MDS) is to automatically generate a brief, wellorganized summary for a topic which describes an event with a set of documents from different sources. (Goldstein et al., 2000; Erkan and Radev, 2004; Wan et al., 2007; Nenkova and McKeown, 2012; Min et al., 2012; Li et al., 2017) . In the typical setting of MDS, the input is a set of news documents about the same topic. The output summary is a piece of short text document containing several sentences, generated only based on the input original documents.With the development of social media and mobile equipments, more and more user generated * The work described in this paper is supported by a grant from the Grant Council of the Hong Kong Special Administrative Region, China (Project Code: 14203414) .1 http://www.se.cuhk.edu.hk/˜textmine/ dataset/ra-mds/ NEWS: The most important announcements from Google\'s big developers\' conference content is available. Figure 1 is a snapshot of rea

0 {'text': 'In this section, we describe the preparation process of the dataset. Then we provide some properties and statistics.', 'cite_spans': [], 'ref_spans': [], 'eq_spans': [], 'section': None}
0 {'text': 'The definition of the terminology related to the dataset is given as follows. 6 Topic: A topic refers to an event and it is composed of a set of news documents from different sources. Document: A news article describing some aspects of the topic. The set of documents in the same topic typically span a period, say a few days. Category: Each topic belongs to a category. There are 6 predefined categories: (1) Accidents and Natural Disasters, (2) Attacks (Criminal/Terrorist), (3) New Technology, (4) Health and Safety, (5) Endangered Resources, and (6) Investigations and Trials (Criminal/Legal/Other). Aspect: Each category has a set of predefined aspects. Each aspect describes one important element of an event. For example, for the category "Accidents and Natural Disasters", the aspe

In [90]:
grobid_latex_overview = dict()
for sections in all_articles[0]['latex_parse']['body_text']:
    grobid_latex_overview[sections['section']] = sections
    print(sections['section'],len(sections['cite_spans']))
    print(sections)
    print(10*'==')

Introduction 7
{'text': 'The goal of multi-document summarization (MDS) is to automatically generate a brief, well-organized summary for a topic which describes an event with a set of documents from different sources. BIBREF0 , BIBREF1 , BIBREF2 , BIBREF3 , BIBREF4 , BIBREF5 , BIBREF6 . In the typical setting of MDS, the input is a set of news documents about the same topic. The output summary is a piece of short text document containing several sentences, generated only based on the input original documents.', 'cite_spans': [{'start': 193, 'end': 200, 'text': None, 'latex': None, 'ref_id': 'BIBREF0'}, {'start': 203, 'end': 210, 'text': None, 'latex': None, 'ref_id': 'BIBREF1'}, {'start': 213, 'end': 220, 'text': None, 'latex': None, 'ref_id': 'BIBREF2'}, {'start': 223, 'end': 230, 'text': None, 'latex': None, 'ref_id': 'BIBREF3'}, {'start': 233, 'end': 240, 'text': None, 'latex': None, 'ref_id': 'BIBREF4'}, {'start': 243, 'end': 250, 'text': None, 'latex': None, 'ref_id': 'BIBREF5'}, 

{'text': 'VAESum BIBREF6 employs an alignment mechanism BIBREF12 , BIBREF13 to recall the lost detailed information from the input sentence. Inspired this idea, we design a jointly weighted alignment mechanism by considering the news sentence and the comment sentence simultaneously. For each decoder hidden state INLINEFORM0 , we align it with each news encoder hidden state INLINEFORM1 by an alignment vector INLINEFORM2 . We also align it with each comments encoder hidden state INLINEFORM3 by an alignment vector INLINEFORM4 . In order to filter the noisy information from the comments, we again employ the comment weight INLINEFORM5 to adjust the alignment vector of comments: DISPLAYFORM0 ', 'cite_spans': [{'start': 7, 'end': 14, 'text': None, 'latex': None, 'ref_id': 'BIBREF6'}, {'start': 46, 'end': 54, 'text': None, 'latex': None, 'ref_id': 'BIBREF12'}, {'start': 57, 'end': 65, 'text': None, 'latex': None, 'ref_id': 'BIBREF13'}], 'ref_spans': [], 'eq_spans': [{'start': 304, 'end': 315, 

{'text': 'Each topic is assigned to 4 experts, who are major in journalism, to conduct the summary writing. The task of summary writing is divided into two phases, namely, aspect facet identification, and summary generation. For the aspect facet identification, the experts read and digested all the news documents and reader comments under the topic. Then for each aspect, the experts extracted the related facets from the news document. The summaries were generated based on the annotated aspect facets. When selecting facets, one consideration is those facets that are popular in both news documents and reader comments have higher priority. Next, the facets that are popular in news documents have the next priority. The generated summary should cover as many aspects as possible, and should be well-organized using complete sentences with a length restriction of 100 words.', 'cite_spans': [], 'ref_spans': [], 'eq_spans': [], 'section': 'Data Collection'}
Data Collection 0
{'text': 'After fini

In [91]:
for article in all_articles[0]['latex_parse']['body_text']:
    print(article['section'],len(article['cite_spans']))
    print(article['text'])
    print(10*'==')

Introduction 7
The goal of multi-document summarization (MDS) is to automatically generate a brief, well-organized summary for a topic which describes an event with a set of documents from different sources. BIBREF0 , BIBREF1 , BIBREF2 , BIBREF3 , BIBREF4 , BIBREF5 , BIBREF6 . In the typical setting of MDS, the input is a set of news documents about the same topic. The output summary is a piece of short text document containing several sentences, generated only based on the input original documents.
Introduction 0
With the development of social media and mobile equipments, more and more user generated content is available. Figure FIGREF2 is a snapshot of reader comments under the news report “The most important announcements from Google's big developers' conference”. The content of the original news report talks about some new products based on AI techniques. The news report generally conveys an enthusiastic tone. However, while some readers share similar enthusiasms, some others expre

Each topic is assigned to 4 experts, who are major in journalism, to conduct the summary writing. The task of summary writing is divided into two phases, namely, aspect facet identification, and summary generation. For the aspect facet identification, the experts read and digested all the news documents and reader comments under the topic. Then for each aspect, the experts extracted the related facets from the news document. The summaries were generated based on the annotated aspect facets. When selecting facets, one consideration is those facets that are popular in both news documents and reader comments have higher priority. Next, the facets that are popular in news documents have the next priority. The generated summary should cover as many aspects as possible, and should be well-organized using complete sentences with a length restriction of 100 words.
Data Collection 0
After finishing the summary writing procedure, we employed another expert for scrutinizing the summaries. Each su

In [92]:
grobid_latex_overview = dict()
for sections in all_articles[0]['latex_parse']['body_text']:
    if sections['section'] in grobid_latex_overview:
        if grobid_latex_overview[sections['section']] == sections:
            continue
        else:
            grobid_latex_overview[sections['section']]['text'].append(sections['text'])
            grobid_latex_overview[sections['section']]['cite_spans'].append(sections['cite_spans'])
            grobid_latex_overview[sections['section']]['cite_span_lens'].append(len(sections['cite_spans']))
            grobid_latex_overview[sections['section']]['section'].append(sections['section'])
    else:
        grobid_latex_overview[sections['section']] = {'text':[sections['text']],   'cite_spans':[sections['cite_spans']],
                                                      'cite_span_lens':[len(sections['cite_spans'])],
                                                      'section':[sections['section']]}
        
    print(sections['section'],len(sections['cite_spans']))
    print(sections)
    print(10*'==')

Introduction 7
{'text': 'The goal of multi-document summarization (MDS) is to automatically generate a brief, well-organized summary for a topic which describes an event with a set of documents from different sources. BIBREF0 , BIBREF1 , BIBREF2 , BIBREF3 , BIBREF4 , BIBREF5 , BIBREF6 . In the typical setting of MDS, the input is a set of news documents about the same topic. The output summary is a piece of short text document containing several sentences, generated only based on the input original documents.', 'cite_spans': [{'start': 193, 'end': 200, 'text': None, 'latex': None, 'ref_id': 'BIBREF0'}, {'start': 203, 'end': 210, 'text': None, 'latex': None, 'ref_id': 'BIBREF1'}, {'start': 213, 'end': 220, 'text': None, 'latex': None, 'ref_id': 'BIBREF2'}, {'start': 223, 'end': 230, 'text': None, 'latex': None, 'ref_id': 'BIBREF3'}, {'start': 233, 'end': 240, 'text': None, 'latex': None, 'ref_id': 'BIBREF4'}, {'start': 243, 'end': 250, 'text': None, 'latex': None, 'ref_id': 'BIBREF5'}, 

{'text': 'VAESum BIBREF6 employs an alignment mechanism BIBREF12 , BIBREF13 to recall the lost detailed information from the input sentence. Inspired this idea, we design a jointly weighted alignment mechanism by considering the news sentence and the comment sentence simultaneously. For each decoder hidden state INLINEFORM0 , we align it with each news encoder hidden state INLINEFORM1 by an alignment vector INLINEFORM2 . We also align it with each comments encoder hidden state INLINEFORM3 by an alignment vector INLINEFORM4 . In order to filter the noisy information from the comments, we again employ the comment weight INLINEFORM5 to adjust the alignment vector of comments: DISPLAYFORM0 ', 'cite_spans': [{'start': 7, 'end': 14, 'text': None, 'latex': None, 'ref_id': 'BIBREF6'}, {'start': 46, 'end': 54, 'text': None, 'latex': None, 'ref_id': 'BIBREF12'}, {'start': 57, 'end': 65, 'text': None, 'latex': None, 'ref_id': 'BIBREF13'}], 'ref_spans': [], 'eq_spans': [{'start': 304, 'end': 315, 

{'text': 'Each topic is assigned to 4 experts, who are major in journalism, to conduct the summary writing. The task of summary writing is divided into two phases, namely, aspect facet identification, and summary generation. For the aspect facet identification, the experts read and digested all the news documents and reader comments under the topic. Then for each aspect, the experts extracted the related facets from the news document. The summaries were generated based on the annotated aspect facets. When selecting facets, one consideration is those facets that are popular in both news documents and reader comments have higher priority. Next, the facets that are popular in news documents have the next priority. The generated summary should cover as many aspects as possible, and should be well-organized using complete sentences with a length restriction of 100 words.', 'cite_spans': [], 'ref_spans': [], 'eq_spans': [], 'section': 'Data Collection'}
Data Collection 0
{'text': 'After fini

In [93]:
grobid_latex_overview['Introduction']

{'text': ['The goal of multi-document summarization (MDS) is to automatically generate a brief, well-organized summary for a topic which describes an event with a set of documents from different sources. BIBREF0 , BIBREF1 , BIBREF2 , BIBREF3 , BIBREF4 , BIBREF5 , BIBREF6 . In the typical setting of MDS, the input is a set of news documents about the same topic. The output summary is a piece of short text document containing several sentences, generated only based on the input original documents.',
  "With the development of social media and mobile equipments, more and more user generated content is available. Figure FIGREF2 is a snapshot of reader comments under the news report “The most important announcements from Google's big developers' conference”. The content of the original news report talks about some new products based on AI techniques. The news report generally conveys an enthusiastic tone. However, while some readers share similar enthusiasms, some others express their worri

In [94]:
overview_papers[all_articles[0]['paper_id']]['grobid_parse'] = grobid_parse_overview
overview_papers[all_articles[0]['paper_id']]['latex_parse'] = grobid_latex_overview

In [95]:
overview_papers

{'10164018': {'paper_id': '10164018',
  'metadata': {'title': 'Reader-Aware Multi-Document Summarization: An Enhanced Model and The First Dataset',
   'authors': [{'first': 'Piji', 'middle': [], 'last': 'Li', 'suffix': ''},
    {'first': 'Lidong', 'middle': [], 'last': 'Bing', 'suffix': ''},
    {'first': 'Wai', 'middle': [], 'last': 'Lam', 'suffix': ''}],
   'abstract': 'We investigate the problem of reader-aware multi-document summarization (RA-MDS) and introduce a new dataset for this problem. To tackle RA-MDS, we extend a variational auto-encodes (VAEs) based MDS framework by jointly considering news documents and reader comments. To conduct evaluation for summarization performance, we prepare a new dataset. We describe the methods for data collection, aspect annotation, and summary writing as well as scrutinizing by experts. Experimental results show that reader comments can improve the summarization performance, which also demonstrates the usefulness of the proposed dataset. The 

Для сравнения глазами ссылка на [статью](https://arxiv.org/pdf/1708.01065.pdf)


In [98]:
for num_tex,(k,v) in enumerate(grobid_latex_overview.items()):
    print(num_tex,k,sum(v['cite_span_lens']))

0 Introduction 13
1 Overview 1
2 Reader-Aware Salience Estimation 6
3 Summary Construction 6
4 Data Description 0
5 Background 0
6 Data Collection 0
7 Data Properties 0
8 Dataset and Metrics 1
9 Comparative Methods 6
10 Experimental Settings 2
11 Results on Our Dataset 0
12 Further Investigation of Our Framework  2
13 Case Study 0
14 Conclusions 0


In [97]:
for k,v in grobid_parse_overview.items():
    print(k,len(v['cite_spans']))

0 13
1 1
2 6
3 3
4 0
5 0
6 0
7 0
8 1
9 4
10 2
11 2
12 0
13 1


In [99]:
grobid_parse_overview = {k: v for k, v in sorted(grobid_parse_overview.items(), key=lambda item: len(item[1]['cite_spans']), reverse=True)}

In [103]:
grobid_latex_overview = {k: v for k, v in sorted(grobid_latex_overview.items(), key=lambda item: sum(item[1]['cite_span_lens']), reverse=True)}

In [104]:
grobid_latex_overview

{'Introduction': {'text': ['The goal of multi-document summarization (MDS) is to automatically generate a brief, well-organized summary for a topic which describes an event with a set of documents from different sources. BIBREF0 , BIBREF1 , BIBREF2 , BIBREF3 , BIBREF4 , BIBREF5 , BIBREF6 . In the typical setting of MDS, the input is a set of news documents about the same topic. The output summary is a piece of short text document containing several sentences, generated only based on the input original documents.',
   "With the development of social media and mobile equipments, more and more user generated content is available. Figure FIGREF2 is a snapshot of reader comments under the news report “The most important announcements from Google's big developers' conference”. The content of the original news report talks about some new products based on AI techniques. The news report generally conveys an enthusiastic tone. However, while some readers share similar enthusiasms, some others e

In [107]:
for k,v in grobid_latex_overview.items():
    print(k,sum(v['cite_span_lens']))

Introduction 13
Reader-Aware Salience Estimation 6
Summary Construction 6
Comparative Methods 6
Experimental Settings 2
Further Investigation of Our Framework  2
Overview 1
Dataset and Metrics 1
Background 0
Data Collection 0
Data Description 0
Data Properties 0
Results on Our Dataset 0
Case Study 0
Conclusions 0


### Применим для части и посмотрим глазами как выполняется сбор

In [144]:
overview_papers = dict()
for num_artic,article in enumerate(all_articles):
    if num_artic ==5:
        break
        
    overview_papers[article['paper_id']] = { 'paper_id':article['paper_id'],   'metadata':article['metadata'],
                                             's2_pdf_hash':article['s2_pdf_hash'], 'grobid_parse':None,'latex_parse':None}
    grobid_parse_overview = None
    if article['grobid_parse'] and article['grobid_parse']['body_text']:
        grobid_parse_overview = dict()
        for num_sec,sections in enumerate(article['grobid_parse']['body_text']):
            grobid_parse_overview[num_sec] = sections
        
        grobid_parse_overview = {k: v for k, v in sorted(grobid_parse_overview.items(), key=lambda item: len(item[1]['cite_spans']), reverse=True)}

    grobid_latex_overview = None
    if article['latex_parse'] and article['latex_parse']['body_text']:
        grobid_latex_overview = dict()
        for sections in article['latex_parse']['body_text']:
            if sections['section'] in grobid_latex_overview:
                if grobid_latex_overview[sections['section']] == sections:
                    continue
                else:
                    grobid_latex_overview[sections['section']]['text'].append(sections['text'])
                    grobid_latex_overview[sections['section']]['cite_spans'].append(sections['cite_spans'])
                    grobid_latex_overview[sections['section']]['cite_span_lens'].append(len(sections['cite_spans']))
                    grobid_latex_overview[sections['section']]['section'].append(sections['section'])
            else:
                grobid_latex_overview[sections['section']] = {'text':[sections['text']],   'cite_spans':[sections['cite_spans']],
                                                              'cite_span_lens':[len(sections['cite_spans'])],
                                                              'section':[sections['section']]}
        grobid_latex_overview = {k: v for k, v in sorted(grobid_latex_overview.items(), key=lambda item: item[1]['cite_span_lens'], reverse=True)}
    overview_papers[article['paper_id']]['grobid_parse'] = grobid_parse_overview
    overview_papers[article['paper_id']]['latex_parse'] = grobid_latex_overview

In [145]:
overview_papers.keys()

dict_keys(['10164018', '14472576', '17302615', '3243536', '3248240'])

In [138]:
key_id = '14472576'
print(overview_papers[key_id]['metadata']['title'],'\n----')
if overview_papers[key_id]['grobid_parse']:
    print('grobid')
    for k,v in overview_papers[key_id]['grobid_parse'].items():
        print(k,len(v['cite_spans']))

if overview_papers[key_id]['latex_parse']:
    print('---\nlatex')
    for k,v in overview_papers[key_id]['latex_parse'].items():
        print(k,sum(v['cite_span_lens']))

Building a Semantic Parser Overnight 
----
grobid
14 18
1 7
0 6
8 5
2 3
7 3
3 1
13 1
4 0
5 0
6 0
9 0
10 0
11 0
12 0


[Building a Semantic Parser Overnight](https://www.aclweb.org/anthology/P15-1129.pdf)

Как видим Related Work является последнмм перед References абзацем => работает верно

##### ====

In [139]:
key_id = '17302615'
print(overview_papers[key_id]['metadata']['title'],'\n----')
if overview_papers[key_id]['grobid_parse']:
    print('grobid')
    for k,v in overview_papers[key_id]['grobid_parse'].items():
        print(k,len(v['cite_spans']))

if overview_papers[key_id]['latex_parse']:
    print('---\nlatex')
    for k,v in overview_papers[key_id]['latex_parse'].items():
        print(k,sum(v['cite_span_lens']))

KLUE: Simple and robust methods for polarity classification 
----
grobid
3 6
9 4
0 3
1 1
2 1
6 1
7 1
4 0
5 0
8 0


In [142]:
overview_papers[key_id]['grobid_parse'][3]['text']

'Widely-used algorithms such as SentiStrength (Thelwall et al., 2010) rely heavily on dictionaries containing sentiment ratings of words and/or phrases. We use features based on an extended version of AFINN-111 (Nielsen, 2011) . 4 The AFINN sentiment dictionary contains sentiment ratings ranging from −5 (very negative) to 5 (very positive) for 2 476 word forms. In order to obtain a better coverage, we extended the dictionary with distributionally similar words. For this purpose, large-vocabulary distributional semantic models (DSM) were constructed from a version of the English Wikipedia 5 and the Google Web 1T 5-Grams database (Brants and Franz, 2006) . The Wikipedia DSM consists of 122 281 case-folded word forms as target terms and 30 484 mid-frequency content words (lemmatised) as feature terms; the Web1T5 DSM of 241 583 case-folded word forms as target terms and 100 063 case-folded word forms as feature terms. Both DSMs use a context window of two words to the left and right, and w

In [143]:
overview_papers[key_id]['grobid_parse'][9]['text']

'We use a resource-lean approach, relying only on three external resources: a stemmer, a relatively small sentiment dictionary and an even smaller list of emotion markers. Stemmers are already available for many languages and both kinds of lexical resources can be gathered relatively easily for other languages. The list of emotion markers should apply to most languages. This makes our whole system relatively language-independent, provided that a similar amount of manually labelled training data is available. 13 In fact, the learning curve for our system ( Fig. 1) suggests that even as few as 3 000-3 500 labelled messages might be sufficient. The similar Figure 1: Learning curve of our system for the "Message Polarity Classification" task, evaluated on the Twitter data evaluation results for the Twitter and the SMS data show that not relying on Twitter-specific features like hashtags pays off: by making our system as generic as possible, it is robust, not overfitted to the training data

[KLUE: Simple and robust methods for polarity classification ](https://www.aclweb.org/anthology/S13-2065.pdf)

Здесь нет ярко выраженной главы Related Work,на мой взгляд, всё выделилось верно

##### ====

In [150]:
key_id = '3243536'
print(overview_papers[key_id]['metadata']['title'],'\n----')
if overview_papers[key_id]['grobid_parse']:
    print('grobid')
    for k,v in overview_papers[key_id]['grobid_parse'].items():
        print(k,len(v['cite_spans']))

if overview_papers[key_id]['latex_parse']:
    print('---\nlatex')
    for k,v in overview_papers[key_id]['latex_parse'].items():
        print(k,sum(v['cite_span_lens']))

Boosting Variant Recognition With Light Semantics 
----
grobid
0 5
4 3
9 2
1 1
5 1
10 1
11 1
2 0
3 0
6 0
7 0
8 0
12 0


In [151]:
overview_papers[key_id]['grobid_parse'][0]['text']

'The recognition ,of paraphrases and variants is an important issue in several areas of infornmtion retrieval and text mlderstanding. Merging paraphrastic sentences ilnproves summarization by avoiding redundancy (Barzilay et al., 1999) . Term variant conilation enhances recall in intbrmation retrieval by pointing at documents that contain linguistic variants of (tuery terms (Arampatzis et al., 1998) .In (Jacquemin and Tzoukermann, 1999 ), a technique is proposed for the conflation of morpho-syntactic variants that relies solely on morphological and low-level syntactic features (part-of-speech category, munber agreement, morphological relationships, and phrase structure). An analysis of these results shows the limitation of this approach: correct and incorrect variants cannot be separated satisfactorily on a purely morpho-syntactic basis. Sonic additional lexical semantics must be taken into consideration.In this study we propose a reasonably simple, domain-independent, large-scale appr

[Boosting Variant Recognition With Light Semantics](https://www.aclweb.org/anthology/C00-1039.pdf)

Все верно

##### =====

In [153]:
key_id = '3248240'
print(overview_papers[key_id]['metadata']['title'],'\n----')
if overview_papers[key_id]['grobid_parse']:
    print('grobid')
    for k,v in overview_papers[key_id]['grobid_parse'].items():
        print(k,len(v['cite_spans']))

if overview_papers[key_id]['latex_parse']:
    print('---\nlatex')
    for k,v in overview_papers[key_id]['latex_parse'].items():
        print(k,sum(v['cite_span_lens']))

Multilingual, Efficient and Easy NLP Processing with IXA Pipeline 
----
grobid
5 6
1 5
4 5
6 3
7 3
8 2
3 1
0 0
2 0
9 0


In [154]:
overview_papers[key_id]['grobid_parse'][5]['text']

'Most of the NER systems nowdays consist of language independent systems (sometimes enriched with gazeteers) based on automatic learning of statistical models. ixa-pipe-nerc provides Named Entity Recognition (NER) for English and Spanish. The named entity types are based on the CONLL 2002 13 and 2003 14 tasks which were focused on language-independent supervised named entity recognition (NER) for four types of named entities: persons, locations, organizations and names of miscellaneous entities that do not belong to the previous three groups. We currently provide two very fast language independent models using a rather simple baseline featureset (e.g., similar to that of Curran and Clark (2003), except POS tag features). For English, perceptron models have been trained using CoNLL 2003 dataset. We currenly obtain 84.80 F1 which is coherent with other results reported with these features (Clark and Curran, 2003; Ratinov and Roth, 2009 ). The best Stanford NER model reported on this data

In [155]:
overview_papers[key_id]['grobid_parse'][1]['text']

'1 http://www.apache.org/licenses/LICENSE-2.0.html 2 Architecture IXA pipeline is primarily conceived as a set of ready to use tools that can provide efficient and accurate linguistic annotation without any installation/configuration/compilation effort. As in Unix-like operative systems, IXA pipeline consists of a set of processes chained by their standard streams, in a way that the output of each process feeds directly as input to the next one. The Unix pipeline metaphor has been applied for NLP tools by adopting a very simple and well known data centric architecture, in which every module/pipe is interchangeable for another one as long as it takes and produces the required data format.The data format in which both the input and output of the modules needs to be formatted to represent and filter linguistic annotations is KAF (Bosma et al., 2009) . KAF is a language neutral annotation format representing both morpho-syntactic and semantic annotation in a structured format. KAF was orig

In [156]:
overview_papers[key_id]['grobid_parse'][4]['text']

"ixa-pipe-pos provides POS tagging and lemmatization for English and Spanish. We have obtained the best results so far with the same featureset as in Collins's (2002) paper. Perceptron models for English have been trained and evaluated on the WSJ treebank using the usual partitions (e.g., as explained in Toutanova et al. (2003) . We currently obtain a performance of 97.07% vs 97.24% obtained by Toutanova et al., (2003) ). For Spanish, Maximum Entropy models have been trained and evaluated using the Ancora corpus; it was randomly divided in 90% for training and 10% for testing. This corresponds to 440K words used for training and 70K words for testing. We obtain a performance of 98.88% (the corpus partitions are available for reproducibility). Giménez and Marquez (2004) report 98.86%, although they train and test on a different subset of the Ancora corpus.Lemmatization is currently performed via 3 different dictionary lookup methods: (i) Simple Lemmatizer: It is based on HashMap lookups

[Multilingual, Efficient and Easy NLP Processing with IXA Pipeline](https://www.aclweb.org/anthology/E14-2002.pdf)

In [159]:
overview_papers[key_id]['grobid_parse'][8]['text'] # RW section

'Other NLP toolkits exist providing similar or more extensive functionalities than the IXA pipeline tools, although not many of them provide multilingual support. GATE (Cunningham, 2002) is an extensive framework supporting annotation of text. GATE has some capacity for wrapping Apache UIMA components 16 , so should be able to manage distributed NLP components. However, GATE is a very large and complex system, with a corresponding steep learning curve.Freeling (Padró and Stanilovsky, 2012) provides multilingual processing for a number of languages, incluing Spanish and English. As opposed to IXA pipeline, Freeling is a monolithic toolkit written in C++ which needs to be compiled natively. The Stanford CoreNLP 17 is a monolithic suite, which makes it difficult to integrate other tools in its chain.IXA pipeline tools can easily be used piping the input with the output of another too, and it is also possible to easily replace or extend the toolchain with a third-party tool. IXA pipeline i

**контрпример:** Как видим не всегда большее количество ссылок в RW части.