### This notebook provides some exploration of the dataset itself.
We will use these files in the dataset directory (you need to first download and unzip it):

data_set_citations.json : each entry corresponds to a mention from a paper to a dataset, mention phrases are also given

file/text/*.txt: all the text files of the papers


In [3]:
import pandas as pd
import os
import json, codecs, re
import numpy as np
from collections import defaultdict

In [4]:
## dataset directory
DIR = "../train_test/"

In [5]:
## function to extract all papers 
def _extract(dir_name='files/text/', extension='.txt'):
    dir_name = DIR + dir_name
    full_text = {}
    for item in os.listdir(dir_name):
        if item.endswith(extension):
            file_name = os.path.abspath(dir_name + '/' + item)
            with codecs.open(file_name, 'r') as f:
                try:
                    lines = f.readlines()
                    #TODO document structure
                    #text = ' '.join([s.strip() for s in lines])
                    text = ' '.join([s.strip() for s in lines])
                    text = re.sub('\d', '0', text)
                    text = re.sub('[^ ]- ', '', text)
                    full_text[item] = text
                except:
                    pass
    return full_text

In [6]:
data_set_citations = pd.read_json(DIR+'data_set_citations.json', encoding='utf-8')
full_text = _extract()
print ('total number of publications given:', len(full_text))

total number of publications given: 5000


### Although there are 5K papers given, not all of them are annotated. We will only use annotated data for training and evaluation. The annotations can be retrieved from data_set_citations. 

In [7]:
data_set_ids = data_set_citations['data_set_id'].values

In [8]:
print ('Total entries: ', len(data_set_ids))

Total entries:  5499


In [9]:
print ('Number of datasets: ', len(set(data_set_ids)))

Number of datasets:  1028


In [10]:
print ('Number of annotated papers: ', len(set(data_set_citations['publication_id'])))

Number of annotated papers:  2500


In [11]:
## retreve full text of annotated papers
publication_with_mentions = {}
publication_with_mentions_ids = [str(a)+'.txt' for a in data_set_citations['publication_id'].values]
publication_with_mentions_ids = set(publication_with_mentions_ids)
for pub in publication_with_mentions_ids:
    publication_with_mentions[pub] = full_text[pub]

In [12]:
print ('average length of annotated papers: ', np.mean([len(v.split()) for k, v in publication_with_mentions.items()]))


average length of annotated papers:  6909.4828


In [13]:
publication_dataset = defaultdict(list)
publication_mention = defaultdict(list)
for i in range(len(data_set_citations)):
    row = data_set_citations.loc[i]
    publication_dataset[row['publication_id']].append(row['data_set_id'])
    publication_mention[row['publication_id']].extend(row['mention_list'])

In [14]:
print ('average number of datasets used in each paper: ', np.mean([len(v) for k, v in publication_dataset.items()]))

average number of datasets used in each paper:  2.1996


In [15]:
print ('average number of mentions per paper: ', np.mean([len(v) for k, v in publication_mention.items()]))

average number of mentions per paper:  7.4952


In [20]:
print ('mean mention length: ', np.mean([len(m.split()) for k,v in publication_mention.items() for m in v]))

mean mention length:  4.663678087309211


### In summary, we are using an annotated dataset of 2.5K papers. In this dataset, there are 2.2 datasets/paper and 7.5 mentions/paper on average.