# Imports

In [1]:
import gzip
import pickle
import numpy as np
import os
from tqdm.notebook import tqdm
from datetime import datetime
import jsonlines

# Collecting data from Corpus

Download corpus from
https://api.semanticscholar.org/corpus/download/

Release downloaded: 2021-09-01 release

From the root folder, run the following commands:

```
mkdir corpus
cd corpus
wget https://s3-us-west-2.amazonaws.com/ai2-s2-research-public/open-corpus/2021-09-01/manifest.txt
wget -B https://s3-us-west-2.amazonaws.com/ai2-s2-research-public/open-corpus/2021-09-01/ -i manifest.txt
```

In [2]:
# move to repo root folder
# ACHTUNG: do only once!! 
os.chdir('../')

In [3]:
corpus_version = '2021-09-01'
corpus_folder = os.path.join('corpus', corpus_version) # where you have the corpus
data_folder = os.path.join('data', corpus_version) # where you save the data

In [4]:
# choose keyword and the words you want to check to be inside the title or abstract
words_to_check_dict = {
    'decentralization':['centraliz','centralis'],
    'internet': ['internet'],
    'wireless': ['wireless'],
    'social media': ['social media', 'social network'],
    'hiv': ['hiv'],
    'covid': ['covid', 'coronavirus'],
    'dark web': ['deep web', 'dark web']
}

In [5]:
# select the papers with the chosen keywords
all_docs_dict = {key:{} for key in words_to_check_dict}
# also save some statistics on the whole dataset
no_papers_in_fields_by_year = {}
sets_authors_in_fields_by_year = {}
sets_authors_by_year = {}
no_authors_in_fields_by_year = {}
no_authors_by_year = {}

count = 0
start = datetime.now()

for ID in tqdm(range(6000)):
    filename = os.path.join(corpus_folder,'s2-corpus-%.3d.gz'%ID)
    with gzip.open(filename, 'rb') as f:
        for item in jsonlines.Reader(f):
            # each line of the file is a dictionary with the paper's info
            x=item
            count += 1
            title = x['title'].lower()
            abstract = x['paperAbstract'].lower()
            
            for keyword,words_to_check in words_to_check_dict.items():
                # check if this paper has the words to check
                add_it = False
                for word in words_to_check:
                    if word in title or word in abstract: # if you want to check only in title/abstract, change it here
                        add_it = True
                        break
                if add_it:
                    all_docs_dict[keyword][x['id']] = x.copy()
                
            # get statistics of whole dataset
            year = x['year']
            if year not in no_papers_in_fields_by_year:
                no_papers_in_fields_by_year[year] = {}
                sets_authors_in_fields_by_year[year] = {}
                sets_authors_by_year[year] = set()
            year_dict = no_papers_in_fields_by_year[year]
            fields = tuple(x['fieldsOfStudy'])
            if fields not in year_dict:
                year_dict[fields] = 1
                sets_authors_in_fields_by_year[year][fields] = set()
            else:
                year_dict[fields] += 1
            for authors in x['authors']:
                sets_authors_in_fields_by_year[year][fields].update(authors['ids'])
                sets_authors_by_year[year].update(authors['ids'])
                
    end = datetime.now()

    if (ID+1) % 1000 == 0:
        print(f'Read {ID+1} files after {end-start}.',flush=True)
        for keyword,papers in all_docs_dict.items():
              print(f'\tkeyword: {keyword} - no.papers: {len(papers)}')

for keyword in words_to_check_dict:
    keyword_data_folder = os.path.join(data_folder, keyword)
    os.makedirs(keyword_data_folder,exist_ok=True)
    with gzip.open(os.path.join(keyword_data_folder, 'papers_dict.pkl.gz'), 'wb') as fp:
        pickle.dump(all_docs_dict[keyword],fp)

  0%|          | 0/6000 [00:00<?, ?it/s]

Read 1000 files after 0:42:41.777740.
	keyword: decentralization - no.papers: 68095
	keyword: internet - no.papers: 184577
	keyword: wireless - no.papers: 120361
	keyword: social media - no.papers: 65607
	keyword: hiv - no.papers: 206670
	keyword: covid - no.papers: 69903
	keyword: dark web - no.papers: 420
Read 2000 files after 1:26:39.296197.
	keyword: decentralization - no.papers: 136081
	keyword: internet - no.papers: 369254
	keyword: wireless - no.papers: 240487
	keyword: social media - no.papers: 131313
	keyword: hiv - no.papers: 412043
	keyword: covid - no.papers: 140162
	keyword: dark web - no.papers: 872
Read 3000 files after 2:11:41.723527.
	keyword: decentralization - no.papers: 204467
	keyword: internet - no.papers: 553871
	keyword: wireless - no.papers: 360596
	keyword: social media - no.papers: 197391
	keyword: hiv - no.papers: 616948
	keyword: covid - no.papers: 209546
	keyword: dark web - no.papers: 1356
Read 4000 files after 2:57:37.590076.
	keyword: decentralization -

Counting how many papers among the selected ones have each (important) attribute

In [11]:
for keyword,papers in all_docs_dict.items():
    print(f'keyword: {keyword}\n\nno.papers: {len(papers)}')
    
    no_doi = []
    count_doi = 0
    for paper, paper_dict in papers.items():
        if paper_dict['doi'] is None or len(paper_dict['doi']) == 0:
            no_doi.append(paper)
        else:
            count_doi += 1
    print(f'{count_doi} have doi')

    no_year = []
    count_year = 0
    for paper, paper_dict in papers.items():
        if paper_dict['year'] is None:
            no_year.append(paper)
        else:
            count_year += 1
    print(f'{count_year} have year')

    no_fields = []
    count_fields = 0
    for paper, paper_dict in papers.items():
        if paper_dict['fieldsOfStudy'] is None or len(paper_dict['fieldsOfStudy']) == 0:
            no_year.append(paper)
        else:
            count_fields += 1
    print(f'{count_fields} have fields')
    
    no_cit = []
    count_cit = 0
    for paper, paper_dict in papers.items():
        if ( len(paper_dict['inCitations']) == 0 and len(paper_dict['outCitations']) == 0 ):
            no_cit.append(paper)
        else:
            count_cit += 1
    print(f'{count_cit} have at least one reference or citation')
    
    no_good = []
    count_good = 0
    for paper, paper_dict in papers.items():
        if paper_dict['doi'] is None or len(paper_dict['doi']) == 0 or paper_dict['year'] is None or paper_dict['fieldsOfStudy'] is None or len(paper_dict['fieldsOfStudy']) == 0 or paper_dict['paperAbstract'] is None or len(paper_dict['paperAbstract']) == 0 or paper_dict['title'] is None or len(paper_dict['title']) == 0 or ( len(paper_dict['inCitations']) == 0 and len(paper_dict['outCitations']) == 0 ):
            no_good.append(paper)
        else:
            count_good += 1
    print(f'{count_good} have everything (have doi, year, fields, abstract, title, and at least one reference or citation)')
    
    print("\n\n")

keyword: decentralization

no.papers: 408231
236927 have doi
406816 have year
369744 have fields
294720 have at least one reference or citation
194851 have everything (have doi, year, fields, abstract, title, and at least one reference or citation)



keyword: internet

no.papers: 1106677
563575 have doi
1102442 have year
1006002 have fields
709246 have at least one reference or citation
435890 have everything (have doi, year, fields, abstract, title, and at least one reference or citation)



keyword: wireless

no.papers: 721127
433089 have doi
718007 have year
659158 have fields
545205 have at least one reference or citation
386614 have everything (have doi, year, fields, abstract, title, and at least one reference or citation)



keyword: social media

no.papers: 393772
251130 have doi
392958 have year
363665 have fields
300067 have at least one reference or citation
200122 have everything (have doi, year, fields, abstract, title, and at least one reference or citation)



keyword: 

Saving now dataset statistics

In [7]:
with gzip.open(os.path.join(data_folder, 'no_papers_in_fields_by_year.pkl.gz'), 'wb') as fp:
    pickle.dump(no_papers_in_fields_by_year,fp)

with gzip.open(os.path.join(data_folder, 'sets_authors_in_fields_by_year.pkl.gz'), 'wb') as fp:
    pickle.dump(sets_authors_in_fields_by_year,fp)

for year, year_dict in sets_authors_in_fields_by_year.items():
    no_authors_in_fields_by_year[year] = {}
    for fields, set_authors in year_dict.items():
        no_authors_in_fields_by_year[year][fields] = len(set_authors)
with gzip.open(os.path.join(data_folder, 'no_authors_in_fields_by_year.pkl.gz'), 'wb') as fp:
    pickle.dump(no_authors_in_fields_by_year,fp)

for year, year_dict in sets_authors_by_year.items():
    no_authors_by_year[year] = len(year_dict)
with gzip.open(os.path.join(data_folder, 'no_authors_by_year.pkl.gz'), 'wb') as fp:
    pickle.dump(no_authors_by_year,fp)
print('Everything dumped successfully!')

Everything dumped successfully!
