# Imports

In [1]:
import gzip
import pickle
import numpy as np
# move to repo root directory
# ACHTUNG: do only once!! 
import os
os.chdir('../')
from tqdm.notebook import tqdm
from datetime import datetime
import jsonlines

import sys
sys.path.insert(0, os.path.join(os.getcwd(),"utils"))
from hsbm_creation import *

[nltk_data] Downloading package wordnet to
[nltk_data]     /data/home/ahw701/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /data/home/ahw701/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Collecting data from Corpus

Download corpus from
https://api.semanticscholar.org/corpus/download/

Release downloaded: 2021-09-01 release

From the root folder, run the following commands:

```
mkdir corpus
cd corpus
mkdir 2022-01-01
cd 2022-01-01
wget https://s3-us-west-2.amazonaws.com/ai2-s2-research-public/open-corpus/2022-01-01/manifest.txt
wget -B https://s3-us-west-2.amazonaws.com/ai2-s2-research-public/open-corpus/2022-01-01/ -i manifest.txt
```

In [2]:
corpus_version = '2022-01-01'
corpus_folder = os.path.join('corpus', corpus_version) # where you have the corpus
data_folder = os.path.join('data', corpus_version) # where you save the data

In [7]:
# choose keyword and the words you want to check to be inside the title or abstract
words_to_check_dict = {
    'decentralization':['centraliz','centralis'],
    'internet': ['internet'],
    'wireless': ['wireless'],
    'social_media': ['social media', 'social network'],
    'covid': ['covid', 'coronavirus'],
}

In [8]:
# select the papers with the chosen keywords
all_docs_dict = {key:{} for key in words_to_check_dict}
# also save some statistics on the whole dataset
no_papers_in_fields_by_year = {}
sets_authors_in_fields_by_year = {}
sets_authors_by_year = {}
no_authors_in_fields_by_year = {}
no_authors_by_year = {}
no_citations_by_year = {}

count = 0
start = datetime.now()

for ID in tqdm(range(6000)):
    filename = os.path.join(corpus_folder,'s2-corpus-%.3d.gz'%ID)
    with gzip.open(filename, 'rb') as f:
        for item in jsonlines.Reader(f):
            # each line of the file is a dictionary with the paper's info
            x=item
            count += 1
            title = x['title'].lower()
            abstract = x['paperAbstract'].lower()
            
            for keyword,words_to_check in words_to_check_dict.items():
                # check if this paper has the words to check
                add_it = False
                for word in words_to_check:
                    if word in title or word in abstract: # if you want to check only in title/abstract, change it here
                        add_it = True
                        break
                if add_it:
                    all_docs_dict[keyword][x['id']] = x.copy()
                
            # get statistics of whole dataset
            year = x['year']
            if year not in no_papers_in_fields_by_year:
                no_papers_in_fields_by_year[year] = {}
                sets_authors_in_fields_by_year[year] = {}
                sets_authors_by_year[year] = set()
                no_citations_by_year[year] = 0
            year_dict = no_papers_in_fields_by_year[year]
            fields = tuple(x['fieldsOfStudy'])
            if fields not in year_dict:
                year_dict[fields] = 1
                sets_authors_in_fields_by_year[year][fields] = set()
            else:
                year_dict[fields] += 1
            for authors in x['authors']:
                sets_authors_in_fields_by_year[year][fields].update(authors['ids'])
                sets_authors_by_year[year].update(authors['ids'])
            no_citations_by_year[year] += len(x['outCitations'])
                
    end = datetime.now()

    if (ID+1) % 1000 == 0:
        print(f'Read {ID+1} files after {end-start}.',flush=True)
        for keyword,papers in all_docs_dict.items():
              print(f'\tkeyword: {keyword} - no.papers: {len(papers)}')

for keyword in words_to_check_dict:
    keyword_data_folder = os.path.join(data_folder, keyword)
    os.makedirs(keyword_data_folder,exist_ok=True)
    with gzip.open(os.path.join(keyword_data_folder, 'papers_dict.pkl.gz'), 'wb') as fp:
        pickle.dump(all_docs_dict[keyword],fp)

  0%|          | 0/6000 [00:00<?, ?it/s]

Read 1000 files after 0:31:31.020598.
	keyword: decentralization - no.papers: 70951
	keyword: internet - no.papers: 193869
	keyword: wireless - no.papers: 123579
	keyword: social_media - no.papers: 71005
	keyword: covid - no.papers: 100304
Read 2000 files after 1:03:39.280659.
	keyword: decentralization - no.papers: 141787
	keyword: internet - no.papers: 387643
	keyword: wireless - no.papers: 246934
	keyword: social_media - no.papers: 142190
	keyword: covid - no.papers: 201525
Read 3000 files after 1:36:25.819944.
	keyword: decentralization - no.papers: 213002
	keyword: internet - no.papers: 581522
	keyword: wireless - no.papers: 370181
	keyword: social_media - no.papers: 214049
	keyword: covid - no.papers: 302259
Read 4000 files after 2:10:30.162460.
	keyword: decentralization - no.papers: 283616
	keyword: internet - no.papers: 775007
	keyword: wireless - no.papers: 493468
	keyword: social_media - no.papers: 285748
	keyword: covid - no.papers: 404596
Read 5000 files after 2:44:09.2288

Saving now dataset statistics

In [9]:
total_no_papers = sum([sum([year_dict[field] for field in year_dict.keys()]) for year_dict in no_papers_in_fields_by_year.values()])
print(f"Total number of papers in dataset (version {corpus_version}): {total_no_papers}")
with gzip.open(os.path.join(data_folder, 'no_papers_in_fields_by_year.pkl.gz'), 'wb') as fp:
    pickle.dump(no_papers_in_fields_by_year,fp)

    
total_no_citations = sum([no_citations_by_year[year] for year in no_citations_by_year.keys()])
print(f"Total number of citations in dataset (version {corpus_version}): {total_no_citations}")
with gzip.open(os.path.join(data_folder, 'no_citations_by_year.pkl.gz'), 'wb') as fp:
    pickle.dump(no_citations_by_year,fp)

    
sets_authors = set()
for year, authors in sets_authors_by_year.items():
    sets_authors.update(authors)
total_no_authors = len(sets_authors)
print(f"Total number of authors in dataset (version {corpus_version}): {total_no_authors}")

with gzip.open(os.path.join(data_folder, 'sets_authors_in_fields_by_year.pkl.gz'), 'wb') as fp:
    pickle.dump(sets_authors_in_fields_by_year,fp)

for year, year_dict in sets_authors_in_fields_by_year.items():
    no_authors_in_fields_by_year[year] = {}
    for fields, set_authors in year_dict.items():
        no_authors_in_fields_by_year[year][fields] = len(set_authors)
with gzip.open(os.path.join(data_folder, 'no_authors_in_fields_by_year.pkl.gz'), 'wb') as fp:
    pickle.dump(no_authors_in_fields_by_year,fp)

for year, year_dict in sets_authors_by_year.items():
    no_authors_by_year[year] = len(year_dict)
with gzip.open(os.path.join(data_folder, 'no_authors_by_year.pkl.gz'), 'wb') as fp:
    pickle.dump(no_authors_by_year,fp)
print('\nEverything dumped successfully!')

Total number of papers in dataset (version 2022-01-01): 203627381
Total number of citations in dataset (version 2022-01-01): 1993294456
Total number of authors in dataset (version 2022-01-01): 76438972

Everything dumped successfully!


# Checks

Counting how many papers among the selected ones have each (important) attribute

In [None]:
# load files
all_docs_dict = {}
for keyword in words_to_check_dict:
    keyword_data_folder = os.path.join(data_folder, keyword)
    os.makedirs(keyword_data_folder,exist_ok=True)
    with gzip.open(os.path.join(keyword_data_folder, 'papers_dict.pkl.gz'), 'rb') as fp:
        all_docs_dict[keyword] = pickle.load(fp)

In [11]:
for keyword,papers in all_docs_dict.items():
    print(f'keyword: {keyword}\n\nno.papers: {len(papers)}')
    
    no_title = []
    count_title = 0
    for paper, paper_dict in papers.items():
        if paper_dict['title'] is None or len(paper_dict['title']) == 0:
            no_title.append(paper)
        else:
            count_title += 1
    print(f'{count_title} have title')

    no_paperAbstract = []
    count_paperAbstract = 0
    for paper, paper_dict in papers.items():
        if paper_dict['paperAbstract'] is None or len(paper_dict['paperAbstract']) == 0:
            no_paperAbstract.append(paper)
        else:
            count_paperAbstract += 1
    print(f'{count_paperAbstract} have paperAbstract')

    no_authors = []
    count_authors = 0
    for paper, paper_dict in papers.items():
        if paper_dict['authors'] is None or len(paper_dict['authors']) == 0:
            no_authors.append(paper)
        else:
            count_authors += 1
    print(f'{count_authors} have authors')

    no_doi = []
    count_doi = 0
    for paper, paper_dict in papers.items():
        if paper_dict['doi'] is None or len(paper_dict['doi']) == 0:
            no_doi.append(paper)
        else:
            count_doi += 1
    print(f'{count_doi} have doi')

    no_year = []
    count_year = 0
    for paper, paper_dict in papers.items():
        if paper_dict['year'] is None:
            no_year.append(paper)
        else:
            count_year += 1
    print(f'{count_year} have year')

    no_cit = []
    count_cit = 0
    for paper, paper_dict in papers.items():
        if ( len(paper_dict['inCitations']) == 0 and len(paper_dict['outCitations']) == 0 ):
            no_cit.append(paper)
        else:
            count_cit += 1
    print(f'{count_cit} have at least one reference or citation')
    
    no_fields = []
    count_fields = 0
    for paper, paper_dict in papers.items():
        if paper_dict['fieldsOfStudy'] is None or len(paper_dict['fieldsOfStudy']) == 0:
            no_fields.append(paper)
        else:
            count_fields += 1
    print(f'{count_fields} have fields')
    
    no_venue = []
    count_venue = 0
    for paper, paper_dict in papers.items():
        if paper_dict['venue'] is None or len(paper_dict['venue']) == 0:
            no_venue.append(paper)
        else:
            count_venue += 1
    print(f'{count_venue} have venue')
    
    no_journalName = []
    count_journalName = 0
    for paper, paper_dict in papers.items():
        if paper_dict['journalName'] is None or len(paper_dict['journalName']) == 0:
            no_journalName.append(paper)
        else:
            count_journalName += 1
    print(f'{count_journalName} have journalName')
    
    no_good = []
    count_good = 0
    for paper, paper_dict in papers.items():
        if paper_dict['doi'] is None or len(paper_dict['doi']) == 0 \
        or paper_dict['year'] is None \
        or paper_dict['fieldsOfStudy'] is None or len(paper_dict['fieldsOfStudy']) == 0 \
        or paper_dict['paperAbstract'] is None or len(paper_dict['paperAbstract']) == 0 \
        or paper_dict['title'] is None or len(paper_dict['title']) == 0 \
        or ( len(paper_dict['inCitations']) == 0 and len(paper_dict['outCitations']) == 0 ):
            no_good.append(paper)
        else:
            count_good += 1
    print(f'{count_good} have everything we need (have doi, year, fields, abstract, title, and at least one reference or citation)')
    
    print("\n\n")

keyword: decentralization

no.papers: 425144
425144 have title
396201 have paperAbstract
421611 have authors
253464 have doi
423431 have year
305639 have at least one reference or citation
377720 have fields
143802 have venue
234888 have journalName
201713 have everything we need (have doi, year, fields, abstract, title, and at least one reference or citation)



keyword: internet

no.papers: 1161738
1161738 have title
1049640 have paperAbstract
1149999 have authors
616609 have doi
1156545 have year
743543 have at least one reference or citation
1026234 have fields
373741 have venue
544689 have journalName
455236 have everything we need (have doi, year, fields, abstract, title, and at least one reference or citation)



keyword: wireless

no.papers: 740282
740282 have title
684581 have paperAbstract
733367 have authors
451359 have doi
736701 have year
556684 have at least one reference or citation
667883 have fields
361225 have venue
432886 have journalName
393490 have everything we ne

Now we check how many papers have a valid title and have at least one inCitation or outCitation to another paper in the same dataset

In [12]:
citations_df_dict = {}
for keyword,papers in all_docs_dict.items():
    papers_with_texts = set([paper_id for paper_id in papers if papers[paper_id]["title"] is not None and len(papers[paper_id]["title"]) > 0])
    citations_df_dict[keyword] =  create_citations_edgelist(papers, papers_with_texts)
    papers_with_cits = set(list(citations_df_dict[keyword]['from'].values) + list(citations_df_dict[keyword]['to'].values))
    print(f'keyword: {keyword}\narticles in the citation layer (with degree > 0): {len(papers_with_cits)}\n\n')

keyword: decentralization
articles in the citation layer (with degree > 0): 181605


keyword: internet
articles in the citation layer (with degree > 0): 517171


keyword: wireless
articles in the citation layer (with degree > 0): 491981


keyword: social_media
articles in the citation layer (with degree > 0): 253555


keyword: covid
articles in the citation layer (with degree > 0): 353563




# Count fields of study

## Whole corpus

In [4]:
with gzip.open(os.path.join(data_folder, 'no_papers_in_fields_by_year.pkl.gz'), 'rb') as fp:
    no_papers_in_fields_by_year = pickle.load(fp)

Count all pairs differently

In [17]:
no_papers_in_fields = {}
for year, year_dict in no_papers_in_fields_by_year.items():
    for fields, no_papers in year_dict.items():
        if fields not in no_papers_in_fields:
            no_papers_in_fields[fields] = 0
        no_papers_in_fields[fields] += no_papers
no_papers_in_fields_df = pd.DataFrame(data={"fieldsOfStudy":list(no_papers_in_fields.keys()), "no_papers":list(no_papers_in_fields.values())})
no_papers_in_fields_df.sort_values(by="no_papers", ascending=False).head(50)

Unnamed: 0,fieldsOfStudy,no_papers
9,(),38661280
2,"(Medicine,)",34492181
0,"(Computer Science,)",11981398
3,"(Chemistry,)",11303138
7,"(Materials Science,)",10166490
6,"(Engineering,)",8894928
5,"(Biology,)",8274902
20,"(Physics,)",7127389
4,"(Political Science,)",6762544
16,"(Psychology,)",6723527


Count only 1st field

In [18]:
no_papers_in_fields = {}
for year, year_dict in no_papers_in_fields_by_year.items():
    for fields, no_papers in year_dict.items():
        if fields is not None and len(fields)>0:
            field = fields[0]
            if field not in no_papers_in_fields:
                no_papers_in_fields[field] = 0
            no_papers_in_fields[field] += no_papers
no_papers_in_fields_df = pd.DataFrame(data={"1st_field":list(no_papers_in_fields.keys()), "no_papers":list(no_papers_in_fields.values())})
no_papers_in_fields_df.sort_values(by="no_papers", ascending=False).head(50)

Unnamed: 0,1st_field,no_papers
1,Medicine,36703459
2,Chemistry,14673850
4,Biology,13629011
0,Computer Science,13209583
6,Materials Science,11147427
5,Engineering,9442314
11,Psychology,8083227
15,Physics,7789393
3,Political Science,6981298
9,Art,6525083


Count fractionary fields

In [31]:
no_papers_in_fields = {}
for year, year_dict in no_papers_in_fields_by_year.items():
    for fields, no_papers in year_dict.items():
        if fields is not None and len(fields)>0:
            for field in fields:
                if field not in no_papers_in_fields:
                    no_papers_in_fields[field] = 0
                no_papers_in_fields[field] += float(no_papers)/len(fields)
no_papers_in_fields_df = pd.DataFrame(data={"fractionary_field":list(no_papers_in_fields.keys()), "no_papers":list(no_papers_in_fields.values())})
no_papers_in_fields_df.sort_values(by="no_papers", ascending=False).head(50)

Unnamed: 0,fractionary_field,no_papers
1,Medicine,41691770.0
0,Computer Science,13333090.0
2,Chemistry,13305840.0
4,Biology,11417040.0
6,Materials Science,10784380.0
5,Engineering,9468510.0
15,Physics,7645256.0
10,Psychology,7601126.0
3,Political Science,6928388.0
9,Art,6499334.0


## Decentralization

In [20]:
words_to_check = ['decentralization']

In [21]:
all_docs_dict = {}
for keyword in words_to_check:
    keyword_data_folder = os.path.join(data_folder, keyword)
    os.makedirs(keyword_data_folder,exist_ok=True)
    with gzip.open(os.path.join(keyword_data_folder, 'papers_dict.pkl.gz'), 'rb') as fp:
        all_docs_dict = pickle.load(fp)

Count all pairs differently

In [41]:
no_papers_in_fields = {}
for paper_id, paper in all_docs_dict.items():
    if 'fieldsOfStudy' in paper and paper['fieldsOfStudy'] is not None and len(paper['fieldsOfStudy']) > 0:
        fields = tuple(paper['fieldsOfStudy'])
        if fields not in no_papers_in_fields:
            no_papers_in_fields[fields] = 1
        else:
            no_papers_in_fields[fields] += 1
no_papers_in_fields_df = pd.DataFrame(data={"fieldsOfStudy":list(no_papers_in_fields.keys()), "no_papers":list(no_papers_in_fields.values())})
no_papers_in_fields_df.sort_values(by="no_papers", ascending=False).head(50)

Unnamed: 0,fieldsOfStudy,no_papers
5,"(Computer Science,)",99230
2,"(Political Science,)",49987
6,"(Engineering,)",37469
8,"(Medicine,)",29223
3,"(Business,)",29134
0,"(Economics,)",26716
1,"(Sociology,)",15869
13,"(Geography,)",13211
11,"(Mathematics,)",8192
22,"(Art,)",6474


In [42]:
df = no_papers_in_fields_df.sort_values(by="no_papers", ascending=False)
df = df.loc[df.no_papers > 1000]
print(df.to_latex(index=False))

\begin{tabular}{lr}
\toprule
                  fieldsOfStudy &  no\_papers \\
\midrule
            (Computer Science,) &      99230 \\
           (Political Science,) &      49987 \\
                 (Engineering,) &      37469 \\
                    (Medicine,) &      29223 \\
                    (Business,) &      29134 \\
                   (Economics,) &      26716 \\
                   (Sociology,) &      15869 \\
                   (Geography,) &      13211 \\
                 (Mathematics,) &       8192 \\
                         (Art,) &       6474 \\
                  (Philosophy,) &       5767 \\
       (Environmental Science,) &       5708 \\
(Engineering, Computer Science) &       4482 \\
                     (History,) &       3941 \\
                  (Psychology,) &       3702 \\
                     (Physics,) &       3406 \\
(Mathematics, Computer Science) &       2633 \\
(Computer Science, Mathematics) &       2599 \\
(Computer Science, Engineering) &       2362 \\
 

Count only 1st field

In [34]:
no_papers_in_fields = {}
for paper_id, paper in all_docs_dict.items():
    if 'fieldsOfStudy' in paper:
        fields = tuple(paper['fieldsOfStudy'])
        if fields is not None and len(fields) > 0:
            field = fields[0]
        if field not in no_papers_in_fields:
            no_papers_in_fields[field] = 1
        else:
            no_papers_in_fields[field] += 1
no_papers_in_fields_df = pd.DataFrame(data={"1st_field":list(no_papers_in_fields.keys()), "no_papers":list(no_papers_in_fields.values())})
df = no_papers_in_fields_df.sort_values(by="no_papers", ascending=False).head(50)
df

In [35]:
print(df.to_latex(index=False))

\begin{tabular}{lr}
\toprule
            1st\_field &  no\_papers \\
\midrule
     Computer Science &     121600 \\
    Political Science &      58140 \\
          Engineering &      48458 \\
             Business &      35955 \\
             Medicine &      35639 \\
            Economics &      31842 \\
            Sociology &      18713 \\
            Geography &      15746 \\
          Mathematics &      12563 \\
                  Art &       7521 \\
Environmental Science &       6974 \\
           Philosophy &       6638 \\
           Psychology &       5115 \\
              Biology &       4944 \\
              History &       4587 \\
              Physics &       4355 \\
            Chemistry &       2484 \\
    Materials Science &       2477 \\
              Geology &       1393 \\
\bottomrule
\end{tabular}



Count fractionary fields

In [32]:
no_papers_in_fields = {}
for paper_id, paper in all_docs_dict.items():
    if 'fieldsOfStudy' in paper:
        fields = tuple(paper['fieldsOfStudy'])
        if fields is not None and len(fields)>0:
            for field in fields:
                if field not in no_papers_in_fields:
                    no_papers_in_fields[field] = 0
                no_papers_in_fields[field] += float(no_papers)/len(fields)
no_papers_in_fields_df = pd.DataFrame(data={"fractionary_field":list(no_papers_in_fields.keys()), "no_papers":list(no_papers_in_fields.values())})
no_papers_in_fields_df.sort_values(by="no_papers", ascending=False).head(50)

Unnamed: 0,fractionary_field,no_papers
5,Computer Science,108946.183333
2,Political Science,51268.366667
6,Engineering,41699.533333
8,Medicine,35181.933333
3,Business,31288.7
0,Economics,27959.683333
1,Sociology,16559.316667
11,Geography,13822.15
10,Mathematics,11196.083333
13,Art,6632.25
