# Imports

In [4]:
import gzip
import pickle
import joblib

from random import choice
import scipy.stats
import sys
import gi
from gi.repository import Gtk, Gdk
import graph_tool.all as gt
import pandas as pd
import numpy as np
import os
import time
from tqdm.notebook import tqdm

from sklearn.feature_extraction import text
from nltk.stem import  WordNetLemmatizer
import re

from datetime import datetime

import random
import seaborn as sns

from tqdm.notebook import tqdm

import jsonlines
import matplotlib.pyplot as plt

import platform

  from gi.repository import Gtk, Gdk


In [10]:
# DEFAULT PARAMETERS IN THE FIGURES TO BE ADJUSTED!!!!

plt.style.use("default")

height_fig = 5
width_fig = 10

params_default = {
    # no upper and right axes
    'axes.spines.right' : False,
    'axes.spines.top' : False,
    # no frame around the legend
    "legend.frameon" : False,

    # dimensions of figures and labels
    # we will play with these once we see how they are rendered in the latex
    'figure.figsize' : (width_fig, height_fig),

    'axes.labelsize' : 22,
    'axes.titlesize' : 25,
    'xtick.labelsize' : 18,
    'ytick.labelsize' : 18,
    'legend.fontsize' : 16,

    # no grids (?)
    'axes.grid' : False,

    # the default color(s) for lines in the plots: in order if multiple lines. We can change them or add colors if needed
#     'axes.prop_cycle' : mpl.cycler(color=["#00008B", "#BF0000", "#006400"]), 

    # default quality of the plot. Not too high but neither too low
    "savefig.dpi" : 300,
    "savefig.bbox" : 'tight', 

}


plt.rcParams.update(params_default)

# Collecting data from Corpus

Download corpus from
https://api.semanticscholar.org/corpus/download/

Corpus download: 2021-09-01 release

Commands used:

wget https://s3-us-west-2.amazonaws.com/ai2-s2-research-public/open-corpus/2021-09-01/manifest.txt

wget -B https://s3-us-west-2.amazonaws.com/ai2-s2-research-public/open-corpus/2021-09-01/ -i manifest.txt

In [5]:
# move in repo root folder
os.chdir("../")

Working on pfsnsq860259


In [22]:
words_to_check_dict = {'decentralization':["centraliz","centralis"],
                  'internet':["world-wide-web", "world wide web", "worldwideweb", "worldwide web","worldwide-web", "internet"],
                  'virus':["virus"]}

In [7]:
corpus_version = "2021-09-01"

[0m[01;34m2021-09-01[0m/  [01;34mdecentralization[0m/  [01;34minternet[0m/  [01;34mvirus[0m/


In [None]:
all_docs_dict = {key:{} for key in words_to_check_dict}
no_papers_in_fields_by_year = {key:{} for key in words_to_check_dict}
sets_authors_in_fields_by_year = {key:{} for key in words_to_check_dict}
sets_authors_by_year = {key:{} for key in words_to_check_dict}
no_authors_in_fields_by_year = {key:{} for key in words_to_check_dict}
no_authors_by_year = {key:{} for key in words_to_check_dict}
count = 0
start = datetime.now()

for ID in tqdm(range(6000)):
    filename = f"./corpus/{corpus_version}/s2-corpus-%.3d.gz"%ID
    with gzip.open(filename, "rb") as f:
        for item in jsonlines.Reader(f):
            x=item
            count += 1
            title = x["title"].lower()
            abstract = x["paperAbstract"].lower()
           
            for keyword,words_to_check in words_to_check_dict.items():
                add_it = False
                for word in words_to_check:
                    if word in title or word in abstract:
                        add_it = True
                if add_it:
                    all_docs_dict[keyword][x["id"]] = x.copy()
                year = x["year"]
                if year not in no_papers_in_fields_by_year[keyword]:
                    no_papers_in_fields_by_year[keyword][year] = {}
                    sets_authors_in_fields_by_year[keyword][year] = {}
                    sets_authors_by_year[keyword][year] = set()
                year_dict = no_papers_in_fields_by_year[keyword][year]
                fields = tuple(x["fieldsOfStudy"])
                if fields not in year_dict:
                    year_dict[fields] = 1
                    sets_authors_in_fields_by_year[keyword][year][fields] = set()
                else:
                    year_dict[fields] += 1
                for authors in x["authors"]:
                    sets_authors_in_fields_by_year[keyword][year][fields].update(authors["ids"])
                    sets_authors_by_year[keyword][year].update(authors["ids"])
    end = datetime.now()

    if ID%100 == 0:
        print(ID,len(all_docs_dict), end-start,flush=True)

for keyword in words_to_check_dict:
    os.makedirs("data/" + keyword,exist_ok=True)
    os.chdir("data/" + keyword)
    with gzip.open("papers_dict.pkl.gz", "wb") as fp:
        pickle.dump(all_docs_dict[keyword],fp)

    with gzip.open("no_papers_in_fields_by_year.pkl.gz", "wb") as fp:
        pickle.dump(no_papers_in_fields_by_year[keyword],fp)

    with gzip.open("sets_authors_in_fields_by_year.pkl.gz", "wb") as fp:
        pickle.dump(sets_authors_in_fields_by_year[keyword],fp)


    for year, year_dict in sets_authors_in_fields_by_year[keyword].items():
        no_authors_in_fields_by_year[keyword][year] = {}
        for fields, set_authors in year_dict.items():
            no_authors_in_fields_by_year[keyword][year][fields] = len(set_authors)
    with gzip.open("no_authors_in_fields_by_year.pkl.gz", "wb") as fp:
        pickle.dump(no_authors_in_fields_by_year[keyword],fp)

    for year, year_dict in sets_authors_by_year[keyword].items():
        no_authors_by_year[keyword][year] = len(year_dict)
    with gzip.open("no_authors_by_year.pkl.gz", "wb") as fp:
        pickle.dump(no_authors_by_year[keyword],fp)
    os.chdir("../../")

In [None]:
for keyword,papers in all_docs_dict.items():
    print(f"keyword: {keyword} - no.papers: {len(papers)}")

# Checks

In [None]:
chosen_keyword = 'decentralization'
chosen_dict = all_docs_dict[chosen_keyword]

In [32]:
bad = []
count = 0
count_all = 0
for key, paper in chosen_dict.items():
    title = paper["title"]
    abstract = paper["paperAbstract"]
    if "centralized" not in title and "centralised" not in title and "centralization" not in title and "centralisation" not in title and "centralized" not in abstract and "centralised" not in abstract and "centralization" not in abstract and "centralisation" not in abstract:
        count += 1
        bad.append(paper["id"])
    count_all += 1
print(count, count_all)

100 715


In [74]:
chosen_dict[bad[0]]

{'id': 'f842293e6e6ce4f328e7b89b169a0027a2dd1f52',
 'title': 'Modèles de développement en Guadeloupe et intégration européenne',
 'paperAbstract': 'Le but de cette these est d\'etudier la question du developpement et du phenomene de la dependance d\'une societe peripherique particuliere, la guadeloupe, dans le contexte de l\'integration europeenne. La premiere partie presente cette methodologie de la recherche: c\'est en fait la preparation qualitative pour un passage du "general" au "particulier" dans un souci de depassament des prenotions. La deuxieme partie traite de la mise en place du systeme colonial et de la legitimation des rapports esclavagistes. La troisieme partie presente la transformation du statut de "colonie" au "departement" de la guadeloupe. Dans la quatrieme partie, ilest question du modele decentralisateur avec la loi de 1982. Et enfin, la cinquieme partie fait reference aux propositions explicatives a dimension theorique.',
 'authors': [{'name': 'Rosan  Rauzduel', '

## Counting how many papers have each attribute

In [None]:
no_doi = []
count_doi = 0
for paper, paper_dict in chosen_dict.items():
    if paper_dict["doi"] is None or len(paper_dict["doi"]) == 0:
        no_doi.append(paper)
    else:
        count_doi += 1
print(f"{count_doi} have doi of total {len(chosen_dict)}")

In [81]:
no_year = []
count_year = 0
for paper, paper_dict in chosen_dict.items():
    if paper_dict["year"] is None:
        no_year.append(paper)
    else:
        count_year += 1
print(f"{count_year} have year of total {len(chosen_dict)}")

392786 have year of total 394126


In [79]:
no_fields = []
count_fields = 0
for paper, paper_dict in chosen_dict.items():
    if paper_dict["fieldsOfStudy"] is None or len(paper_dict["fieldsOfStudy"]) == 0:
        no_year.append(paper)
    else:
        count_fields += 1
print(f"{count_fields} have fields of total {len(chosen_dict)}")

357081 have fields of total 394126


In [84]:
no_good = []
count_good = 0
for paper, paper_dict in chosen_dict.items():
    if paper_dict["doi"] is None or len(paper_dict["doi"]) == 0 or paper_dict["year"] is None or paper_dict["fieldsOfStudy"] is None or len(paper_dict["fieldsOfStudy"]) == 0 or paper_dict["paperAbstract"] is None or len(paper_dict["paperAbstract"]) == 0 or paper_dict["title"] is None or len(paper_dict["title"]) == 0 or ( len(paper_dict["inCitations"]) == 0 and len(paper_dict["outCitations"]) == 0 ):
        no_good.append(paper)
    else:
        count_good += 1
print(f"{count_good} are good of total {len(chosen_dict)} (have doi, year, fields, abstract, title, and at least one of reference or citation)")

191065 are good of total 394126 (have doi, year, fields, abstract, title, and at least one of reference or citation)
