# Data extraction, transformation, and loading into a JSON file
This is part of the project described in <https://github.com/amchagas/OSH_papers_DB>, check the project readme for more details.

This notebook loads data sources and merges them in a single compressed JSON file.

In [195]:
import os
import re
import numpy as np
import pandas as pd
import rispy
import matplotlib.pyplot as plt
from pathlib import Path
from project_definitions import baseDir, dataSourceDir, dataOutDir, figDir, articleDataFile
from project_definitions import store_data, load_data
from pprint import pprint
import html
from urllib.parse import unquote
from jellyfish import damerau_levenshtein_distance as edit_distance

## Sources

In [52]:
scieloSource = {
    'paths': [dataSourceDir / x for x in ("scielo.ris",)],
    'rispy_args': {},
    'col_rename': {},
    'transforms': [],
}
scopusSource = {
    'paths': [dataSourceDir / x for x in ("scopus.ris",)],
    'rispy_args': {},
    'col_rename': {},
    'transforms': [],
}
wosSource = {
    'paths': [dataSourceDir / x for x in ("wos1-500.ciw", "wos501-973.ciw")],
    'rispy_args': {'implementation': 'wok'},
    'col_rename': {'publication_year': 'year', 'document_title': 'title'},
    'transforms': [],
}

In [53]:
def load_source(dataSource):
    dfs = []
    for path in dataSource['paths']:
        with path.open() as f:
            df = pd.DataFrame(rispy.load(f, **dataSource['rispy_args']))
        df['__source'] = [[path.name] for _ in range(len(df))]
        dfs.append(df)
    cdf = pd.concat(dfs, join='outer', ignore_index=True)
    cdf = cdf.rename(columns=dataSource['col_rename'])
    for trans in dataSource['transforms']:
        cdf = cdf.transform(trans)
    return cdf.sort_index(axis=1)

In [54]:
scieloData = load_source(scieloSource)

In [55]:
scopusData = load_source(scopusSource)

In [56]:
wosData = load_source(wosSource)

In [None]:
allDataList = [scieloData, scopusData, wosData]

In [None]:
allData = pd.concat(allDataList, join='outer', ignore_index=True)

In [None]:
allData.describe()

In [None]:
def merge_series_keep_longest(sx):
    if sx.isna().all():
        return np.nan
    if sx.name == '__source':
        return sx.sum()
    return sx[sx.map(len, na_action='ignore').idxmax()]

def merge_records_keep_longest(dfx):
    return dfx.agg(merge_series_keep_longest)

In [None]:
# Keep only article data
article_data = allData.loc[allData["type_of_reference"].eq('JOUR') | allData["publication_type"].eq('J')]
# Merge data with same DOI
article_doi = article_data.groupby(article_data['doi'].values).agg(merge_records_keep_longest)
# Reassemble data with and without DOI
article_nodoi = article_data[~article_data.doi.isin(article_doi.index)]
article_data = pd.concat([article_doi, article_nodoi], ignore_index=True)

In [179]:
def clean_titles(sx):
    return (
        sx
        .str.lower()
        .str.replace(r'[^\s\w]', ' ', regex=True)
        .str.replace(r'\s+', ' ', regex=True)
        .str.strip()
    )

In [17]:
class Match:
    """
    Index string values with similar strings under the same index, for use in a `groupby`.

    First normalizes titles. Then, for each value, returns the index of the first previously indexed value
    whose edit_distance is <= threshold, or a new index if none is found.
    """
    def __init__(self, df, threshold=0):
        self.df = df
        assert not df['title'].hasnans
        self.titles = clean_titles(self.df['title'])
        self.threshold = threshold
        self.match_index = {}
    def match(self, x):
        x = self.titles.loc[x]
        if x in self.match_index:
            return self.match_index[x]
        if self.threshold > 0:
            for m, idx in self.match_index.items():
                if edit_distance(x, m) <= self.threshold:
                    self.match_index[x] = idx
                    return self.match_index[x]
        self.match_index[x] = len(self.match_index)
        return self.match_index[x]

In [None]:
articles_g = article_data.groupby(Match(article_data, 5).match)

In [None]:
aa = articles_g.agg(list)[articles_g.size()>=2]

In [None]:
# Test alternatives matchers
# articles_gx = article_data.groupby(Match(article_data, 15).match)
# bb = articles_gx.agg(list)[articles_gx.size()>=2]
# set(clean_titles(aa.explode('title')['title'])).difference(clean_title(bb.explode('title')['title']))
# set(clean_titles(bb.explode('title')['title'])).difference(clean_title(aa.explode('title')['title']))

In [None]:
# Check that matching titles also have matching year and author (impl: first author last name)
assert aa['year'].map(lambda x: len(set(x)) < 2).all()
aa['authors'].map(
    lambda x: set(
        tuple(z.split(',')[0].split(' ')[-1] for z in y) # last name of each author
        for y in x
        if not ( isinstance(y, np.float) and pd.isna(y) ) # skip NANs
    )
).map(
    lambda x: sum(
        edit_distance(y, z) # sum the edit distances
        for x in list(zip(*x))[:1] # first authors
        for i, y in enumerate(x) for z in x[i+1:] # distinct pairs
    )
).max() < 2

In [90]:
article_data[['doi', 'title', 'authors']].describe()

Unnamed: 0,doi,title,authors
count,623,706,702
unique,623,706,680
top,10.1016/j.ohx.2020.e00127,Research on Monitoring Platform of Agricultura...,"[Pearce, J.M.]"
freq,1,1,10


In [None]:
article_data = articles_g.agg(merge_records_keep_longest)
article_data

In [None]:
# Store deduplicated data and check the stored version reproduces the data
store_data(article_data, articleDataFile)
assert article_data.equals(load_data(articleDataFile))

# Load article data (instead of running the code above)

In [99]:
data_corrections = {
    'doi': {
        r'^(.*)/pdf$': r'\1',
#        r'^(.*)/\w+/$': r'\1',
    }
}

In [100]:
article_data = load_data(articleDataFile)

In [101]:
rep_article_data = article_data.replace(data_corrections, regex=True)
article_data.compare(rep_article_data)

Unnamed: 0_level_0,doi,doi
Unnamed: 0_level_1,self,other
245,10.1088/2058-7058/31/8/34/pdf,10.1088/2058-7058/31/8/34


In [102]:
article_data = rep_article_data

## PLOS Collection sources

In [169]:
plosData = pd.read_csv('https://raw.githubusercontent.com/amchagas/open-source-toolkit/main/plos-items.csv')

In [170]:
sel_article = plosData[
    "Content Type (URL items only - Research Article, Web Article, Commentary, Video, Poster)"
].eq("Research Article")
sel_hardware = plosData["Hardware or software"].eq("hardware")
plosData = plosData.loc[sel_article & sel_hardware]

### DOIs

In [None]:
assert plosData["URI (DOI or URL)"].notna().all()

In [171]:
# Get the doi and doi-like, fixing doi-like containing extra stuff
re_doi = r"(10\.[1-9]\d{3,}(?:\.\d+)*/.+)"
re_http_doi_fix = r"https?://.*/" + re_doi + r"(?:/|/full|/abstract|#\w+)$"

In [172]:
plosData_doi = plosData['URI (DOI or URL)'].str.extract(re_doi)[0]

In [173]:
plosData_doi_http_doi_fixed = (
    plosData['URI (DOI or URL)']
    .str.extract(re_httpdoi)[0]
    .map(unquote, na_action='ignore')
)

In [174]:
plosData_doi.loc[plosData_doi_http_doi_fixed.notna()].compare(plosData_doi_http_doi_fixed.dropna())

Unnamed: 0,self,other
35,10.5334/joh.7/,10.5334/joh.7
36,10.5334/joh.4/,10.5334/joh.4
96,10.3389/fnbeh.2019.00140/full,10.3389/fnbeh.2019.00140
98,10.3389/fncir.2012.00098/full,10.3389/fncir.2012.00098
99,10.3389/fneng.2014.00043/full,10.3389/fneng.2014.00043
103,10.3389/fnins.2019.00784/full,10.3389/fnins.2019.00784
126,10.1088/1741-2552/aa6806#jneaa6806f01,10.1088/1741-2552/aa6806
128,10.5334/joh.14/,10.5334/joh.14
134,10.3389/fphys.2019.00099/abstract,10.3389/fphys.2019.00099


In [175]:
assert 'doi' not in plosData
plosData['doi'] = plosData_doi_http_doi_fixed.where(plosData_doi_http_doi_fixed.notna(), plosData_doi)

In [176]:
plosData['doi'].dropna()

4              10.1002/elps.201800304
35                      10.5334/joh.7
36                      10.5334/joh.4
65       10.1021/acs.analchem.9b02628
66                  10.1063/1.4941068
                    ...              
251      10.1371/journal.pone.0011890
317      10.1371/journal.pone.0214460
319      10.1371/journal.pone.0192752
330    10.1016/j.techfore.2020.119986
331                 10.1111/tra.12728
Name: doi, Length: 126, dtype: object

In [193]:
print(
    len(set(plosData['doi'].dropna()).intersection(article_data['doi'])),
    len(set(plosData['doi'].dropna()).symmetric_difference(article_data['doi'])),
)

19 712


### Titles

In [224]:
plosData['Title (URL items only)'] = plosData['Title (URL items only)'].str.strip()

In [197]:
# How many from the collection have their title in article_data
plosData['Title (URL items only)'].pipe(clean_titles).map(
    lambda x: article_data.title.pipe(clean_titles).str.contains(rf'(?i){x}', regex=True).any()
).sum()

36

In [198]:
# How many from the collection have their title in article_data if we require they have DOIs
plosData['Title (URL items only)'].loc[plosData['doi'].notna()].pipe(clean_titles).map(
    lambda x: article_data.loc[article_data['doi'].notna()].title.pipe(clean_titles).str.contains(rf'(?i){x}', regex=True).any()
).sum()

20

In [210]:
# Give me 10 from the collection having DOIs
z = plosData['doi'].dropna().sample(10)
print(z)

219    10.1371/journal.pone.0168207
117       10.1016/j.ohx.2017.07.001
203    10.1371/journal.pone.0181560
231    10.1371/journal.pone.0134989
190    10.1371/journal.pone.0201353
65     10.1021/acs.analchem.9b02628
181    10.1371/journal.pone.0220091
232    10.1371/journal.pone.0124938
182    10.1371/journal.pone.0228140
210    10.1371/journal.pone.0178540
Name: doi, dtype: object


In [212]:
# Get their titles if their titles are not in article_data
for i, title in plosData.loc[z.index]['Title (URL items only)'].pipe(clean_titles).items():
    if not clean_titles(article_data['title']).str.contains(rf'(?i){title}', regex=True).any():
        print(i, title)

219 chaos based simultaneous compression and encryption for hadoop
203 feasibility of a 3d printed anthropomorphic patient specific head phantom for patient specific quality assurance of intensity modulated radiotherapy
65 odx a fitness tracker based device for continuous bacterial growth monitoring
181 a low cost fluorescence reader for in vitro transcription and nucleic acid detection with cas13a
232 multi contrast imaging and digital refocusing on a mobile microscope with a domed led array
182 fieldwork based determination of design priorities for point of use drinking water quality sensors for use in resource limited environments
210 from medical imaging data to 3d printed anatomical models


In [246]:
# Selector for DOIs only in the collection
sel_new_doi = ~plosData["doi"].dropna().isin(article_data.doi.values)
sel_new_doi.sum()

107

In [263]:
# Selector for Titles only in the collection
sel_new_title = ~clean_titles(plosData["Title (URL items only)"]).isin(clean_titles(article_data['title']))
sel_new_title.sum()

136

In [268]:
# Same title, different DOIs
x = plosData[["doi", "Title (URL items only)"]].loc[sel_new_doi & ~sel_new_title]
for i, y in x["Title (URL items only)"].str.lower().items():
    print(
        y,
        article_data["doi"].loc[
            article_data['title'].str.lower().eq(y)
        ].squeeze(),
        plosData.loc[i, 'doi']
    )
    

bottom-illuminated orbital shaker for microalgae cultivation 10.1016/j.ohx.2020.e00143 10.1101/2020.05.01.071878


In [269]:
# Same DOI, different titles
x = plosData.loc[~sel_new_doi & sel_new_title, 'doi']
for y in x:
    print(
        plosData.loc[plosData.doi.eq(y), "Title (URL items only)"].squeeze(),
        article_data.loc[article_data.doi.eq(y), 'title'].squeeze(),
    )

# All done, now just mess around

In [None]:
data = nad

In [None]:
print(data.shape)
print(data.columns)

In [None]:
print(article_data.shape)

In [None]:
dup_title = article_data.duplicated('title', keep=False)
dup_doi = article_data.duplicated('doi', keep=False)
nan_doi = article_data['doi'].isna()
print(
    dup_title.sum(),
    dup_doi.sum(),
    nan_doi.sum(),
    (dup_title & dup_doi).sum(),
    (dup_title & ~dup_doi).sum(),
)

In [None]:
article_data.issn.str.replace('[^\d]', '', regex=True).value_counts()

In [None]:
article_data.issn.str.replace('[^\d]', '', regex=True).value_counts().reset_index().plot(loglog=True)

In [None]:
article_data.groupby('year').size().plot.bar()