# Data extraction, transformation, and loading into a JSON file
This is part of the project described in <https://github.com/amchagas/OSH_papers_DB>, check the project readme for more details.

This notebook loads data sources and merges them in a single compressed JSON file.

In [51]:
import os
import re
import numpy as np
import pandas as pd
import rispy
import matplotlib.pyplot as plt
from pathlib import Path
from project_definitions import baseDir, dataSourceDir, dataOutDir, figDir, articleDataFile
from project_definitions import store_data, load_data
from pprint import pprint
import html
from jellyfish import damerau_levenshtein_distance as edit_distance

## Sources

In [52]:
scieloSource = {
    'paths': [dataSourceDir / x for x in ("scielo.ris",)],
    'rispy_args': {},
    'col_rename': {},
    'transforms': [],
}
scopusSource = {
    'paths': [dataSourceDir / x for x in ("scopus.ris",)],
    'rispy_args': {},
    'col_rename': {},
    'transforms': [],
}
wosSource = {
    'paths': [dataSourceDir / x for x in ("wos1-500.ciw", "wos501-973.ciw")],
    'rispy_args': {'implementation': 'wok'},
    'col_rename': {'publication_year': 'year', 'document_title': 'title'},
    'transforms': [],
}

In [53]:
def load_source(dataSource):
    dfs = []
    for path in dataSource['paths']:
        with path.open() as f:
            df = pd.DataFrame(rispy.load(f, **dataSource['rispy_args']))
        df['__source'] = [[path.name] for _ in range(len(df))]
        dfs.append(df)
    cdf = pd.concat(dfs, join='outer', ignore_index=True)
    cdf = cdf.rename(columns=dataSource['col_rename'])
    for trans in dataSource['transforms']:
        cdf = cdf.transform(trans)
    return cdf.sort_index(axis=1)

In [54]:
scieloData = load_source(scieloSource)

In [55]:
scopusData = load_source(scopusSource)

In [56]:
wosData = load_source(wosSource)

In [None]:
allDataList = [scieloData, scopusData, wosData]

In [None]:
allData = pd.concat(allDataList, join='outer', ignore_index=True)

In [None]:
allData.describe()

In [None]:
def merge_series_keep_longest(sx):
    if sx.isna().all():
        return np.nan
    if sx.name == '__source':
        return sx.sum()
    return sx[sx.map(len, na_action='ignore').idxmax()]

def merge_records_keep_longest(dfx):
    return dfx.agg(merge_series_keep_longest)

In [None]:
# Keep only article data
article_data = allData.loc[allData["type_of_reference"].eq('JOUR') | allData["publication_type"].eq('J')]
# Merge data with same DOI
article_doi = article_data.groupby(article_data['doi'].values).agg(merge_records_keep_longest)
# Reassemble data with and without DOI
article_nodoi = article_data[~article_data.doi.isin(article_doi.index)]
article_data = pd.concat([article_doi, article_nodoi], ignore_index=True)

In [15]:
def clean_titles(sx):
    return (
        sx
        .str.lower()
        .str.replace(r'[^\s\w]', ' ', regex=True)
        .str.replace(r'\s+', ' ', regex=True)
        .str.strip()
    )

In [17]:
class Match:
    """
    Index string values with similar strings under the same index, for use in a `groupby`.

    First normalizes titles. Then, for each value, returns the index of the first previously indexed value
    whose edit_distance is <= threshold, or a new index if none is found.
    """
    def __init__(self, df, threshold=0):
        self.df = df
        assert not df['title'].hasnans
        self.titles = clean_titles(self.df['title'])
        self.threshold = threshold
        self.match_index = {}
    def match(self, x):
        x = self.titles.loc[x]
        if x in self.match_index:
            return self.match_index[x]
        if self.threshold > 0:
            for m, idx in self.match_index.items():
                if edit_distance(x, m) <= self.threshold:
                    self.match_index[x] = idx
                    return self.match_index[x]
        self.match_index[x] = len(self.match_index)
        return self.match_index[x]

In [None]:
articles_g = article_data.groupby(Match(article_data, 5).match)

In [None]:
aa = articles_g.agg(list)[articles_g.size()>=2]

In [None]:
# Test alternatives matchers
# articles_gx = article_data.groupby(Match(article_data, 15).match)
# bb = articles_gx.agg(list)[articles_gx.size()>=2]
# set(clean_titles(aa.explode('title')['title'])).difference(clean_title(bb.explode('title')['title']))
# set(clean_titles(bb.explode('title')['title'])).difference(clean_title(aa.explode('title')['title']))

In [None]:
# Check that matching titles also have matching year and author (impl: first author last name)
assert aa['year'].map(lambda x: len(set(x)) < 2).all()
aa['authors'].map(
    lambda x: set(
        tuple(z.split(',')[0].split(' ')[-1] for z in y) # last name of each author
        for y in x
        if not ( isinstance(y, np.float) and pd.isna(y) ) # skip NANs
    )
).map(
    lambda x: sum(
        edit_distance(y, z) # sum the edit distances
        for x in list(zip(*x))[:1] # first authors
        for i, y in enumerate(x) for z in x[i+1:] # distinct pairs
    )
).max() < 2

In [90]:
article_data[['doi', 'title', 'authors']].describe()

Unnamed: 0,doi,title,authors
count,623,706,702
unique,623,706,680
top,10.1016/j.ohx.2020.e00127,Research on Monitoring Platform of Agricultura...,"[Pearce, J.M.]"
freq,1,1,10


In [None]:
article_data = articles_g.agg(merge_records_keep_longest)
article_data

In [None]:
# Store deduplicated data and check the stored version reproduces the data
store_data(article_data, articleDataFile)
assert article_data.equals(load_data(articleDataFile))

# Load article data (instead of running the code above)

In [3]:
article_data = load_data(articleDataFile)

## CSV sources

In [98]:
plosData = pd.read_csv('https://raw.githubusercontent.com/amchagas/open-source-toolkit/main/plos-items.csv')

In [99]:
sel_article = plosData["Content Type (URL items only - Research Article, Web Article, Commentary, Video, Poster)"].eq("Research Article")
sel_hardware = plosData["Hardware or software"].eq("hardware")
plosData = plosData.loc[sel_article & sel_hardware]
assert not plosData["URI (DOI or URL)"].isna().any()
plosData['Title (URL items only)'] = plosData['Title (URL items only)'].str.strip()
plosData

Unnamed: 0,URI (DOI or URL),Hardware or software,Title (URL items only),Authors (URL items only),"Content Type (URL items only - Research Article, Web Article, Commentary, Video, Poster)",Date Published (URL items only),Source (URL items only),Summary,"Featured Rank (1 = Editor's Pick, 2-6 = Featured Research, 7-12 = Related Content)","Paywall (x = paywall, otherwise leave blank)","Featured Preprint (x = Featured Preprint, otherwise leave blank)",Remove (x = delete the item)
4,https://pubs.acs.org/doi/pdf/10.1021/acs.analc...,hardware,ODX: A Fitness Tracker-Based Device for Contin...,Venkata V. B. YallapragadaUday GowdaDavid Wong...,Research Article,2019-10-17,ACS Publications,,,,,
6,http://scitation.aip.org/content/aip/journal/r...,hardware,A one-piece 3D printed flexure translation sta...,"James P. Sharkey, Darryl C. W. Foo, Alexandre ...",Research Article,2016-02-08,AIP Scientific Instruments,,,,,
8,http://www.appropedia.org/Free_and_open-source...,hardware,Free and open-source automated 3-D microscope,"Wijnen, B., Petersen, E. E., Hunt, E. J. and P...",Research Article,2017-08-01,Appropedia,"This paper presents a low-cost, open-source mi...",,,,
9,http://www.appropedia.org/Open-source_mobile_w...,hardware,Open-source mobile water quality testing platform,"Bas Wijnen, G. C. Anzalone and Joshua M. Pearce",Research Article,2017-08-01,Appropedia,This project details an open source water test...,,,,
10,http://www.appropedia.org/Open-source_Wax_RepR...,hardware,Open-source Wax RepRap 3-D Printer for Rapid P...,"J. M. Pearce, N. C. Anzalone, and C. L. Heldt.",Research Article,2017-08-01,Appropedia,This study reports on the development of a Rep...,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
434,http://www.mdpi.com/1424-8220/13/4/5338,hardware,Open-Source Colorimeter,"Gerald C. Anzalone, Alexandra G. Glover and Jo...",Research Article,2013-04-19,Sensors,This paper provides a methodology for applying...,,,,
435,https://www.sciencedirect.com/science/article/...,hardware,Glucose biosensor based on open-source wireles...,"Conan Mercer, Richard Bennett, Peter Ó Conghai...",Research Article,2019-03-28,Sensors and Actuators B: Chemical,Wireless potentiostats capable of cyclic volta...,,x,,
436,http://biomedicaloptics.spiedigitallibrary.org...,hardware,Fabricating optical lenses by inkjet printing ...,"Yu-Lung Sung, Jenn Jeang, Chia-Hsiung Lee, Wei...",Research Article,2015-03-30,SPIE | Journal of Biomedical Optics,,,,,
440,http://sro.sussex.ac.uk/66499/,hardware,Microsco-pi: a novel and inexpensive way of me...,"Jonathan P Bacon, Harry R Kent",Research Article,2017-03-06,Sussex Research Online,,,,,


In [103]:
# How many are doi-like
re_doi = '10\.\d+/.+'
plosData['URI (DOI or URL)'].str.contains(re_doi).value_counts()

True     120
False     46
Name: URI (DOI or URL), dtype: int64

In [102]:
# How many have their title in the corpus
plosData['Title (URL items only)'].pipe(clean_titles).map(
    lambda x: article_data.title.pipe(clean_titles).str.contains(rf'(?i){x}', regex=True).any()
).sum()

35

In [107]:
# Give me 10 pone
z = plosData['URI (DOI or URL)'][plosData['URI (DOI or URL)'].str.contains('\.pone\.')].sample(10)

350    10.1371/journal.pone.0187219
405    10.1371/journal.pone.0059840
407    10.1371/journal.pone.0030837
393    10.1371/journal.pone.0118545
310    10.1371/journal.pone.0206678
388    10.1371/journal.pone.0143547
295    10.1371/journal.pone.0220751
398    10.1371/journal.pone.0107216
281    10.1371/journal.pone.0226761
338    10.1371/journal.pone.0193744
Name: URI (DOI or URL), dtype: object

In [111]:
# Which of those pone are not in the corpus
for i, title in plosData.loc[z.index]['Title (URL items only)'].pipe(clean_titles).items():
    if re.search(re_doi, plosData.loc[i, 'URI (DOI or URL)']):
        if not clean_titles(article_data.title).str.contains(rf'(?i){title}', regex=True).any():
            print(i, title)

350 democratizing science with the aid of parametric design and additive manufacturing design and fabrication of a versatile and low cost optical instrument for scattering measurement
407 the spikerbox a low cost open source bioamplifier for increasing public participation in neuroscience inquiry
393 the aerodynamic cost of head morphology in bats maybe not as bad as it seems
310 open source sensor for measuring oxygen partial pressures below 100 microbars
388 open led illuminator a simple and inexpensive led illuminator for fast multicolor particle tracking in neurons
295 low cost solution for rodent home cage behaviour monitoring
398 open source syringe pump library
281 an accurate precise and affordable light emitting diode spectrophotometer for drinking water and other testing with limited resources
338 do it yourself reliable ph stat device by using open source software inexpensive hardware and available laboratory equipment


In [None]:
plos2bibtex = {
    "URI (DOI or URL)": "doi",
    "": "",
}

In [None]:
sel_new_doi = ~plosData["URI (DOI or URL)"].isin(allData.doi.values)
sel_new_doi.sum()

In [None]:
sel_new_title = ~plosData["Title (URL items only)"].isin(allData.title.values)
sel_new_title.sum()

In [None]:
# Same titles, different DOIs
x = plosData[["URI (DOI or URL)", "Title (URL items only)"]].loc[sel_new_doi & ~sel_new_title]
x['doi'] = [
    allData["doi"].loc[
        allData['title'].eq(y)
    ].squeeze()
    for y in x["Title (URL items only)"]
]
x

In [None]:
# Same DOI, different Titles
pprint(plosData["Title (URL items only)"].loc[~sel_new_doi & sel_new_title])
print()
pprint(allData["title"].loc[
    allData["doi"].eq('10.1371/journal.pone.0023783')
])

# All done, now just mess around

In [None]:
data = nad

In [None]:
print(data.shape)
print(data.columns)

In [None]:
print(article_data.shape)

In [None]:
dup_title = article_data.duplicated('title', keep=False)
dup_doi = article_data.duplicated('doi', keep=False)
nan_doi = article_data['doi'].isna()
print(
    dup_title.sum(),
    dup_doi.sum(),
    nan_doi.sum(),
    (dup_title & dup_doi).sum(),
    (dup_title & ~dup_doi).sum(),
)

In [None]:
article_data.issn.str.replace('[^\d]', '', regex=True).value_counts()

In [None]:
article_data.issn.str.replace('[^\d]', '', regex=True).value_counts().reset_index().plot(loglog=True)

In [None]:
article_data.groupby('year').size().plot.bar()