In [6]:
import os
from base_fns import get_local_folder
os.chdir(os.path.dirname(get_local_folder()))

import pandas as pd
import tabulate as tb
from hydra import compose, initialize
from omegaconf import OmegaConf
import random
from munch import Munch
import numpy as np
from pandarallel import pandarallel
from orjson import loads
from pathlib import Path

with initialize(
    version_base=None,
    config_path="../cfg",
):
    cfg = compose(config_name="main")

random.seed(cfg.random.seed)
np.random.seed(cfg.random.seed)

## PMIDs

In [7]:
pmids = Munch() 
for k, v in cfg.data.processed.pmid.items():
        pmids[k] = pd.read_json(v, lines=True)
        pmids[k]['DATASET'] = k
       
pmids_combined = pd.concat([pmids[d]for d in pmids], ignore_index=True)

pmids_combined[['DATASET', 'SOURCE']].groupby(['DATASET','SOURCE']).value_counts()
# pmids.combined.groupby('ORIGIN').value_counts()


DATASET          SOURCE
train_bronze_ab  0         10000
                 1         10000
Name: count, dtype: int64

## Articles

In [8]:
repo_fn = os.listdir(cfg.data.raw.article)
print(f"repository contains text for {len(repo_fn)} articles")

repository contains text for 132361 articles


## Abstract and fulltext datasets

In [9]:
pandarallel.initialize()

def get_article(row):
    article_fn = str(row.PMID) + '.json'
    repo_fn = os.listdir(cfg.data.raw.article)
    if article_fn in repo_fn:
        with open(os.path.join(cfg.data.raw.article, article_fn)) as f:
            article = loads(f.read())
            # if fulltext is not returned then used abstract.
            if not article["FULLTEXT"]:
                article["FULLTEXT"] = article["ABSTRACT"]
            
            return pd.Series(article) 
    else:
        return pd.Series({}) 

def data_text(df_in):
    df_article = df_in.parallel_apply(get_article, axis=1, result_type="expand")
    df_article = df_article.drop('PMID', axis=1)
    return pd.concat([df_in, df_article], axis=1)

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [10]:
processed = Munch()
for k, v in pmids.items():
   processed[k] = data_text(v)

### Samples without abstract-text.

Samples with no abstract (count, portion)

In [11]:
processed_combined = pd.concat([v for k,v in processed.items()], axis=0)
processed_combined.groupby(["DATASET", "SOURCE", "ORIGIN"])['ABSTRACT'].apply(lambda x: ((x.isnull().sum()), x.isnull().mean().round(2)))

DATASET          SOURCE  ORIGIN        
train_bronze_ab  0       googlescholar       (0, 0.0)
                         litsuggest        (61, 0.09)
                         medline             (0, 0.0)
                 1       cellosaurus_ab      (1, 0.0)
Name: ABSTRACT, dtype: object

Drop samples with no abstract and verify.

In [12]:
for k, v in processed.items():
    processed[k] = v[v.ABSTRACT.notnull()]

processed_combined = pd.concat([v for k,v in processed.items()], axis=0)
processed_combined.groupby(["DATASET", "SOURCE", "ORIGIN"])['ABSTRACT'].apply(lambda x: ((x.isnull().sum()), x.isnull().mean().round(2)))

DATASET          SOURCE  ORIGIN        
train_bronze_ab  0       googlescholar     (0, 0.0)
                         litsuggest        (0, 0.0)
                         medline           (0, 0.0)
                 1       cellosaurus_ab    (0, 0.0)
Name: ABSTRACT, dtype: object

### Fulltext

In [13]:

processed_combined.groupby(["DATASET", "SOURCE", 'ORIGIN'])['FULLTEXT'].apply(lambda x: ((x.isnull().sum(), len(x), x.isnull().mean().round(2))))

DATASET          SOURCE  ORIGIN        
train_bronze_ab  0       googlescholar      (0, 175, 0.0)
                         litsuggest         (0, 584, 0.0)
                         medline           (0, 9180, 0.0)
                 1       cellosaurus_ab    (0, 9999, 0.0)
Name: FULLTEXT, dtype: object

In [14]:

processed_combined.groupby(["DATASET"])['FULLTEXT'].apply(lambda x: ((x.isnull().sum(),len(x), x.isnull().mean().round(2))))

DATASET
train_bronze_ab    (0, 19938, 0.0)
Name: FULLTEXT, dtype: object

Write processed data to file.

In [15]:
for k in processed:
    processed[k].to_json(cfg.data.processed.text[k], lines=True, orient='records')
    print (f'Write {cfg.data.processed.text[k]}')

Write data/processed/text/train_bronze_ab.ndjson
