In [2]:
import os
from base_fns import get_local_folder
os.chdir(os.path.dirname(get_local_folder()))

import pandas as pd
import tabulate as tb
from hydra import compose, initialize
from omegaconf import OmegaConf
import random
from munch import Munch
import numpy as np
from tab

with initialize(
    version_base=None,
    config_path="../cfg",
):
    cfg = compose(config_name="main")
    
np.random.seed(cfg.random.seed)
random.seed(cfg.random.seed)

# Raw data

- CelloSaurus; postives, expert curated
- GoogleScholar; negatives, expert curated, gold standard
- LitSuggest; negatives, likely rejects from previous curation cycles, silver standard
- Medline, pseudo negatives, random sample from Medline, bronze standard

In [131]:
# remove duplcates from MEDLINE sample

medline_temp = pd.read_json(cfg.data.raw.pmid.medline, lines=True)

medline_temp = medline_temp.drop_duplicates()
medline_temp.to_json(cfg.data.raw.pmid.medline, lines=True, orient='records')

In [132]:
raw = Munch()
for k, v in cfg.data.raw.pmid.items():
    raw[k] = pd.read_json(v, lines=True)
    raw[k]['ORIGIN'] = k

# filter out curated PMIDs from random MEDELINE sample
raw_curated = pd.concat([raw[k] for k in raw if k != 'medline'], axis=0)
raw.medline = raw.medline[~raw.medline.PMID.isin(raw_curated.PMID)]

raw_all = pd.concat([raw[k] for k in raw], axis=0)
raw_all[['ORIGIN', 'SOURCE']].groupby('ORIGIN').value_counts()

ORIGIN          SOURCE
cellosaurus     1          22719
cellosaurus_ab  1          10000
googlescholar   0            475
litsuggest      0            645
medline         0         558603
Name: count, dtype: int64

Cellosaurus_AB contains a subset of postitive PMIDs from  Cellosaurus that were previously used to train a Litsuggest model.

Check for duplicates in raw data.

In [133]:
raw_all['PMID'].duplicated().sum()

9999

# Processed data

## Test data
- negatives; 300 PMIDs from Google Scholar
- positives; 300 PMIDs from Cellosaurus

In [134]:

processed = Munch()
N_TEST_NEG = 300
test_neg = raw.googlescholar.sample(n=300)
test_pos = raw.cellosaurus[~raw.cellosaurus['PMID'].isin(raw.cellosaurus_ab["PMID"])].sample(n= test_neg.shape[0])

processed.test = pd.concat([test_neg, test_pos]).sample(frac = 1)
test_pmids = processed.test.PMID.tolist() 
processed.test['DATASET'] = "test"
processed.test[['DATASET','ORIGIN', 'SOURCE']].groupby(['DATASET','SOURCE']).value_counts()

DATASET  SOURCE  ORIGIN       
test     0       googlescholar    300
         1       cellosaurus      300
Name: count, dtype: int64

## Train data
We will create to balanced training sets:
- Train_silver; Smaller dataset inluding only a portion of the data from Cellosaurus. Negatives including only high quality data from Google Scholar(gold) and LitSuggest (silver).
- Train_gold; Larger dataset incuding all data from Cellosaur. Negatives are extended with pseudo negatives from MEDLINE

### Train_silver
- negatives 
    - all remaining PMIDs from Google Scholar (gold)
    - all PMIDs from LitSuggest (silver)
- positive
    - matched sample from cellosaurus

In [135]:
train_silver_neg = pd.concat(
    [raw.googlescholar[~raw.googlescholar.PMID.isin(test_pmids)], raw.litsuggest]
)
train_silver_pos = raw.cellosaurus[~raw.cellosaurus.PMID.isin(test_pmids)].sample(n=len(train_silver_neg))

processed.train_silver = pd.concat([train_silver_neg, train_silver_pos]).sample(frac=1)
train_silver_pmids = processed.train_silver.PMID.tolist()
processed.train_silver["DATASET"] = "train_silver"
processed.train_silver[["DATASET", "ORIGIN", "SOURCE"]].groupby(
    ["DATASET", "SOURCE"]
).value_counts()

DATASET       SOURCE  ORIGIN       
train_silver  0       litsuggest       645
                      googlescholar    175
              1       cellosaurus      820
Name: count, dtype: int64

Check for overlap with test data.

In [136]:
pd.concat([processed.train_silver, processed.test], axis=0) ['PMID'].duplicated().sum()

0

### Train_bronze
- negatives
    - all remaining PMIDs from Google Scholar (gold)
    - all PMIDs from LitSuggest (silver) 
    - matched sample of pseudo-negatives from MEDLINE (bronze)
- positives
    - all remaining PMIDs from Cellosaurus

In [137]:
train_bronze_pos = raw.cellosaurus[~raw.cellosaurus.PMID.isin(test_pmids)]

train_bronze_neg = pd.concat([raw.googlescholar[~raw.googlescholar.PMID.isin(test_pmids)],raw.litsuggest])
medline_n = train_bronze_pos.shape[0] - train_bronze_neg.shape[0]
medline_sample = raw.medline[~raw.medline.PMID.isin(raw_curated.PMID)].sample(medline_n)
train_bronze_neg = pd.concat([train_bronze_neg,medline_sample])
# train_bronze_neg = pd.concat([train_bronze_neg,raw.medline.sample(n_sample_medline)])

processed.train_bronze = pd.concat([train_bronze_neg, train_bronze_pos]).sample(frac=1)
processed.train_bronze["DATASET"] = "train_bronze"
processed.train_bronze[["DATASET", "ORIGIN", "SOURCE"]].groupby(
    ["DATASET", "SOURCE"]
).value_counts()

DATASET       SOURCE  ORIGIN       
train_bronze  0       medline          21599
                      litsuggest         645
                      googlescholar      175
              1       cellosaurus      22419
Name: count, dtype: int64

Check for overlap with test set.

In [138]:
pd.concat([processed.train_bronze, processed.test], axis=0) ['PMID'].duplicated().sum()

0


### Train_silver_**ab**
- negatives 
    - all remaining PMIDs from Google Scholar (gold)
    - all PMIDs from LitSuggest (silver)
- positive
    - matched sample from **cellosaurus_ab**

In [139]:
train_silver_ab_neg = pd.concat(
    [raw.googlescholar[~raw.googlescholar.PMID.isin(test_pmids)], raw.litsuggest]
)
train_silver_ab_pos = raw.cellosaurus_ab[~raw.cellosaurus_ab.PMID.isin(test_pmids)].sample(n=len(train_silver_neg))

processed.train_silver_ab = pd.concat([train_silver_ab_neg, train_silver_ab_pos]).sample(frac=1)
train_silver_ab_pmids = processed.train_silver_ab.PMID.tolist()
processed.train_silver_ab["DATASET"] = "train_silver_ab"
processed.train_silver_ab[["DATASET", "ORIGIN", "SOURCE"]].groupby(
    ["DATASET", "SOURCE"]
).value_counts()

DATASET          SOURCE  ORIGIN        
train_silver_ab  0       litsuggest        645
                         googlescholar     175
                 1       cellosaurus_ab    820
Name: count, dtype: int64

Check for overlap with test data.

In [140]:
pd.concat([processed.train_silver_ab, processed.test], axis=0) ['PMID'].duplicated().sum()

0

### Train_bronze_**ab**
- negatives
    - all remaining PMIDs from Google Scholar (gold)
    - all PMIDs from LitSuggest (silver) 
    - matched sample of pseudo-negatives from MEDLINE (bronze)
- positives
    - all remaining PMIDs from **Cellosaurus_ab**

In [141]:
train_bronze_ab_pos = raw.cellosaurus_ab[~raw.cellosaurus_ab.PMID.isin(test_pmids)]

train_bronze_ab_neg = pd.concat([raw.googlescholar[~raw.googlescholar.PMID.isin(test_pmids)],raw.litsuggest])
medline_n = train_bronze_ab_pos.shape[0] - train_bronze_ab_neg.shape[0]
medline_sample = raw.medline[~raw.medline.PMID.isin(raw_curated.PMID)].sample(medline_n)
train_bronze_ab_neg = pd.concat([train_bronze_ab_neg,medline_sample])
# train_bronze_neg = pd.concat([train_bronze_neg,raw.medline.sample(n_sample_medline)])

processed.train_bronze_ab = pd.concat([train_bronze_ab_neg, train_bronze_ab_pos]).sample(frac=1)
processed.train_bronze_ab["DATASET"] = "train_bronze_ab"
processed.train_bronze_ab[["DATASET", "ORIGIN", "SOURCE"]].groupby(
    ["DATASET", "SOURCE"]
).value_counts()

DATASET          SOURCE  ORIGIN        
train_bronze_ab  0       medline            9180
                         litsuggest          645
                         googlescholar       175
                 1       cellosaurus_ab    10000
Name: count, dtype: int64

Check for overlap with test set.

In [142]:
pd.concat([processed.train_bronze_ab, processed.test], axis=0) ['PMID'].duplicated().sum()

0

### Train_**ml_ab**
Similar to Train_bronze_ml but without the negatives from the GoogleScholar or Litsuggest datasets.
The dataset can be used to compare perfrommance of the new models compare to the origional Litsuggest model.

- negatives
    - matched sample of pseudo-negatives from MEDLINE (bronze)
- positives
    - all remaining PMIDs from **Cellosaurus_ab**

In [143]:
train_ml_ab_pos = raw.cellosaurus_ab[~raw.cellosaurus_ab.PMID.isin(test_pmids)]

medline_n = train_ml_ab_pos.shape[0]
train_ml_ab_neg = raw.medline[~raw.medline.PMID.isin(raw_curated.PMID)].sample(medline_n)

processed.train_ml_ab = pd.concat([train_ml_ab_neg, train_ml_ab_pos]).sample(frac=1)
processed.train_ml_ab["DATASET"] = "train_ml_ab"
processed.train_ml_ab[["DATASET", "ORIGIN", "SOURCE"]].groupby(
    ["DATASET", "SOURCE"]
).value_counts()

DATASET      SOURCE  ORIGIN        
train_ml_ab  0       medline           10000
             1       cellosaurus_ab    10000
Name: count, dtype: int64

In [144]:
pd.concat([processed.train_ml_ab, processed.test], axis=0) ['PMID'].duplicated().sum()

0

Write processed data to file.

In [145]:
for k in processed:
    processed[k].to_json(cfg.data.processed.pmid[k], lines=True, orient='records')
    print (f'Write {cfg.data.processed.pmid[k]}')

Write data/processed/pmid/test.ndjson
Write data/processed/pmid/train_silver.ndjson
Write data/processed/pmid/train_bronze.ndjson
Write data/processed/pmid/train_silver_ab.ndjson
Write data/processed/pmid/train_bronze_ab.ndjson
Write data/processed/pmid/train_ml_ab.ndjson
