In [23]:
import os
from base_fns import get_local_folder
os.chdir(os.path.dirname(get_local_folder()))

import pandas as pd
import tabulate as tb
from hydra import compose, initialize
from omegaconf import OmegaConf
import random
from munch import Munch
import numpy as np
from pandarallel import pandarallel
from orjson import loads
from pathlib import Path
from script.ct_model import CtTagger

with initialize(
    version_base=None,
    config_path="../cfg",
):
    cfg = compose(config_name="main")

random.seed(cfg.random.seed)
np.random.seed(cfg.random.seed)


## PMIDs

In [4]:


DF_FP = cfg.evaluation.celltriage.oneweek
df = pd.read_csv(DF_FP)
len(df)


28609

## Articles

In [5]:
repo_fn = os.listdir(cfg.data.raw.article)
print(f"repository contains text for {len(repo_fn)} articles")

repository contains text for 95239 articles


## Abstract and fulltext datasets

In [10]:
pandarallel.initialize()

def get_article(row):
    article_fn = str(int(row.PMID)) + '.json'
    repo_fn = os.listdir(cfg.data.raw.article)
    if article_fn in repo_fn:
        with open(os.path.join(cfg.data.raw.article, article_fn)) as f:
            article = loads(f.read())
            # if fulltext is not returned then used abstract.
            if not article["FULLTEXT"]:
                article["FULLTEXT"] = article["ABSTRACT"]
            
            return pd.Series(article) 
    else:
        return pd.Series({}) 

def data_text(df_in):
    df_article = df_in.parallel_apply(get_article, axis=1, result_type="expand")
    # df_article = df_in.apply(get_article, axis=1, result_type="expand")
    df_article = df_article.drop('PMID', axis=1)
    return pd.concat([df_in, df_article], axis=1)
    # return df_article


INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


Unnamed: 0,PMID,SCORE_0918,PMCID,ABSTRACT,FULLTEXT
0,37682469,97.26,,"Analysis of Chromatin Accessibility, Histone M...","Analysis of Chromatin Accessibility, Histone M..."


In [11]:
df_out =  data_text(df)
df_out.head()

Unnamed: 0,PMID,SCORE_0918,PMCID,ABSTRACT,FULLTEXT
0,37682469,97.26,,"Analysis of Chromatin Accessibility, Histone M...","Analysis of Chromatin Accessibility, Histone M..."
1,37663788,97.25,PMC10469926,Improved prediction of MHC-peptide binding usi...,Improved prediction of MHC-peptide binding usi...
2,37662358,97.22,PMC10473580,Active learning of enhancer and silencer regul...,Active learning of enhancer and silencer regul...
3,37669185,97.22,,Experts Collaboration Learning for Continual M...,Experts Collaboration Learning for Continual M...
4,37685874,97.21,PMC10487524,A Leukemic Target with a Thousand Faces: The M...,A Leukemic Target with a Thousand Faces: The M...


### Samples without abstract-text.

In [18]:
len(df_out[df_out.ABSTRACT.isnull()])

0

In [24]:
df_out.to_csv(cfg.evaluation.celltriage.oneweek_txt, sep = "\t")
