In [44]:
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import numpy as np

tqdm.pandas()
import re


In [45]:
# Read abstracts
df = pd.read_parquet("data/pubmed_dump_raw.parquet", columns=["pmid", "abstract"])
df


Unnamed: 0,pmid,abstract
0,30656465,\n\n \n Koala retrovirus (KoRV) is a...
1,28669101,\n\n \n Koala (Phascolarctos cinereu...
2,29967444,"\n\n \n The koala, the only extant s..."
3,32470998,\n\n \n Habitat destruction and frag...
4,31848216,\n\n \n The morphology and locomotor...
...,...,...
923,33257543,\n\n \n Fingerprints are unique to p...
924,36161902,\n\n \n Lorises are a group of globa...
925,24906475,\n\n \n Structural characterizations...
926,25197935,\n\n \n A specific galactose-binding...


In [46]:
def gen_ent(abstract, ner, min_score):

    ner_results = ner(abstract)

    if not ner_results:
        return {}

    entity_df = pd.DataFrame(ner_results)
    entity_df = entity_df[entity_df.entity_group != "MISC"]
    entity_df = entity_df[entity_df.entity_group != "0"]
    entity_df = entity_df[entity_df.score > min_score]

    if entity_df.empty:
        return {}
    else:
        entity_dict = entity_df.groupby("entity_group").word.agg(list).to_dict()
        return entity_dict


In [47]:
# entities: location (LOC), organizations (ORG), person (PER) and Miscellaneous (MISC)
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
ner = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

ent_dict = df.abstract.progress_apply(gen_ent, ner=ner, min_score=0.5)
ent_df = pd.DataFrame.from_dict(ent_dict.to_list())
df = pd.concat([df, ent_df], axis=1)


100%|██████████| 928/928 [19:53<00:00,  1.29s/it]


In [48]:
# entities: DISEASE
tokenizer = AutoTokenizer.from_pretrained("alvaroalon2/biobert_diseases_ner")
model = AutoModelForTokenClassification.from_pretrained(
    "alvaroalon2/biobert_diseases_ner"
)
ner = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

ent_dict = df.abstract.progress_apply(gen_ent, ner=ner, min_score=0.5)
ent_df = pd.DataFrame.from_dict(ent_dict.to_list())
df = pd.concat([df, ent_df], axis=1)


100%|██████████| 928/928 [18:25<00:00,  1.19s/it]


In [49]:
# entities: GENETIC
tokenizer = AutoTokenizer.from_pretrained("alvaroalon2/biobert_genetic_ner")
model = AutoModelForTokenClassification.from_pretrained(
    "alvaroalon2/biobert_genetic_ner"
)
ner = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

ent_dict = df.abstract.progress_apply(gen_ent, ner=ner, min_score=0.5)
ent_df = pd.DataFrame.from_dict(ent_dict.to_list())
df = pd.concat([df, ent_df], axis=1)


100%|██████████| 928/928 [18:00<00:00,  1.16s/it]


In [50]:
# entities: CHEMICAL
tokenizer = AutoTokenizer.from_pretrained("alvaroalon2/biobert_chemical_ner")
model = AutoModelForTokenClassification.from_pretrained(
    "alvaroalon2/biobert_chemical_ner"
)
ner = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

ent_dict = df.abstract.progress_apply(gen_ent, ner=ner, min_score=0.5)
ent_df = pd.DataFrame(ent_dict.to_list())
df = pd.concat([df, ent_df], axis=1)


100%|██████████| 928/928 [17:36<00:00,  1.14s/it]


In [51]:
# Post-process entities
def strip_str(instr):
    return re.sub("[^A-Za-z0-9]+", " ", instr)


def rm_htkn(row, entity_name):
    entities = row[entity_name]
    abstract = row["abstract"]

    if isinstance(entities, list):
        whole_words = []
        for word in entities:
            word = strip_str(word)
            if word != " ":
                if re.search(r"\b{}\b".format(word), abstract):
                    whole_words.append(word)
        return whole_words

    else:
        return []


entity_dict = {
    "LOC": "location",
    "ORG": "organization",
    "PER": "person",
    "GENETIC": "genetic",
    "DISEASE": "disease",
    "CHEMICAL": "chemical",
}
df.abstract = df.abstract.apply(strip_str)
for entity in tqdm(entity_dict.keys()):
    df[entity] = df.apply(rm_htkn, args=(entity,), axis=1)

df = df.rename(columns=entity_dict)


100%|██████████| 6/6 [00:00<00:00,  7.70it/s]


In [52]:
# Save entities
df = df.apply(lambda s: s.fillna({i: [] for i in df.index})).drop(columns="abstract")
df.to_parquet("data/entities_05.parquet")
