In [291]:
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import numpy as np
tqdm.pandas()
import re



In [292]:
df = pd.read_parquet("pubmed_dump.parquet", columns=["pmid", "abstract"])
df


Unnamed: 0,pmid,abstract
0,33546104,Chlamydia is a significant pathogen for many s...
1,32770481,Koala retrovirus (KoRV) is a major threat to k...
2,29382557,Infectious diseases are contributing to the de...
3,24148555,Koala retroviruses (KoRV) have been isolated f...
4,26958909,A retroviral etiology for malignant neoplasias...
...,...,...
922,33257543,Fingerprints are unique to primates and koalas...
923,36161902,Lorises are a group of globally threatened str...
924,24906475,Structural characterizations of marsupial milk...
925,25197935,A specific galactose-binding lectin was shown ...


In [293]:
def gen_ent(abstract, ner, min_score):

    ner_results = ner(abstract)

    if not ner_results:
        return {}

    entity_df = pd.DataFrame(ner_results)
    entity_df = entity_df[entity_df.entity_group != "MISC"]
    entity_df = entity_df[entity_df.entity_group != "0"]
    entity_df = entity_df[entity_df.score > min_score]

    if entity_df.empty:
        return {}
    else:
        entity_dict = entity_df.groupby("entity_group").word.agg(list).to_dict()
        return entity_dict


In [294]:
# entities: location (LOC), organizations (ORG), person (PER) and Miscellaneous (MISC)
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
ner = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

ent_dict = df.abstract.progress_apply(gen_ent, ner=ner, min_score=0.9)
ent_df = pd.DataFrame.from_dict(ent_dict.to_list())
df = pd.concat([df, ent_df], axis=1)


100%|██████████| 927/927 [17:36<00:00,  1.14s/it]


In [295]:
# entities: DISEASE
tokenizer = AutoTokenizer.from_pretrained("alvaroalon2/biobert_diseases_ner")
model = AutoModelForTokenClassification.from_pretrained(
    "alvaroalon2/biobert_diseases_ner"
)
ner = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

ent_dict = df.abstract.progress_apply(gen_ent, ner=ner, min_score=0.9)
ent_df = pd.DataFrame.from_dict(ent_dict.to_list())
df = pd.concat([df, ent_df], axis=1)


100%|██████████| 927/927 [16:27<00:00,  1.07s/it]


In [296]:
# entities: GENETIC
tokenizer = AutoTokenizer.from_pretrained("alvaroalon2/biobert_genetic_ner")
model = AutoModelForTokenClassification.from_pretrained(
    "alvaroalon2/biobert_genetic_ner"
)
ner = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

ent_dict = df.abstract.progress_apply(gen_ent, ner=ner, min_score=0.9)
ent_df = pd.DataFrame.from_dict(ent_dict.to_list())
df = pd.concat([df, ent_df], axis=1)


100%|██████████| 927/927 [16:26<00:00,  1.06s/it]


In [297]:
# entities: CHEMICAL
tokenizer = AutoTokenizer.from_pretrained("alvaroalon2/biobert_chemical_ner")
model = AutoModelForTokenClassification.from_pretrained(
    "alvaroalon2/biobert_chemical_ner"
)
ner = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

ent_dict = df.abstract.progress_apply(gen_ent, ner=ner, min_score=0.9)
ent_df = pd.DataFrame(ent_dict.to_list())
df = pd.concat([df, ent_df], axis=1)


100%|██████████| 927/927 [16:21<00:00,  1.06s/it]


In [298]:
df


Unnamed: 0,pmid,abstract,LOC,ORG,PER,DISEASE,GENETIC,CHEMICAL
0,33546104,Chlamydia is a significant pathogen for many s...,,,,"[Chlamydia, chlamydial, chlamydial infection, ...","[interferon - gamma, IFN - γ, IgG, mucosal IgA...",
1,32770481,Koala retrovirus (KoRV) is a major threat to k...,,,,"[retroviral diseases, l, ##ymphoma, leukemia, ...",,
2,29382557,Infectious diseases are contributing to the de...,,,,"[In, ##fectious diseases, ocular and urogenita...",,
3,24148555,Koala retroviruses (KoRV) have been isolated f...,[Australia],,,"[leukemia, leukemia, chlamydiosis, l, ##ymphom...",[envelope protein],
4,26958909,A retroviral etiology for malignant neoplasias...,"[Los Angeles Zoo, United States]",,,"[malignant neoplasia, neoplasia, lymphomas, ne...",,
...,...,...,...,...,...,...,...,...
922,33257543,Fingerprints are unique to primates and koalas...,,,,,,
923,36161902,Lorises are a group of globally threatened str...,,,,"[p, p, p, p, ##ygmy, p, p, p, ##ygmy]","[GSTA gene family, PITRM1, PITRM1, MYOF, PER2,...",
924,24906475,Structural characterizations of marsupial milk...,,,,,,"[car, ##bohydrate, car, ##bohydrate, ( 1 ) H, ..."
925,25197935,A specific galactose-binding lectin was shown ...,,[EC],,"[hemolysis, hemolysis, toxicity]","[lectin, streptolysin O, SLO, lectins, T - ant...","[galact, ##ose, N - acetyllactosamine, α - gal..."


In [299]:
def strip_str(instr):
    return re.sub('[^A-Za-z0-9]+', ' ', instr)
    
def rm_htkn(row, entity_name):
    entities = row[entity_name]
    abstract = row["abstract"]

    if isinstance(entities, np.ndarray):
        #return [word for word in entities if word in abstract]
        


        whole_words=[]
        for word in entities:
            word=strip_str(word)
            if word!=' ':
                if re.search(r'\b{}\b'.format(word), abstract):                
                    whole_words.append(word)
        return whole_words

    else:
        return None


entity_dict = {
    "LOC": "location",
    "ORG": "organization",
    "PER": "person",
    "GENETIC": "genetic",
    "DISEASE": "disease",
    "CHEMICAL": "chemical",
}
df.abstract=df.abstract.apply(strip_str)
for entity in tqdm(entity_dict.keys()):
    print(entity)
    # df[entity]=df[entity].apply(lambda x: [strip_str(word) for word in x])
    df[entity] = df.apply(rm_htkn, args=(entity,), axis=1)

df=df.rename(columns=entity_dict)
df

100%|██████████| 6/6 [00:00<00:00, 62.50it/s]

LOC
ORG
PER
GENETIC
DISEASE
CHEMICAL





Unnamed: 0,pmid,abstract,location,organization,person,disease,genetic,chemical
0,33546104,Chlamydia is a significant pathogen for many s...,,,,,,
1,32770481,Koala retrovirus KoRV is a major threat to koa...,,,,,,
2,29382557,Infectious diseases are contributing to the de...,,,,,,
3,24148555,Koala retroviruses KoRV have been isolated fro...,,,,,,
4,26958909,A retroviral etiology for malignant neoplasias...,,,,,,
...,...,...,...,...,...,...,...,...
922,33257543,Fingerprints are unique to primates and koalas...,,,,,,
923,36161902,Lorises are a group of globally threatened str...,,,,,,
924,24906475,Structural characterizations of marsupial milk...,,,,,,
925,25197935,A specific galactose binding lectin was shown ...,,,,,,


In [300]:
df.drop(columns="abstract").to_parquet("entities_09.parquet")