In [21]:
from pymongo import MongoClient

# Connect to mongodb and create a new database
client = MongoClient("mongodb://localhost:27017/")
database = client["MedGraph"]

# Base pubmed url: https://pubmed.ncbi.nlm.nih.gov/ + paper_id

In [None]:
import requests

# Web scraping from bern2 online deployment
def query_plain(text, url="http://bern2.korea.ac.kr/plain"):
    return requests.post(url, json={'text': text}).json()

if __name__ == '__main__':
    text = "BACKGROUND\nLactoferricin (LFcin) is a strong cationic peptide released from the N-terminus of lactoferrin by gastric pepsin digestion. LFcin has some important properties, including high antimicrobial activity. To date, lactoferricins have been isolated and characterised from various animal species, but not from camel. The aim of this study was to characterise and express recombinant camel lactoferricin (LFcinC) in Pichia pastoris and investigate its antimicrobial activity.\n\n\nRESULTS\nAfter methanol induction, LFcinC was expressed and secreted into a culture broth medium and the results determined by concentrated supernatant culture medium showed high antimicrobial activity against the following microorganisms: Escherichia coli PTCC 1330 (ATCC 8739), Staphylococcus aureus PTCC 1112 (ATCC 6538), Pseudomonas aeruginosa PTCC 1074 (ATCC 9027), Bacillus subtilis PTCC 1023 (ATCC 6633), and Candida albicans PTCC 5027 (ATCC 10231). Thermal stability was clarified with antibacterial activity against Escherichia coli PTCC 1330 (ATCC 8739).\n\n\nCONCLUSION\nResults confirmed that camel lactoferricin had suitable antimicrobial activity and its production by Pichia pastoris can be used for recombinant production."
    print(query_plain(text))

In [36]:
# GLiNER custom build (https://github.com/urchade/GLiNER)

from gliner import GLiNER
import os, warnings

warnings.filterwarnings("ignore", category=FutureWarning, module='huggingface_hub.file_download')
warnings.filterwarnings("ignore", message="Sentence of length")
warnings.filterwarnings("ignore", category=UserWarning, module='transformers.convert_slow_tokenizer')

collection = database['Dataset2000Entries']


model = GLiNER.from_pretrained("urchade/gliner_large_bio-v0.1", max_len = 32768)

labels = [
    "antibiotic", "clinical_event", "diagnostic_procedure", "drug", "disease_disorder", 
    "drug_dosage", "treatment_duration", "drug_strength", "drug_frequency", "severity", "lab_value",
    "sign_symptom", "gene", "species", "tissue", "treatment",
    "age", "sex", "height", "weight", "date", "protein", 
    "cell", "plant", "volume", "subject", "body_part", "anatomical_system", 
    "chemical_system", "enzyme", "biological_process", "molecular_function", "mutation", 
    "immunologic_marker", "organism", "compound", "virus", "bacteria", "parasite", "clinical_trial", 
    "surgical_procedure", "molecule", "protein_family", "environmental_factor",
    "diet", "symptom_complex", "study_design", "fungus", "cell_type", "cell_line", 
    "device", "side_effect", "condition_outcome", "clinical_guideline",
    "medical_abbreviation", "medical_term", "peptide", "activity"
]

os.makedirs("documents", exist_ok=True)
documents = collection.find()

for doc in documents:
    text = doc['abstract'] 
    entities = model.predict_entities(text, labels)
    
    entities_dict = {}
    for entity in entities:
        key = (entity["text"], entity["label"])
        if key not in entities_dict:
            entities_dict[key] = (1, float(entity["score"]))
        else:
            count, max_score = entities_dict[key]
            entities_dict[key] = (count + 1, max(max_score, entity["score"]))
    
    lines = [f"{key[0]} => {key[1]} | max_score: {value[1]} | count: {value[0]}" for key, value in entities_dict.items()]
    content = "\n".join(lines)
    
    file_path = os.path.join("documents", f"{doc['_id']}.txt")
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(content)
    
    print(f"Documento {doc['_id']} salvato con successo!")

Documento 25655077 salvato con successo!




Documento 25993982 salvato con successo!
Documento 26139113 salvato con successo!




Documento 26228051 salvato con successo!




Documento 26350435 salvato con successo!
Documento 26369812 salvato con successo!
Documento 26404497 salvato con successo!




Documento 26442948 salvato con successo!
Documento 26456194 salvato con successo!
Documento 26484384 salvato con successo!
Documento 26493221 salvato con successo!




Documento 26494147 salvato con successo!
Documento 26499211 salvato con successo!
Documento 26525791 salvato con successo!
Documento 26539799 salvato con successo!




Documento 26547698 salvato con successo!
Documento 26584757 salvato con successo!




Documento 26592804 salvato con successo!
Documento 26608112 salvato con successo!




Documento 26626629 salvato con successo!
Documento 26648572 salvato con successo!
Documento 26656137 salvato con successo!
Documento 26687241 salvato con successo!


KeyboardInterrupt: 

In [30]:
entities_dict = {}
for entity in entities:
    if (entity["text"], entity["label"]) not in entities_dict:
        entities_dict[(entity["text"], entity["label"])] = (1, float(entity["score"]))
    else:
        if entities_dict[(entity["text"], entity["label"])][1] < entity["score"]:
            entities_dict[(entity["text"], entity["label"])] = (entities_dict[(entity["text"], entity["label"])][0] + 1, entity["score"])
        else:
            entities_dict[(entity["text"], entity["label"])] = (entities_dict[(entity["text"], entity["label"])][0] + 1, entities_dict[(entity["text"], entity["label"])][1])

for key, value in entities_dict.items():
    print(key[0], "=>", key[1], " | max_score: ", value[1], " | count: ", value[0])


intratracheal transplantation => surgical_procedure  | max_score:  0.5052153468132019  | count:  1
MSCs => cell_line  | max_score:  0.787893533706665  | count:  8
Escherichia (E) coli => bacteria  | max_score:  0.7500589489936829  | count:  1
inflammation => biological_process  | max_score:  0.5499964952468872  | count:  1
bacterial clearance => biological_process  | max_score:  0.6395661234855652  | count:  2
mice => species  | max_score:  0.6265113949775696  | count:  2
E. coli => bacteria  | max_score:  0.9057008028030396  | count:  5
fibroblasts => cell_line  | max_score:  0.833831787109375  | count:  3
toll- like receptors (TLR) => protein  | max_score:  0.5873016715049744  | count:  1
TLR- 4 => protein  | max_score:  0.8786500096321106  | count:  5
β- defensin 2 => protein  | max_score:  0.5340133905410767  | count:  1
BD2 => protein  | max_score:  0.7794433832168579  | count:  5
in vitro => environmental_factor  | max_score:  0.710089385509491  | count:  2
TLR- 2 => protein  | m