# NER

In [32]:
import spacy
from spacy.pipeline import EntityRuler
import pandas as pd

nlp = spacy.load("en_core_web_sm")
nlp.disable_pipe("ner")
ruler = nlp.add_pipe("entity_ruler")

extracted_entities = []

patterns = [
    # ChemicalEntity patterns
    {"label": "ChemicalEntity", "pattern": [{"LOWER": {"REGEX": r".*ate$"}}]},
    {"label": "ChemicalEntity", "pattern": [{"LOWER": {"REGEX": r".*ium$"}}]},
    {"label": "ChemicalEntity", "pattern": [{"LOWER": {"REGEX": r".*ide$"}}]},
    {"label": "ChemicalEntity", "pattern": [{"LOWER": {"REGEX": r".*ine$"}}]},
    {"label": "ChemicalEntity", "pattern": [{"LOWER": {"REGEX": r".*ol$"}}]},
    {"label": "ChemicalEntity", "pattern": [{"LOWER": {"REGEX": r".*one$"}}]},
    {"label": "ChemicalEntity", "pattern": [{"LOWER": {"REGEX": r".*oid$"}}]},
    {"label": "ChemicalEntity", "pattern": [{"LOWER": {"REGEX": r".*acid$"}}]},
    {"label": "ChemicalEntity", "pattern": [{"LOWER": {"REGEX": r".*illin$"}}]},
    {"label": "ChemicalEntity", "pattern": [{"LOWER": {"REGEX": r".*tam$"}}]},
    {"label": "ChemicalEntity", "pattern": [{"LOWER": {"REGEX": r".*cin$"}}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "NOUN"}, {"LOWER": "agonists"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "PROPN"}, {"LOWER": "agonists"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "ADJ"}, {"LOWER": "agonists"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "NOUN"}, {"LOWER": "antagonists"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "PROPN"}, {"LOWER": "antagonists"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "ADJ"}, {"LOWER": "antagonists"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "NOUN"}, {"LOWER": "acid"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "PROPN"}, {"LOWER": "acid"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "ADJ"}, {"LOWER": "acid"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "NOUN"}, {"LOWER": "receptor antagonists"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "PROPN"}, {"LOWER": "receptor antagonists"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "ADJ"}, {"LOWER": "receptor antagonists"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "NOUN"}, {"LOWER": "receptors antagonists"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "PROPN"}, {"LOWER": "receptors antagonists"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "ADJ"}, {"LOWER": "receptors antagonists"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "NOUN"}, {"LOWER": "agent"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "PROPN"}, {"LOWER": "agent"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "ADJ"}, {"LOWER": "agent"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "NOUN"}, {"LOWER": "inhibitors"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "PROPN"}, {"LOWER": "inhibitors"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "ADJ"}, {"LOWER": "inhibitors"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "NOUN"}, {"LOWER": "acetate"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "PROPN"}, {"LOWER": "acetate"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "ADJ"}, {"LOWER": "acetate"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "NOUN"}, {"LOWER": "aspartate"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "PROPN"}, {"LOWER": "aspartate"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "ADJ"}, {"LOWER": "aspartate"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "NOUN"}, {"LOWER": "aminonucleoside"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "PROPN"}, {"LOWER": "aminonucleoside"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "ADJ"}, {"LOWER": "aminonucleoside"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "ADJ"}, {"LOWER": "drug"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "PROPN"}, {"LOWER": "drug"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "NOUN"}, {"LOWER": "drug"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "ADJ"}, {"LOWER": "A"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "PROPN"}, {"LOWER": "A"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "NOUN"}, {"LOWER": "A"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "ADJ"}, {"LOWER": "bromide"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "PROPN"}, {"LOWER": "bromide"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "NOUN"}, {"LOWER": "bromide"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "ADJ"}, {"LOWER": "p-coumarate"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "PROPN"}, {"LOWER": "p-coumarate"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "NOUN"}, {"LOWER": "p-coumarate"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "ADJ"}, {"LOWER": "bradykinin"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "PROPN"}, {"LOWER": "bradykinin"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "NOUN"}, {"LOWER": "bradykinin"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "ADJ"}, {"LOWER": "virgaurea extract"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "PROPN"}, {"LOWER": "virgaurea extract"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "NOUN"}, {"LOWER": "virgaurea extract"}]},

    # GeneOrGeneProduct patterns
    {"label": "GeneOrGeneProduct", "pattern": [{"LOWER": {"REGEX": "^MT-\\w+$"}}]},
    {"label": "GeneOrGeneProduct", "pattern": [{"LOWER": {"REGEX": r".*sin$"}}]},
    {"label": "GeneOrGeneProduct", "pattern": [{"LOWER": {"IN": ["PTEN", "EGFR", "TP53", "CDK4", "KRAS", "HER2", "BRCA1", "BRCA2", "NOTCH1", "FGFR4", "VEGFA", "SMAD", "BRAF", "APC", "TNF", "JAK2", "GATA3", "SOX2", "NFKB", "MYC"]}}]},
    {"label": "GeneOrGeneProduct", "pattern": [{"LOWER": {"REGEX": "^(BRCA[0-9]*|CD[0-9]+|BCL[-_]?2|BCL[-_]?xL|MMP[0-9]*|FGF[-_]?\\d*|BMP[-_]?\\d*|IL[-_]?\\d*|TNF[-_]?a|VEGF[-_]?R[0-9]*|CYP[0-9A-Z-]*)$"}}]},
    {"label": "GeneOrGeneProduct", "pattern": [{"LOWER": {"REGEX": "^(\\w+[-_]?(kinase|synthase|factor|receptor|protein|complex|channel|transporters?|cytokine|chemokine|phosphatase|transaminase|dehydrogenase|reductase|hydroxylase|oxidase|transferase|synthetase)\\w*)$"}}]},
    {"label": "GeneOrGeneProduct", "pattern": [{"LOWER": {"REGEX": "^(\\w+[-_]?\\d+)$"}}]},
    {"label": "GeneOrGeneProduct", "pattern": [{"LOWER": {"REGEX": "^(caspase[-_]?\\d*|HIF[-_]?1\\w*|PAR[-_]?[0-9]?)$"}}]},
    {"label": "GeneOrGeneProduct", "pattern": [{"LOWER": {"REGEX": "^NR\\d+[A-Z]?$"}}]},
    {"label": "GeneOrGeneProduct", "pattern": [{"LOWER": {"REGEX": "^(RNA[-_]?polymerase|TF[-_]?\\d+)$"}}]},
    {"label": "GeneOrGeneProduct", "pattern": [{"LOWER": {"REGEX": "^(Ig[A-Z]?[kml]?$|cytokine|chemokine)$"}}]},
    {"label": "GeneOrGeneProduct", "pattern": [{"LOWER": {"REGEX": "^(\\w+[-_]?(dehydrogenase|reductase|hydroxylase|phosphatase|transporter|oxidase|transferase)\\w*)$"}}]},
    {"label": "GeneOrGeneProduct", "pattern": [{"LOWER": {"REGEX": "^(mitochondrial|nuclear|cytosolic|membrane[-_]?bound|membrane[-_]?spanning|L[-_]?type|T[-_]?type|Ca(2+)?|g(amma)?|caspase)$"}}]},
    {"label": "GeneOrGeneProduct", "pattern": [{"LOWER": {"REGEX": "^(F(oxp|ox)|G(ata|f)|H(nf|ox)|J(nk|ak)|M(apk|mp)|N(fk|fat|f|myc)|P(53|ac|ar|as|c|r|tn)|R(ar|as)|S(ma|ox)|T(g|nf)|V(egf|hl))[-_]?\\d*\\w*$"}}]},
    {"label": "GeneOrGeneProduct", "pattern": [{"LOWER": {"REGEX": "^\\w{2,4}[a-z]*\\d+\\w*$"}}]},
    {"label": "GeneOrGeneProduct", "pattern": [{"LOWER": {"REGEX": "^(t(umor)?|p53|bcl[-_]?2|ras|myc|brca|mapk|akt|cdk|nfk|vegf|notch|stat|tgf|tcf|fgf|smad|jak|erk|mmp|il|hif|tnf|cxcr|cd|tcr|ifn|bmp|mtor|pax|wnt|lgr|egfr|her)\\d*[-_]?\\w*$"}}]},
    {"label": "GeneOrGeneProduct", "pattern": [{"LOWER": {"REGEX": "^(receptor|ligand|antigen|inhibitor|enzyme|immunoglobulin|globulin|cytokine|chemokine|peptide|hormone|nucleoprotein|glycoprotein|lipoprotein|protein)$"}}]},
    {"label": "GeneOrGeneProduct", "pattern": [{"LOWER": {"REGEX": "^(histone|ubiquitin|acetylase|deacetylase|transferase|ligase|synthetase|kinase|phosphatase|phosphorylase|hydrolase)$"}}]},
    {"label": "GeneOrGeneProduct", "pattern": [{"LOWER": {"REGEX": "^(nuclear[-_]?receptor|growth[-_]?factor|cell[-_]?adhesion|adhesion[-_]?molecule|signaling[-_]?pathway|transcription[-_]?factor|co[-_]?activator|co[-_]?repressor)$"}}]},
    {"label": "GeneOrGeneProduct", "pattern": [{"LOWER": {"REGEX": "^(serine[-_]?threonine|tyrosine|threonine|serine|cysteine|zinc[-_]?finger|toll[-_]?like|lectin|interleukin[-_]?receptor|g[-_]?protein[-_]?coupled)$"}}]},
    {"label": "GeneOrGeneProduct", "pattern": [{"LOWER": {"REGEX": "^(beta|alpha|gamma|delta|epsilon|zeta|kappa|lambda|mu|sigma|tau|xi|omega)$"}}]},
    {"label": "GeneOrGeneProduct", "pattern": [{"LOWER": {"REGEX": "^(tuberous[-_]?sclerosis|von[-_]?hippel|lynch|li[-_]?fraumeni|marfan|noonan)$"}}]},

    # OrganismTaxon patterns
    {"label": "OrganismTaxon", "pattern": [{"LOWER": {"IN": ["patient", "patients", "woman", "women", "man", "men", "mouse", "mice", "rat", "rats", "human", "humans", "yeast", "virus", "viruses", "bacteria", "bacterial", "outpatient", "inpatient", "hepatitis", "HIV", "cytomegalovirus", "murine", "Plasmodium falciparum", "Mycobacterium tuberculosis", "Staphylococcus aureus", "S. aureus", "E. coli", "Panax ginseng", "mangosteen"]}}]},

    #DiseaseOrPhenotypicFeature patterns
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"LOWER": {"REGEX": "^.*ism$"}}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"LOWER": {"REGEX": "^.*esia$"}}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"LOWER": {"REGEX": "^.*esias$"}}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"LOWER": {"IN": ["genetic", "hereditary", "inherited"]}}, {"POS": "NOUN"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"LOWER": "autoimmune"}, {"POS": "NOUN"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"LOWER": {"IN": ["heart", "cardiovascular", "cardiac"]}}, {"POS": "NOUN"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"LOWER": "neurological"}, {"POS": "NOUN"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"LOWER": "oncological"}, {"POS": "NOUN"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"LOWER": {"IN": ["gastrointestinal", "gi", "stomach"]}}, {"POS": "NOUN"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"LOWER": {"IN": ["respiratory", "lung"]}}, {"POS": "NOUN"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"LOWER": "endocrine"}, {"POS": "NOUN"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"IS_ALPHA": True, "IS_UPPER": True, "LENGTH": 3}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"LOWER": {"REGEX": "^.*(ism|opathy|itis|oma)$"}}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"LOWER": {"IN": ["mutation", "inheritance", "genetic disorder"]}}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"LOWER": {"IN": ["disease", "condition", "syndrome"]}}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": {"IN": ["NOUN", "PROPN"]}}, {"LOWER": {"IN": ["disease", "condition", "syndrome"]}}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"TEXT": {"REGEX": "^.*osis$"}}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"TEXT": {"REGEX": "^.*itis$"}}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"TEXT": {"REGEX": "^.*oma$"}}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "ADJ"}, {"LOWER": "diseases"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "PROPN"}, {"LOWER": "diseases"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "NOUN"}, {"LOWER": "diseases"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "ADJ"}, {"LOWER": "disease"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "PROPN"}, {"LOWER": "disease"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "NOUN"}, {"LOWER": "disease"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "ADJ"}, {"LOWER": "atherosclerosis"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "PROPN"}, {"LOWER": "atherosclerosis"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "NOUN"}, {"LOWER": "atherosclerosis"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "ADJ"}, {"LOWER": "cancers"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "PROPN"}, {"LOWER": "cancers"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "NOUN"}, {"LOWER": "cancers"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "ADJ"}, {"LOWER": "cancer"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "PROPN"}, {"LOWER": "cancer"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "NOUN"}, {"LOWER": "cancer"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "ADJ"}, {"LOWER": "syndrome"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "PROPN"}, {"LOWER": "syndrome"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "NOUN"}, {"LOWER": "syndrome"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "ADJ"}, {"LOWER": "impairment"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "PROPN"}, {"LOWER": "impairment"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "NOUN"}, {"LOWER": "impairment"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "ADJ"}, {"LOWER": "tumours"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "PROPN"}, {"LOWER": "tumours"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "NOUN"}, {"LOWER": "tumours"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "ADJ"}, {"LOWER": "tumour"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "PROPN"}, {"LOWER": "tumour"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "NOUN"}, {"LOWER": "tumour"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "ADJ"}, {"LOWER": "lymphoma"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "PROPN"}, {"LOWER": "lymphoma"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "NOUN"}, {"LOWER": "lymphoma"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"LOWER": {"REGEX": "^.*ism$"}}, {"LOWER": {"NOT_IN": ["mechanism", "manner"]}}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"LOWER": "cardiac"}, {"LOWER": {"IN": ["function", "disease", "dilation"]}}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"LOWER": {"IN": ["neurological", "oncological", "autoimmune", "endocrine", "gastrointestinal", "respiratory"]}}, {"POS": "NOUN"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"LOWER": {"IN": ["genetic", "hereditary", "inherited"]}}, {"POS": "NOUN"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"LOWER": {"REGEX": "^.*(opathy|oma|osis)$"}}]},

    #SequenceVariant patterns
    {"label": "SequenceVariant", "pattern": [{"TEXT": {"REGEX": "rs\\d+"}}]},
    {"label": "SequenceVariant", "pattern": [{"TEXT": {"REGEX": "[A-Z]\\d+[A-Z]"}}]},
    {"label": "SequenceVariant", "pattern": [{"TEXT": {"REGEX": "p\\.[A-Z][a-z]{2}\\d+[A-Z]"}}]},
    {"label": "SequenceVariant", "pattern": [{"TEXT": {"REGEX": "\\d+[A-Z]>[A-Z]"}}]},
    {"label": "SequenceVariant", "pattern": [{"TEXT": {"REGEX": "c\\.\\d+[A-Z]>[A-Z]"}}]},
    {"label": "SequenceVariant", "pattern": [{"TEXT": {"REGEX": "\\d+del[A-Z]+"}}]},
    {"label": "SequenceVariant", "pattern": [{"TEXT": {"REGEX": "\\d+ins[A-Z]+"}}]},
    {"label": "SequenceVariant", "pattern": [{"TEXT": {"REGEX": "IVS\\d+\\+\\d+[A-Z]>[A-Z]"}}]},
    {"label": "SequenceVariant", "pattern": [{"TEXT": {"REGEX": "[A-Z][a-z]{2}\\d+[a-z]{2}[A-Z]\\d+"}}]},
    {"label": "SequenceVariant", "pattern": [{"TEXT": {"REGEX": "\\d+[A-Z][a-z]{2}"}}]},
    {"label": "SequenceVariant", "pattern": [{"TEXT": {"REGEX": "[A-Z]+ repeat"}}]},
    {"label": "SequenceVariant", "pattern": [{"TEXT": {"REGEX": "[a-zA-Z]+ at nucleotide \\d+"}}]},
    {"label": "SequenceVariant", "pattern": [{"TEXT": {"REGEX": "\\w+ by \\w+ at position \\d+"}}]},
    {"label": "SequenceVariant", "pattern": [{"TEXT": {"REGEX": "deletion of \\w+"}}]},
    {"label": "SequenceVariant", "pattern": [{"TEXT": {"REGEX": "\\-\\d+ \\w+/\\w+"}}]},
    {"label": "SequenceVariant", "pattern": [{"TEXT": {"REGEX": "\\w+\\s?\\-\\d+[A-Z]>[A-Z]"}}]},

    #CellLine pattern
    {"label": "CellLine", "pattern": [{"TEXT": {"REGEX": "\\d+"}}, {"LOWER": "cells"}]},
    {"label": "CellLine", "pattern": [{"POS": "ADJ"}, {"LOWER": "B"}]},
    {"label": "CellLine", "pattern": [{"POS": "PROPN"}, {"LOWER": "B"}]},
    {"label": "CellLine", "pattern": [{"POS": "NOUN"}, {"LOWER": "B"}]},
    {"label": "CellLine", "pattern": [{"POS": {"REGEX": "^.+$"}}, {"LOWER": " B"}]},
    {"label": "CellLine", "pattern": [{"LOWER": "traf2dn-tg b"}]},
    {"label": "CellLine", "pattern": [{"LOWER": "u87mg"}]},
    {"label": "CellLine", "pattern": [{"LOWER": "mcardle-rh7777"}]},
    {"label": "CellLine", "pattern": [{"LOWER": "HT22"}]},
    {"label": "CellLine", "pattern": [{"LOWER": "llc-pk1"}]},
    {"label": "CellLine", "pattern": [{"LOWER": "cos-7"}]},
    {"label": "CellLine", "pattern": [{"LOWER": "sw48"}]},
    {"label": "CellLine", "pattern": [{"LOWER": "ht29"}]},
    {"label": "CellLine", "pattern": [{"LOWER": "t98g"}]},
    {"label": "CellLine", "pattern": [{"LOWER": "sh-sy5y"}]},
    {"label": "CellLine", "pattern": [{"LOWER": "sw480"}]}
]

ruler.add_patterns(patterns)

def process_entities(doc):
    entities = [
        (ent.text.split()[0], 'GeneOrGeneProduct') if ent.label_ == 'GeneOrGeneProduct'
        else (ent.text.split()[0], 'CellLine') if ent.label_ == 'CellLine'
        else (ent.text, ent.label_)
        for ent in doc.ents
        if '@' not in ent.text
    ]
    return entities

df = pd.read_csv('/Users/bielcave/Documents/BSC/Language_Models/Rule_Based/dev.csv', header=None)

# Optionally, you can specify column names to better understand the structure
column_names = [
    'ID', 'Type1', 'Type2', 'GeneticChange1', 'GeneticChange2',
    'IsPathogenic', 'Number', 'CaseDescription', 'Extra1', 'Extra2'
]
df.columns = column_names

# Iterate over each row in the DataFrame
extracted_entities = []
for _, row in df.iterrows():
    # Get the text from the CaseDescription column
    text = row['CaseDescription']
    
    # Process the text with your NLP model
    doc = nlp(text)
    
    # Process entities
    modified_entities = process_entities(doc)
    
    # Extract and store the entities
    for entity_text, entity_label in modified_entities:
        extracted_entities.append(entity_text)
        # Uncomment the line below if you want to print each entity and its label
        # print(f"Entity: {entity_text}, Label: {entity_label}")



In [29]:
print(set(extracted_entities))

{'colon cancer', 'nephrotic syndrome', 'selective antagonists', 'cardiovascular diseases', 'Otilonium bromide', 'endogenous inhibitors', 'Cardiovascular diseases', 'hypothyroidism identified', 'novel disease', 'cardiovascular risk', 'genetic variations', 'polymorphism (', 'lung histology', 'cardiac marker', 'respiratory complex', 'EGFR inhibitors', 'receptor antagonists', 'synthase inhibitors', 'dihydroxyphenylacetic acid', 'breakage syndrome', 'cardiac function', 'COX-2 inhibitors', 'rectal cancers', 'Arachidonic acid', 'cardiac procedures', 'lung mechanics', 'brain tumour', 'broad cancer', 'rectal cancer', 'cardiac tissues', 'recessive disease', 'glomerular diseases', 'selective agonists', 'inherited diseases', 'otilonium bromide', 'hypercortisolism ,', 'hearing impairment', 'genetic factors', 'Woude syndrome', 'mechanism through', 'genetic risk', 'Willebrand disease', 'H(3)-receptor antagonists', 'cardiac myocytes', 'genetic counseling', 'retinoic acid', 'chemotherapeutic agent', 'm

## Gold Standard

In [33]:
gold_standard = []

with open('/Users/bielcave/Documents/BSC/Language_Models/Rule_Based/BioRED/Dev.PubTator', 'r', encoding='utf-8') as file:
    for line in file:
        columns = line.strip().split('\t')
        if len(columns) == 6:
            gold_standard.append(columns[3])
        #if len(columns) == 6 and columns[4] == 'ChemicalEntity':
        #    print(columns[3])
            
#print(set(gold_standard))

## NER Evaluation

In [34]:
def evaluate_ner(gold_standard_entities, extracted_entities):
    true_positives = len(set(gold_standard_entities) & set(extracted_entities))
    false_positives = len(extracted_entities) - true_positives
    false_negatives = len(gold_standard_entities) - true_positives
    
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    f1_score = 2 * (precision * recall) / (precision + recall)
    
    return precision, recall, f1_score

gold_standard = set(gold_standard)
extracted_entities = set(extracted_entities)

precision, recall, f1_score = evaluate_ner(gold_standard, extracted_entities)
print(f"Precision: {precision}, Recall: {recall}, F1 Score: {f1_score}")

Precision: 0.41995692749461594, Recall: 0.40540540540540543, F1 Score: 0.4125528913963329


ChemicalEntity --> 0.409

GeneOrGeneProduct --> 0.346

OrganismTaxon --> 0.785

DiseaseOrPhenotypicFeature --> 0.272

SequenceVariant --> 0.398

CellLine --> 0.608

Overall --> 0.412