# Extraction Entities

In [4]:
import spacy
from spacy.pipeline import EntityRuler
import pandas as pd

nlp = spacy.load("en_core_web_sm")
nlp.disable_pipe("ner")
ruler = nlp.add_pipe("entity_ruler")

extracted_entities = []

patterns = [
    # ChemicalEntity patterns
    {"label": "ChemicalEntity", "pattern": [{"LOWER": {"REGEX": r".*ate$"}}]},
    {"label": "ChemicalEntity", "pattern": [{"LOWER": {"REGEX": r".*ium$"}}]},
    {"label": "ChemicalEntity", "pattern": [{"LOWER": {"REGEX": r".*ide$"}}]},
    {"label": "ChemicalEntity", "pattern": [{"LOWER": {"REGEX": r".*ine$"}}]},
    {"label": "ChemicalEntity", "pattern": [{"LOWER": {"REGEX": r".*ol$"}}]},
    {"label": "ChemicalEntity", "pattern": [{"LOWER": {"REGEX": r".*one$"}}]},
    {"label": "ChemicalEntity", "pattern": [{"LOWER": {"REGEX": r".*oid$"}}]},
    {"label": "ChemicalEntity", "pattern": [{"LOWER": {"REGEX": r".*acid$"}}]},
    {"label": "ChemicalEntity", "pattern": [{"LOWER": {"REGEX": r".*illin$"}}]},
    {"label": "ChemicalEntity", "pattern": [{"LOWER": {"REGEX": r".*tam$"}}]},
    {"label": "ChemicalEntity", "pattern": [{"LOWER": {"REGEX": r".*cin$"}}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "NOUN"}, {"LOWER": "agonists"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "PROPN"}, {"LOWER": "agonists"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "ADJ"}, {"LOWER": "agonists"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "NOUN"}, {"LOWER": "antagonists"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "PROPN"}, {"LOWER": "antagonists"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "ADJ"}, {"LOWER": "antagonists"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "NOUN"}, {"LOWER": "acid"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "PROPN"}, {"LOWER": "acid"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "ADJ"}, {"LOWER": "acid"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "NOUN"}, {"LOWER": "receptor antagonists"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "PROPN"}, {"LOWER": "receptor antagonists"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "ADJ"}, {"LOWER": "receptor antagonists"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "NOUN"}, {"LOWER": "receptors antagonists"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "PROPN"}, {"LOWER": "receptors antagonists"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "ADJ"}, {"LOWER": "receptors antagonists"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "NOUN"}, {"LOWER": "agent"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "PROPN"}, {"LOWER": "agent"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "ADJ"}, {"LOWER": "agent"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "NOUN"}, {"LOWER": "inhibitors"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "PROPN"}, {"LOWER": "inhibitors"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "ADJ"}, {"LOWER": "inhibitors"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "NOUN"}, {"LOWER": "acetate"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "PROPN"}, {"LOWER": "acetate"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "ADJ"}, {"LOWER": "acetate"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "NOUN"}, {"LOWER": "aspartate"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "PROPN"}, {"LOWER": "aspartate"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "ADJ"}, {"LOWER": "aspartate"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "NOUN"}, {"LOWER": "aminonucleoside"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "PROPN"}, {"LOWER": "aminonucleoside"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "ADJ"}, {"LOWER": "aminonucleoside"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "ADJ"}, {"LOWER": "drug"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "PROPN"}, {"LOWER": "drug"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "NOUN"}, {"LOWER": "drug"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "ADJ"}, {"LOWER": "A"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "PROPN"}, {"LOWER": "A"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "NOUN"}, {"LOWER": "A"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "ADJ"}, {"LOWER": "bromide"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "PROPN"}, {"LOWER": "bromide"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "NOUN"}, {"LOWER": "bromide"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "ADJ"}, {"LOWER": "p-coumarate"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "PROPN"}, {"LOWER": "p-coumarate"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "NOUN"}, {"LOWER": "p-coumarate"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "ADJ"}, {"LOWER": "bradykinin"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "PROPN"}, {"LOWER": "bradykinin"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "NOUN"}, {"LOWER": "bradykinin"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "ADJ"}, {"LOWER": "virgaurea extract"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "PROPN"}, {"LOWER": "virgaurea extract"}]},
    {"label": "ChemicalEntity", "pattern": [{"POS": "NOUN"}, {"LOWER": "virgaurea extract"}]},

    # GeneOrGeneProduct patterns
    {"label": "GeneOrGeneProduct", "pattern": [{"LOWER": {"REGEX": "^MT-\\w+$"}}]},
    {"label": "GeneOrGeneProduct", "pattern": [{"LOWER": {"REGEX": r".*sin$"}}]},
    {"label": "GeneOrGeneProduct", "pattern": [{"LOWER": {"IN": ["PTEN", "EGFR", "TP53", "CDK4", "KRAS", "HER2", "BRCA1", "BRCA2", "NOTCH1", "FGFR4", "VEGFA", "SMAD", "BRAF", "APC", "TNF", "JAK2", "GATA3", "SOX2", "NFKB", "MYC"]}}]},
    {"label": "GeneOrGeneProduct", "pattern": [{"LOWER": {"REGEX": "^(BRCA[0-9]*|CD[0-9]+|BCL[-_]?2|BCL[-_]?xL|MMP[0-9]*|FGF[-_]?\\d*|BMP[-_]?\\d*|IL[-_]?\\d*|TNF[-_]?a|VEGF[-_]?R[0-9]*|CYP[0-9A-Z-]*)$"}}]},
    {"label": "GeneOrGeneProduct", "pattern": [{"LOWER": {"REGEX": "^(\\w+[-_]?(kinase|synthase|factor|receptor|protein|complex|channel|transporters?|cytokine|chemokine|phosphatase|transaminase|dehydrogenase|reductase|hydroxylase|oxidase|transferase|synthetase)\\w*)$"}}]},
    {"label": "GeneOrGeneProduct", "pattern": [{"LOWER": {"REGEX": "^(\\w+[-_]?\\d+)$"}}]},
    {"label": "GeneOrGeneProduct", "pattern": [{"LOWER": {"REGEX": "^(caspase[-_]?\\d*|HIF[-_]?1\\w*|PAR[-_]?[0-9]?)$"}}]},
    {"label": "GeneOrGeneProduct", "pattern": [{"LOWER": {"REGEX": "^NR\\d+[A-Z]?$"}}]},
    {"label": "GeneOrGeneProduct", "pattern": [{"LOWER": {"REGEX": "^(RNA[-_]?polymerase|TF[-_]?\\d+)$"}}]},
    {"label": "GeneOrGeneProduct", "pattern": [{"LOWER": {"REGEX": "^(Ig[A-Z]?[kml]?$|cytokine|chemokine)$"}}]},
    {"label": "GeneOrGeneProduct", "pattern": [{"LOWER": {"REGEX": "^(\\w+[-_]?(dehydrogenase|reductase|hydroxylase|phosphatase|transporter|oxidase|transferase)\\w*)$"}}]},
    {"label": "GeneOrGeneProduct", "pattern": [{"LOWER": {"REGEX": "^(mitochondrial|nuclear|cytosolic|membrane[-_]?bound|membrane[-_]?spanning|L[-_]?type|T[-_]?type|Ca(2+)?|g(amma)?|caspase)$"}}]},
    {"label": "GeneOrGeneProduct", "pattern": [{"LOWER": {"REGEX": "^(F(oxp|ox)|G(ata|f)|H(nf|ox)|J(nk|ak)|M(apk|mp)|N(fk|fat|f|myc)|P(53|ac|ar|as|c|r|tn)|R(ar|as)|S(ma|ox)|T(g|nf)|V(egf|hl))[-_]?\\d*\\w*$"}}]},
    {"label": "GeneOrGeneProduct", "pattern": [{"LOWER": {"REGEX": "^\\w{2,4}[a-z]*\\d+\\w*$"}}]},
    {"label": "GeneOrGeneProduct", "pattern": [{"LOWER": {"REGEX": "^(t(umor)?|p53|bcl[-_]?2|ras|myc|brca|mapk|akt|cdk|nfk|vegf|notch|stat|tgf|tcf|fgf|smad|jak|erk|mmp|il|hif|tnf|cxcr|cd|tcr|ifn|bmp|mtor|pax|wnt|lgr|egfr|her)\\d*[-_]?\\w*$"}}]},
    {"label": "GeneOrGeneProduct", "pattern": [{"LOWER": {"REGEX": "^(receptor|ligand|antigen|inhibitor|enzyme|immunoglobulin|globulin|cytokine|chemokine|peptide|hormone|nucleoprotein|glycoprotein|lipoprotein|protein)$"}}]},
    {"label": "GeneOrGeneProduct", "pattern": [{"LOWER": {"REGEX": "^(histone|ubiquitin|acetylase|deacetylase|transferase|ligase|synthetase|kinase|phosphatase|phosphorylase|hydrolase)$"}}]},
    {"label": "GeneOrGeneProduct", "pattern": [{"LOWER": {"REGEX": "^(nuclear[-_]?receptor|growth[-_]?factor|cell[-_]?adhesion|adhesion[-_]?molecule|signaling[-_]?pathway|transcription[-_]?factor|co[-_]?activator|co[-_]?repressor)$"}}]},
    {"label": "GeneOrGeneProduct", "pattern": [{"LOWER": {"REGEX": "^(serine[-_]?threonine|tyrosine|threonine|serine|cysteine|zinc[-_]?finger|toll[-_]?like|lectin|interleukin[-_]?receptor|g[-_]?protein[-_]?coupled)$"}}]},
    {"label": "GeneOrGeneProduct", "pattern": [{"LOWER": {"REGEX": "^(beta|alpha|gamma|delta|epsilon|zeta|kappa|lambda|mu|sigma|tau|xi|omega)$"}}]},
    {"label": "GeneOrGeneProduct", "pattern": [{"LOWER": {"REGEX": "^(tuberous[-_]?sclerosis|von[-_]?hippel|lynch|li[-_]?fraumeni|marfan|noonan)$"}}]},

    # OrganismTaxon patterns
    {"label": "OrganismTaxon", "pattern": [{"LOWER": {"IN": ["patient", "patients", "woman", "women", "man", "men", "mouse", "mice", "rat", "rats", "human", "humans", "yeast", "virus", "viruses", "bacteria", "bacterial", "outpatient", "inpatient", "hepatitis", "HIV", "cytomegalovirus", "murine", "Plasmodium falciparum", "Mycobacterium tuberculosis", "Staphylococcus aureus", "S. aureus", "E. coli", "Panax ginseng", "mangosteen"]}}]},

    #DiseaseOrPhenotypicFeature patterns
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"LOWER": {"REGEX": "^.*ism$"}}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"LOWER": {"REGEX": "^.*esia$"}}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"LOWER": {"REGEX": "^.*esias$"}}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"LOWER": {"IN": ["genetic", "hereditary", "inherited"]}}, {"POS": "NOUN"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"LOWER": "autoimmune"}, {"POS": "NOUN"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"LOWER": {"IN": ["heart", "cardiovascular", "cardiac"]}}, {"POS": "NOUN"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"LOWER": "neurological"}, {"POS": "NOUN"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"LOWER": "oncological"}, {"POS": "NOUN"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"LOWER": {"IN": ["gastrointestinal", "gi", "stomach"]}}, {"POS": "NOUN"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"LOWER": {"IN": ["respiratory", "lung"]}}, {"POS": "NOUN"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"LOWER": "endocrine"}, {"POS": "NOUN"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"IS_ALPHA": True, "IS_UPPER": True, "LENGTH": 3}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"LOWER": {"REGEX": "^.*(ism|opathy|itis|oma)$"}}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"LOWER": {"IN": ["mutation", "inheritance", "genetic disorder"]}}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"LOWER": {"IN": ["disease", "condition", "syndrome"]}}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": {"IN": ["NOUN", "PROPN"]}}, {"LOWER": {"IN": ["disease", "condition", "syndrome"]}}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"TEXT": {"REGEX": "^.*osis$"}}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"TEXT": {"REGEX": "^.*itis$"}}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"TEXT": {"REGEX": "^.*oma$"}}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "ADJ"}, {"LOWER": "diseases"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "PROPN"}, {"LOWER": "diseases"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "NOUN"}, {"LOWER": "diseases"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "ADJ"}, {"LOWER": "disease"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "PROPN"}, {"LOWER": "disease"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "NOUN"}, {"LOWER": "disease"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "ADJ"}, {"LOWER": "atherosclerosis"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "PROPN"}, {"LOWER": "atherosclerosis"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "NOUN"}, {"LOWER": "atherosclerosis"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "ADJ"}, {"LOWER": "cancers"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "PROPN"}, {"LOWER": "cancers"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "NOUN"}, {"LOWER": "cancers"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "ADJ"}, {"LOWER": "cancer"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "PROPN"}, {"LOWER": "cancer"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "NOUN"}, {"LOWER": "cancer"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "ADJ"}, {"LOWER": "syndrome"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "PROPN"}, {"LOWER": "syndrome"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "NOUN"}, {"LOWER": "syndrome"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "ADJ"}, {"LOWER": "impairment"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "PROPN"}, {"LOWER": "impairment"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "NOUN"}, {"LOWER": "impairment"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "ADJ"}, {"LOWER": "tumours"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "PROPN"}, {"LOWER": "tumours"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "NOUN"}, {"LOWER": "tumours"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "ADJ"}, {"LOWER": "tumour"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "PROPN"}, {"LOWER": "tumour"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "NOUN"}, {"LOWER": "tumour"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "ADJ"}, {"LOWER": "lymphoma"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "PROPN"}, {"LOWER": "lymphoma"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"POS": "NOUN"}, {"LOWER": "lymphoma"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"LOWER": {"REGEX": "^.*ism$"}}, {"LOWER": {"NOT_IN": ["mechanism", "manner"]}}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"LOWER": "cardiac"}, {"LOWER": {"IN": ["function", "disease", "dilation"]}}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"LOWER": {"IN": ["neurological", "oncological", "autoimmune", "endocrine", "gastrointestinal", "respiratory"]}}, {"POS": "NOUN"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"LOWER": {"IN": ["genetic", "hereditary", "inherited"]}}, {"POS": "NOUN"}]},
    {"label": "DiseaseOrPhenotypicFeature", "pattern": [{"LOWER": {"REGEX": "^.*(opathy|oma|osis)$"}}]},

    #SequenceVariant patterns
    {"label": "SequenceVariant", "pattern": [{"TEXT": {"REGEX": "rs\\d+"}}]},
    {"label": "SequenceVariant", "pattern": [{"TEXT": {"REGEX": "[A-Z]\\d+[A-Z]"}}]},
    {"label": "SequenceVariant", "pattern": [{"TEXT": {"REGEX": "p\\.[A-Z][a-z]{2}\\d+[A-Z]"}}]},
    {"label": "SequenceVariant", "pattern": [{"TEXT": {"REGEX": "\\d+[A-Z]>[A-Z]"}}]},
    {"label": "SequenceVariant", "pattern": [{"TEXT": {"REGEX": "c\\.\\d+[A-Z]>[A-Z]"}}]},
    {"label": "SequenceVariant", "pattern": [{"TEXT": {"REGEX": "\\d+del[A-Z]+"}}]},
    {"label": "SequenceVariant", "pattern": [{"TEXT": {"REGEX": "\\d+ins[A-Z]+"}}]},
    {"label": "SequenceVariant", "pattern": [{"TEXT": {"REGEX": "IVS\\d+\\+\\d+[A-Z]>[A-Z]"}}]},
    {"label": "SequenceVariant", "pattern": [{"TEXT": {"REGEX": "[A-Z][a-z]{2}\\d+[a-z]{2}[A-Z]\\d+"}}]},
    {"label": "SequenceVariant", "pattern": [{"TEXT": {"REGEX": "\\d+[A-Z][a-z]{2}"}}]},
    {"label": "SequenceVariant", "pattern": [{"TEXT": {"REGEX": "[A-Z]+ repeat"}}]},
    {"label": "SequenceVariant", "pattern": [{"TEXT": {"REGEX": "[a-zA-Z]+ at nucleotide \\d+"}}]},
    {"label": "SequenceVariant", "pattern": [{"TEXT": {"REGEX": "\\w+ by \\w+ at position \\d+"}}]},
    {"label": "SequenceVariant", "pattern": [{"TEXT": {"REGEX": "deletion of \\w+"}}]},
    {"label": "SequenceVariant", "pattern": [{"TEXT": {"REGEX": "\\-\\d+ \\w+/\\w+"}}]},
    {"label": "SequenceVariant", "pattern": [{"TEXT": {"REGEX": "\\w+\\s?\\-\\d+[A-Z]>[A-Z]"}}]},

    #CellLine pattern
    {"label": "CellLine", "pattern": [{"TEXT": {"REGEX": "\\d+"}}, {"LOWER": "cells"}]},
    {"label": "CellLine", "pattern": [{"POS": "ADJ"}, {"LOWER": "B"}]},
    {"label": "CellLine", "pattern": [{"POS": "PROPN"}, {"LOWER": "B"}]},
    {"label": "CellLine", "pattern": [{"POS": "NOUN"}, {"LOWER": "B"}]},
    {"label": "CellLine", "pattern": [{"POS": {"REGEX": "^.+$"}}, {"LOWER": " B"}]},
    {"label": "CellLine", "pattern": [{"LOWER": "traf2dn-tg b"}]},
    {"label": "CellLine", "pattern": [{"LOWER": "u87mg"}]},
    {"label": "CellLine", "pattern": [{"LOWER": "mcardle-rh7777"}]},
    {"label": "CellLine", "pattern": [{"LOWER": "HT22"}]},
    {"label": "CellLine", "pattern": [{"LOWER": "llc-pk1"}]},
    {"label": "CellLine", "pattern": [{"LOWER": "cos-7"}]},
    {"label": "CellLine", "pattern": [{"LOWER": "sw48"}]},
    {"label": "CellLine", "pattern": [{"LOWER": "ht29"}]},
    {"label": "CellLine", "pattern": [{"LOWER": "t98g"}]},
    {"label": "CellLine", "pattern": [{"LOWER": "sh-sy5y"}]},
    {"label": "CellLine", "pattern": [{"LOWER": "sw480"}]}
]

ruler.add_patterns(patterns)

def process_entities(doc):
    entities = [
        (ent.text.split()[0], 'GeneOrGeneProduct') if ent.label_ == 'GeneOrGeneProduct'
        else (ent.text.split()[0], 'CellLine') if ent.label_ == 'CellLine'
        else (ent.text, ent.label_)
        for ent in doc.ents
        if '@' not in ent.text
    ]
    return entities

df = pd.read_csv('/Users/bielcave/Documents/BSC/Language_Models/Rule_Based/train_dev.csv', header=None)

# Optionally, you can specify column names to better understand the structure
column_names = [
    'ID', 'Type1', 'Type2', 'GeneticChange1', 'GeneticChange2',
    'IsPathogenic', 'Number', 'CaseDescription', 'Extra1', 'Extra2'
]
df.columns = column_names

# Iterate over each row in the DataFrame
extracted_entities = []
for _, row in df.iterrows():
    # Get the text from the CaseDescription column
    text = row['CaseDescription']
    
    # Process the text with your NLP model
    doc = nlp(text)
    
    # Process entities
    modified_entities = process_entities(doc)
    
    # Extract and store the entities
    for entity_text, entity_label in modified_entities:
        extracted_entities.append((entity_text, entity_label))

output_file_path = '/Users/bielcave/Documents/BSC/Language_Models/Rule_Based/train_dataset/rb_train_dev.tsv'
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    for entity_text, entity_label in extracted_entities:
        output_file.write(f"{entity_text}\t{entity_label}\n")



# Input Dataset + Rule Based

In [13]:
import pandas as pd

def process_tsv_files(file1_path, file2_path, output_file_path):
    # Read the TSV files into dataframes
    df1 = pd.read_csv(file1_path, sep='\t', header=None, names=['Key', 'Value'])
    df2 = pd.read_csv(file2_path, sep='\t', header=None, names=['Key', 'Value'])
    
    # List of entities to exclude
    exclude_entities = {"the", "this", "then", "that", "and", "to", "them", "are"}
    
    # Create a dictionary from the first dataframe, excluding unwanted entities
    df1_dict = pd.Series(df1.Value.values, index=df1.Key).to_dict()
    df1_dict = {
        k: v for k, v in df1_dict.items() 
        if str(k).lower() not in exclude_entities or str(k).strip() == ""
    }
    
    # List to hold unmatched rows
    unmatched_rows = []
    
    # Iterate through the second dataframe and update values based on the first dataframe
    for index, row in df2.iterrows():
        if row['Key'] in df1_dict:
            df2.at[index, 'Value'] = df1_dict[row['Key']]
        else:
            unmatched_rows.append(row)
    
    # Append unmatched rows to the end of the dataframe
    unmatched_df = pd.DataFrame(unmatched_rows)
    df2 = pd.concat([df2, unmatched_df]).reset_index(drop=True)
    
    # Save the updated dataframe to a new TSV file
    df2.to_csv(output_file_path, sep='\t', index=False, header=False)

# Example usage
file1_path = '/Users/bielcave/Documents/BSC/Language_Models/Rule_Based/train_dataset/rb_train_dev.tsv'
file2_path = '/Users/bielcave/Documents/BSC/Language_Models/Rule_Based/train_dataset/train_dev.tsv'
output_file_path = '/Users/bielcave/Documents/BSC/Language_Models/Rule_Based/train_dataset/train_dev_complete.tsv'

process_tsv_files(file1_path, file2_path, output_file_path)


# NER to RE

In [None]:
# Agafar el test_predictions.txt i per cada frase del test standard (potser sha de separar en frases si no esta fet ja) mirar si hi ha dos entities
# Si estan, fer una frase i posarles al test de RE, han d'estar marcades d'alguna manera.
# El final sera executar el RE amb el test les frases, poden haver-hi frases repetides, amb diferents entities, pero no les mateixes frases
#Amb les mateixes entities marcades, en resum no pot haver-hi dues frases identiques (amb tot el que comporta)

### Abstract to sentences for RE

In [1]:
import pandas as pd
import re

# Read the TSV file
input_file = '/Users/bielcave/Documents/BSC/Language_Models/Final_Dataset/test_modified.tsv'
df = pd.read_csv(input_file, sep='\t', header=None)

# Remove the second column
df = df.drop(columns=[1])

# Separate the input into sentences using regex
df[0] = df[0].apply(lambda x: re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', x))

# Explode the list of sentences into separate rows
df = df.explode(0).reset_index(drop=True)

# Save the processed data to a new TSV file
output_file = '/Users/bielcave/Documents/BSC/Language_Models/Rule_Based/benchmark_files/sentences_test_re.tsv'
df.to_csv(output_file, sep='\t', header=False, index=False)

print(f'Processed data has been saved to {output_file}')


Processed data has been saved to /Users/bielcave/Documents/BSC/Language_Models/Rule_Based/benchmark_files/sentences_test_re.tsv


### Prepare input test for RE

In [1]:
import pandas as pd
from itertools import combinations as comb

# Load File 1 (txt file)
file1 = '/Users/bielcave/Documents/BSC/Language_Models/Rule_Based/benchmark_files/test_predictions.txt'
with open(file1, 'r') as f:
    lines = f.readlines()

# Parse File 1 into a DataFrame
data = [line.strip().split() for line in lines]
df1 = pd.DataFrame(data, columns=['word', 'tag'])

# Load File 2 (tsv file)
file2 = '/Users/bielcave/Documents/BSC/Language_Models/Rule_Based/benchmark_files/sentences_test_re.tsv'
df2 = pd.read_csv(file2, sep='\t', header=None, names=['sentence'])

# Function to tag words in a sentence
def tag_words(sentence, df1):
    words = sentence.split()
    tagged_words = []
    index = 0
    for word in words:
        clean_word = word.rstrip('.,')
        while index < len(df1) and df1.iloc[index]['word'] != clean_word:
            index += 1
        if index < len(df1):
            tagged_words.append((word, df1.iloc[index]['tag']))
            index += 1
        else:
            tagged_words.append((word, 'O'))
            print(f"Warning: Word '{word}' not found in prediction file.")
    return tagged_words

# Function to generate combinations and format the output
def generate_combinations(sentence, tagged_words):
    bio_entities = [word for word, tag in tagged_words if tag != 'O']
    output = []
    for combo in comb(bio_entities, 2):
        left_entity, right_entity = combo
        formatted_sentence = sentence.replace(left_entity, f'<< {left_entity} >>', 1).replace(right_entity, f'[[ {right_entity} ]]', 1)
        output.append(formatted_sentence)
    return output

# Process each sentence in File 2
output_data = []
index = 0
for sentence in df2['sentence']:
    tagged_words = tag_words(sentence, df1)
    formatted_combinations = generate_combinations(sentence, tagged_words)
    for formatted_sentence in formatted_combinations:
        output_data.append([index, formatted_sentence, 'O'])
        index += 1

# Create DataFrame for output
output_df = pd.DataFrame(output_data, columns=['index', 'sentence', 'label'])

# Save the output to a new TSV file
output_file = '/Users/bielcave/Documents/BSC/Language_Models/Rule_Based/benchmark_files/test_re.tsv'
output_df.to_csv(output_file, sep='\t', index=False)

print(f'Processed data has been saved to {output_file}')


Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/Users/bielcave/opt/anaconda3/envs/LM/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3548, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/7g/_d25g3_n1j14gkxn1yvg7dr00000gn/T/ipykernel_16543/209958648.py", line 48, in <module>
    tagged_words = tag_words(sentence, df1)
                   ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/7g/_d25g3_n1j14gkxn1yvg7dr00000gn/T/ipykernel_16543/209958648.py", line 24, in tag_words
    while index < len(df1) and df1.iloc[index]['word'] != clean_word:
                               ~~~~~~~~^^^^^^^
  File "/Users/bielcave/opt/anaconda3/envs/LM/lib/python3.11/site-packages/pandas/core/indexing.py", line 1153, in __getitem__
    return self._getitem_axis(maybe_callable, axis=axis)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/bielcave/opt/anaconda3/envs/LM/lib/python3.11/site-packages/pandas/core/indexing.py", line 1716, i