In [32]:
import spacy
from spacy import displacy
import diseaseprocessor
import chemicalprocessor
import class_entities

In [33]:
colors = {"DISEASE":"linear-gradient(90deg, #aa9cfc, #fc9ce7)",
          "CHEMICAL":"linear-gradient(90deg, #43C6AC, ##F8FFAE)"}

In [34]:
disease_service = diseaseprocessor.DiseaseProcessor('./models/Disease')
chemical_service = chemicalprocessor.ChemicalProcessor('./models/Chemical')

In [35]:
with open('prueba_comb.txt','r') as f:
    sequence = f.read()

In [36]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(sequence)

In [37]:
def paragraphs(document):
    start = 0
    for token in document:
        if token.is_space and token.text.count("\n") > 1:
            yield document[start:token.i]
            start = token.i
    yield document[start:]

In [49]:
def remove_non_entities(entities):
    for ent in entities.ents:
        if ent['entity_group'] == '0':
            entities.ents.remove(ent)

def correct_boundaries(entities):
    last_ent = {'entity_group': '0', 'score': 1, 'word': '', 'start': 0, 'end': 0}
    for ent in entities.ents:
        if ent['entity_group'] == last_ent['entity_group']:
            if ent['start'] == last_ent['end']:
                ent['start'] = last_ent['start']
                if ent['word'].startswith('##'):
                    ent['word'] = ent['word'].replace('##', '')
                ent['word'] = last_ent['word'] + ent['word']
                # print('NEW BOUNDARIES:')
                print(ent)
                entities.ents.remove(last_ent)
                # print('------------------')
        last_ent = ent   

def ents_spans_spacy_doc(entities):
    ent_spans =[]
    for ent in entities.ents:
        proposed_ent = doc.char_span(ent['start'],ent['end'],ent['entity_group'])
        if proposed_ent:
            ent_spans.append(proposed_ent)
    return ent_spans

def solve_split_words(entities,ent_spans):
    proposed_ents = []
    for i,ent in enumerate(entities.ents):
        proposed_ent = doc.char_span(ent['start'],ent['end'],ent['entity_group'])
        if not proposed_ent:
            proposed_ents.append(ent)
            #print(ent)

    #new_ent=None      
    for i,ent in enumerate(proposed_ents):     
        if ent['word'].startswith('##'):
            if proposed_ents[i-1]['entity_group'] == ent['entity_group']:
                if (ent['start'] - proposed_ents[i-1]['end'])<10:
                    new_ent = doc.char_span(proposed_ents[i-1]['start'],ent['end'],ent['entity_group'])
                    print(new_ent)
                    ent_spans.append(new_ent)
    
    return ent_spans

In [52]:
def postprocessing(doc,entities):
    remove_non_entities(entities)
    correct_boundaries(entities)
    ent_spans = ents_spans_spacy_doc(entities)
    ent_spans = solve_split_words(entities, ent_spans)
    ent_spans = list(filter(None, ent_spans))
    ents = doc.set_ents(ent_spans)
    print(doc.ents)
    

In [40]:
print(len(str(doc)))

53778


In [None]:
entities = class_entities.Entities(sequence, [], [], [])
offset = 0

for paragraph in paragraphs(doc):
    #print(len(str(paragraph)))
    disease_service.sentence_to_process(str(paragraph))
    disease_results = disease_service.predict()
    entities.append_new_entities(disease_results)
    chemical_service.sentence_to_process(str(paragraph))
    chemical_results = chemical_service.predict()
    entities.append_new_entities(chemical_results)
    entities.remove_non_entities()
    entities.correct_boundaries()
    
    offset += len(str(paragraph))
    disease_service.set_offset(offset)
    chemical_service.set_offset(offset)


disease_service.set_offset(0,restart = True)
chemical_service.set_offset(0,restart = True)
#print(entities.ents)

In [57]:
postprocessing(doc,entities)

severe acute respiratory syndrome coronavirus
Middle East respiratory syndrome coronavirus
novel coronavirus
AY278488
None
of pneumocytes
[Severe acute respiratory syndrome, acute respiratory disease, coronavirus disease 2019, SARS-CoV-2 infection, acute respiratory syndrome, coronavirus disease, respiratory infections, SARS, CoV, MERS-CoV, respiratory illness, unusual viral pneumonia, coronavirus disease, coronavirus disease 2019, SARS, MERS, pneumonia, SARS, MERS, viral pneumonia, fever, cough, chest discomfort, dyspnea, pneumonia, coronavirus disease, SARS-CoV-2, severe acute respiratory syndrome coronavirus 2, pneumonia, novel, coronavirus pneumonia, COVID-19, COVID-19, pneumonia, SARS, amino acid, amino acid, zinc, Severe acute respiratory syndrome, bat, Middle East respiratory syndrome, amino acids, amino acids, amino acids, amino acid, amino acid, amino acid, amino acid, severe acute respiratory syndrome, amino acids, amino acid, amino acid, amino acid, amino acid, Ser, Lys, cor

In [58]:
entities_html = displacy.render(doc, style="ent", options={"ents": ["DISEASE", "CHEMICAL"], "colors": colors})