In [1]:
import spacy
from spacy import displacy

import bioprocessor
import chemicalprocessor
import diseaseprocessor
import geneprocessor
import class_entities
from utils import paragraphs

import re

In [2]:
colors = {"DISEASE": "linear-gradient(90deg, #aa9cfc, #fc9ce7)",
          "CHEMICAL": "linear-gradient(90deg, #ffa17f, #3575ad)",
          "GENETIC": "linear-gradient(90deg, #c21500, #ffc500)"}

In [3]:
disease_service = diseaseprocessor.DiseaseProcessor('./models/Disease')
print('Disease Model Loaded')

chemical_service = chemicalprocessor.ChemicalProcessor('./models/Chemical')
print('Chemical Model Loaded')

genetic_service = geneprocessor.GeneProcessor('./models/Gene')
print('Genetic Model Loaded')

Disease Model Loaded
Chemical Model Loaded
Genetic Model Loaded


In [4]:
with open('prueba_comb.txt','r') as f:
    sequence = f.read()
    #print(sequence)

In [5]:
def process_by_paragraph(doc, entities):
    offset = 0
    for paragraph in paragraphs(doc):
        # print(len(str(paragraph)))
        disease_service.sentence_to_process(str(paragraph))
        disease_results = disease_service.predict()
        entities.append_new_entities(disease_results)
        chemical_service.sentence_to_process(str(paragraph))
        chemical_results = chemical_service.predict()
        entities.append_new_entities(chemical_results)
        genetic_service.sentence_to_process(str(paragraph))
        genetic_results = genetic_service.predict()
        entities.append_new_entities(genetic_results)

        entities.remove_non_entities()

        offset += len(str(paragraph))
        disease_service.set_offset(offset)
        chemical_service.set_offset(offset)
        genetic_service.set_offset(offset)

    disease_service.set_offset(0, restart=True)
    chemical_service.set_offset(0, restart=True)
    genetic_service.set_offset(0, restart=True)


In [6]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(sequence)

entities = class_entities.Entities(doc)
process_by_paragraph(doc, entities)
entities.postprocessing()
print((len(entities)))

entities_html = displacy.render(entities.doc, style="ent",
                                options={"ents": ["DISEASE", "CHEMICAL","GENETIC"], "colors": colors})


374


In [15]:
matches = set(
                [(ent['entity_group'], ent['start'], ent['end']) for ent in entities.ents]
            )
get_sort_key = lambda m: (m[2] - m[1], m[1])
matches = sorted(matches, key=get_sort_key, reverse=True)           

In [26]:
doc[10167:10246]



In [35]:
any(t['start']==10190 and t['end']==10246 for t in entities.ents)

False

In [41]:
ents_no_overlaped = []
seen_tokens = set()
for match_id, start, end in matches:
    if start not in seen_tokens and end - 1 not in seen_tokens:
        for e in entities.ents:
            if not (e['start'] < end and e['end'] > start):
                ents_no_overlaped.append(e)
            else:
                pass
        #print('-------------------------')
        seen_tokens.update(range(start, end))

        
print(ents_no_overlaped)

KeyboardInterrupt: 

# DEPRECATED

In [13]:
def paragraph_tokenize(sequence):
    if len(sequence.split('.\n')) < len(sequence.split('. \n')):
        paragraphs = sequence.split('. \n')
    else:
        paragraphs = sequence.split('.\n')
    print('Text split in:', len(paragraphs), 'paragraphs' )
    return paragraphs

In [19]:
entities = class_entities.Entities(sequence, [], [], [])
offset = 0

for i,paragraph in enumerate(paragraphs):
    disease_service.sentence_to_process(paragraph)
    disease_service.set_offset(offset+i)
    disease_results = disease_service.predict()
    print(disease_service.offset)
    offset = len(paragraph)
    entities.append_entities(disease_results)

0
3199


In [16]:
tokenizer_disease = disease_service.tokenizer

def chunk_text(offset,tokenized_sequence):
    split_index = -1
    for j in range(offset+509,offset-1,-1):
        if tokenized_sequence[j] == '.':
            if tokenized_sequence[j+1][0].isupper():
                split_index = j
                break
    return split_index

print(len(tokenized_sequence))
processing = True
offset=0
tokenized_sequence = tokenizer_disease.tokenize(sequence)
index_list = []
while processing:
    if len(tokenized_sequence[offset:])<510:
        print(len(tokenized_sequence[offset:]))
        processing = True
        break
    split_index = chunk_text(offset, tokenized_sequence)
    print('Split index',split_index)
    print(len(tokenized_sequence[offset:]))
    offset = split_index + 1
    index_list.append(split_index)

chunks = []
for chunk in index_list:
    pattern = tokenized_sequence[chunk-1] + tokenized_sequence[chunk]
    result = re.search(pattern + ' ', sequence)
    print(result)
    if result is None:
        result = re.search(pattern + '\n', sequence)
    elif result is None:
        result = re.search(pattern, sequence)
    chunk_size = result.span()
    chunks.append(chunk_size[1])

start=0
entities = class_entities.Entities(sequence, [], [], [])
for i,chunk in enumerate(chunks):
    chemical_results = []
    chunk_number = i + 1
    if chunk_number<len(chunks):
        print(start)
        disease_service.sentence_to_process(sequence[start:chunk])    
        disease_service.set_offset(start)
        start = chunk
    else:
        print(start)
        print(sequence[start:])
        disease_service.sentence_to_process(sequence[start:])

    disease_results = disease_service.predict()
    

    gene_results = []
    

    entities.append_entities(disease_results)

In [None]:
sequence = "Toluene is a chemical."

In [9]:

chemical_service = chemicalprocessor.ChemicalProcessor('./models/Chemical')

chemical_service.sentence_to_process(sequence)
chemical_results = chemical_service.predict()

entities = class_entities.Entities(sequence, [], chemical_results, [])

print(entities.ents)

entities.remove_non_entities()

entities.correct_boundaries()

print(entities.ents)

[{'entity_group': 'CHEMICAL', 'score': 0.9999813040097555, 'word': 'squalene', 'start': 475, 'end': 483}, {'entity_group': 'CHEMICAL', 'score': 0.9999890128771464, 'word': 'alpha - tocopherol', 'start': 512, 'end': 528}]
[{'entity_group': 'CHEMICAL', 'score': 0.9999813040097555, 'word': 'squalene', 'start': 475, 'end': 483}, {'entity_group': 'CHEMICAL', 'score': 0.9999890128771464, 'word': 'alpha - tocopherol', 'start': 512, 'end': 528}]


In [None]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

In [37]:
doc.spans

{}

In [47]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7f9b79b2f180>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7f9b7b2bab30>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7f9bb93bd100>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7f9b7b048ac0>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7f9bbc003280>)]

In [None]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)