In [27]:
import os
from nltk.tokenize import sent_tokenize, word_tokenize
from py_stringmatching.similarity_measure.soft_tfidf import SoftTfIdf
from py_stringmatching.similarity_measure.jaro_winkler import JaroWinkler

In [28]:
def instantiate_string_matching(cleaned_text, tuples):
    
    corpus = []
    
    # add words from tuple to corpus
    for tupl in tuples:
        corpus.append(word_tokenize(tupl['prop'] + " " + tupl['value']))
    
    sentences = sent_tokenize(cleaned_text)
    
    # add words from sentences to corpus
    for sentence in sentences:
        tokenized_sent = word_tokenize(sentence)
        corpus.append(tokenized_sent)
    
    return SoftTfIdf(corpus, sim_func = JaroWinkler().get_raw_score, threshold=0.5)

In [29]:
def token_sliding_window(tokens, size):
    for i in range(len(tokens) - size + 1):
        yield tokens[i: i + size]

In [30]:
# cleaned text and a dict with keys prop and value)
def score_sentences(cleaned_text, tupl, soft_tfidf):
    
    tuple_tokens = word_tokenize(tupl['prop'] + " " + tupl['value'])
    sentences = sent_tokenize(cleaned_text)
    
    scores = []
    for sentence in sentences:
        sentence_tokens = word_tokenize(sentence)
        
        big_score = 0.0
        for window in token_sliding_window(sentence_tokens, 5):
            score = soft_tfidf.get_raw_score(window, tuple_tokens)
            if score > big_score:
                big_score = score
            
        scores.append({'sent':sentence, 'score': big_score})
    
    return scores
        

In [31]:
def select_bigger_score(scores):
    return max(scores, key=lambda s:s['score'])

In [45]:
def score_sentence_and_set_ner(cleaned_text, tupl, soft_tfidf, threshold=0.6):
    
    scores = score_sentences(cleaned_text, tupl, soft_tfidf)
    
    #big_score = select_bigger_score(scores)
    big_score = scores

    propTokens = word_tokenize(tupl['prop'])
    valueTokens = word_tokenize(tupl['value'])
    
    selected_sentence = []
    list_sentence_tokens = []
    list_ner = []
    for i in scores:
        if i["score"] > threshold:
            selected_sentence.append(i)
            sentence_tokens = word_tokenize(i['sent'])
            list_sentence_tokens.append(sentence_tokens)

            # get ner annotations according to soft tf-idf measure
            kept_index_prop = [-1] * len(propTokens)
            bigger_token_score = [0.0] * len(propTokens)
            for i, prop_token in enumerate(propTokens):
                for j, token in enumerate(sentence_tokens):
                    score = soft_tfidf.get_raw_score([prop_token], [token])
                    if score > bigger_token_score[i]:
                        bigger_token_score[i] = score
                        kept_index_prop[i] = j

            kept_index_value = [-1] * len(valueTokens)
            bigger_value_score = [0.0] * len(valueTokens)
            for i, value_token in enumerate(valueTokens):
                for j, token in enumerate(sentence_tokens):
                    score = soft_tfidf.get_raw_score([value_token], [token])
                    if score > bigger_value_score[i]:
                        bigger_value_score[i] = score
                        kept_index_value[i] = j

            ner = [''] * len(sentence_tokens)
            for index, token in enumerate(sentence_tokens):
                if index in kept_index_prop:
                    ner[index] = 'PROP'
                    continue
                if index in kept_index_value:
                    ner[index] = 'VALUE'
                    continue
                ner[index] = 'O'
            list_ner.append(ner)

    
    
    return selected_sentence, list_sentence_tokens, list_ner

In [46]:
def read_files(text_dir, structured_dir, filename):
    
    content = open(text_dir + filename, 'r') 
    text = content.read()
    text = text.replace("\n", " ").strip().rstrip()
    
    tuples = []
    content2 = open(structured_dir + filename, 'r')
    for line in content2:
        items = line.replace("\n","").replace("_"," ").split("\t:\t")
        tuples.append({'prop': items[0], 'value': items[1]})
    
    return text, tuples

In [58]:
def write_to_file_tuple(filename, big_score, sentence_tokens, ner):
    filename_output = "out/" + filename
    with open(filename_output, "w") as out:
        for i in range(len(big_score)):
#             out.write(big_score[i]['sent'] + "\n\n" +
#             "score: " + str(big_score[i]['score']) + "\n\n")
            
            add_coma = False
            for j in list(zip(sentence_tokens[i], ner[i])):
                if add_coma:
                    out.write(",")
                else: add_coma = True
                line = ", ".join(str(x) for x in j)
                out.write("(" + line + ")")
            out.write("\n")

def write_to_file_xml(filename, big_score, sentence_tokens, ner):
    filename_output = "out/" + filename.replace(".txt", ".xml")
    with open(filename_output, "w") as out:
        is_sequence = False # caso a palavra anterior tbm seja uma cell type
        
        out.write('<document id="{0}">\n'.format(filename.split(".")[0]))
        for i in range(len(big_score)):
            out.write("\t<sentence>")
            for j in list(zip(sentence_tokens[i], ner[i])):
                if j[1] == "O":
                    if is_sequence:
                        out.write("</cell_type> " + j[0])
                        is_sequence = False
                    else:
                        out.write(j[0] + " ")
                else:
                    if is_sequence:
                        out.write(" {0}".format(j[0]))
                    else:
                        out.write("<cell_type>{0}".format(j[0]))
                        is_sequence = True
            if is_sequence:
                out.write("</cell_type>")
                is_sequence = False
            out.write("</sentence>\n")
        out.write("</document>")

In [59]:
text_dir = 'data/text/'
structured_dir = 'data/structured_data/'
# filenames = ['Abbeville_County,_South_Carolina', 'Acadia_Parish,_Louisiana', 'Accomack_County,_Virginia']
filenames = ['adipose_tissue.txt', 
             'bone_marrow.txt', 
             "umbilical_cord.txt",
             "epithelial.txt",
             "fibroblast.txt",
             "kidney.txt",
             "neural_cell.txt",
             "precursor_cell.txt",
             "stem_cell.txt"]

for filename in filenames:
    # filename = filenames[2]
    text, tuples = read_files(text_dir, structured_dir, filename)

    soft_tfidf = instantiate_string_matching(text, tuples)

    for tupl in tuples:
        print(tupl)

        # Only the big score of the sentence
        # scores = score_sentence(text, tupl, soft_tfidf)
        # print(select_bigger_score(scores))

        # The big sentence score inside the window + named entity tag for each sentence token
        big_score, sentence_tokens, ner = score_sentence_and_set_ner(text, tupl, soft_tfidf)

        filename_output = "out/" + filename
#         for i in range(len(big_score)):
#             print(big_score[i]['sent'])
#             print(big_score[i]['score'])
#             print(list(zip(sentence_tokens[i], ner[i])))

        write_to_file_tuple(filename, big_score, sentence_tokens, ner)
        print("Término escrita arquivo {0}".format(filename))

        print('')

{'prop': '', 'value': 'adipose tissue'}
Término escrita arquivo adipose_tissue.txt

{'prop': '', 'value': 'bone marrow'}
Término escrita arquivo bone_marrow.txt

{'prop': '', 'value': 'umbilical cord'}
Término escrita arquivo umbilical_cord.txt

{'prop': '', 'value': 'epithelial'}
Término escrita arquivo epithelial.txt

{'prop': '', 'value': 'fibroblast'}
Término escrita arquivo fibroblast.txt

{'prop': '', 'value': 'kidney'}
Término escrita arquivo kidney.txt

{'prop': '', 'value': 'neural cell'}
Término escrita arquivo neural_cell.txt

{'prop': '', 'value': 'precursor cell'}
Término escrita arquivo precursor_cell.txt

{'prop': '', 'value': 'stem cell'}
Término escrita arquivo stem_cell.txt

