In [113]:
from bs4 import BeautifulSoup 
from bs4.element import Tag
import nltk

import pycrfsuite

import numpy as np
import pandas as pd

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import numpy as np

import glob

import time
import re
import os
import sklearn_crfsuite
import pandas as pd
from sklearn_crfsuite.metrics import flat_accuracy_score
from collections import Counter
from sklearn_crfsuite import metrics
from random import randrange
from copy import deepcopy
from nltk import word_tokenize
from nltk import pos_tag
import pickle

nltk.download('maxent_ne_chunker')
nltk.download('words')


[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/allyson/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /home/allyson/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

### informações sobre carregar dados

In [201]:
def get_tissue(file_name):
    # Read data file and parse the XML
    with open(file_name, "r") as infile:
        soup = BeautifulSoup(infile, 'html.parser')

    docs = []
    ners = []
    for elem in soup.find_all("sentence"):
        texts = []
        ner = []

        for c in elem:
            if type(c) == Tag:
                # part of a named entity
                for j in c.text.split(" "):
                    if len(j) > 0:
                        texts.append(j) 
                        ner.append("N")
            else:
                # irrelevant word
                for j in c.replace(",", "").replace("\"", "").split(" "):
                    if len(j) > 0:
                        texts.append(j) 
                        ner.append("I")



        docs.append(texts)
        ners.append(ner)

    return docs, ners



def get_tissue_other(file_name):
    # Read data file and parse the XML
    with open(file_name, "r") as infile:
        soup = BeautifulSoup(infile, 'html.parser')

    docs = []
    for elem in soup.find_all("sentence"):
        texts = []

        for c in elem:
            if type(c) == Tag:
                # part of a named entity
                for j in c.text.split(" "):
                    if len(j) > 0:
                        texts.append((j, "N")) 
            else:
                # irrelevant word
                for j in c.replace(",", "").replace("\"", "").split(" "):
                    if len(j) > 0:
                        texts.append((j, "I")) 



        docs.append(texts)

    data = []
    for i, doc in enumerate(docs):

        # Obtain the list of tokens in the document
        tokens = [t for t, label in doc]

        # Perform POS tagging
        tagged = nltk.pos_tag(tokens)

        # Take the word, POS tag, and its label
        data.append([(w, pos, label) for (w, label), (word, pos) in zip(doc, tagged)])

    return data


def get_tissue_from_file(filename):
    tissue_senteces = []
    print("Reading files")
    for file in filename:
        print(file)
        with open(file, "r") as f:
            for i in f:
                tissue_senteces.append(i)
    print("Reading is finished")
    return tissue_senteces

def sentences_train_test_split2(filename):
    qnt_files = len(filename)
    train = get_tissue_from_file(filename[:(qnt_files - 1)])
    test = get_tissue_from_file(filename[(qnt_files - 1):])
    return train, test

def sentences_train_test_split(filename):
    qnt_files = len(filename)
    train_sentence = []
    train_ner = []
    
    for i in filename[:(qnt_files - 1)]:
        sent, ner = get_tissue(i)
        train_sentence.append(sent)
        train_ner.append(ner)
        
    test_sentence, test_ner = get_tissue(filename[-1])
    return np.concatenate(train_sentence), np.concatenate(train_ner), test_sentence, test_ner

### Extraindo informações das sentenças e treinando o modelo

In [156]:
def label2features(label, sent):
    tokens = word_tokenize(sent)
    postags = pos_tag(tokens)
    label = label.encode("utf-8")
    label_tk = word_tokenize(label.lower())
    return ['VALUE' if tags[0].encode("utf-8").lower() in label_tk else 'O' for tags in postags]

def sent2features(tokens):
    tags = pos_tag(tokens)

    return [word2features(tags, i) for i in range(len(tags))]

def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    # features object definition with Token and postag added. Others default.
    features = {
        'token': word,
        'postag': postag,
        'np_chunk': 'none',
        'start_capital': False,
        'single_capital': False,
        'capital_period': False,
        'all_capital_period': False,
        'contains_number': False,
        'two_digits': False,
        'four_digits': False,
        'dollar_sign': False,
        'underline': False,
        'percentage': False,
        'purely_numeric': False,
        'number_type': False,
        'stop_word': False
    }

    # NP Chunk tag
    grammar = r"""NP:
    {<.*>+}          # Chunk everything
    }<VBD|IN>+{      # Chink sequences of VBD and IN
    """
    cp = nltk.RegexpParser(grammar)
    tree = cp.parse(sent)

    features['np_chunk'] = [subtree.label()
                            if sent[i][0] in [token for (token, tag) in subtree.leaves()] else 'none'
                            for subtree in tree.subtrees()][0]

    '''
        print(features['np_chunk'])
        for subtree in tree.subtrees():
            if sent[i][0] in [token for (token, tag) in subtree.leaves()]:
                features['np_chunk'] = subtree.label()
            else:
                features['np_chunk'] = 'none'
    '''

    # First token of sentence
    if i == 0:
        features['first'] = word

    # In first and second half of sentence
    if i < len(sent)/2:
        features['first_half'] = word
    elif i >= len(sent)/2:
        features['second_half'] = word

    # String normalization
    normalization = wordNormalization(word)
    features['normalization'] = normalization

    # Previous Tokens (window size = 5)
    previous_tokens = getTokensInWindow(sent, i, 5, "prev")

    features['previous_tokens'] = previous_tokens

    # Next Tokens (window size = 5)
    next_tokens = getTokensInWindow(sent, i, 5, "next")

    features['next_tokens'] = next_tokens

    # First letter capitalized
    if word[0].isupper():
        features['start_capital'] = True

    # Single capital
    if len(word)==1 and word.isupper():
        features['single_capital'] = True

    # Starts capital end period
    if word[0].isupper() and word[len(word)-1] == '.':
        features['capital_period'] = True

    # All capital end period
    capital_period_pattern = re.compile('^[A-Z]*\.$')
    if capital_period_pattern.match(word) is not None:
        features["all_capital_period"] = True

    # Contains at least one digit
    one_number_pattern = re.compile('[0-9]+')
    if one_number_pattern.match(word) is not None:
        features['contains_number'] = True

    # Two digits
    two_digits_pattern = re.compile('^[0-9]{2}$')
    if two_digits_pattern.match(word) is not None:
        features['two_digits'] = True

    # Four digits
    four_digits_pattern = re.compile('^[0-9]{4}$')
    if four_digits_pattern.match(word) is not None:
        features['four_digits'] = True

    # Contains dollar sign
    dollar_sign_pattern = re.compile('\$')
    if dollar_sign_pattern.match(word) is not None:
        features['dollar_sign'] = True

    # Contains uniderline
    underline_pattern = re.compile('\_')
    if underline_pattern.match(word) is not None:
        features['underline'] = True

    # Contains percentage
    percentage_pattern = re.compile('\%')
    if percentage_pattern.match(word) is not None:
        features['percentage'] = True

    # Purely numeric
    purely_numeric_pattern = re.compile('^\d+$')
    if purely_numeric_pattern.match(word) is not None:
        features['purely_numeric'] = True

    # Number type
    number_type_pattern = re.compile('(\d+((\.|,)*\d+)+((,)*\d+)*)*')
    if number_type_pattern.match(word) is not None:
        features['number_type'] = True

    # Stop word
    stop_words = ['the', 'a', 'of']
    if word in stop_words:
        features['stop_word'] = True

    # print(features)

    return features

# capital to "A"
# lowercase to "a"
# digit to "1"
# others to "0"
def wordNormalization(word):
    normalization = ''
    digit_pattern = re.compile('\d')

    for character in word:
        if character.isupper():
            normalization += "A"
        elif character.islower():
            normalization += "a"
        elif digit_pattern.match(character):
            normalization += "1"
        else:
            normalization += "0"
    return normalization

# return previous tokens in sentence including current index
def getTokensInWindow(sent, current_index, window_size, type):

    returnedTokens = []
    for i in range(1, window_size):
        if len(returnedTokens) < window_size:
            index = 0
            if type == 'prev':
                index = current_index - i
            elif type == 'next':
                index = current_index + i

            if 0 <= index < len(sent):
                returnedTokens.append(sent[index][0])
    return returnedTokens

def splitTrainingData(features, labels, division_ratio):
        trainSize = int(len(features) * division_ratio)
        print("trainSize = " + str(trainSize))
        x_train = []
        y_train = []
        x_test = []
        y_test = []

        while len(x_train) < trainSize and len(y_train) < trainSize:
            index = randrange(len(features))
            x_train.append(features.pop(index))
            y_train.append(labels.pop(index))

        while len(features) > 0:
            x_test.append(features.pop())
            y_test.append(labels.pop())

        return x_train, y_train, x_test, y_test

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

In [310]:
def trainCRFModel(sentences, ner, save=False):
        start_time = time.time()
        CRFs = []
        dump_dir = "crf/"
        dump_file = dump_dir + "sklean_crf.sav"

        if os.path.exists(dump_dir) is False:
            os.makedirs(dump_dir)

        if os.path.isfile(dump_file) and save:
            print("LOADING MODEL ALREADY TRAINED AND SAVED")
            crf = pickle.load(open(dump_file, 'rb'))
            CRFs.append(["sklean_crf", crf])
        else:        
            features = [sent2features(sent) for sent in sentences]
            labels = [n for n in ner]

            print('\nCount features: {}'.format(len(features)))
            print('Count labels: {}'.format(len(labels)))

            X = deepcopy(features)
            y = deepcopy(labels)

            #X_train, y_train, X_test, y_test = splitTrainingData(X, y, 0.75)
            X_train = X
            y_train = y
            
            crf = sklearn_crfsuite.CRF(
                algorithm='l2sgd',
                c2=0.1,
                max_iterations=200,
                all_possible_transitions=True,
            )
            crf.fit(X_train, y_train)
            pickle.dump(crf, open(dump_file, 'wb'))

            labels = list(crf.classes_)
            print("CRF CLASSES: ")
            print(labels)

#             y_pred = crf.predict(X_test)

#             sorted_labels = sorted(
#                 labels,
#                 key=lambda name: (name[1:], name[0])
#             )
#             print(metrics.flat_classification_report(
#                 y_test, y_pred, labels=sorted_labels, digits=3
#             ))

#             print(">>> flat accuracy: %.3f" % flat_accuracy_score(y_test, y_pred))

#             print("Top likely transitions:")
#             print_transitions(Counter(crf.transition_features_).most_common(20))

#             print("\nTop positive:")
#             print_state_features(Counter(crf.state_features_).most_common(30))

#             '''print("\nTop negative:")
#             self.print_state_features(Counter(crf.state_features_).most_common()[-30:])'''
            CRFs.append(["sklean_crf", crf])

        elapsed_time = time.time() - start_time
        print("Returned CRF Extractors: {}".format(len(CRFs)))
        print("CRF extractors: {}".format([name for (name, crf) in CRFs]))
        print("Elapsed time: {}".format(elapsed_time))
        return CRFs

def heuristicTokensSelection(tokens, labels, marginals):
    ret = []
    for i, label in enumerate(labels):
        #print("label: " + label)
        if label == "N" and marginals[i]["N"] > 0.5:
            ret.append(label)
            
    return ret
    
def runCRFExtractor(crf, sentence):
    if crf is not None:
        features_crf = sent2features(sentence)
        predictedLabels = crf.predict_single(features_crf)
        predictedMarginals = crf.predict_marginals_single(features_crf)
        #print(predictedLabels)
        #print(predictedMarginals)
        #return extractFromCRFOutput(sentence, predictedLabels, predictedMarginals)
        return predictedLabels
    else:
        print("CRF IS NONE")
        #return []
        return None

def extractFromCRFOutput(tokens, labels, marginals):
    # composedValue indicates if attribute is composed by two or more tokens
    extractions = heuristicTokensSelection(tokens, labels, marginals)

    if extractions is not None and len(extractions) > 0:
        #print(extractions)
        return extractions

    return []

def test_extractor(crf, sentences, print_results = False):
    pred = []
    for sent in sentences:
        #sent  = test_sentence[4]
        if print_results:
            print(sent)
            
        pred.append(runCRFExtractor(crf[0][1], sent))
    return pred

In [244]:
#filename_list = glob.glob("../../sentence_scoring/out/*.xml")
filename_list = [
 '../../sentence_scoring/out/stem_cell.xml',
 '../../sentence_scoring/out/neural_cell.xml',
 '../../sentence_scoring/out/fibroblast.xml',
  '../../sentence_scoring/out/epithelial.xml',
 '../../sentence_scoring/out/kidney.xml',
 '../../sentence_scoring/out/precursor_cell.xml',
'../../sentence_scoring/out/adipose_tissue.xml',
'../../sentence_scoring/out/umbilical_cord.xml',
'../../sentence_scoring/out/bone_marrow.xml']
#tissue_train_setences, tissue_test_setences = sentences_train_test_split(filename_list)
tissue_list = []
ner_list = []
# for filename in filename_list:
#     tissue_aux, ner_aux = get_tissue(filename)
#     tissue_list.append(tissue_aux)
#     ner_list.append(ner_aux)

train_sentence, train_ner, test_sentence, test_ner = sentences_train_test_split(filename_list)
#tissue_list

In [248]:
len(train_sentence)

468

In [249]:
len(test_sentence)

118

In [250]:
len(test_sentence) + len(train_sentence)

586

In [245]:
crf = trainCRFModel(train_sentence, train_ner


Count features: 468
Count labels: 468
trainSize = 351
CRF CLASSES: 
['I', 'N']
             precision    recall  f1-score   support

          I      0.968     0.994     0.981      2700
          N      0.865     0.516     0.646       186

avg / total      0.961     0.964     0.959      2886

>>> flat accuracy: 0.964
Top likely transitions:
I      -> I       0.427033
N      -> I       -0.055849
I      -> N       -0.199303
N      -> N       -1.415806

Top positive:
3.424234 I        next_tokens::
2.426344 I        normalization:0
2.300739 N        token:adipose
2.066315 I        next_tokens:by
2.060982 I        next_tokens:factor
2.053555 N        next_tokens:expressionlevel
1.915232 N        token:precursors
1.879041 I        next_tokens:adipose
1.874212 N        token:correlate
1.874212 N        second_half:correlate
1.852713 N        token:fibroblast
1.819771 I        next_tokens:fibroblasts
1.794918 I        contains_number
1.748304 I        first_half:mesenchymal
1.712356 I       

In [246]:
y_pred = []
for sent in test_sentence:
    #sent  = test_sentence[4]
    print(sent)
    y_pred.append(runCRFExtractor(crf[0][1], sent))

['About', '85', '%', 'of', 'the', 'cell', 'population', 'in', 'this', 'bone', 'tissueare', 'osteocytes', 'and', 'the', 'remainder', 'are', 'osteoblasts', 'bone', 'marrow', 'etc', '.']
[{'I': 0.9996273154421305, 'N': 0.0003726845578700991}, {'I': 0.9999601365304698, 'N': 3.986346953074631e-05}, {'I': 0.9997896012984254, 'N': 0.0002103987015751682}, {'I': 0.9992823202214376, 'N': 0.0007176797785627101}, {'I': 0.9621372857684143, 'N': 0.037862714231586145}, {'I': 0.6126913353378702, 'N': 0.38730866466213004}, {'I': 0.9877938519555808, 'N': 0.01220614804441975}, {'I': 0.9832728339828717, 'N': 0.016727166017128665}, {'I': 0.996636845650564, 'N': 0.003363154349436147}, {'I': 0.981802889070131, 'N': 0.018197110929869142}, {'I': 0.983392015564622, 'N': 0.01660798443537813}, {'I': 0.9635712129692212, 'N': 0.03642878703077891}, {'I': 0.996449713830963, 'N': 0.003550286169037159}, {'I': 0.9575633753972868, 'N': 0.04243662460271354}, {'I': 0.9884291195163823, 'N': 0.011570880483617688}, {'I': 0.99

[{'I': 0.9982808365203115, 'N': 0.0017191634796881483}, {'I': 0.9902333782358181, 'N': 0.009766621764181787}, {'I': 0.9991891226299829, 'N': 0.0008108773700169876}, {'I': 0.9981823528070227, 'N': 0.0018176471929770707}, {'I': 0.9837918525873244, 'N': 0.01620814741267556}, {'I': 0.9945533815157294, 'N': 0.005446618484270492}, {'I': 0.960379323019872, 'N': 0.039620676980127834}, {'I': 0.9443716900087541, 'N': 0.0556283099912458}, {'I': 0.9930334374046176, 'N': 0.006966562595382437}, {'I': 0.9666070424315056, 'N': 0.03339295756849424}, {'I': 0.9851424404897308, 'N': 0.014857559510268998}, {'I': 0.9909565902576819, 'N': 0.009043409742318143}, {'I': 0.9893073584453745, 'N': 0.010692641554625421}, {'I': 0.9989813217398522, 'N': 0.0010186782601478376}, {'I': 0.993860493579776, 'N': 0.006139506420224101}, {'I': 0.9997744964627262, 'N': 0.000225503537273625}, {'I': 0.9992276114065087, 'N': 0.0007723885934912044}, {'I': 0.9999194013468964, 'N': 8.059865310361175e-05}, {'I': 0.9998197297179746, '

[{'I': 0.9940842421605355, 'N': 0.005915757839463141}, {'I': 0.996956302260803, 'N': 0.0030436977391957586}, {'I': 0.995938428715457, 'N': 0.0040615712845417885}, {'I': 0.981752466485139, 'N': 0.01824753351485994}, {'I': 0.992113939619043, 'N': 0.007886060380955977}, {'I': 0.9957933516836686, 'N': 0.00420664831633045}, {'I': 0.9969071643760229, 'N': 0.0030928356239762695}, {'I': 0.9785397470739923, 'N': 0.021460252926006857}, {'I': 0.9991842676108573, 'N': 0.0008157323891415935}, {'I': 0.9722991847825417, 'N': 0.027700815217457254}, {'I': 0.9937156781268798, 'N': 0.006284321873119189}, {'I': 0.9997207817721375, 'N': 0.00027921822786166917}, {'I': 0.9994375283996465, 'N': 0.0005624716003525083}, {'I': 0.9958060672146792, 'N': 0.004193932785319675}, {'I': 0.9981074176648739, 'N': 0.0018925823351250515}, {'I': 0.9985612248768735, 'N': 0.001438775123125351}, {'I': 0.9998015742691674, 'N': 0.00019842573083143575}, {'I': 0.9994898317868223, 'N': 0.0005101682131763981}, {'I': 0.99168960400576

[{'I': 0.9909729689880179, 'N': 0.009027031011982623}, {'I': 0.999771510672979, 'N': 0.0002284893270214864}, {'I': 0.9471956231800468, 'N': 0.05280437681995377}, {'I': 0.9995690399591279, 'N': 0.0004309600408723666}, {'I': 0.9992797285182151, 'N': 0.0007202714817852954}, {'I': 0.999757678052044, 'N': 0.00024232194795620294}, {'I': 0.9915229051629292, 'N': 0.008477094837070796}, {'I': 0.9965559912510774, 'N': 0.0034440087489228414}, {'I': 0.994503016993961, 'N': 0.005496983006039289}, {'I': 0.8649578855474724, 'N': 0.13504211445252812}, {'I': 0.9951919442127555, 'N': 0.004808055787244842}, {'I': 0.8826848114203704, 'N': 0.11731518857962973}, {'I': 0.9784770681973965, 'N': 0.0215229318026035}, {'I': 0.9945330032539582, 'N': 0.005466996746041851}, {'I': 0.9980084422675278, 'N': 0.0019915577324720003}]
['Human', 'fetal', 'bone', 'marrow', 'wasobtained', 'and', 'using', 'flow', 'cytometry', 'antibodies', 'four', 'populations', 'of', 'B-cell', 'developmental', 'stages', 'including', 'stage',

[{'I': 0.9959559355645959, 'N': 0.004044064435403651}, {'I': 0.9943145682846757, 'N': 0.0056854317153239315}, {'I': 0.9983240493730403, 'N': 0.001675950626959281}, {'I': 0.9992327870108386, 'N': 0.0007672129891611761}, {'I': 0.9990858252145864, 'N': 0.0009141747854133238}, {'I': 0.9999722814356945, 'N': 2.771856430499012e-05}, {'I': 0.9999803677785999, 'N': 1.963222139944496e-05}, {'I': 0.9986153666712713, 'N': 0.0013846333287282054}, {'I': 0.9596195169262166, 'N': 0.040380483073782766}, {'I': 0.9998865696322172, 'N': 0.00011343036778227171}, {'I': 0.9997970803674054, 'N': 0.00020291963259400165}, {'I': 0.9998910289464737, 'N': 0.0001089710535256858}, {'I': 0.930066507332396, 'N': 0.06993349266760333}, {'I': 0.9823968519296814, 'N': 0.017603148070317955}, {'I': 0.7143676097458043, 'N': 0.28563239025419523}, {'I': 0.9497477513767485, 'N': 0.050252248623250804}, {'I': 0.9667793047771753, 'N': 0.033220695222824045}, {'I': 0.990172160980246, 'N': 0.009827839019753351}, {'I': 0.994799412023

[{'I': 0.4296212846594861, 'N': 0.5703787153405142}, {'I': 0.9985680145051614, 'N': 0.0014319854948389114}, {'I': 0.9993773834503396, 'N': 0.0006226165496605929}, {'I': 0.9993417653501506, 'N': 0.0006582346498494046}, {'I': 0.893954902204927, 'N': 0.10604509779507305}, {'I': 0.9416262722582265, 'N': 0.05837372774177375}, {'I': 0.9999721289230747, 'N': 2.7871076925294366e-05}, {'I': 0.9997631424348892, 'N': 0.0002368575651109077}, {'I': 0.9229475718714494, 'N': 0.07705242812855065}, {'I': 0.9995186615326234, 'N': 0.0004813384673764245}, {'I': 0.9999918152254859, 'N': 8.184774513995639e-06}, {'I': 0.9996681176648029, 'N': 0.00033188233519706745}, {'I': 0.9998550875513782, 'N': 0.00014491244862204606}, {'I': 0.9903251580226887, 'N': 0.009674841977311366}, {'I': 0.9999792966435678, 'N': 2.0703356432373e-05}, {'I': 0.9982214732688371, 'N': 0.001778526731163259}, {'I': 0.998006712450707, 'N': 0.001993287549293407}, {'I': 0.9999955588341937, 'N': 4.441165806704244e-06}, {'I': 0.99999747762815

[{'I': 0.9948294548744915, 'N': 0.005170545125508466}, {'I': 0.9968391300478874, 'N': 0.003160869952112537}, {'I': 0.9956699511358705, 'N': 0.0043300488641294}, {'I': 0.9989842162654795, 'N': 0.001015783734520241}, {'I': 0.9943701419111042, 'N': 0.005629858088895664}, {'I': 0.9993171329706959, 'N': 0.0006828670293041469}, {'I': 0.8100091887670535, 'N': 0.18999081123294628}, {'I': 0.9986971486339445, 'N': 0.0013028513660553417}, {'I': 0.9980977890751154, 'N': 0.00190221092488457}, {'I': 0.998809155935029, 'N': 0.0011908440649710224}, {'I': 0.9993339131612472, 'N': 0.0006660868387527927}, {'I': 0.9951634776437142, 'N': 0.00483652235628592}, {'I': 0.9936047896585619, 'N': 0.006395210341438199}, {'I': 0.9954845407207324, 'N': 0.004515459279267959}, {'I': 0.9862062949475673, 'N': 0.013793705052433212}, {'I': 0.9965244510620721, 'N': 0.003475548937928131}, {'I': 0.9882336460286646, 'N': 0.011766353971335524}, {'I': 0.9972199977410299, 'N': 0.002780002258970358}, {'I': 0.9967445387533347, 'N'

In [247]:
print(metrics.flat_classification_report(
                test_ner, y_pred, digits=3
            ))
#umbilical cord tissue as test set

             precision    recall  f1-score   support

          I      0.881     0.994     0.934      1717
          N      0.333     0.021     0.040       235

avg / total      0.815     0.877     0.827      1952



In [235]:
glob.glob("../../sentence_scoring/out/*.xml")

['../../sentence_scoring/out/umbilical_cord.xml',
 '../../sentence_scoring/out/stem_cell.xml',
 '../../sentence_scoring/out/neural_cell.xml',
 '../../sentence_scoring/out/fibroblast.xml',
 '../../sentence_scoring/out/bone_marrow.xml',
 '../../sentence_scoring/out/adipose_tissue.xml',
 '../../sentence_scoring/out/epithelial.xml',
 '../../sentence_scoring/out/kidney.xml',
 '../../sentence_scoring/out/precursor_cell.xml']

### teste de desempenho

In [267]:
def cross_val_train_test():
    values = range(len(filename_list))
    for i in values:
        train_sentence = []
        train_ner = []
        test_sentence = [] 
        test_ner = []
        
        for j in values:
            if i == j:
                test_sentence, test_ner = get_tissue(filename_list[j])
            else:
                sent, ner = get_tissue(filename_list[j])
                train_sentence.append(sent)
                train_ner.append(ner)

        yield np.concatenate(train_sentence), np.concatenate(train_ner), test_sentence, test_ner
        #metrics.flat_accuracy_score(test_ner, y_pred)


In [311]:
accuracy = []
precision_pos = []
precision_neg = []
recall_pos = []
recall_neg = []
for train_sentence, train_ner, test_sentence, test_ner in cross_val_train_test():
    crf = trainCRFModel(train_sentence, train_ner)
    y_pred = test_extractor(crf, test_sentence)
    accuracy.append(metrics.flat_accuracy_score(test_ner, y_pred))
    precision_pos.append(metrics.flat_precision_score(test_ner, y_pred, pos_label="N"))
    precision_neg.append(metrics.flat_precision_score(test_ner, y_pred, pos_label="I"))
    recall_pos.append(metrics.flat_recall_score(test_ner, y_pred, pos_label="N"))
    recall_neg.append(metrics.flat_recall_score(test_ner, y_pred, pos_label="I"))


Count features: 448
Count labels: 448
CRF CLASSES: 
['I', 'N']
Returned CRF Extractors: 1
CRF extractors: ['sklean_crf']
Elapsed time: 11.106700658798218

Count features: 557
Count labels: 557
CRF CLASSES: 
['I', 'N']
Returned CRF Extractors: 1
CRF extractors: ['sklean_crf']
Elapsed time: 13.800935983657837

Count features: 508
Count labels: 508
CRF CLASSES: 
['I', 'N']
Returned CRF Extractors: 1
CRF extractors: ['sklean_crf']
Elapsed time: 12.834608316421509

Count features: 491
Count labels: 491
CRF CLASSES: 
['I', 'N']
Returned CRF Extractors: 1
CRF extractors: ['sklean_crf']
Elapsed time: 12.259775876998901

Count features: 562
Count labels: 562
CRF CLASSES: 
['I', 'N']
Returned CRF Extractors: 1
CRF extractors: ['sklean_crf']
Elapsed time: 13.378231048583984

Count features: 556
Count labels: 556
CRF CLASSES: 
['I', 'N']
Returned CRF Extractors: 1
CRF extractors: ['sklean_crf']
Elapsed time: 14.569947719573975

Count features: 532
Count labels: 532
CRF CLASSES: 
['I', 'N']
Return

In [312]:
print("acurácia = " + str(np.mean(accuracy)))
print("precisão pos = " + str(np.mean(precision_pos)))
print("precisão neg = " + str(np.mean(precision_neg)))
print("cobertura pos = " + str(np.mean(recall_pos)))
print("cobertura neg = " + str(np.mean(recall_neg)))

acurácia = 0.9217973708089963
precisão pos = 0.3008065059726922
precisão neg = 0.930552949886909
cobertura pos = 0.05416678764613683
cobertura neg = 0.9897113441770339


In [309]:
print("acurácia = " + str(np.mean(accuracy)))
print("precisão pos = " + str(np.mean(precision_pos)))
print("precisão neg = " + str(np.mean(precision_neg)))
print("cobertura pos = " + str(np.mean(recall_pos)))
print("cobertura neg = " + str(np.mean(recall_neg)))

acurácia = 0.9204574923195101
precisão pos = 0.28221546971546974
precisão neg = 0.9307225441523381
cobertura pos = 0.05686319358028559
cobertura neg = 0.9879515999600055


In [304]:
# alg = lbfgs
#c1 = 0.01
#c2 = 0.01
print("acurácia = " + str(np.mean(accuracy)))
print("precisão pos = " + str(np.mean(precision_pos)))
print("precisão neg = " + str(np.mean(precision_neg)))
print("cobertura pos = " + str(np.mean(recall_pos)))
print("cobertura neg = " + str(np.mean(recall_neg)))

acurácia = 0.9207489236188842
precisão pos = 0.2811762396128883
precisão neg = 0.9304783027017801
cobertura pos = 0.05428433595027707
cobertura neg = 0.9885659636882702
