In [1]:
from bs4 import BeautifulSoup 
from bs4.element import Tag
import nltk

import pycrfsuite

import numpy as np
import pandas as pd

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import numpy as np

import glob

In [2]:
def get_tissue(file_name):
    # Read data file and parse the XML
    with open(file_name, "r") as infile:
        soup = BeautifulSoup(infile, 'html.parser')

    docs = []
    for elem in soup.find_all("sentence"):
        texts = []

        for c in elem:
            if type(c) == Tag:
                # part of a named entity
                for j in c.text.split(" "):
                    if len(j) > 0:
                        texts.append((j, "N")) 
            else:
                # irrelevant word
                for j in c.replace(",", "").replace("\"", "").split(" "):
                    if len(j) > 0:
                        texts.append((j, "I")) 



        docs.append(texts)

    data = []
    for i, doc in enumerate(docs):

        # Obtain the list of tokens in the document
        tokens = [t for t, label in doc]

        # Perform POS tagging
        tagged = nltk.pos_tag(tokens)

        # Take the word, POS tag, and its label
        data.append([(w, pos, label) for (w, label), (word, pos) in zip(doc, tagged)])

    return data

In [3]:
def get_tissue_other(file_name):
    # Read data file and parse the XML
    with open(file_name, "r") as infile:
        soup = BeautifulSoup(infile, 'html.parser')

    docs = []
    for elem in soup.find_all("sentence"):
        texts = []

        for c in elem:
            if type(c) == Tag:
                # part of a named entity
                for j in c.text.split(" "):
                    if len(j) > 0:
                        texts.append((j, "N")) 
            else:
                # irrelevant word
                for j in c.replace(",", "").replace("\"", "").split(" "):
                    if len(j) > 0:
                        texts.append((j, "I")) 



        docs.append(texts)

    data = []
    for i, doc in enumerate(docs):

        # Obtain the list of tokens in the document
        tokens = [t for t, label in doc]

        # Perform POS tagging
        tagged = nltk.pos_tag(tokens)

        # Take the word, POS tag, and its label
        data.append([(w, pos, label) for (w, label), (word, pos) in zip(doc, tagged)])

    return data

In [4]:
def get_tissue_from_file(cell_type):
    file_name = "../../sentence_scoring/out/" + cell_type + ".xml"
    data = get_tissue(file_name)
    return data

filenames = ['bone_marrow',  
             "umbilical_cord",
             'adipose_tissue',
             "fibroblast",
             "kidney",
             "neural_cell",
             "precursor_cell",
             "stem_cell",
             "epithelial"
            ]

tissue_list = []
for filename in filenames:
    tissue_list.append(get_tissue_from_file(filename))

In [5]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = [
        'bias=1.0',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag,
        'postag[:2]=' + postag[:2],
    ]
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:postag=' + postag1,
            '-1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('BOS=True')
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:postag=' + postag1,
            '+1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('EOS')
                
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [6]:
# A function for extracting features in documents
def extract_features(doc):
    return [word2features(doc, i) for i in range(len(doc))]

# A function fo generating the list of labels for each document
def get_labels(doc):
    return [label for (token, postag, label) in doc]

tissue_features = []
tissue_label = []

for tissue in tissue_list:
    tissue_features.append([extract_features(doc) for doc in tissue])
    tissue_label.append([get_labels(doc) for doc in tissue])
    
    
# # Umbilical Cord data
# x_umbilical_cord = [extract_features(doc) for doc in umbilical_cord]
# y_umbilical_cord = [get_labels(doc) for doc in umbilical_cord]

# # Bone Marrow data
# x_bone_marrow = [extract_features(doc) for doc in bone_marrow]
# y_bone_marrow = [get_labels(doc) for doc in bone_marrow]

# # Adipose Tissue data
# x_adipose_tissue = [extract_features(doc) for doc in adipose_tissue]
# y_adipose_tissue = [get_labels(doc) for doc in adipose_tissue]

### Umbilical Cord model

In [7]:
len(tissue_list)

9

In [8]:
trainer = pycrfsuite.Trainer(verbose=True)

# x_umbilical_cord_train = x_adipose_tissue + x_bone_marrow
# y_umbilical_cord_train = y_adipose_tissue + y_bone_marrow

# Separando dados de treino e teste
x_train = np.concatenate(tissue_features[:7])
y_train = np.concatenate(tissue_label[:7])

x_test = tissue_features[-1]
y_test = tissue_label[-1]

# Submit training data to the trainer
for xseq, yseq in zip(x_train, y_train):
    trainer.append(xseq, yseq)

# Set the parameters of the model
trainer.set_params({
    # coefficient for L1 penalty
    'c1': 0.1,

    # coefficient for L2 penalty
    'c2': 0.1,  

    # maximum number of iterations
    'max_iterations': 200,

    # whether to include transitions that
    # are possible, but not observed
    'feature.possible_transitions': True
})

# Provide a file name as a parameter to the train function, such that
# the model will be saved to the file when training is finished
trainer.train('crf_umbilical_cord.model')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 7876
Seconds required: 0.046

L-BFGS optimization
c1: 0.100000
c2: 0.100000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 2798.169872
Feature norm: 1.000000
Error norm: 2342.744713
Active features: 7732
Line search trials: 1
Line search step: 0.000065
Seconds required for this iteration: 0.014

***** Iteration #2 *****
Loss: 2514.194951
Feature norm: 0.875749
Error norm: 2021.313241
Active features: 7782
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.011

***** Iteration #3 *****
Loss: 2150.763510
Feature norm: 0.531380
Error norm: 1384.802051
Active features: 5237
Line search trials: 2
Line search step: 0.500000
Seconds required for this iter

***** Iteration #46 *****
Loss: 571.058096
Feature norm: 37.784058
Error norm: 13.748814
Active features: 2346
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.009

***** Iteration #47 *****
Loss: 570.700106
Feature norm: 37.824178
Error norm: 27.808663
Active features: 2344
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.008

***** Iteration #48 *****
Loss: 570.464817
Feature norm: 37.934646
Error norm: 49.479279
Active features: 2335
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.008

***** Iteration #49 *****
Loss: 570.057395
Feature norm: 37.927804
Error norm: 31.727392
Active features: 2325
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.008

***** Iteration #50 *****
Loss: 569.766623
Feature norm: 37.993992
Error norm: 19.970793
Active features: 2316
Line search trials: 1
Line search step: 1.000000
Seconds required for thi

***** Iteration #100 *****
Loss: 565.211118
Feature norm: 37.662624
Error norm: 3.303605
Active features: 2172
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.015

***** Iteration #101 *****
Loss: 565.200588
Feature norm: 37.663919
Error norm: 8.030101
Active features: 2170
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.015

***** Iteration #102 *****
Loss: 565.194314
Feature norm: 37.660549
Error norm: 13.163149
Active features: 2170
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.008

***** Iteration #103 *****
Loss: 565.178661
Feature norm: 37.667824
Error norm: 16.671369
Active features: 2170
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.008

***** Iteration #104 *****
Loss: 565.146406
Feature norm: 37.660764
Error norm: 9.462289
Active features: 2167
Line search trials: 1
Line search step: 1.000000
Seconds required for t

***** Iteration #143 *****
Loss: 564.800718
Feature norm: 37.669830
Error norm: 6.598307
Active features: 2152
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.009

***** Iteration #144 *****
Loss: 564.794194
Feature norm: 37.668607
Error norm: 5.649796
Active features: 2152
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.018

***** Iteration #145 *****
Loss: 564.789752
Feature norm: 37.672142
Error norm: 5.937062
Active features: 2154
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.017

***** Iteration #146 *****
Loss: 564.784133
Feature norm: 37.670888
Error norm: 5.029988
Active features: 2154
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.009

***** Iteration #147 *****
Loss: 564.782272
Feature norm: 37.673421
Error norm: 6.448892
Active features: 2154
Line search trials: 1
Line search step: 1.000000
Seconds required for thi

***** Iteration #188 *****
Loss: 564.604098
Feature norm: 37.681738
Error norm: 4.625034
Active features: 2147
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.017

***** Iteration #189 *****
Loss: 564.602512
Feature norm: 37.682544
Error norm: 6.241565
Active features: 2145
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.017

***** Iteration #190 *****
Loss: 564.597796
Feature norm: 37.681192
Error norm: 4.678775
Active features: 2145
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.017

***** Iteration #191 *****
Loss: 564.596283
Feature norm: 37.681873
Error norm: 6.516710
Active features: 2145
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.017

***** Iteration #192 *****
Loss: 564.591223
Feature norm: 37.680272
Error norm: 4.662534
Active features: 2143
Line search trials: 1
Line search step: 1.000000
Seconds required for thi

In [9]:
# Result
tagger = pycrfsuite.Tagger()
tagger.open('crf_umbilical_cord.model')
y_pred = [tagger.tag(xseq) for xseq in x_test]

# Let's take a look at a random sample in the testing set
i = 18
for x, y in zip(y_pred[i], [x[1].split("=")[1] for x in x_test[i]]):
    print("%s (%s)" % (y, x))

we (I)
find (I)
that (I)
this (I)
human (I)
sendmp (I)
signature (I)
is (I)
positively (I)
and (I)
significantly (I)
correlated (I)
with (I)
bothcancer (I)
and (I)
ageing-associated (I)
methylomic (I)
dynamics (I)
. (I)


In [10]:
# Metrics
# Create a mapping of labels to indices
labels = {"N": 1, "I": 0}

# Convert the sequences of tags into a 1-dimensional array
predictions = np.array([labels[tag] for row in y_pred for tag in row])
truths = np.array([labels[tag] for row in y_test for tag in row])

# Print out the classification report
print(classification_report(
    truths, predictions,
    target_names=["I", "N"]))

             precision    recall  f1-score   support

          I       0.96      0.99      0.97      2075
          N       0.00      0.00      0.00        95

avg / total       0.91      0.95      0.93      2170



### Testando para outras classes

In [11]:
x = x_adipose_tissue + x_bone_marrow + x_umbilical_cord
y = y_adipose_tissue + y_bone_marrow + y_umbilical_cord

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

NameError: name 'x_adipose_tissue' is not defined

In [None]:
trainer = pycrfsuite.Trainer(verbose=True)

# Submit training data to the trainer
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

# Set the parameters of the model
trainer.set_params({
    # coefficient for L1 penalty
    'c1': 0.1,

    # coefficient for L2 penalty
    'c2': 0.1,  

    # maximum number of iterations
    'max_iterations': 200,

    # whether to include transitions that
    # are possible, but not observed
    'feature.possible_transitions': True
})

# Provide a file name as a parameter to the train function, such that
# the model will be saved to the file when training is finished
trainer.train('crf.model')

In [None]:
tagger = pycrfsuite.Tagger()
tagger.open('crf.model')
y_pred = [tagger.tag(xseq) for xseq in X_test]

# Let's take a look at a random sample in the testing set
i = 12
for x, y in zip(y_pred[i], [x[1].split("=")[1] for x in X_test[i]]):
    print("%s (%s)" % (y, x))

In [None]:
# Convert the sequences of tags into a 1-dimensional array
predictions = np.array([labels[tag] for row in y_pred for tag in row])
truths = np.array([labels[tag] for row in y_test for tag in row])

# Print out the classification report
print(classification_report(
    truths, predictions,
    target_names=["I", "N"]))

### CRF do sklearn

In [None]:
import time
import re
import os
import sklearn_crfsuite
import pandas as pd
from sklearn_crfsuite.metrics import flat_accuracy_score
from collections import Counter
from sklearn_crfsuite import metrics
from random import randrange
from copy import deepcopy
from nltk import word_tokenize
from nltk import pos_tag
import pickle
import nltk
nltk.download('maxent_ne_chunker')
nltk.download('words')


class Extractor(object):

    def runCRFExtractor(self, crf, sentence):

        if crf is not None:
            features_crf = Extractor().sent2features(sentence)
            predictedLabels = crf.predict_single(features_crf)
            predictedMarginals = crf.predict_marginals_single(features_crf)
            print(predictedLabels)
            return self.extractFromCRFOutput(sentence, predictedLabels, predictedMarginals)
        else:
            print("CRF IS NONE")
            return []

    def extractFromCRFOutput(self, sentence, labels, marginals):
        tokens = word_tokenize(sentence)

        # composedValue indicates if attribute is composed by two or more tokens
        extractions = heuristicTokensSelection(labels, marginals, tokens)

        if extractions is not None and len(extractions) > 0:
            print(sentence)
            print(extractions)
            return extractions

        return []


    def trainCRFModel(self, attributes, CLASS):
        start_time = time.time()
        CRFs = []

        for attribute in attributes:
            print("Start training CRF for attribute: %s" % attribute)

            dump_dir = "/" + CLASS + "/crf/"
            dump_file = dump_dir + attribute + ".sav"

            if os.path.exists(dump_dir) is False:
                os.makedirs(dump_dir)

            if os.path.isfile(dump_file):
                print("LOADING MODEL ALREADY TRAINED AND SAVED")
                crf = pickle.load(open(dump_file, 'rb'))
                CRFs.append([attribute, crf])
            else:
                train_file = TRAIN_EXT_DIR + "/" + CLASS + "/" + attribute + ".csv"

                samples = pd.read_csv(train_file, names=['sentence', 'ner', 'entity', 'value', 'label'],
                                      dtype={'sentence': str, 'entity': str, 'value': str, 'label': str},
                                      converters={'ner': eval}, encoding='utf-8')

                data = samples[samples['label'] == 't']
                subset = data[['ner', 'sentence']]

                sentences = subset['sentence'].values
                ner = subset['ner'].values

                sentences_final = []
                ner_final = []
                for i, s in enumerate(sentences):
                    s_tokens = word_tokenize(s)
                    if len(s_tokens) == len(ner[i]):
                        sentences_final.append(s)
                        ner_final.append(ner[i])
                    else:
                        print(">>> DIFFERENT LENGHTS FOR SENTENCE TOKENS AND NER")

                features = [self.sent2features(sent) for sent in sentences_final]
                labels = [n for n in ner_final]

                print('\nCount features: {}'.format(len(features)))
                print('Count labels: {}'.format(len(labels)))

                '''
                print(features[0])
                print([example for index, example in samples.iterrows() if example['label'] == 't'][0])
                print(labels[0])
                print(type(features))
                print(type(labels))
                print("\n")
                
                Removes occurences of ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] from training CRFs
                
                keptFeatures = []
                keptLabels = []
                
                for index, crf_labels in enumerate(labels):
                    keep = False
                    for ner in crf_labels:
                        if ner == 'VALUE':
                            keep = True

                    if keep is True:
                        print(crf_labels)
                        keptFeatures.append(features[index])
                        keptLabels.append(labels[index])

                print("\nAFTER CLEANING NEGATIVE OCCURENCES")
                print('Count kept features: {}'.format(len(keptFeatures)))
                print('Count kept labels: {}'.format(len(keptLabels)))'''

                X = deepcopy(features)
                y = deepcopy(labels)

                X_train, y_train, X_test, y_test = self.splitTrainingData(X, y, 0.75)

                crf = sklearn_crfsuite.CRF(
                    algorithm='lbfgs',
                    c1=0.1,
                    c2=0.1,
                    max_iterations=30,
                    all_possible_transitions=True,
                )
                crf.fit(X_train, y_train)
                pickle.dump(crf, open(dump_file, 'wb'))

                labels = list(crf.classes_)
                print("CRF CLASSES: ")
                print(labels)

                y_pred = crf.predict(X_test)

                sorted_labels = sorted(
                    labels,
                    key=lambda name: (name[1:], name[0])
                )
                print(metrics.flat_classification_report(
                    y_test, y_pred, labels=sorted_labels, digits=3
                ))

                print(">>> flat accuracy: %.3f" % flat_accuracy_score(y_test, y_pred))

                print("Top likely transitions:")
                self.print_transitions(Counter(crf.transition_features_).most_common(20))

                print("\nTop positive:")
                self.print_state_features(Counter(crf.state_features_).most_common(30))

                '''print("\nTop negative:")
                self.print_state_features(Counter(crf.state_features_).most_common()[-30:])'''
                CRFs.append([attribute, crf])

        elapsed_time = time.time() - start_time
        print("Returned CRF Extractors: {}".format(len(CRFs)))
        print("CRF extractors: {}".format([name for (name, crf) in CRFs]))
        print("Elapsed time: {}".format(elapsed_time))
        return CRFs

    def print_state_features(self, state_features):
        for (attr, label), weight in state_features:
            print("%0.6f %-8s %s" % (weight, label, attr))

    def print_transitions(self, trans_features):
        for (label_from, label_to), weight in trans_features:
            print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

    def splitTrainingData(self, features, labels, division_ratio):
        trainSize = int(len(features) * division_ratio)
        x_train = []
        y_train = []
        x_test = []
        y_test = []

        while len(x_train) < trainSize and len(y_train) < trainSize:
            index = randrange(len(features))
            x_train.append(features.pop(index))
            y_train.append(labels.pop(index))

        while len(features) > 0:
            x_test.append(features.pop())
            y_test.append(labels.pop())

        return x_train, y_train, x_test, y_test

    def label2features(self, label, sent):
        tokens = word_tokenize(sent)
        postags = pos_tag(tokens)
        label = label.encode("utf-8")
        label_tk = word_tokenize(label.lower())
        return ['VALUE' if tags[0].encode("utf-8").lower() in label_tk else 'O' for tags in postags]

    def sent2features(self, sent):
        tokens = word_tokenize(sent)
        tags = pos_tag(tokens)

        return [self.word2features(tags, i) for i in range(len(tags))]

    def word2features(self, sent, i):
        word = sent[i][0]
        postag = sent[i][1]

        # features object definition with Token and postag added. Others default.
        features = {
            'token': word,
            'postag': postag,
            'np_chunk': 'none',
            'start_capital': False,
            'single_capital': False,
            'capital_period': False,
            'all_capital_period': False,
            'contains_number': False,
            'two_digits': False,
            'four_digits': False,
            'dollar_sign': False,
            'underline': False,
            'percentage': False,
            'purely_numeric': False,
            'number_type': False,
            'stop_word': False
        }

        # NP Chunk tag
        grammar = r"""NP:
        {<.*>+}          # Chunk everything
        }<VBD|IN>+{      # Chink sequences of VBD and IN
        """
        cp = nltk.RegexpParser(grammar)
        tree = cp.parse(sent)

        features['np_chunk'] = [subtree.label()
                                if sent[i][0] in [token for (token, tag) in subtree.leaves()] else 'none'
                                for subtree in tree.subtrees()][0]

        '''
            print(features['np_chunk'])
            for subtree in tree.subtrees():
                if sent[i][0] in [token for (token, tag) in subtree.leaves()]:
                    features['np_chunk'] = subtree.label()
                else:
                    features['np_chunk'] = 'none'
        '''

        # First token of sentence
        if i == 0:
            features['first'] = word

        # In first and second half of sentence
        if i < len(sent)/2:
            features['first_half'] = word
        elif i >= len(sent)/2:
            features['second_half'] = word

        # String normalization
        normalization = self.wordNormalization(word)
        features['normalization'] = normalization

        # Previous Tokens (window size = 5)
        previous_tokens = self.getTokensInWindow(sent, i, 5, "prev")

        features['previous_tokens'] = previous_tokens

        # Next Tokens (window size = 5)
        next_tokens = self.getTokensInWindow(sent, i, 5, "next")

        features['next_tokens'] = next_tokens

        # First letter capitalized
        if word[0].isupper():
            features['start_capital'] = True

        # Single capital
        if len(word)==1 and word.isupper():
            features['single_capital'] = True

        # Starts capital end period
        if word[0].isupper() and word[len(word)-1] == '.':
            features['capital_period'] = True

        # All capital end period
        capital_period_pattern = re.compile('^[A-Z]*\.$')
        if capital_period_pattern.match(word) is not None:
            features["all_capital_period"] = True

        # Contains at least one digit
        one_number_pattern = re.compile('[0-9]+')
        if one_number_pattern.match(word) is not None:
            features['contains_number'] = True

        # Two digits
        two_digits_pattern = re.compile('^[0-9]{2}$')
        if two_digits_pattern.match(word) is not None:
            features['two_digits'] = True

        # Four digits
        four_digits_pattern = re.compile('^[0-9]{4}$')
        if four_digits_pattern.match(word) is not None:
            features['four_digits'] = True

        # Contains dollar sign
        dollar_sign_pattern = re.compile('\$')
        if dollar_sign_pattern.match(word) is not None:
            features['dollar_sign'] = True

        # Contains uniderline
        underline_pattern = re.compile('\_')
        if underline_pattern.match(word) is not None:
            features['underline'] = True

        # Contains percentage
        percentage_pattern = re.compile('\%')
        if percentage_pattern.match(word) is not None:
            features['percentage'] = True

        # Purely numeric
        purely_numeric_pattern = re.compile('^\d+$')
        if purely_numeric_pattern.match(word) is not None:
            features['purely_numeric'] = True

        # Number type
        number_type_pattern = re.compile('(\d+((\.|,)*\d+)+((,)*\d+)*)*')
        if number_type_pattern.match(word) is not None:
            features['number_type'] = True

        # Stop word
        stop_words = ['the', 'a', 'of']
        if word in stop_words:
            features['stop_word'] = True

        # print(features)

        return features

    # capital to "A"
    # lowercase to "a"
    # digit to "1"
    # others to "0"
    def wordNormalization(self, word):
        normalization = ''
        digit_pattern = re.compile('\d')

        for character in word:
            if character.isupper():
                normalization += "A"
            elif character.islower():
                normalization += "a"
            elif digit_pattern.match(character):
                normalization += "1"
            else:
                normalization += "0"
        return normalization

    # return previous tokens in sentence including current index
    def getTokensInWindow(self, sent, current_index, window_size, type):

        returnedTokens = []
        for i in range(1, window_size):
            if len(returnedTokens) < window_size:
                index = 0
                if type == 'prev':
                    index = current_index - i
                elif type == 'next':
                    index = current_index + i

                if 0 <= index < len(sent):
                    returnedTokens.append(sent[index][0])
        return returnedTokens


In [12]:
files = glob.glob("../../sentence_scoring/out/*.txt")
tissue_txt_list = []
for filename in files:
    tissue_list.append(get_tissue_from_file(filename))
#Extractor().trainCRFModel(x_train, y_train)

FileNotFoundError: [Errno 2] No such file or directory: '../../sentence_scoring/out/../../sentence_scoring/out/bone_marrow.txt.xml'

In [128]:
a = "a" + "b" + "c"
type(a)

str