In [126]:
from bs4 import BeautifulSoup 
from bs4.element import Tag
import nltk

import pycrfsuite

import numpy as np
import pandas as pd

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [93]:
def get_tissue(file_name):
    # Read data file and parse the XML
    with open(file_name, "r") as infile:
        soup = BeautifulSoup(infile, 'html.parser')

    docs = []
    for elem in soup.find_all("sentence"):
        texts = []

        for c in elem:
            if type(c) == Tag:
                # part of a named entity
                for j in c.text.split(" "):
                    if len(j) > 0:
                        texts.append((j, "N")) 
            else:
                # irrelevant word
                for j in c.replace(",", "").replace("\"", "").split(" "):
                    if len(j) > 0:
                        texts.append((j, "I")) 



        docs.append(texts)

    data = []
    for i, doc in enumerate(docs):

        # Obtain the list of tokens in the document
        tokens = [t for t, label in doc]

        # Perform POS tagging
        tagged = nltk.pos_tag(tokens)

        # Take the word, POS tag, and its label
        data.append([(w, pos, label) for (w, label), (word, pos) in zip(doc, tagged)])

    return data

In [96]:
def asd(cell_type):
    file_name = "../seed/usefull/" + cell_type + ".xml"
    data = get_tissue(file_name)
    return data

umbilical_cord = asd("umbilical_cord")
bone_marrow = asd("bone_marrow")
adipose_tissue = asd("adipose_tissue")

In [97]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag,
        'postag[:2]=' + postag[:2],
    ]
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:postag=' + postag1,
            '-1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('BOS')
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:postag=' + postag1,
            '+1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('EOS')
                
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [102]:
# A function for extracting features in documents
def extract_features(doc):
    return [word2features(doc, i) for i in range(len(doc))]

# A function fo generating the list of labels for each document
def get_labels(doc):
    return [label for (token, postag, label) in doc]

# Umbilical Cord data
x_umbilical_cord = [extract_features(doc) for doc in umbilical_cord]
y_umbilical_cord = [get_labels(doc) for doc in umbilical_cord]

# Bone Marrow data
x_bone_marrow = [extract_features(doc) for doc in bone_marrow]
y_bone_marrow = [get_labels(doc) for doc in bone_marrow]

# Adipose Tissue data
x_adipose_tissue = [extract_features(doc) for doc in adipose_tissue]
y_adipose_tissue = [get_labels(doc) for doc in adipose_tissue]

### Umbilical Cord model

In [109]:
trainer_umbilical_cord = pycrfsuite.Trainer(verbose=True)

x_umbilical_cord_train = x_adipose_tissue + x_bone_marrow
y_umbilical_cord_train = y_adipose_tissue + y_bone_marrow

# Submit training data to the trainer
for xseq, yseq in zip(x_umbilical_cord_train, y_umbilical_cord_train):
    trainer.append(xseq, yseq)

# Set the parameters of the model
trainer.set_params({
    # coefficient for L1 penalty
    'c1': 0.1,

    # coefficient for L2 penalty
    'c2': 0.01,  

    # maximum number of iterations
    'max_iterations': 200,

    # whether to include transitions that
    # are possible, but not observed
    'feature.possible_transitions': True
})

# Provide a file name as a parameter to the train function, such that
# the model will be saved to the file when training is finished
trainer.train('crf_umbilical_cord.model')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 4509
Seconds required: 0.023

L-BFGS optimization
c1: 0.100000
c2: 0.010000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 1634.956545
Feature norm: 1.000000
Error norm: 1607.903864
Active features: 4491
Line search trials: 1
Line search step: 0.000142
Seconds required for this iteration: 0.006

***** Iteration #2 *****
Loss: 1369.465681
Feature norm: 0.852537
Error norm: 1419.631380
Active features: 4497
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.007

***** Iteration #3 *****
Loss: 1070.676328
Feature norm: 0.642934
Error norm: 1979.673775
Active features: 3353
Line search trials: 2
Line search step: 0.500000
Seconds required for this iter

***** Iteration #85 *****
Loss: 19.657987
Feature norm: 11.147084
Error norm: 2.068008
Active features: 142
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.007

***** Iteration #86 *****
Loss: 19.648864
Feature norm: 11.135485
Error norm: 1.316026
Active features: 140
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.007

***** Iteration #87 *****
Loss: 19.644084
Feature norm: 11.148197
Error norm: 1.571022
Active features: 140
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.007

***** Iteration #88 *****
Loss: 19.637145
Feature norm: 11.128700
Error norm: 1.886139
Active features: 140
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.004

***** Iteration #89 *****
Loss: 19.629485
Feature norm: 11.139378
Error norm: 0.941978
Active features: 140
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.

***** Iteration #141 *****
Loss: 19.420718
Feature norm: 11.123927
Error norm: 0.799462
Active features: 120
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.008

***** Iteration #142 *****
Loss: 19.418093
Feature norm: 11.125633
Error norm: 0.761229
Active features: 120
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.009

***** Iteration #143 *****
Loss: 19.417090
Feature norm: 11.127530
Error norm: 0.766568
Active features: 120
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.007

***** Iteration #144 *****
Loss: 19.415249
Feature norm: 11.128650
Error norm: 0.837921
Active features: 120
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.007

***** Iteration #145 *****
Loss: 19.413805
Feature norm: 11.130270
Error norm: 0.615309
Active features: 120
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteratio

***** Iteration #200 *****
Loss: 19.312251
Feature norm: 11.232543
Error norm: 1.028903
Active features: 110
Line search trials: 3
Line search step: 0.250000
Seconds required for this iteration: 0.013

L-BFGS terminated with the maximum number of iterations
Total seconds required for training: 1.216

Storing the model
Number of active features: 110 (4509)
Number of active attributes: 57 (4387)
Number of active labels: 2 (2)
Writing labels
Writing attributes
Writing feature references for transitions
Writing feature references for attributes
Seconds required: 0.001



In [122]:
# Result
tagger = pycrfsuite.Tagger()
tagger.open('crf_umbilical_cord.model')
y_pred = [tagger.tag(xseq) for xseq in x_umbilical_cord]

# Let's take a look at a random sample in the testing set
i = 18
for x, y in zip(y_pred[i], [x[1].split("=")[1] for x in x_umbilical_cord[i]]):
    print("%s (%s)" % (y, x))

in (I)
this (I)
study (I)
we (I)
have (I)
analyzed (I)
dna (I)
methylation (I)
characteristics (I)
of (I)
human (I)
mesenchymal (I)
stem (I)
and (I)
progenitor (I)
cells (I)
(mspcs) (I)
form (I)
different (I)
tissue (I)
sources (I)
including (I)
bone (N)
marrow (N)
(bm) (I)
white (I)
adipose (N)
tissue (N)
(wat (I)
) (I)
umbilical (I)
cord (I)
(uc) (I)
as (I)
well (I)
as (I)
dermal (I)
fibroblasts (I)
by (I)
using (I)
the (I)
humanmethylation450k (I)
array. (I)


In [124]:
# Metrics
# Create a mapping of labels to indices
labels = {"N": 1, "I": 0}

# Convert the sequences of tags into a 1-dimensional array
predictions = np.array([labels[tag] for row in y_pred for tag in row])
truths = np.array([labels[tag] for row in y_umbilical_cord for tag in row])

# Print out the classification report
print(classification_report(
    truths, predictions,
    target_names=["I", "N"]))

             precision    recall  f1-score   support

          I       0.95      0.99      0.97       697
          N       0.00      0.00      0.00        36

avg / total       0.90      0.94      0.92       733



## All together

In [127]:
x = x_adipose_tissue + x_bone_marrow + x_umbilical_cord
y = y_adipose_tissue + y_bone_marrow + y_umbilical_cord

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [128]:
trainer = pycrfsuite.Trainer(verbose=True)

# Submit training data to the trainer
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

# Set the parameters of the model
trainer.set_params({
    # coefficient for L1 penalty
    'c1': 0.1,

    # coefficient for L2 penalty
    'c2': 0.01,  

    # maximum number of iterations
    'max_iterations': 200,

    # whether to include transitions that
    # are possible, but not observed
    'feature.possible_transitions': True
})

# Provide a file name as a parameter to the train function, such that
# the model will be saved to the file when training is finished
trainer.train('crf.model')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 4433
Seconds required: 0.022

L-BFGS optimization
c1: 0.100000
c2: 0.010000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 1401.038479
Feature norm: 1.000000
Error norm: 1343.112246
Active features: 4415
Line search trials: 1
Line search step: 0.000149
Seconds required for this iteration: 0.006

***** Iteration #2 *****
Loss: 1200.619726
Feature norm: 0.866230
Error norm: 1192.584192
Active features: 4403
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.004

***** Iteration #3 *****
Loss: 939.455368
Feature norm: 0.629945
Error norm: 1544.086599
Active features: 2946
Line search trials: 2
Line search step: 0.500000
Seconds required for this itera

***** Iteration #62 *****
Loss: 23.846891
Feature norm: 11.484117
Error norm: 1.428773
Active features: 176
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.006

***** Iteration #63 *****
Loss: 23.835587
Feature norm: 11.489607
Error norm: 1.921484
Active features: 176
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.004

***** Iteration #64 *****
Loss: 23.833602
Feature norm: 11.507430
Error norm: 4.180948
Active features: 172
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.003

***** Iteration #65 *****
Loss: 23.810810
Feature norm: 11.514304
Error norm: 2.252009
Active features: 172
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.003

***** Iteration #66 *****
Loss: 23.809466
Feature norm: 11.525860
Error norm: 3.240862
Active features: 170
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.

***** Iteration #115 *****
Loss: 23.445379
Feature norm: 11.696721
Error norm: 0.704384
Active features: 156
Line search trials: 4
Line search step: 0.125000
Seconds required for this iteration: 0.013

***** Iteration #116 *****
Loss: 23.442817
Feature norm: 11.694997
Error norm: 0.987611
Active features: 154
Line search trials: 4
Line search step: 0.125000
Seconds required for this iteration: 0.012

***** Iteration #117 *****
Loss: 23.440316
Feature norm: 11.693786
Error norm: 0.653893
Active features: 154
Line search trials: 4
Line search step: 0.125000
Seconds required for this iteration: 0.013

***** Iteration #118 *****
Loss: 23.438458
Feature norm: 11.688511
Error norm: 1.928018
Active features: 154
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.007

***** Iteration #119 *****
Loss: 23.433078
Feature norm: 11.687161
Error norm: 0.978204
Active features: 154
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteratio

***** Iteration #193 *****
Loss: 23.254283
Feature norm: 11.599318
Error norm: 1.161507
Active features: 152
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.006

***** Iteration #194 *****
Loss: 23.251765
Feature norm: 11.598832
Error norm: 0.876055
Active features: 152
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.006

***** Iteration #195 *****
Loss: 23.250450
Feature norm: 11.603457
Error norm: 1.113071
Active features: 150
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.007

***** Iteration #196 *****
Loss: 23.248217
Feature norm: 11.604612
Error norm: 0.891765
Active features: 150
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.007

***** Iteration #197 *****
Loss: 23.246401
Feature norm: 11.608207
Error norm: 0.563770
Active features: 150
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteratio

In [129]:
tagger = pycrfsuite.Tagger()
tagger.open('crf.model')
y_pred = [tagger.tag(xseq) for xseq in X_test]

# Let's take a look at a random sample in the testing set
i = 12
for x, y in zip(y_pred[i], [x[1].split("=")[1] for x in X_test[i]]):
    print("%s (%s)" % (y, x))

genomic (I)
dna (I)
from (I)
l-mpp (I)
of (I)
normal (I)
bone (N)
marrow (N)
id2710 (I)


In [130]:
# Convert the sequences of tags into a 1-dimensional array
predictions = np.array([labels[tag] for row in y_pred for tag in row])
truths = np.array([labels[tag] for row in y_test for tag in row])

# Print out the classification report
print(classification_report(
    truths, predictions,
    target_names=["I", "N"]))

             precision    recall  f1-score   support

          I       1.00      1.00      1.00       899
          N       0.96      0.96      0.96        90

avg / total       0.99      0.99      0.99       989

