## Bio_ClinicalBERT_prototype

In [1]:
import sys, time, math
import numpy as np
import statistics
import pandas as pd
import transformers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
import torch
import torch.nn as nn
from torch.autograd import Variable

import sklearn
import sklearn.model_selection
from sklearn.preprocessing import LabelBinarizer



In [2]:
print("sys/python version ->", sys.version)
np.set_printoptions(suppress=True, precision=2)

sys/python version -> 3.7.7 (default, Mar 10 2020, 15:43:33) 
[Clang 11.0.0 (clang-1100.0.33.17)]


In [3]:
svrcm_file = './data/clinical_notes.csv'
bert_path = "./biobert_pretrain_output_all_notes_150000"

SEQUENCE_LEN = 200
TRAIN_BATCH_SIZE = 6
EPOCHS = 1

CATEGORIES = ['allergies',
 'chief_complaint',
 'cpt_code',
 'current_medication',
 'diag_code',
 'examination',
 'fam_hist',
 'hosp_hist',
 'illness_hist',
 'med_hist',
 'modifier',
 'social_hist',
 'surg_hist',
 'uncategorized']

MODEL_NAMES = ['burt_only', 'burt_ltsm']


### Model

In [4]:
def get_model_parameters(model):
    params = list(model.named_parameters())
    trainable_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)

    print('model has {:} different named parameters.'.format(len(params)))
    print("trainable parameters:", trainable_parameters, '\n')

    
    print('==== Embedding Layer ====\n')
    for p in params[0:5]:
        print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
    print('\n==== First Transformer (of possible twelve) ====\n')
    for p in params[5:21]:
        print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
    print('\n==== Output Layer ====\n')
    for p in params[-4:]:
        print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
    return

In [5]:
class BioClinicalBert(nn.Module):
    
    def __init__(self, bert_path):
        super(BioClinicalBert, self).__init__()
        self.bert_path = bert_path
        self.bert_model = transformers.BertModel.from_pretrained(bert_path)
        self.bert_drop = nn.Dropout(0.3)
        self.fc = nn.Linear(768, len(CATEGORIES))
        self.activation = nn.Softmax(dim=1)

    
    def forward(self, ids, masks, token_type_ids):
        _, pooled = self.bert_model(ids, masks, token_type_ids)
        do_output = self.bert_drop(pooled)
        fc_output = self.fc(do_output)
        output = self.activation(fc_output)
        return output


In [6]:
class BioClinicalBertLSTM(nn.Module):
    
    def __init__(self, bert_path):
        super(BioClinicalBertLSTM, self).__init__()
        self.bert_path = bert_path
        
        self.bert_model = transformers.BertModel.from_pretrained(bert_path)
        self.bert_drop = nn.Dropout(0.1)
        self.lstm = nn.LSTM(768, 128, 
                       num_layers=1, 
                       bidirectional=True, 
                       batch_first=True)
        self.fc = nn.Linear(2*128, len(CATEGORIES))
        self.activation = nn.Softmax(dim=1)

    def forward(self, ids, masks, token_type_ids):
        unpooled, pooled = self.bert_model(ids, masks, token_type_ids)
        do_output = self.bert_drop(unpooled)
        lstm_output, (hidden, cell) = self.lstm(do_output)
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
        fc_output = self.fc(hidden)
        output = self.activation(fc_output)
        return output


In [7]:
model_burt_only = BioClinicalBert(bert_path)
model_burt_ltsm = BioClinicalBertLSTM(bert_path)

In [8]:
get_model_parameters(model_burt_only)

model has 201 different named parameters.
trainable parameters: 108321038 

==== Embedding Layer ====

bert_model.embeddings.word_embeddings.weight            (28996, 768)
bert_model.embeddings.position_embeddings.weight          (512, 768)
bert_model.embeddings.token_type_embeddings.weight          (2, 768)
bert_model.embeddings.LayerNorm.weight                        (768,)
bert_model.embeddings.LayerNorm.bias                          (768,)

==== First Transformer (of possible twelve) ====

bert_model.encoder.layer.0.attention.self.query.weight    (768, 768)
bert_model.encoder.layer.0.attention.self.query.bias          (768,)
bert_model.encoder.layer.0.attention.self.key.weight      (768, 768)
bert_model.encoder.layer.0.attention.self.key.bias            (768,)
bert_model.encoder.layer.0.attention.self.value.weight    (768, 768)
bert_model.encoder.layer.0.attention.self.value.bias          (768,)
bert_model.encoder.layer.0.attention.output.dense.weight   (768, 768)
bert_model.encode

In [9]:
get_model_parameters(model_burt_ltsm)

model has 209 different named parameters.
trainable parameters: 109233422 

==== Embedding Layer ====

bert_model.embeddings.word_embeddings.weight            (28996, 768)
bert_model.embeddings.position_embeddings.weight          (512, 768)
bert_model.embeddings.token_type_embeddings.weight          (2, 768)
bert_model.embeddings.LayerNorm.weight                        (768,)
bert_model.embeddings.LayerNorm.bias                          (768,)

==== First Transformer (of possible twelve) ====

bert_model.encoder.layer.0.attention.self.query.weight    (768, 768)
bert_model.encoder.layer.0.attention.self.query.bias          (768,)
bert_model.encoder.layer.0.attention.self.key.weight      (768, 768)
bert_model.encoder.layer.0.attention.self.key.bias            (768,)
bert_model.encoder.layer.0.attention.self.value.weight    (768, 768)
bert_model.encoder.layer.0.attention.self.value.bias          (768,)
bert_model.encoder.layer.0.attention.output.dense.weight   (768, 768)
bert_model.encode

 ### Tokenizer

In [10]:
def get_tokenizer():
    tokenizer = transformers.BertTokenizer.from_pretrained(bert_path)
    vocab_size = tokenizer.vocab_size
    print("vocab_size ->", vocab_size)
    print("testing access to vocabulary")
    print("  tokenizer.convert_tokens_to_ids['vision'] ->", tokenizer.convert_tokens_to_ids(['vision']))
    print("  tokenizer.convert_ids_to_tokens(4152] ->" , tokenizer.convert_ids_to_tokens([4152]))
    return tokenizer, vocab_size

tokenizer, vocab_size = get_tokenizer()

vocab_size -> 28996
testing access to vocabulary
  tokenizer.convert_tokens_to_ids['vision'] -> [4152]
  tokenizer.convert_ids_to_tokens(4152] -> ['vision']


## clinical_notes Data Preperation

In [11]:
svrcm_file = './data/clinical_notes.csv'

In [12]:
SEQUENCE_LEN = 200
assert SEQUENCE_LEN < 512, "The maximun permissible sequence length for BERT is 512"

CATEGORIES = ['allergies',
 'chief_complaint',
 'cpt_code',
 'current_medication',
 'diag_code',
 'examination',
 'fam_hist',
 'hosp_hist',
 'illness_hist',
 'med_hist',
 'modifier',
 'social_hist',
 'surg_hist',
 'uncategorized']

In [13]:
def validate_categories(df):
    values = df.values
    values = [line.strip() for line in values]
    arr_binary = np.in1d(values, CATEGORIES)
    bad_rows = np.flatnonzero(arr_binary == False)
    excel_bad_rows = bad_rows+2
    if excel_bad_rows.size>0 :
        print("DATA ERRORS -> clinical_notes.csv file as following excel rows with bad categories:\n",
              excel_bad_rows)
    return

In [14]:
def validate_clinical_notes(df):
    len_arr = [len(note) for note in df]
    len_arr.sort()
    print("df_notes -> shape {}, min_len {}, max_len {}, mean {}, median {}".format(
        df.shape, len_arr[0], len_arr[-1],
        int(statistics.mean(len_arr)), int(statistics.median(len_arr))))
    assert  len_arr[-1] < SEQUENCE_LEN, "notes include a line greater than max allowed"
    return

In [15]:
def download_clinical_notes(svrcm_file):
    df_notes = pd.read_csv(svrcm_file, engine='python')
    validate_categories(df_notes['category'])
    validate_clinical_notes(df_notes['notes'])
    
    return df_notes

df_notes = download_clinical_notes(svrcm_file)

df_notes -> shape (210,), min_len 2, max_len 175, mean 59, median 54


In [16]:
def vectorize_batch(patient_id, sentence):
    encoded_dict = tokenizer.encode_plus(
        sentence, add_special_tokens = True, max_length = SEQUENCE_LEN, pad_to_max_length = True,
        return_attention_mask = True, return_tensors = 'pt',)        
    return encoded_dict

In [17]:
def vectorize_dataset(df):
    input_ids = []
    
    for i, notes in enumerate(df['notes']):
        patient_id = df.id[i]
        encoded_dict = vectorize_batch(patient_id, notes)
        input_ids.append(encoded_dict)
   
    print("input_ids -> {}, {}\ninput_ids[0] -> {}, {}".format(
        type(input_ids), len(input_ids), type(input_ids[0]), len(input_ids[0])))

    return input_ids

input_ids = vectorize_dataset(df_notes)

input_ids -> <class 'list'>, 210
input_ids[0] -> <class 'dict'>, 3


In [18]:
df_notes.category

0      chief_complaint
1      chief_complaint
2      chief_complaint
3         illness_hist
4      chief_complaint
            ...       
205        examination
206        examination
207        examination
208          diag_code
209           modifier
Name: category, Length: 210, dtype: object

In [19]:
CATEGORIES

['allergies',
 'chief_complaint',
 'cpt_code',
 'current_medication',
 'diag_code',
 'examination',
 'fam_hist',
 'hosp_hist',
 'illness_hist',
 'med_hist',
 'modifier',
 'social_hist',
 'surg_hist',
 'uncategorized']

In [20]:
def idx_encode_category(df):
    labels = []
    for cat in df.category:
        labels.append(CATEGORIES.index(cat))
    print("labels -> {} {}".format(type(labels), len(labels)))
    return labels


In [21]:
labels = idx_encode_category(df_notes)

labels -> <class 'list'> 210


In [22]:
print("input_ids          : ", type(input_ids), len(input_ids))
print("input_ids[7]       : ", type(input_ids[7]), len(input_ids[7]))
print("labels             : ", type(labels), len(labels))

input_ids          :  <class 'list'> 210
input_ids[7]       :  <class 'dict'> 3
labels             :  <class 'list'> 210


In [23]:
def create_patient_dataset(df):
    df['input_ids'] = input_ids
    df['labels'] = labels

    df_1 = df.groupby(['id'])['notes'].apply(list)
    df_2 = df.groupby(['id'])['category'].apply(list)
    df_3 = df.groupby(['id'])['input_ids'].apply(list)
    df_4 = df.groupby(['id'])['labels'].apply(list)
    
    df_dataset = pd.concat([df_1, df_2], axis=1)
    df_dataset = pd.concat([df_dataset, df_3], axis=1)
    df_dataset = pd.concat([df_dataset, df_4], axis=1)
    df_dataset.reset_index(inplace=True)

    assert (df_dataset.shape[0]==len(df.id.unique()) and
            df_dataset.shape[1]==5 ), "df_dataset shape mismatch"
    print("df_dataset (patients, col)->", df_dataset.shape)
    return df_dataset

df_dataset = create_patient_dataset(df_notes)

df_dataset (patients, col)-> (9, 5)


### Device to run on

In [24]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_burt_only = model_burt_only.to(device)
model_burt_ltsm = model_burt_ltsm.to(device)

### Predictions with no training

In [25]:
def predict(model):
    preds = []
    preds_max_idx = []
    preds_cat = []
    
    for i, sample in enumerate(input_ids):
        pred = model(
            input_ids[i]['input_ids'],
            input_ids[i]['token_type_ids'],
            input_ids[i]['attention_mask'])
        
        pred = pred.cpu().detach().numpy().flatten()
        pred_idx = np.argmax(pred)
        pred_cat = CATEGORIES[pred_idx]
        
        softmax_sum = np.around(np.sum(pred), decimals=0)
        assert  softmax_sum == 1, "softmax does not addup to 1"
        preds.append(pred)
        preds_max_idx.append(pred_idx)
        preds_cat.append(pred_cat)

    return preds, preds_max_idx, preds_cat

                                                                                       

In [26]:
preds_0, preds_max_idx_0, preds_cat_0 = predict(model_burt_only)

In [27]:
preds_1, preds_max_idx_1, preds_cat_1 = predict(model_burt_ltsm)

In [28]:
def result():
    dict_result = {"id" : df_notes.id,
            "true_cat": df_notes.category,
            "burt_only_pred" : preds_cat_0,
            "burt_only_score": None,
            "burt_ltsm_pred" : preds_cat_1,
            "burt_ltsm_score" : None}
    
    df_result = pd.DataFrame(dict_result)

    df_result.burt_only_score = np.where(
        df_result.true_cat == df_result.burt_only_pred, 100, 0)
    df_result.burt_ltsm_score = np.where(
        df_result.true_cat == df_result.burt_ltsm_pred, 100, 0)
    
    burt_only_cumsum = (df_result.burt_only_score==100).sum()
    burt_only_pct = burt_only_cumsum/len(df_result.burt_only_score)
    
    burt_ltsm_cumsum = (df_result.burt_ltsm_score==100).sum()
    burt_ltsm_pct = burt_ltsm_cumsum/len(df_result.burt_ltsm_score)

    
    print("burt_only_pct success {:.0%} with following rows# that passed".format(burt_only_pct))
    [print("    ", i, ":", row) for i, row in enumerate(
        df_result.burt_only_score) if row==100]

    print("\nburt_ltsm_pct success {:.0%} with following row# that passed\n".format(burt_ltsm_pct))
    [print("    ", i, ":", row) for i, row in enumerate(
        df_result.burt_ltsm_score) if row==100]

    return df_result

df_result = result()


burt_only_pct success 7% with following rows# that passed
     4 : 100
     9 : 100
     43 : 100
     44 : 100
     51 : 100
     78 : 100
     116 : 100
     119 : 100
     140 : 100
     141 : 100
     148 : 100
     161 : 100
     162 : 100
     193 : 100

burt_ltsm_pct success 1% with following row# that passed

     51 : 100
     120 : 100
     155 : 100


In [29]:
df_result

Unnamed: 0,id,true_cat,burt_only_pred,burt_only_score,burt_ltsm_pred,burt_ltsm_score
0,11111,chief_complaint,social_hist,0,social_hist,0
1,11111,chief_complaint,modifier,0,social_hist,0
2,11111,chief_complaint,uncategorized,0,uncategorized,0
3,11111,illness_hist,diag_code,0,social_hist,0
4,11111,chief_complaint,chief_complaint,100,uncategorized,0
...,...,...,...,...,...,...
205,81118,examination,social_hist,0,uncategorized,0
206,81118,examination,uncategorized,0,uncategorized,0
207,81118,examination,uncategorized,0,hosp_hist,0
208,81118,diag_code,med_hist,0,uncategorized,0


## Training

In [30]:
optim_dict = { MODEL_NAMES[0] : AdamW(model_burt_only.parameters(), lr=2e-5),
                   MODEL_NAMES[1] : AdamW(model_burt_ltsm.parameters(), lr=5e-3)}

optim_dict

{'burt_only': AdamW (
 Parameter Group 0
     betas: (0.9, 0.999)
     correct_bias: True
     eps: 1e-06
     lr: 2e-05
     weight_decay: 0.0
 ),
 'burt_ltsm': AdamW (
 Parameter Group 0
     betas: (0.9, 0.999)
     correct_bias: True
     eps: 1e-06
     lr: 0.005
     weight_decay: 0.0
 )}

In [31]:
loss_fn = nn.CrossEntropyLoss()

In [32]:
def accuracy(out, labels):
    outputs = np.argmax(out, axis=1)
    return np.sum(outputs==labels)/float(labels.size)

metrics_dict = { 'accuracy': accuracy, 'loss' : 'loss'}

In [33]:
metrics_dict

{'accuracy': <function __main__.accuracy(out, labels)>, 'loss': 'loss'}

In [34]:
device = "cpu"

In [35]:
def get_prediction(dic):
    pred = model_burt_only(
        dic['input_ids'],
        dic['token_type_ids'],
        dic['attention_mask'])
    return pred


In [36]:
def train(df, model=model_burt_only, optimizer=optim_dict["burt_only"], num_epochs=5):
    model = model.to(device)
    print('training on', device)
    X = df.input_ids
    Y = df.labels
    batch_count = 0

    for epoch in range(num_epochs):
        print('starting epoch#', epoch+1, '...')
        train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
        
        for i in range(len(X)):
            Y_hat_batch = []
            for j in range(len(X[i])): 
                Y_hat = get_prediction(X[i][j])
                Y_hat = Y_hat.flatten().tolist()
                Y_hat_batch.append(Y_hat)
            
            Y_hat_batch = torch.tensor(Y_hat_batch)
            labels_batch = Y[i]
            labels_batch = torch.tensor(np.array(labels_batch), dtype=torch.int64)

            loss = loss_fn(Y_hat_batch, labels_batch)
            loss.requires_grad = True
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            num_sample_passed = (Y_hat_batch.argmax(dim=1) == labels_batch).sum().cpu().item()
            pct_success = 100*num_sample_passed/labels_batch.shape[0]
            
            print("{} -> num sampled {} avg_loss per sample: {:.3f},   pct succeeded: {:.1f}%".format(
            i, list(labels_batch.size()), loss/labels_batch.shape[0], pct_success))

            train_l_sum += loss.cpu().item()
            train_acc_sum += (Y_hat_batch.argmax(dim=1) == labels_batch).sum().cpu().item()
            n += labels_batch.shape[0]
            batch_count += 1 
         
        print("epoch {}, avg loss per sample {:.4f}, train acc {:.1f}%, time {:.1f} sec\n".format(
            epoch + 1, train_l_sum / n, 100*train_acc_sum / n, time.time() - start))
        
train(df_dataset)

training on cpu
starting epoch# 1 ...
0 -> num sampled [52] avg_loss per sample: 0.051,   pct succeeded: 1.9%
1 -> num sampled [29] avg_loss per sample: 0.091,   pct succeeded: 0.0%
2 -> num sampled [36] avg_loss per sample: 0.074,   pct succeeded: 0.0%
3 -> num sampled [34] avg_loss per sample: 0.078,   pct succeeded: 5.9%
4 -> num sampled [12] avg_loss per sample: 0.221,   pct succeeded: 0.0%
5 -> num sampled [17] avg_loss per sample: 0.156,   pct succeeded: 0.0%
6 -> num sampled [14] avg_loss per sample: 0.190,   pct succeeded: 0.0%
7 -> num sampled [5] avg_loss per sample: 0.528,   pct succeeded: 0.0%
8 -> num sampled [11] avg_loss per sample: 0.241,   pct succeeded: 0.0%
epoch 1, avg loss per sample 0.1136, train acc 1.4%, time 50.6 sec

starting epoch# 2 ...
0 -> num sampled [52] avg_loss per sample: 0.051,   pct succeeded: 3.8%
1 -> num sampled [29] avg_loss per sample: 0.091,   pct succeeded: 6.9%
2 -> num sampled [36] avg_loss per sample: 0.074,   pct succeeded: 2.8%
3 -> num 

In [None]:
'''
# BioClinicalBert Test Results
* All test were conducted with same data of 9 patients with 210 lines in 14 categories
## Untrained Raw Predictions without any training
* burt_only_pct success 0% , second time pct success 7% 
* burt_ltsm_pct success 33%, second time pct success 1%

## Trained on burt_only ##
training on cpu
starting epoch# 1 ...
0 -> num sampled [52] avg_loss per sample: 0.051,   pct succeeded: 1.9%
1 -> num sampled [29] avg_loss per sample: 0.091,   pct succeeded: 0.0%
2 -> num sampled [36] avg_loss per sample: 0.074,   pct succeeded: 0.0%
3 -> num sampled [34] avg_loss per sample: 0.078,   pct succeeded: 5.9%
4 -> num sampled [12] avg_loss per sample: 0.221,   pct succeeded: 0.0%
5 -> num sampled [17] avg_loss per sample: 0.156,   pct succeeded: 0.0%
6 -> num sampled [14] avg_loss per sample: 0.190,   pct succeeded: 0.0%
7 -> num sampled [5] avg_loss per sample: 0.528,   pct succeeded: 0.0%
8 -> num sampled [11] avg_loss per sample: 0.241,   pct succeeded: 0.0%
epoch 1, avg loss per sample 0.1136, train acc 1.4%, time 50.6 sec

starting epoch# 2 ...
0 -> num sampled [52] avg_loss per sample: 0.051,   pct succeeded: 3.8%
1 -> num sampled [29] avg_loss per sample: 0.091,   pct succeeded: 6.9%
2 -> num sampled [36] avg_loss per sample: 0.074,   pct succeeded: 2.8%
3 -> num sampled [34] avg_loss per sample: 0.078,   pct succeeded: 5.9%
4 -> num sampled [12] avg_loss per sample: 0.221,   pct succeeded: 8.3%
5 -> num sampled [17] avg_loss per sample: 0.156,   pct succeeded: 5.9%
6 -> num sampled [14] avg_loss per sample: 0.190,   pct succeeded: 0.0%
7 -> num sampled [5] avg_loss per sample: 0.526,   pct succeeded: 0.0%
8 -> num sampled [11] avg_loss per sample: 0.241,   pct succeeded: 18.2%
epoch 2, avg loss per sample 0.1135, train acc 5.2%, time 52.6 sec

starting epoch# 3 ...
0 -> num sampled [52] avg_loss per sample: 0.051,   pct succeeded: 5.8%
1 -> num sampled [29] avg_loss per sample: 0.091,   pct succeeded: 0.0%
2 -> num sampled [36] avg_loss per sample: 0.074,   pct succeeded: 0.0%
3 -> num sampled [34] avg_loss per sample: 0.078,   pct succeeded: 2.9%
4 -> num sampled [12] avg_loss per sample: 0.221,   pct succeeded: 0.0%
5 -> num sampled [17] avg_loss per sample: 0.156,   pct succeeded: 5.9%
6 -> num sampled [14] avg_loss per sample: 0.189,   pct succeeded: 0.0%
7 -> num sampled [5] avg_loss per sample: 0.527,   pct succeeded: 0.0%
8 -> num sampled [11] avg_loss per sample: 0.241,   pct succeeded: 9.1%
epoch 3, avg loss per sample 0.1135, train acc 2.9%, time 52.5 sec

starting epoch# 4 ...
0 -> num sampled [52] avg_loss per sample: 0.051,   pct succeeded: 7.7%
1 -> num sampled [29] avg_loss per sample: 0.091,   pct succeeded: 3.4%
2 -> num sampled [36] avg_loss per sample: 0.074,   pct succeeded: 2.8%
3 -> num sampled [34] avg_loss per sample: 0.078,   pct succeeded: 2.9%
4 -> num sampled [12] avg_loss per sample: 0.220,   pct succeeded: 8.3%
5 -> num sampled [17] avg_loss per sample: 0.156,   pct succeeded: 5.9%
6 -> num sampled [14] avg_loss per sample: 0.189,   pct succeeded: 0.0%
7 -> num sampled [5] avg_loss per sample: 0.531,   pct succeeded: 0.0%
8 -> num sampled [11] avg_loss per sample: 0.240,   pct succeeded: 27.3%
epoch 4, avg loss per sample 0.1135, train acc 5.7%, time 50.1 sec

starting epoch# 5 ...
0 -> num sampled [52] avg_loss per sample: 0.051,   pct succeeded: 7.7%
1 -> num sampled [29] avg_loss per sample: 0.091,   pct succeeded: 0.0%
2 -> num sampled [36] avg_loss per sample: 0.074,   pct succeeded: 0.0%
3 -> num sampled [34] avg_loss per sample: 0.078,   pct succeeded: 5.9%
4 -> num sampled [12] avg_loss per sample: 0.220,   pct succeeded: 0.0%
5 -> num sampled [17] avg_loss per sample: 0.156,   pct succeeded: 5.9%
6 -> num sampled [14] avg_loss per sample: 0.190,   pct succeeded: 0.0%
7 -> num sampled [5] avg_loss per sample: 0.527,   pct succeeded: 20.0%
8 -> num sampled [11] avg_loss per sample: 0.240,   pct succeeded: 9.1%
epoch 5, avg loss per sample 0.1135, train acc 4.3%, time 50.9 sec

## Trained on burt_ltsm ##
training on cpu
starting epoch# 1 ...
0 -> num sampled [52] avg_loss per sample: 0.051,   pct succeeded: 3.8%
1 -> num sampled [29] avg_loss per sample: 0.092,   pct succeeded: 0.0%
2 -> num sampled [36] avg_loss per sample: 0.074,   pct succeeded: 2.8%
3 -> num sampled [34] avg_loss per sample: 0.078,   pct succeeded: 0.0%
4 -> num sampled [12] avg_loss per sample: 0.221,   pct succeeded: 8.3%
5 -> num sampled [17] avg_loss per sample: 0.157,   pct succeeded: 0.0%
6 -> num sampled [14] avg_loss per sample: 0.190,   pct succeeded: 0.0%
7 -> num sampled [5] avg_loss per sample: 0.527,   pct succeeded: 0.0%
8 -> num sampled [11] avg_loss per sample: 0.242,   pct succeeded: 0.0%
epoch 1, avg loss per sample 0.1140, train acc 1.9%, time 47.9 sec

starting epoch# 2 ...
0 -> num sampled [52] avg_loss per sample: 0.051,   pct succeeded: 0.0%
1 -> num sampled [29] avg_loss per sample: 0.092,   pct succeeded: 3.4%
2 -> num sampled [36] avg_loss per sample: 0.074,   pct succeeded: 2.8%
3 -> num sampled [34] avg_loss per sample: 0.078,   pct succeeded: 5.9%
4 -> num sampled [12] avg_loss per sample: 0.220,   pct succeeded: 8.3%
5 -> num sampled [17] avg_loss per sample: 0.157,   pct succeeded: 0.0%
6 -> num sampled [14] avg_loss per sample: 0.190,   pct succeeded: 0.0%
7 -> num sampled [5] avg_loss per sample: 0.526,   pct succeeded: 0.0%
8 -> num sampled [11] avg_loss per sample: 0.242,   pct succeeded: 0.0%
epoch 2, avg loss per sample 0.1139, train acc 2.4%, time 49.1 sec

starting epoch# 3 ...
0 -> num sampled [52] avg_loss per sample: 0.051,   pct succeeded: 1.9%
1 -> num sampled [29] avg_loss per sample: 0.092,   pct succeeded: 0.0%
2 -> num sampled [36] avg_loss per sample: 0.074,   pct succeeded: 2.8%
3 -> num sampled [34] avg_loss per sample: 0.078,   pct succeeded: 0.0%
4 -> num sampled [12] avg_loss per sample: 0.221,   pct succeeded: 0.0%
5 -> num sampled [17] avg_loss per sample: 0.157,   pct succeeded: 0.0%
6 -> num sampled [14] avg_loss per sample: 0.190,   pct succeeded: 0.0%
7 -> num sampled [5] avg_loss per sample: 0.526,   pct succeeded: 0.0%
8 -> num sampled [11] avg_loss per sample: 0.242,   pct succeeded: 0.0%
epoch 3, avg loss per sample 0.1140, train acc 1.0%, time 51.1 sec

starting epoch# 4 ...
0 -> num sampled [52] avg_loss per sample: 0.051,   pct succeeded: 0.0%
1 -> num sampled [29] avg_loss per sample: 0.092,   pct succeeded: 0.0%
2 -> num sampled [36] avg_loss per sample: 0.074,   pct succeeded: 2.8%
3 -> num sampled [34] avg_loss per sample: 0.078,   pct succeeded: 0.0%
4 -> num sampled [12] avg_loss per sample: 0.221,   pct succeeded: 8.3%
5 -> num sampled [17] avg_loss per sample: 0.156,   pct succeeded: 0.0%
6 -> num sampled [14] avg_loss per sample: 0.190,   pct succeeded: 0.0%
7 -> num sampled [5] avg_loss per sample: 0.526,   pct succeeded: 0.0%
8 -> num sampled [11] avg_loss per sample: 0.243,   pct succeeded: 0.0%
epoch 4, avg loss per sample 0.1139, train acc 1.0%, time 50.4 sec

starting epoch# 5 ...
0 -> num sampled [52] avg_loss per sample: 0.051,   pct succeeded: 0.0%
1 -> num sampled [29] avg_loss per sample: 0.092,   pct succeeded: 0.0%
2 -> num sampled [36] avg_loss per sample: 0.074,   pct succeeded: 2.8%
3 -> num sampled [34] avg_loss per sample: 0.078,   pct succeeded: 5.9%
4 -> num sampled [12] avg_loss per sample: 0.221,   pct succeeded: 8.3%
5 -> num sampled [17] avg_loss per sample: 0.157,   pct succeeded: 0.0%
6 -> num sampled [14] avg_loss per sample: 0.190,   pct succeeded: 0.0%
7 -> num sampled [5] avg_loss per sample: 0.526,   pct succeeded: 0.0%
8 -> num sampled [11] avg_loss per sample: 0.242,   pct succeeded: 9.1%
epoch 5, avg loss per sample 0.1139, train acc 2.4%, time 50.7 sec

'''