In [27]:
import argparse
import numpy as np
import os
import pandas as pd
import sys
import torch
import warnings
import lime
import shap
import torch.nn.functional as F
import seaborn as sns
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

code_dir = os.path.split(os.getcwd())[0]
sys.path.append(code_dir)

base_dir = "/".join(code_dir.split('/')[:-1])
sys.path.append(f"{base_dir}/code/turing/examples-raw/gluesst_finetune/")
sys.path.append(f"{base_dir}/code/turing/src/")

from argparse import Namespace
from methods.bag_of_ngrams.processing import cleanReports, cleanSplit, stripChars
from pyfunctions.general import extractListFromDic, readJson
from pyfunctions.pathology import extract_synoptic, fixProstateLabels, fixLabel, exclude_labels
from sklearn import preprocessing
from sklearn.metrics import f1_score
from turing.pathology.run_classifier import MODEL_CLASSES, processors, train_path as train
from methods.torch.processing import make_weights_for_balanced_classes
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from tqdm import tqdm, trange
from transformers import AutoTokenizer, AutoModel
from transformers import BertTokenizer, BertForSequenceClassification
from turing.pathology.path_utils import evaluate, extract_features, load_tnlr_base, load_tnlr_tokenizer, path_dataset
from sklearn.model_selection import train_test_split
from lime.lime_text import LimeTextExplainer

args = {
    'model_name_or_path': '',
    'task_name': 'sst-2',
    'config_name': '',
    'tokenizer_name': '',
    'do_train': True,
    'do_eval': True,
    'evaluate_during_training': True,
    'max_seq_length': 512,
    'do_lower_case': True,
    'per_gpu_train_batch_size': 8,
    'per_gpu_eval_batch_size': 8,
    'gradient_accumulation_steps': 1,
    'learning_rate': 7.6e-6,
    'weight_decay': 0.01,
    'adam_epsilon': 1e-8,
    'max_grad_norm': 1,
    'max_steps': -1,
    'warmup_ratio': 0.2,
    'logging_steps': 50,
    'eval_all_checkpoints': True,
    'no_cuda': False,
    'overwrite_output_dir': True,
    'seed': 42,
    'overwrite_cache': True,
    'metric_for_choose_best_checkpoint': None,
    'fp16': False,
    'fp16_opt_level': 'O1',
    'local_rank': -1,
    'num_train_epochs': 25,
    'n_gpu': 1,
    'device': 'cuda',
    'model_type': 'clinical_biobert'
}

# Read in data
path = f"{base_dir}/path_nlp_turing/data/prostate.json"
data = readJson(path)

# Clean reports
data = cleanSplit(data, stripChars)
data['dev_test'] = cleanReports(data['dev_test'], stripChars)

data = fixLabel(data)

kwargs = Namespace(**args)

# Tokenizer
if args['model_type'] == 'bert':
    bert_path = 'bert-base-uncased'
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
elif args['model_type'] == 'pubmed_bert':
    bert_path = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract"
    tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract", local_files_only=False)
elif args['model_type'] == 'biobert':
    bert_path = "dmis-lab/biobert-v1.1"
    tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1", local_files_only=False)
elif args['model_type'] == 'clinical_biobert':
    bert_path = "emilyalsentzer/Bio_ClinicalBERT"
    tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", local_files_only=False)
elif args['model_type'] == 'tnlr':
    checkpoint_file = f'{base_dir}/path_nlp_turing/turing/src/tnlr/checkpoints/tnlrv3-base.pt'
    config_file = f'{base_dir}/path_nlp_turing/turing/src/tnlr/config/tnlr-base-uncased-config.json'
    vocab_file = f'{base_dir}/path_nlp_turing/turing/src/tnlr/tokenizer/tnlr-uncased-vocab.txt'
    tokenizer = load_tnlr_tokenizer(vocab_file)

# A) Evaluate on best epoch

In [28]:
# this is working evaluation code -- Mar 17 2023

fields = ['SeminalVesicleNone']#'PrimaryGleason', 'SecondaryGleason', 'MarginStatusNone', 'SeminalVesicleNone']
results = {field: {'macro': [], 'micro': []} for field in fields}

for i in range(3):
    args['run'] = i
    
    for field in fields:
        train_documents = [extract_synoptic(patient['document'].lower(), tokenizer) for patient in data['train']]
        train_labels = [patient['labels'][field] for patient in data['train']]
        
        val_documents = [extract_synoptic(patient['document'].lower(), tokenizer) for patient in data['val']]
        val_labels = [patient['labels'][field] for patient in data['val']]
        
        test_documents = [extract_synoptic(patient['document'].lower(), tokenizer) for patient in data['test']]
        test_labels = [patient['labels'][field] for patient in data['test']]
        
        if field in ['PrimaryGleason', 'SecondaryGleason']:
            train_documents, train_labels = exclude_labels(train_documents, train_labels)
            val_documents, val_labels = exclude_labels(val_documents, val_labels)
            test_documents, test_labels = exclude_labels(test_documents, test_labels)

        print(len(train_documents), len(val_documents), len(test_documents))
        le = preprocessing.LabelEncoder()
        le.fit(train_labels)

        # Map raw label to processed label
        le_dict = dict(zip(le.classes_, le.transform(le.classes_)))
        le_dict = {str(key):le_dict[key] for key in le_dict}

        for label in val_labels + test_labels:
            if str(label) not in le_dict:
                le_dict[str(label)] = len(le_dict)
                
        train_labels = [le_dict[str(label)] for label in train_labels]
        val_labels = [le_dict[str(label)] for label in val_labels]
        test_labels = [le_dict[str(label)] for label in test_labels]

        # Map processed label back to raw label
        inv_le_dict = {v: k for k, v in le_dict.items()}
        print(le_dict)
        
        documents_full = train_documents + val_documents + test_documents
        labels_full = train_labels + val_labels + test_labels

        p_test = len(test_labels)/len(labels_full)
        p_val = len(val_labels)/(len(train_labels) + len(val_labels))

        train_documents, test_documents, train_labels, test_labels = train_test_split(documents_full, 
                                                                                      labels_full, 
                                                                                      test_size= p_test,
                                                                                      random_state=args['run'])

        train_documents, val_documents, train_labels, val_labels = train_test_split(train_documents, 
                                                                                      train_labels, 
                                                                                      test_size= p_val,
                                                                                      random_state=args['run'])
        

        model_path = f"{base_dir}/path_nlp_turing/output/fine_tuning/{args['model_type']}_{args['run']}/{field}"
        checkpoint_file = f"{model_path}/save_output"
        config_file = f"{model_path}/save_output/config.json"
        
        if args['model_type'] != 'tnlr':
            model = BertForSequenceClassification.from_pretrained(checkpoint_file, num_labels=len(le_dict))
        else:
            model = load_tnlr_base(checkpoint_file, config_file, model_type='tnlrv3_classification', num_labels=len(le_dict))

        with torch.cuda.device(1):
            model = model.cuda()

            test_dataset = path_dataset(test_documents, test_labels, model, tokenizer)
            test_dataloader = DataLoader(test_dataset, batch_size=kwargs.per_gpu_train_batch_size)
            output, labels, preds = evaluate(test_dataloader, kwargs, model, tokenizer, prefix="")
            
        results[field]['micro'].append(output['micro'])
        results[field]['macro'].append(output['macro'])
        
        preds = [inv_le_dict[pred] for pred in preds]
        labels = [inv_le_dict[label] for label in labels]
                                    
        y_actu = pd.Series(labels, name='Actual')
        y_pred = pd.Series(preds, name='Predicted')
        df_confusion = pd.crosstab(y_actu, y_pred)
                                    
        print(f"{field}_{args['run']}")
        print(df_confusion)
        print("\n")

        del model
        del test_dataloader

2066 517 324
{'0': 0, '1': 1}


Evaluating: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 41/41 [01:07<00:00,  1.65s/it]


SeminalVesicleNone_0
Predicted   0    1
Actual            
0          33    9
1           1  281


2066 517 324
{'0': 0, '1': 1}


Evaluating: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 41/41 [01:07<00:00,  1.65s/it]


SeminalVesicleNone_1
Predicted   0    1
Actual            
0          30   10
1           1  283


2066 517 324
{'0': 0, '1': 1}


Evaluating: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 41/41 [00:54<00:00,  1.33s/it]

SeminalVesicleNone_2
Predicted   0    1
Actual            
0          28    6
1           3  287







In [29]:
averages_mi, averages_ma = [], []

for run in range(3):
    average_mi, average_ma = [], []
    for field in results:
        average_mi.append(results[field]['micro'][run])
        average_ma.append(results[field]['macro'][run])
    averages_mi.append(np.mean(average_mi))
    averages_ma.append(np.mean(average_ma))
        
print(f"Average micro {np.mean(averages_mi)} {np.std(averages_mi)}")
print(f"Average macro {np.mean(averages_ma)} {np.std(averages_ma)}")

Average micro 0.969 0.00244948974278318
Average macro 0.9203333333333333 0.005249338582674546
