# Model inference

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from fast_bert.prediction import BertClassificationPredictor
from sklearn.metrics import roc_auc_score, accuracy_score, precision_recall_fscore_support
import torch
from scipy import interp

In [2]:
def compute_auc(target, predictions):
    
    target = target.copy(deep=True)
    
    target['N'] = np.where(target['30d_unplan_readmit'] == 'N', 1, 0)
    target['Y'] = np.where(target['30d_unplan_readmit'] == 'Y', 1, 0)
    target = target[['N','Y']]
    
    np_gold = target.to_numpy()
    np_preds = predictions.to_numpy()
    
    roc_auc = roc_auc_score(np_gold, np_preds, average=None)
    
    return np.average(roc_auc)

In [3]:
def compute_accuracy(target, predictions):
    
    predictions = predictions.copy(deep=True)
    
    predictions['30d_unplan_readmit']= np.where((predictions['Y']>0.5) & (predictions['Y']>=predictions['N']), 'Y', 'N')
    predictions = predictions[['30d_unplan_readmit']]
    
    np_gold = target.to_numpy()
    np_preds = predictions.to_numpy()
    
    return accuracy_score(np_gold, np_preds)

In [4]:
def compute_f1(target, predictions):
    
    predictions = predictions.copy(deep=True)
    target = target.copy(deep=True)

    target['30d_unplan_readmit'] = target['30d_unplan_readmit'].map({'Y': 1, 'N': 0})

    predictions['30d_unplan_readmit']= np.where((predictions['Y']>0.5) & (predictions['Y']>=predictions['N']), 1, 0)
    predictions = predictions[['30d_unplan_readmit']]
    
    np_gold = target.to_numpy()
    np_preds = predictions.to_numpy()
        
    return precision_recall_fscore_support(np_gold, np_preds, average='binary')

In [5]:
BASE = Path('data/readmission_prediction/low_resource')
LABEL_PATH = BASE
TRANSFORMER = 'transformer'

#path_to_directory="combined"
#model="biobert"

def infer(path_to_directory, model):

    DATA_PATH = BASE/TRANSFORMER/path_to_directory
    OUTPUT_DIR = BASE/TRANSFORMER/path_to_directory/'output'/model
    MODEL_PATH = OUTPUT_DIR/'model_out'

    test_dataset = pd.read_csv(DATA_PATH/'test.csv')
    test_text = list(test_dataset['text'].values)

    gold = test_dataset.drop(['text'],axis=1)
    gold = gold.reindex(sorted(gold.columns), axis=1)

    predictor = BertClassificationPredictor(model_path=MODEL_PATH,
                                            label_path=LABEL_PATH,
                                            multi_label=True,
                                            model_type='bert',
                                            do_lower_case=True)

    predictions = predictor.predict_batch(test_text)
    df_predictions=pd.DataFrame(predictions)
    df_predictions.to_csv(OUTPUT_DIR/'predictions.csv')

    preds = pd.DataFrame([{item[0]: item[1] for item in pred} for pred in predictions])

    del predictor
    del predictions
    torch.cuda.empty_cache()
    
    auc = compute_auc(gold, preds)
    accuracy = compute_accuracy(gold, preds)
    f_scores = compute_f1(gold, preds)
    precision = f_scores[0]
    recall = f_scores[1]
    f1 = f_scores[2]
    
    metrics=pd.DataFrame([{'Model': path_to_directory + '_' + model}])
    metrics['AUC'] = auc
    metrics['accuracy'] = accuracy
    metrics['precision'] = precision
    metrics['recall'] = recall
    metrics['F1'] = f1
    
    metrics.to_csv(OUTPUT_DIR/'metrics.csv', index = False)
    
    return metrics

In [6]:
for directory in ['synthetic','combined']:
#for directory in ['original','original_2x','synthetic','combined','original_eda']:
    for model in ['biobert','bert']:
        print(infer(directory, model))

  'precision', 'predicted', average, warn_for)


               Model       AUC  accuracy  precision  recall   F1
0  synthetic_biobert  0.527269  0.816216        0.0     0.0  0.0


  'precision', 'predicted', average, warn_for)


            Model       AUC  accuracy  precision  recall   F1
0  synthetic_bert  0.504772  0.816216        0.0     0.0  0.0
              Model       AUC  accuracy  precision    recall        F1
0  combined_biobert  0.527464  0.616216   0.196721  0.352941  0.252632
           Model       AUC  accuracy  precision  recall   F1
0  combined_bert  0.459875  0.816216        0.0     0.0  0.0


  'precision', 'predicted', average, warn_for)


In [7]:
global_metrics = pd.DataFrame(columns=["Model", "AUC", "accuracy", "precision", "recall", "F1"])
#for directory in ['original','original_2x','synthetic','combined','original_eda']:
for directory in ['synthetic','combined']:
    for model in ['biobert','bert']:
        csv = pd.read_csv(BASE/TRANSFORMER/directory/'output'/model/'metrics.csv')
        global_metrics = global_metrics.append(csv)

In [8]:
global_metrics.to_csv(BASE/TRANSFORMER/'global_metrics.csv', index=False)
global_metrics

Unnamed: 0,Model,AUC,accuracy,precision,recall,F1
0,synthetic_biobert,0.527269,0.816216,0.0,0.0,0.0
0,synthetic_bert,0.504772,0.816216,0.0,0.0,0.0
0,combined_biobert,0.527464,0.616216,0.196721,0.352941,0.252632
0,combined_bert,0.459875,0.816216,0.0,0.0,0.0
