In [3]:
# base libraries
import numpy as np
import pandas as pd
import regex as re
import itertools
import sklearn.metrics as sk
from functools import reduce

In [None]:
# deep learning libraries
import torch
import transformers
from sklearn.model_selection import train_test_split
from simpletransformers.classification import ClassificationModel, ClassificationArgs

# hyperparameter optimization
import optuna
from optuna.samplers import TPESampler
import optuna.visualization.matplotlib as oviz

# file system manipulation
import os
import shutil
from pathlib import Path
import pickle
import copy

# logging
import logging
import time

In [2]:
# set seeds to make computations deterministic
np.random.seed(42)
torch.manual_seed(42)

# check CUDA availability
cuda_available = torch.cuda.is_available()
print("Is CUDA available? ", "Yes" if cuda_available else "No")

Is CUDA available?  Yes


In [46]:
final_test = pd.read_csv(r"Storage/Bert/test_8_11.csv")
final_test.columns = ["PatientID", "text", "labels"]

In [47]:
best_model = ClassificationModel(
    "bert",
    "Storage/Bert/NoHyperParameterTuningResults/trial_0"
)

In [48]:
test_results, test_outputs, test_wrong = best_model.eval_model(
    final_test,
)

  0%|          | 0/293 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/74 [00:00<?, ?it/s]



In [49]:
max_prob_list = []
test_prob_list = []
test_pred_list = []
for i in range(len(final_test)):
    # prob_list = list(torch.softmax(torch.from_numpy(model_outputs[i]), axis=0)[:,1])
    prob_list = torch.softmax(torch.from_numpy(test_outputs[i]), axis=0)
    #print("Prob List: ", prob_list, type(prob_list))

    extracted_prob_list = []
    for i in range(len(prob_list)):
        extracted_prob_list.append(float(prob_list[i]))

    #print("Extracted Prob List: ", extracted_prob_list)
    # find max one in each submatrix of length 3
    max_proba = max(extracted_prob_list)

    # identify model prediction based on location of max_proba within extracted_prob_list
    if (extracted_prob_list[0] == max_proba):
        test_pred_list.append(0)
    elif (extracted_prob_list[1] == max_proba):
        test_pred_list.append(1)
    else:
        test_pred_list.append(2)

    max_prob_list.append(max_proba)
    test_prob_list.append(extracted_prob_list)

In [50]:
sk.accuracy_score(test_pred_list, final_test["labels"])

0.856655290102389

In [51]:
final_test["pred"] = test_pred_list
final_test["proba"] = max_prob_list

In [52]:
final_test.head()

Unnamed: 0,PatientID,text,labels,pred,proba
0,Z9096104,ital status: married civil union spouse name: ...,1,1,0.999421
1,Z7410349,history narrative social history tobacco quit ...,1,1,0.999419
2,Z7754903,ncy-based suicidal statements. suicidal homici...,0,0,0.999304
3,Z11959548,js; bjshingleton li61ao +21.5d target plano • ...,1,1,0.99939
4,Z7580275,rts pedal edema is worsened recently. current ...,2,2,0.997088


In [53]:
sk.roc_auc_score(final_test["labels"], test_prob_list, multi_class = "ovr", average = "weighted")

0.9569753382861237

In [6]:
wrong_preds = pd.read_csv(r"Storage/Performance/test_set_results_trial_0_model.csv") # final_test[final_test["labels"] != final_test["pred"]]
wrong_preds = wrong_preds[wrong_preds["pred"] != wrong_preds["labels"]]

In [35]:
len(wrong_preds)

42

In [28]:
always_pattern_regex = pd.read_csv(r"../EDA/Data/always_patterns_8_1.csv")
always_pattern_regex = always_pattern_regex["Pattern"].to_list()
for i in range(len(always_pattern_regex)):
    always_pattern_regex[i] = re.compile(always_pattern_regex[i], re.IGNORECASE)

In [41]:
def find_always_pattern_matches(df, col, always_pattern_regex):
    a = []
    for seq in (df[col]):
        curr = []
        for regex_pattern in always_pattern_regex:
            match = regex_pattern.search(seq)
            if (match is not None):
                curr.append(match.group())
        a.append(curr)
    
    df["always_pattern_match"] = a
    
    return df

In [42]:
wrong_preds = find_always_pattern_matches(wrong_preds, "text", always_pattern_regex)

In [53]:
wrong_preds.loc[213]["text"] # label 1, pred 0, shld be 0
wrong_preds.at[213, "labels"] = 0

In [44]:
wrong_preds.loc[209]["text"] # label 2, pred 0, shld be 2 interesting how model got this wrong

"------------------------------------------------------------------------------------------------------------- history no chief complaint on file. hpi patient is a 75-year-old female with extensive past medical history including htn, obesity, left breast ca, type ii dm complicated by neuropathy's, spinal stenosis, djd, rcc cognitive impairment, aki, presenting with 2 3 weeks of intermittent right-sided chest pain feels like air bubble, brief lasting 5 to 10 minutes, no shortness of breath but little cough, not productive, not exertional, not pleuritic, not positional, not worse with movement but occasionally feels like a food bolus stuck in her chest denies any nausea, vomiting, diarrhea. seen at outpatient clinic a day found to have a temp"

In [55]:
wrong_preds.loc[206]["text"] # label 1, pred 0, shld be 0
wrong_preds.at[206, "labels"] = 0

In [21]:
wrong_preds.loc[205]["text"] # label 2, pred 0, shld be _?

'c given diffuse intracranial atherosclerosis. stroke rehabilitation: the patient is continued on her current rehabilitation program with pt, ot, and slp. the patient is participating with therapy. secondary prevention: actively treating with aspirin, plavix x 21 days (started 12 18), optimal bp control, bs control and statin adl dysfunction motor function: actively treating with pt and ot cognition, dysarthria, expressive aphasia: actively treating with slp. pain: monitoring and offering tylenol as needed safety: actively treating with education and monitoring skin: actively assessing given risk of pressure sores from immobility elevated bun assessment & plan likely pre-renal due to volume depletion. elevated bun on 12 28, now downtrending s p ivfs. - encourage adequate po fluid int'

In [57]:
wrong_preds.loc[204]["text"] # label 1, pred 2 ,shld be 2
wrong_preds.at[204, "labels"] = 2

In [58]:
wrong_preds.loc[232]["text"] # label 0 pred 1, should be 1 recognized context?
wrong_preds.at[232, "labels"] = 1

In [18]:
wrong_preds.loc[233]["text"] # label 1 pred 0 should be 1

'ain and falls. hematology allergic immunologic: negative for easy bruise bleed, and environmental allergies. endocrine: negative for polydipsia. neurological: negative for dizziness, tingling, tremors, sensory changes, speech changes, weakness, and seizure. psychiatric behavioral: negative for depression, suicidal ideas, drug and alcohol abuse, hallucinations, nervous or anxious, insomnia, and memory loss problem list patient active problem list diagnosis • urinary incontinence • deep vein thrombosis • benign prostatic hyperplasia • sleep apnea • obesity • hypertensive disorder • factor v leiden mutation • healthcare maintenance • hypertension • obstructive sleep apnea syndrome • neck pain • insomnia past surgical history past surgical history procedure laterality date • uncoded surgical h'

In [59]:
wrong_preds.loc[234]["text"] # label 1 pred 0 should be 0
wrong_preds.at[234, "labels"] = 0

In [60]:
wrong_preds.loc[236]["text"] # label 0, pred 1, should be 1 got context???
wrong_preds.at[236, "labels"] = 1

In [61]:
wrong_preds.loc[238]["text"] # label 1, pred 0, should be 0
wrong_preds.at[238, "labels"] = 0

In [62]:
wrong_preds.loc[239]["text"] # label 0, pred 2, should be 2
wrong_preds.at[239, "labels"] = 2

In [63]:
wrong_preds.loc[240]["text"] # label 1, pred 0, should be 2
wrong_preds.at[240, "labels"] = 2

In [64]:
wrong_preds.loc[241]["text"] # label 0, pred 2, shld be 2
wrong_preds.at[241, "labels"] = 2

In [65]:
wrong_preds.loc[248]["text"] # label 0, pred 2, should be 2
wrong_preds.at[248, "labels"] = 2

In [66]:
wrong_preds.loc[253]["text"] # label 0, pred 2, shld be 2
wrong_preds.at[253, "labels"] = 2

In [67]:
wrong_preds.loc[254]["text"] # label 1, pred 0, should be 0
wrong_preds.at[254, "labels"] = 0

In [59]:
wrong_preds.loc[260]["text"] # label 1, predicted 0

"------- impression: no definite evidence of pneumonia or pulmonary edema. 2012 06 18 00:00:00 - ctbrw oc: impression: no evidence of intracranial hemorrhage, cerebral infarction, or mas lesion. several small scattered white matter hypodensities in the bilateral cerebral periventricular and subcortical white matter, which are likely related to patient's history of multiple sclerosis and grossly uncha ------- ulmonary edema. 2012 06 18 00:00:00 - ctbrw oc: impression: no evidence of intracranial hemorrhage, cerebral infarction, or mas lesion. several small scattered white matter hypodensities in the bilateral cerebral periventricular and subcortical white matter, which are likely related to patient's history of multiple sclerosis and grossly unchanged compared to the prior mri accounting for diff"

In [68]:
wrong_preds.loc[256]["text"] # label 1, predicted 0 shld be 1
wrong_preds.at[256, "labels"] = 1

In [69]:
wrong_preds.loc[275]["text"] # label 2, predicted 0 shld be 1
wrong_preds.at[275, "labels"] = 1

In [70]:
wrong_preds.loc[262]["text"] # label 1, predicted 0 shld be 0 no CI AP
wrong_preds.at[262, "labels"] = 0

In [71]:
wrong_preds.loc[264]["text"] # label 1, predicted 0 shld be 0 no memory always AP 
wrong_preds.at[264, "labels"] = 0

In [72]:
wrong_preds.loc[265]["text"] # label 2, predicted 0 should be 1
wrong_preds.at[265, "labels"] = 1

In [45]:
wrong_preds.loc[272]["text"] # label 2, predicted 1

'r an evaluation of multiple medical problems including the following: seronegative erosive rheumatoid arthritis-now on cimzia and methotrexate history of ischemic colitis– we have avoided jak inhibitors and tocilizumab status post bilateral hip replacements with revisions status post bilateral knee replacements severe oa both shoulders carpal tunnel right greater than left hypertension early memory loss hypercholesterolemia spinal stenosis she comes in with a wheelchair with her daughter. the wheelchair is only for convenience. does not because of ongoing pain. she actually is doing reasonably well. she had 400 mg of cimzia only a few few weeks ago at most. she has been on and off of this and stopped it for a number of weeks–months because of expense. is now requesting a $200 a month w'

In [73]:
wrong_preds.loc[280]["text"] # label 0, predicted 2
wrong_preds.at[280, "labels"] = 2

In [74]:
wrong_preds.loc[288]["text"] # shld be 1, label 0, predicted 2 interesting, mistaked pt for having short term memory issues when it was actually pt's wife
wrong_preds.at[288, "labels"] = 1

In [75]:
wrong_preds.loc[290]["text"] # label 0, model predicted 2 shld be 2
wrong_preds.at[290, "labels"] = 2

In [76]:
wrong_preds.loc[292]["text"] # label 1, model predicted 0 shld be 0
wrong_preds.at[292, "labels"] = 0

In [79]:
wrong_preds.loc[215]["text"] # label 2, pred 0, shld be 2
# wrong_preds.at[292, "labels"] = 0

'62 acct: 3012038213 location: u4 adm date: 10 08 14 status: dis in fc: 01 unit #: 00273466 disch date: 10 09 14 attend md: gange,meghan e. m.d. discharge summary part 2 please refer to discharge summary part 1 for all other details. history of present illness: please refer to the full dictated h&p for complete details. briefly, stephanie barton is a 78-year-old female with a history of mild dementia, aortic stenosis, status post bioprosthetic valve, hypercholesterolemia, and hypertension, who presented with transient left-sided facial droop. hospital course by problem: 1. left facial droop. this completely resolved prior to admission to the hospital and was most suggestive of a tia. the patient had an mri mra of the brain, head, and neck, which were negative for evidence of stroke or a'

In [80]:
wrong_preds.loc[216]["text"] # label 1, pred 0, shld be 1

'midline with protrusion. no dysarthria. shoulder shrugs normal bilaterally. motor: no pronator drift or orbiting. strength 5 5 throughout bilateral upper and lower extremities. there is some slowing left finger sensation: diffusely intact to light touch reflexes: deep tendon reflexes 1+ and symmetric bilaterally at the triceps, biceps, brachioradialis, quadriceps and gastrocnemius soleus. cerebellar: finger-to-nose normal labs: 12.07* \\ 13.1* 293 41.1 \\ 10 24 1559 \\ no bmp past 72hr lab results component value date time pt 14.3 03 27 2018 04:00 pm inr 1.1 03 27 2018 04:00 pm aptt 31.5 03 27 2018 04:00 pm imaging: mri brain 10 24 2019: impression 1. early subacute small right frontoparietal infarct along the central sulcus just above the omega area. 2. no significant intracranial'

In [82]:
wrong_preds.loc[218]["text"] # label 0,pred 2 , shld be 2
wrong_preds.at[218,  "labels"] = 2

In [85]:
wrong_preds.loc[220]["text"] # label 0,pred 2 , shld be 0

'------- with use of grab bars meal preparation supervision moderate good kitchen mobility and safety awareness around the kitchen. pt able to prepare meal with good effect and mobility. cognition: functional communication appears able to comprehend verbal written information, requires additiona ------- ntation level oriented x3 ability to follow commands follows 2-step commands, verbal cues attention memory comment requires assist to recall events however can recall basics of hosp course awareness judgmen ------- gth rue : able to perform adl tasks overall strength lue: able to perform adl tasks cognitive test: moca total score out of 30 (>=26 is normal) 18 impression: charlene mullins has made good progress in ot this week as evidenced by increased participation in adl routine, increased ac'

In [87]:
wrong_preds.loc[225]["text"] # label 1, pred 0 shld be 0
wrong_preds.at[225, "labels"] = 0

In [89]:
wrong_preds.loc[226]["text"] # label 1, pred 0 
wrong_preds.at[226,  "labels"] = 0

In [91]:
wrong_preds.loc[227]["text"] # label 2, pred 0 
wrong_preds.at[227,  "labels"] = 2

In [93]:
wrong_preds.loc[228]["text"] # label 0, pred 1, what to do??

"-------------------------------------------------------------------------------------------------------------patient's daughter lilian called requesting to speak with someone regarding her mother, she states she is concerned because her mother has been extremely forgetful for the past couple of months, she says its getting worse with time and she is concerned her mother might have alzheimer's, please call. ------------------------------------------------------------------------------------------------------------"

In [94]:
wrong_preds.loc[231]["text"] # label 1, pred 0, shld be 2
#wrong_preds.at[226,  "labels"] = 0

'------- gib (gastrointestinal bleeding) • rectal bleeding • ischemic colitis, enteritis, or enterocolitis • status post bilateral knee replacements • carpal tunnel syndrome of right wrist • other chest pain • corns and callosities • disease of nail • bilateral bunions • hammertoe, bilateral • memory loss • chronic otitis media medications reviewed include the following: scheduled meds: • certolizu ------- md review of systems except for overall pain but especially in her neck and shoulders unremarkable. memory is an issue. she has mild shortness of breath but rare. no edema. very uncomfortable woman sitting in a walker with her daughter present. bp 166 72 | pulse 80 | temp 37 °c (98.6 °f) (oral) | ht 154.9 cm (5\' 1") | wt 53.1 kg (117 lb) | spo2 99% | bmi 22.11 kg m² no nodules but synoviti'

In [95]:
wrong_preds

Unnamed: 0,PatientID,text,labels,pred,proba,always_pattern_match
15,Z10612901,------- 25 mg (65 mg elemental) tablet take 32...,2,1,0.997182,[namenda take 1 tablet (10 mg total) by mouth ...
54,Z8642778,----------------------------------------------...,1,2,0.997195,[will do moca]
64,Z12253259,------- sents with • flank pain left sided pai...,2,1,0.999346,[father • type 2 diabetes father • hypertensio...
89,Z11408813,"cation, transient ischemic attack. plan on win...",2,1,0.999233,"[family medical history: alzheimer, history: a..."
171,Z7878411,------- yme-abc results viewable i... ct brain...,2,0,0.555855,[impression and plan- bilateral vestibular fai...
204,Z8805927,------- osephine has been having some increasi...,2,2,0.997208,[]
205,Z9974098,c given diffuse intracranial atherosclerosis. ...,2,0,0.994733,[]
206,Z6623096,ty · not sleeping well because pain wakes her ...,0,0,0.833552,[]
209,Z7633862,----------------------------------------------...,2,0,0.483224,[]
213,Z9912403,eted. social history reviewed: living situatio...,0,0,0.996028,[]


In [96]:
wrong_preds.to_csv(r"Storage/Predictions/BertWrongPreds.csv", index = False)

In [97]:
sk.accuracy_score(wrong_preds["labels"], wrong_preds["pred"])

0.47619047619047616