In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc
import time
from IPython.display import clear_output
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import ModelCheckpoint as MC
from tensorflow.keras import backend as K


In [2]:
print('Reading train data...')
train = pd.read_csv("../input/rsna-str-pulmonary-embolism-detection/train.csv")
print(train.shape)
train.head()
print('Reading test data...')
test = pd.read_csv("../input/rsna-str-pulmonary-embolism-detection/test.csv")
print(test.shape)
test.head()

Reading train data...
(1790594, 17)
Reading test data...
(146853, 3)


Unnamed: 0,StudyInstanceUID,SeriesInstanceUID,SOPInstanceUID
0,df06fad17bc3,857e3d760445,c3163725fcf6
1,df06fad17bc3,857e3d760445,d54a8daaf836
2,df06fad17bc3,857e3d760445,bdc531b699cd
3,df06fad17bc3,857e3d760445,9e6a68e27df0
4,df06fad17bc3,857e3d760445,25e3307ba7da


In [3]:
print('Reading sample data...')
ss = pd.read_csv("../input/rsna-str-pulmonary-embolism-detection/sample_submission.csv")
print(ss.shape)
ss.head()

ids = ss.id
counter = [1 for _ in range(10)]
mapper = []
for i in ids:
    n = '_'.join(i.split('_')[1:])
    if n not in mapper:
        mapper.append(n)
    else:
        counter[mapper.index(n)] += 1
print("List of keys:")
print(mapper, sep='\n')
print()
print("Count of items per key:")
print(counter)

Reading sample data...
(152703, 2)
List of keys:
['negative_exam_for_pe', 'rv_lv_ratio_gte_1', 'rv_lv_ratio_lt_1', 'leftsided_pe', 'chronic_pe', 'rightsided_pe', 'acute_and_chronic_pe', 'central_pe', 'indeterminate', '']

Count of items per key:
[650, 650, 650, 650, 650, 650, 650, 650, 650, 146853]


In [4]:
import vtk
from vtk.util import numpy_support
import cv2

reader = vtk.vtkDICOMImageReader()
def get_img(path):
    reader.SetFileName(path)
    reader.Update()
    _extent = reader.GetDataExtent()
    ConstPixelDims = [_extent[1]-_extent[0]+1, _extent[3]-_extent[2]+1, _extent[5]-_extent[4]+1]

    ConstPixelSpacing = reader.GetPixelSpacing()
    imageData = reader.GetOutput()
    pointData = imageData.GetPointData()
    arrayData = pointData.GetArray(0)
    ArrayDicom = numpy_support.vtk_to_numpy(arrayData)
    ArrayDicom = ArrayDicom.reshape(ConstPixelDims, order='F')
    ArrayDicom = cv2.resize(ArrayDicom,(512,512))
    return ArrayDicom

In [5]:
def convert_to_rgb(array):
    array = array.reshape((512, 512, 1))
    return np.stack([array, array, array], axis=2).reshape((512, 512, 3))
    
def custom_dcom_image_generator(batch_size, dataset, test=False, debug=False):
    
    fnames = dataset[['StudyInstanceUID', 'SeriesInstanceUID', 'SOPInstanceUID']]
    
    if not test:
        Y = dataset[['pe_present_on_image', 'rv_lv_ratio_gte_1', 'rv_lv_ratio_lt_1', 'leftsided_pe',
                     'chronic_pe', 'rightsided_pe', 'acute_and_chronic_pe', 'central_pe', 'indeterminate'
                    ]]
        prefix = 'input/rsna-str-pulmonary-embolism-detection/train'
        
    else:
        prefix = 'input/rsna-str-pulmonary-embolism-detection/test'
    
    X = []
    batch = 0
    for st, sr, so in fnames.values:
        if debug:
            print(f"Current file: ../{prefix}/{st}/{sr}/{so}.dcm")

        dicom = get_img(f"../{prefix}/{st}/{sr}/{so}.dcm")
        image = convert_to_rgb(dicom)
        X.append(image)
        
        del st, sr, so
        
        if len(X) == batch_size:
            if test:
                yield np.array(X)
                del X
            else:
                yield np.array(X), Y[batch*batch_size:(batch+1)*batch_size].values
                del X
                
            gc.collect()
            X = []
            batch += 1
        
    if test:
        yield np.array(X)
    else:
        yield np.array(X), Y[batch*batch_size:(batch+1)*batch_size].values
        del Y
    del X
    gc.collect()
    return

In [6]:
import random
grouped = train.groupby('StudyInstanceUID')

In [7]:
#sample_df=pd.DataFrame()
#for _,d in grouped:
#    sample_df=pd.concat([sample_df,d.loc[random.sample(list(d.index),15)]])

In [8]:
#sample_df.shape

(109185, 17)

In [9]:
#sample_df.to_csv('test_data.csv')

In [10]:
#del sample_df

In [30]:
sample_df = pd.read_csv('../input/sampled-test-data/test_data (1).csv', index_col=0)
sample_df.head()
sample_df.shape

(109185, 17)

In [12]:
predictions = {}
stopper = 3600 * 4 #4 hours limit for prediction
pred_start_time = time.time()
test=sample_df
p, c = time.time(), time.time()
batch_size = 1000
    
l = 0
n = test.shape[0]

all_preds_true = []

for x in custom_dcom_image_generator(batch_size, test, False, False):
    clear_output(wait=True)
    model = load_model('../input/inceptionweights/pe_detection_model_inception.h5')
    #print(x[0])
    #print(x[1])
    all_preds_true.extend(x[1])
    preds = model.predict(x[0], batch_size=16, verbose=1)
    #print(preds)
    try:
        for key in preds.keys():
            predictions[key] += preds[key].flatten().tolist()
            
    except Exception as e:
        print(e)
        for key in preds.keys():
            predictions[key] = preds[key].flatten().tolist()
            
    l = (l+batch_size)%n
    print('Total predicted:', len(predictions['indeterminate']),'/', n)
    p, c = c, time.time()
    print("One batch time: %.2f seconds" %(c-p))
    print("ETA: %.2f" %((n-l)*(c-p)/batch_size))
    if c - pred_start_time >= stopper:
        print("Time's up!")
        break
    
    del model
    K.clear_session()
    
    del x, preds
    gc.collect()
    

KeyError: "None of [Index(['pe_present_on_image', 'rv_lv_ratio_gte_1', 'rv_lv_ratio_lt_1',\n       'leftsided_pe', 'chronic_pe', 'rightsided_pe', 'acute_and_chronic_pe',\n       'central_pe', 'indeterminate'],\n      dtype='object')] are in the [columns]"

In [None]:
predictions_df = pd.DataFrame(predictions)

In [None]:
predictions_df.to_csv('inception_output_test.csv')

In [None]:
for key in predictions:
    print(key, len(set(predictions[key])), len(predictions[key]))

In [None]:
print(all_preds_true)

In [None]:
print([x[0] for x in all_preds_true])
print(len(all_preds_true))

In [None]:
print(predictions['pe_present_on_image'], min(predictions['pe_present_on_image']), max(predictions['pe_present_on_image']))

In [None]:
(0.5 + np.mean(np.array(predictions['pe_present_on_image'])))/2

In [None]:
for key in predictions:
    print(key)
    arr = np.array([1 if x > 0.28 else 0 for x in predictions[key]])
    (unique, counts) = np.unique(arr, return_counts=True)
    print(unique)
    print(counts)

In [None]:
test_ids = []
for v in test.StudyInstanceUID:
    if v not in test_ids:
        test_ids.append(v)

In [None]:
test_preds = test
cols = list(range(3,17))
test_preds.drop(test_preds.columns[cols],axis=1,inplace=True)

In [None]:
test_preds.head()

In [None]:
predictions = pd.DataFrame(predictions)
predictions.reset_index(drop=True, inplace=True)
predictions.head()

In [None]:
test_preds.reset_index(drop=True, inplace=True)
test_preds.head()

In [None]:
test_preds = pd.concat([test_preds, predictions], axis=1)
test_preds.head()

In [None]:
from scipy.special import softmax

label_agg = {key:[] for key in 
             ['id', 'negative_exam_for_pe', 'rv_lv_ratio_gte_1',
              'rv_lv_ratio_lt_1', 'leftsided_pe', 'chronic_pe',
              'rightsided_pe', 'acute_and_chronic_pe',
              'central_pe', 'indeterminate']
            }

for uid in test_ids:
    temp = test_preds.loc[test_preds.StudyInstanceUID ==uid]
    label_agg['id'].append(uid)
    
    n = temp.shape[0]
    #Check for any image level presence of PE of high confidence
    positive = any(temp.pe_present_on_image >= 0.5) #50% threshhold
    
    #Only one from positive, negative and indeterminate should have value>0.5
    #per exam
    if positive: 
        label_agg['indeterminate'].append(temp.indeterminate.min()/2)
        label_agg['negative_exam_for_pe'].append(0)
    else:
        if any(temp.indeterminate >= 0.5):
            label_agg['indeterminate'].append(temp.indeterminate.max())
            label_agg['negative_exam_for_pe'].append(1)
        else:
            label_agg['indeterminate'].append(temp.indeterminate.min()/2)
            label_agg['negative_exam_for_pe'].append(1)
    
    #I decided that the total ratio should be equal to 1, so I used softmax
    #We modify the weights by multiplying the bigger by 2 and dividing the smaller by 2
    a, b = temp[['rv_lv_ratio_gte_1', 'rv_lv_ratio_lt_1']].mean().values
    if a > b:
        a, b = a*2, b/2
    elif a < b:
        a, b = a/2, b*2
    a, b = softmax([a, b])
    if positive:
        label_agg['rv_lv_ratio_gte_1'].append(a)
        label_agg['rv_lv_ratio_lt_1'].append(b)
    else:
        label_agg['rv_lv_ratio_gte_1'].append(a/2)
        label_agg['rv_lv_ratio_lt_1'].append(b/2)
    
        #Next is for Chronic (C), Acute-Chronic (AC) and Acute (A) PE
        #We need to see if we got a high confidence value from either C or AC
        #If there is, we add it to a 50% based score for high confidence
        #and half weight for low confidence score
        if any(temp['acute_and_chronic_pe'] > 0.5): #50% confidence level
            label_agg['acute_and_chronic_pe'].append(0.5 + temp['acute_and_chronic_pe'].mean()/2)
            label_agg['chronic_pe'].append(temp['chronic_pe'].mean()/2)

        elif any(temp['chronic_pe'] > 0.5):
            label_agg['acute_and_chronic_pe'].append(temp['acute_and_chronic_pe'].mean()/2)
            label_agg['chronic_pe'].append(0.5 + temp['chronic_pe'].mean()/2)

        else: #Else, we set both to half values, as we declare the A as the value
            label_agg['acute_and_chronic_pe'].append(temp['acute_and_chronic_pe'].mean()/2)
            label_agg['chronic_pe'].append(temp['chronic_pe'].mean()/2)

        #for right, left, central, we use the same metric above
        for key in ['leftsided_pe', 'rightsided_pe', 'central_pe']:
            if positive:
                label_agg[key].append(0.5 + temp[key].mean()/2)
            else:
                label_agg[key].append(temp[key].mean()/2)