# This file views metadata and saves malignancy labels to a numpy array
# It is organised as follows:
* view metadata as pandas dataframes
* do train/validation/test split, keeping nodules from same patient in same dataset (change models later to use this)
* find and omit slices with a bounding box that overlaps the image border
* find number of slices with each label
* save labels for malignant/beign and malignant/non-malignant

In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import os

In [23]:
meta = pd.read_csv(r"C:\Users\mm17b2k\Documents\ARCANE\Python\MSc\Data\Meta\meta_info.csv")
meta = meta.drop("is_clean",axis=1)

In [24]:
x = 130
meta[x:x+10]

Unnamed: 0,patient_id,nodule_no,slice_no,original_image,mask_image,malignancy,is_cancer
130,11,2,1,0011_NI002_slice001,0011_MA002_slice001,2,False
131,11,3,1,0011_NI003_slice001,0011_MA003_slice001,1,False
132,11,4,1,0011_NI004_slice001,0011_MA004_slice001,2,False
133,11,4,2,0011_NI004_slice002,0011_MA004_slice002,2,False
134,11,5,0,0011_NI005_slice000,0011_MA005_slice000,1,False
135,11,5,1,0011_NI005_slice001,0011_MA005_slice001,1,False
136,11,6,1,0011_NI006_slice001,0011_MA006_slice001,1,False
137,11,6,2,0011_NI006_slice002,0011_MA006_slice002,1,False
138,11,7,0,0011_NI007_slice000,0011_MA007_slice000,2,False
139,11,7,1,0011_NI007_slice001,0011_MA007_slice001,2,False


# NI= Nodule Image, MA = Mask Original , CN = Clean Nodule , CM = Clean Mask

In [25]:
def is_nodule(row):
    if row[5:7] == 'NI':
        return True
    else:
        return False

In [26]:
meta['is_nodule'] = meta['original_image'].apply(lambda row: is_nodule(row))

In [27]:
meta

Unnamed: 0,patient_id,nodule_no,slice_no,original_image,mask_image,malignancy,is_cancer,is_nodule
0,1,0,0,0001_NI000_slice000,0001_MA000_slice000,5,True,True
1,1,0,1,0001_NI000_slice001,0001_MA000_slice001,5,True,True
2,1,0,2,0001_NI000_slice002,0001_MA000_slice002,5,True,True
3,1,0,3,0001_NI000_slice003,0001_MA000_slice003,5,True,True
4,1,0,4,0001_NI000_slice004,0001_MA000_slice004,5,True,True
...,...,...,...,...,...,...,...,...
13911,1011,3,0,1011_NI003_slice000,1011_MA003_slice000,2,False,True
13912,1011,3,1,1011_NI003_slice001,1011_MA003_slice001,2,False,True
13913,1012,0,0,1012_NI000_slice000,1012_MA000_slice000,2,False,True
13914,1012,0,1,1012_NI000_slice001,1012_MA000_slice001,2,False,True


In [28]:
print(np.sum(meta.is_nodule == False))

0


In [29]:
meta = meta[meta['is_nodule']==True]
meta.reset_index(inplace=True)

In [30]:
meta["patient_id"].nunique()

875

In [31]:
def is_train(row,train,val,test):
    if row in train:
        return 'Train'
    elif row in val:
        return 'Validation'
    else:
        return 'Test'

In [32]:
#clean_patient_id = list(np.unique(clean_meta['patient_id']))
meta_patient_id = list(np.unique(meta['patient_id']))
len(meta_patient_id)

875

In [33]:
def create_label_segmentation(meta):
    patient_id = list(np.unique(meta['patient_id']))
    train_patient , test_patient = train_test_split(patient_id,test_size= 0.2)
    train_patient, val_patient = train_test_split(train_patient,test_size= 0.25)
    print(len(train_patient),len(val_patient),len(test_patient))
    
    meta['data_split']= meta['patient_id'].apply(lambda row : is_train(row,train_patient,val_patient,test_patient))
    
    return meta

In [34]:
# We need to train/test split independently for clean_meta, meta
meta = create_label_segmentation(meta)
print('total_patients', len(meta_patient_id))

525 175 175
total_patients 875


# Clean Meta only stores meta information of patients without nodules.

In [35]:
meta.to_csv(r"C:\Users\mm17b2k\Documents\ARCANE\Python\MSc\Data\Meta\meta.csv")

In [36]:
meta

Unnamed: 0,index,patient_id,nodule_no,slice_no,original_image,mask_image,malignancy,is_cancer,is_nodule,data_split
0,0,1,0,0,0001_NI000_slice000,0001_MA000_slice000,5,True,True,Train
1,1,1,0,1,0001_NI000_slice001,0001_MA000_slice001,5,True,True,Train
2,2,1,0,2,0001_NI000_slice002,0001_MA000_slice002,5,True,True,Train
3,3,1,0,3,0001_NI000_slice003,0001_MA000_slice003,5,True,True,Train
4,4,1,0,4,0001_NI000_slice004,0001_MA000_slice004,5,True,True,Train
...,...,...,...,...,...,...,...,...,...,...
13911,13911,1011,3,0,1011_NI003_slice000,1011_MA003_slice000,2,False,True,Train
13912,13912,1011,3,1,1011_NI003_slice001,1011_MA003_slice001,2,False,True,Train
13913,13913,1012,0,0,1012_NI000_slice000,1012_MA000_slice000,2,False,True,Train
13914,13914,1012,0,1,1012_NI000_slice001,1012_MA000_slice001,2,False,True,Train


In [37]:
IMAGE_DIR = r"C:\Users\mm17b2k\Documents\ARCANE\Python\MSc\Data\Images"
all_files_list = [f[:-4] for f in os.listdir(IMAGE_DIR)]
all_files_list.sort()       
print(len(all_files_list))
print(all_files_list[0:10])

13852
['0001_NI000_slice000', '0001_NI000_slice001', '0001_NI000_slice002', '0001_NI000_slice003', '0001_NI000_slice004', '0001_NI000_slice005', '0001_NI000_slice006', '0001_NI000_slice007', '0002_NI000_slice000', '0002_NI000_slice001']


In [38]:
omitted_indices = []
for i, file_name in enumerate(meta["original_image"]):
    if file_name not in all_files_list:
        omitted_indices.append(i)
print(len(omitted_indices))
omitted_indices

64


[4035,
 4037,
 4038,
 5401,
 5402,
 5403,
 5819,
 5820,
 5821,
 5822,
 5823,
 5824,
 5825,
 5826,
 5827,
 5828,
 5829,
 5830,
 6200,
 6201,
 6202,
 6203,
 6204,
 6205,
 6206,
 6207,
 6208,
 6209,
 6210,
 6211,
 6212,
 7293,
 7322,
 7323,
 8548,
 8549,
 8550,
 8551,
 8552,
 8553,
 8554,
 8555,
 8715,
 8716,
 8717,
 8718,
 9377,
 9378,
 9379,
 9380,
 9381,
 9382,
 9383,
 9384,
 13040,
 13041,
 13042,
 13043,
 13055,
 13056,
 13570,
 13571,
 13572,
 13573]

In [39]:
np.mean(meta.is_cancer == 'True')

0.3771917217591262

In [40]:
len(meta)

13916

In [41]:
np.sum(meta.is_cancer == 'True')

5249

In [42]:
np.sum(meta.is_cancer == 'False')

3274

In [43]:
meta

Unnamed: 0,index,patient_id,nodule_no,slice_no,original_image,mask_image,malignancy,is_cancer,is_nodule,data_split
0,0,1,0,0,0001_NI000_slice000,0001_MA000_slice000,5,True,True,Train
1,1,1,0,1,0001_NI000_slice001,0001_MA000_slice001,5,True,True,Train
2,2,1,0,2,0001_NI000_slice002,0001_MA000_slice002,5,True,True,Train
3,3,1,0,3,0001_NI000_slice003,0001_MA000_slice003,5,True,True,Train
4,4,1,0,4,0001_NI000_slice004,0001_MA000_slice004,5,True,True,Train
...,...,...,...,...,...,...,...,...,...,...
13911,13911,1011,3,0,1011_NI003_slice000,1011_MA003_slice000,2,False,True,Train
13912,13912,1011,3,1,1011_NI003_slice001,1011_MA003_slice001,2,False,True,Train
13913,13913,1012,0,0,1012_NI000_slice000,1012_MA000_slice000,2,False,True,Train
13914,13914,1012,0,1,1012_NI000_slice001,1012_MA000_slice001,2,False,True,Train


In [69]:
len(data_3classes)

0

In [70]:
pat_ids = []
for item in data_3classes:
    if item[1] not in pat_ids:
        pat_ids.append(item[1])
#print(len(pat_ids))
len(pat_ids)

0

In [71]:
data_2classes[-10:]

[]

In [72]:
patient_ids = list(meta['patient_id'])
len(patient_ids)

data_2classes = []
data_3classes = []
for i in range(13916):
    if i not in omitted_indices:
        data_2classes.append([i, patient_ids[i], labels2[i]])
        data_3classes.append([i, patient_ids[i], labels[i]])



np.save(r"C:\Users\mm17b2k\Documents\ARCANE\Python\MSc\LIDC\Python Scripts\latent vectors" + '/' + 'data_2classes', data_2classes)
print(data_2classes[0:10])

np.save(r"C:\Users\mm17b2k\Documents\ARCANE\Python\MSc\LIDC\Python Scripts\latent vectors" + '/' + 'data_3classes', data_3classes)
print(data_3classes[0:10])

len(data_2classes)

IndexError: list index out of range

In [73]:
meta.is_cancer.unique()

array(['True', 'False', 'Ambiguous'], dtype=object)

In [74]:
meta = meta[meta.original_image.isin(all_files_list)]
meta

Unnamed: 0,index,patient_id,nodule_no,slice_no,original_image,mask_image,malignancy,is_cancer,is_nodule,data_split
0,0,1,0,0,0001_NI000_slice000,0001_MA000_slice000,5,True,True,Train
1,1,1,0,1,0001_NI000_slice001,0001_MA000_slice001,5,True,True,Train
2,2,1,0,2,0001_NI000_slice002,0001_MA000_slice002,5,True,True,Train
3,3,1,0,3,0001_NI000_slice003,0001_MA000_slice003,5,True,True,Train
4,4,1,0,4,0001_NI000_slice004,0001_MA000_slice004,5,True,True,Train
...,...,...,...,...,...,...,...,...,...,...
13911,13911,1011,3,0,1011_NI003_slice000,1011_MA003_slice000,2,False,True,Train
13912,13912,1011,3,1,1011_NI003_slice001,1011_MA003_slice001,2,False,True,Train
13913,13913,1012,0,0,1012_NI000_slice000,1012_MA000_slice000,2,False,True,Train
13914,13914,1012,0,1,1012_NI000_slice001,1012_MA000_slice001,2,False,True,Train


In [75]:
meta_cancer = meta[meta['is_cancer']=='True']
meta_cancer

Unnamed: 0,index,patient_id,nodule_no,slice_no,original_image,mask_image,malignancy,is_cancer,is_nodule,data_split
0,0,1,0,0,0001_NI000_slice000,0001_MA000_slice000,5,True,True,Train
1,1,1,0,1,0001_NI000_slice001,0001_MA000_slice001,5,True,True,Train
2,2,1,0,2,0001_NI000_slice002,0001_MA000_slice002,5,True,True,Train
3,3,1,0,3,0001_NI000_slice003,0001_MA000_slice003,5,True,True,Train
4,4,1,0,4,0001_NI000_slice004,0001_MA000_slice004,5,True,True,Train
...,...,...,...,...,...,...,...,...,...,...
13906,13906,1011,2,0,1011_NI002_slice000,1011_MA002_slice000,5,True,True,Train
13907,13907,1011,2,1,1011_NI002_slice001,1011_MA002_slice001,5,True,True,Train
13908,13908,1011,2,2,1011_NI002_slice002,1011_MA002_slice002,5,True,True,Train
13909,13909,1011,2,3,1011_NI002_slice003,1011_MA002_slice003,5,True,True,Train


In [76]:
print(np.sum(meta_cancer.is_cancer == 'True'))  # 5249
meta_cancer["patient_id"].nunique()             # 394

5230


394

In [77]:
meta_noncancer = meta[meta['is_cancer']=='False'] 
meta_noncancer

Unnamed: 0,index,patient_id,nodule_no,slice_no,original_image,mask_image,malignancy,is_cancer,is_nodule,data_split
36,36,3,0,0,0003_NI000_slice000,0003_MA000_slice000,2,False,True,Train
37,37,3,0,1,0003_NI000_slice001,0003_MA000_slice001,2,False,True,Train
38,38,3,0,2,0003_NI000_slice002,0003_MA000_slice002,2,False,True,Train
39,39,3,0,3,0003_NI000_slice003,0003_MA000_slice003,2,False,True,Train
40,40,3,0,4,0003_NI000_slice004,0003_MA000_slice004,2,False,True,Train
...,...,...,...,...,...,...,...,...,...,...
13911,13911,1011,3,0,1011_NI003_slice000,1011_MA003_slice000,2,False,True,Train
13912,13912,1011,3,1,1011_NI003_slice001,1011_MA003_slice001,2,False,True,Train
13913,13913,1012,0,0,1012_NI000_slice000,1012_MA000_slice000,2,False,True,Train
13914,13914,1012,0,1,1012_NI000_slice001,1012_MA000_slice001,2,False,True,Train


In [78]:
print(np.sum(meta_noncancer.is_cancer == 'False')) # 3274
meta_noncancer["patient_id"].nunique()             # 454

3264


454

In [79]:
meta_ambig = meta[meta['is_cancer']=='Ambiguous']
meta_ambig

Unnamed: 0,index,patient_id,nodule_no,slice_no,original_image,mask_image,malignancy,is_cancer,is_nodule,data_split
64,64,5,0,0,0005_NI000_slice000,0005_MA000_slice000,3,Ambiguous,True,Train
65,65,5,0,1,0005_NI000_slice001,0005_MA000_slice001,3,Ambiguous,True,Train
66,66,5,1,0,0005_NI001_slice000,0005_MA001_slice000,3,Ambiguous,True,Train
67,67,5,1,1,0005_NI001_slice001,0005_MA001_slice001,3,Ambiguous,True,Train
68,68,5,1,2,0005_NI001_slice002,0005_MA001_slice002,3,Ambiguous,True,Train
...,...,...,...,...,...,...,...,...,...,...
13863,13863,1008,1,1,1008_NI001_slice001,1008_MA001_slice001,3,Ambiguous,True,Train
13864,13864,1008,1,2,1008_NI001_slice002,1008_MA001_slice002,3,Ambiguous,True,Train
13870,13870,1008,4,0,1008_NI004_slice000,1008_MA004_slice000,3,Ambiguous,True,Train
13871,13871,1008,4,1,1008_NI004_slice001,1008_MA004_slice001,3,Ambiguous,True,Train


In [80]:
print(np.sum(meta_ambig.is_cancer == 'Ambiguous')) # 5393
meta_ambig["patient_id"].nunique()                 # 580

5358


579

In [81]:
cancer = list(meta_cancer['index'])
non_cancer = list(meta_noncancer['index'])
ambiguous = list(meta_ambig['index'])

In [82]:
mus = np.load(r"C:\Users\mm17b2k\Documents\ARCANE\Python\MSc\LIDC\Python Scripts\latent vectors\mu.npy")
len(mus)

13852

In [83]:
len(cancer+non_cancer+ambiguous)

13852

In [84]:
labels = []
for i in range(13916):
    if i in cancer:
        labels.append(2) #'cancer'
    if i in ambiguous:
        labels.append(1) #'ambiguous'
    if i in non_cancer:
        labels.append(0) #'non_cancer'
len(labels)

13852

In [85]:
np.save(r"C:\Users\mm17b2k\Documents\ARCANE\Python\MSc\LIDC\Python Scripts\latent vectors" + '/' + 'labels', labels)

In [86]:
labels2 = []
for i in range(13916):
    if i in cancer:
        labels2.append(1) #'cancer'
    if i in ambiguous:
        labels2.append(0) #'ambiguous'
    if i in non_cancer:
        labels2.append(0) #'non_cancer'
len(labels2)

13852

In [87]:
np.save(r"C:\Users\mm17b2k\Documents\ARCANE\Python\MSc\LIDC\Python Scripts\latent vectors" + '/' + 'labels2', labels2)

In [88]:
patient_ids = list(meta['patient_id'])
len(patient_ids)

13852

In [89]:
len(np.unique(patient_ids))

875

In [90]:
labels3 = []
for i in range(13916):
    if i in cancer:
        labels3.append(1) #'cancer'
    if i in non_cancer:
        labels3.append(0) #'non_cancer'

np.save(r"C:\Users\mm17b2k\Documents\ARCANE\Python\MSc\LIDC\Python Scripts\latent vectors" + '/' + 'labels3', labels3)

len(labels3)

8494

In [91]:
meta_new = meta.reset_index()
meta_new

Unnamed: 0,level_0,index,patient_id,nodule_no,slice_no,original_image,mask_image,malignancy,is_cancer,is_nodule,data_split
0,0,0,1,0,0,0001_NI000_slice000,0001_MA000_slice000,5,True,True,Train
1,1,1,1,0,1,0001_NI000_slice001,0001_MA000_slice001,5,True,True,Train
2,2,2,1,0,2,0001_NI000_slice002,0001_MA000_slice002,5,True,True,Train
3,3,3,1,0,3,0001_NI000_slice003,0001_MA000_slice003,5,True,True,Train
4,4,4,1,0,4,0001_NI000_slice004,0001_MA000_slice004,5,True,True,Train
...,...,...,...,...,...,...,...,...,...,...,...
13847,13911,13911,1011,3,0,1011_NI003_slice000,1011_MA003_slice000,2,False,True,Train
13848,13912,13912,1011,3,1,1011_NI003_slice001,1011_MA003_slice001,2,False,True,Train
13849,13913,13913,1012,0,0,1012_NI000_slice000,1012_MA000_slice000,2,False,True,Train
13850,13914,13914,1012,0,1,1012_NI000_slice001,1012_MA000_slice001,2,False,True,Train


In [92]:
meta_ambig = meta_new[meta_new['is_cancer']=='Ambiguous']
meta_ambig

Unnamed: 0,level_0,index,patient_id,nodule_no,slice_no,original_image,mask_image,malignancy,is_cancer,is_nodule,data_split
64,64,64,5,0,0,0005_NI000_slice000,0005_MA000_slice000,3,Ambiguous,True,Train
65,65,65,5,0,1,0005_NI000_slice001,0005_MA000_slice001,3,Ambiguous,True,Train
66,66,66,5,1,0,0005_NI001_slice000,0005_MA001_slice000,3,Ambiguous,True,Train
67,67,67,5,1,1,0005_NI001_slice001,0005_MA001_slice001,3,Ambiguous,True,Train
68,68,68,5,1,2,0005_NI001_slice002,0005_MA001_slice002,3,Ambiguous,True,Train
...,...,...,...,...,...,...,...,...,...,...,...
13799,13863,13863,1008,1,1,1008_NI001_slice001,1008_MA001_slice001,3,Ambiguous,True,Train
13800,13864,13864,1008,1,2,1008_NI001_slice002,1008_MA001_slice002,3,Ambiguous,True,Train
13806,13870,13870,1008,4,0,1008_NI004_slice000,1008_MA004_slice000,3,Ambiguous,True,Train
13807,13871,13871,1008,4,1,1008_NI004_slice001,1008_MA004_slice001,3,Ambiguous,True,Train


In [93]:
ambiguous = meta_ambig.index.values.tolist()
np.save(r"C:\Users\mm17b2k\Documents\ARCANE\Python\MSc\LIDC\Python Scripts\latent vectors" + '/' + 'ambiguous', ambiguous)
ambiguous[-10:]

[13774, 13795, 13796, 13797, 13798, 13799, 13800, 13806, 13807, 13808]