In [40]:
import numpy as np
import pandas as pd
import os
import nibabel as nib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.offline import init_notebook_mode, iplot, plot
from tqdm.notebook import tqdm
from pathlib import Path
from collections import Counter

tqdm.pandas()

## Vertebal Column

The spine is made up of bones, muscles, tendons, nerves, and other tissues that reach from the base of the skull near the spinal cord (clivus) to the coccyx (tailbone). The vertebrae (back bones) of the spine include the cervical spine (C1-C7), thoracic spine (T1-T12), lumbar spine (L1-L5), sacral spine (S1-S5), and the tailbone. Each vertebra is separated by a disc. The vertebrae surround and protect the spinal cord.

In [45]:
from IPython.display import display, HTML
display(HTML("<center><img src='https://upload.wikimedia.org/wikipedia/commons/thumb/5/54/Gray_111_-_Vertebral_column-coloured.png/174px-Gray_111_-_Vertebral_column-coloured.png'></center>"))


## Lumbar Vertebra
The lumbar vertebrae are, in human anatomy, the five vertebrae between the rib cage and the pelvis. They are the largest segments of the vertebral column and are characterized by the absence of the foramen transversarium within the transverse process (since it is only found in the cervical region) and by the absence of facets on the sides of the body (as found only in the thoracic region). They are designated L1 to L5, starting at the top. The lumbar vertebrae help support the weight of the body, and permit movement.

## Tests and Imaging
Computed tomography (CT) scan: This scan uses X-rays and computers to produce images that are very thin “slices” of the area under examination. A CT scan can show the shape and size of your spinal canal, its contents and the bone around it. It helps diagnose bone spurs, osteophytes, bone fusion and bone destruction from infection or tumor.

## Dataset - VerSe
Spine or vertebral segmentation is a crucial step in all applications regarding automated quantification of spinal morphology and pathology. With the advent of deep learning, for such a task on computed tomography (CT) scans, a big and varied data is a primary sought-after resource. However, a large-scale, public dataset is currently unavailable.

We believe VerSe can help here. VerSe is a large scale, multi-detector, multi-site, CT spine dataset consisting of 374 scans from 355 patients. The challenge was held in two iterations in conjunction with MICCAI 2019 and 2020. The tasks evaluated for include: vertebral labelling and segmentation.

In [47]:
from IPython.display import display, HTML
display(HTML("<center><img src='https://github.com/anjany/verse/blob/main/assets/dataset_snapshot.png?raw=true'></center>"))


## Uderstanding Derivatives
1) seg-subreg-ctd.json
2) seg-subreg-mask.nii.gz
3) seg-total-mask.nii.gz
4) seg-vertsac-ctd.json
5) seg-vertsac-mask.nii.gz

In [48]:
v_idx2name = {
     1: "C1",     2: "C2",     3: "C3",     4: "C4",     5: "C5",     6: "C6",     7: "C7", 
     8: "T1",     9: "T2",    10: "T3",    11: "T4",    12: "T5",    13: "T6",    14: "T7",    15: "T8",    16: "T9",    17: "T10",   18: "T11",   19: "T12", 28: "T13",
    20: "L1",    21: "L2",    22: "L3",    23: "L4",    24: "L5",    25: "L6",    
    26: "S1",    29: "S2",    30: "S3",    31: "S4",    32: "S5",    33: "S6",
    27: "Cocc",
}

In [81]:
master_df = pd.read_excel('VerSe_masterlist.xlsx')
master_df
master_df = master_df[['Id', 'Full_Id', 'Castellvi', '2a/3a Side',	'Sacrum Seg', 'Last_L']]

In [82]:
master_df.head()

Unnamed: 0,Id,Full_Id,Castellvi,2a/3a Side,Sacrum Seg,Last_L
0,4,sub-verse004,0.0,,1.0,L5
1,5,sub-verse005,0.0,,1.0,L5
2,6,sub-verse006,0.0,,1.0,L5
3,7,,,,,
4,8,sub-verse008,0.0,,1.0,L5


In [85]:
print('missing image derivatives: ', master_df.Full_Id.isna().sum())

missing image derivatives:  40


In [86]:
master_df = master_df.dropna(subset=['Full_Id'])

In [87]:
master_df

Unnamed: 0,Id,Full_Id,Castellvi,2a/3a Side,Sacrum Seg,Last_L
0,4,sub-verse004,0,,1.0,L5
1,5,sub-verse005,0,,1.0,L5
2,6,sub-verse006,0,,1.0,L5
4,8,sub-verse008,0,,1.0,L5
5,9,sub-verse009,0,,1.0,L5
...,...,...,...,...,...,...
320,824,sub-verse824,0,,1.0,L5
321,825,sub-verse825,0,,1.0,L5
322,826,sub-verse826,0,,1.0,L5
323,833,sub-verse833,2b,,1.0,L5


In [88]:
verse19 = '../dataset-verse19/'
verse20 = '../dataset-verse20/'
def get_full_idx(root):
    root = root + 'derivatives/'    
    full_idx = os.listdir(root)
    return full_idx

dataset_19 = get_full_idx(verse19)
dataset_20 = get_full_idx(verse20)
full_idx = get_full_idx(verse19) + get_full_idx(verse20)


In [89]:
print('dataset size:', len(full_idx))

dataset size: 323


In [100]:
def find_missing_derivatives(list1, list2):
    # Find elements present in list1 but not in list2
    missing_in_list2 = [item for item in list1 if item not in list2]

    # Find elements present in list2 but not in list1
    missing_in_list1 = [item for item in list2 if item not in list1]

    return missing_in_list1, missing_in_list2

In [103]:
list1 = list(master_df['Full_Id'])
list2 = full_idx

missing_in_list1, missing_in_list2 = find_missing_derivatives(list1, list2)
print("Missing in list1:", len(missing_in_list1))
print("Missing in list2:", len(missing_in_list2))


Missing in list1: 56
Missing in list2: 18


In [105]:
missing_in_list1

['sub-verse410',
 'sub-verse205',
 'sub-verse105',
 'sub-verse081',
 'sub-verse403',
 'sub-verse221',
 'sub-verse404',
 'sub-verse007',
 'sub-verse412',
 'sub-verse401',
 'sub-verse402',
 'sub-verse405',
 'sub-verse400',
 'sub-verse112',
 'sub-verse415',
 'sub-verse413',
 'sub-verse075',
 'sub-verse018',
 'sub-verse012',
 'sub-verse416',
 'sub-verse119',
 'sub-verse125',
 'sub-verse242',
 'sub-verse217',
 'sub-verse207',
 'sub-verse225',
 'sub-verse411',
 'sub-verse409',
 'sub-verse407',
 'sub-verse100',
 'sub-verse406',
 'sub-verse408',
 'sub-verse011',
 'sub-verse150',
 'sub-verse414',
 'sub-verse059',
 'sub-verse250',
 'sub-verse417',
 'sub-verse230',
 'sub-verse643',
 'sub-verse814',
 'sub-verse645',
 'sub-verse650',
 'sub-verse803',
 'sub-verse641',
 'sub-verse647',
 'sub-verse767',
 'sub-verse757',
 'sub-verse710',
 'sub-verse596',
 'sub-verse544',
 'sub-verse648',
 'sub-verse764',
 'sub-verse651',
 'sub-verse640',
 'sub-verse754']

In [106]:
missing_in_list2

['sub-verse400_split-verse155',
 'sub-verse401_split-verse253',
 'sub-verse402_split-verse251',
 'sub-verse403_split-verse255',
 'sub-verse404_split-verse256',
 'sub-verse405_split-verse259',
 'sub-verse406_split-verse261',
 'sub-verse407_split-verse262',
 'sub-verse408_split-verse265',
 'sub-verse409_split-verse266',
 'sub-verse410_split-verse267',
 'sub-verse411_split-verse270',
 'sub-verse412_split-verse290',
 'sub-verse413_split-verse272',
 'sub-verse414_split-verse273',
 'sub-verse415_split-verse275',
 'sub-verse416_split-verse279',
 'sub-verse417_split-verse278']

In [116]:
# Uncomment to make it optimized

# start_of_20 = sorted(dataset_20)
# idx_arr = start_of_20[0].split('verse')
# first_idx_20 = idx_arr[-1]
# print(first_idx_20)

In [115]:
for i in master_df.Full_Id:
    # TODO : Ask Hendrick about those missing images. 

    # idx_arr = i.split('verse')
    # idx = idx_arr[-1]
    # print(i)
    
    if i in dataset_19:
        continue
    elif i in dataset_20:
        continue
    else:
        raise Exception('Missing images!!')
        

Exception: Missing images!!