# XML Feature Extraction - 2D and 3D

#### Note: run inside an environment with numpy==1.20.3 e pylidc==0.2.2

In [1]:
import os
import six
import pickle
import pandas as pd
import SimpleITK as sitk
from functools import reduce
import numpy as np
from scipy.stats import mode
from skimage.measure import find_contours
from skimage import io
import pydicom
import pylidc as pl
from pylidc.utils import consensus
import matplotlib.pyplot as plt

#### std_limit = 0.5

In [5]:
# Path were the images are stored
input_directory = "/home/cmonteiro/Aulas/Laboratório de IA e CD/Projeto 1/Imagens/LIDC-IDRI"

# Path to the setup file from radiomics
params_file = "/home/cmonteiro/pyradiomics-master/pyradiomics-master/examples/exampleSettings/Params.yaml"

path = "/home/cmonteiro/Aulas/Laboratório de IA e CD/Projeto 1/std_limit_0.5"

# Ordered list of all the subfolders
patient_folders = sorted(os.listdir(input_directory))
nodule_id=0

# Dictionary with all the correspondancies
nodule_data = {
    'Nodule_id': [],
    'Patient_id': [],
    'Subtlety': [],
    'Internalstructure': [],
    'Calcification': [],
    'Sphericity': [],
    'Margin': [],
    'Lobulation': [],
    'Spiculation': [],
    'Texture': [],
    'Malignancy': []
}

malignancy_names = {
    1: "1-Highly Unlikely",
    2: "2-Moderately Unlikely",
    3: "3-Indeterminate",
    4: "4-Moderately Suspicious",
    5: "5-Highly Suspicious"
}

std_limit = 0.5

for patient_folder in patient_folders:
    patient_folder_path = os.path.join(input_directory, patient_folder)
    
    # Id of the patient - 'LIDC-IDRI-xxxx'
    patient_id = os.path.basename(patient_folder_path)
    print(patient_id)
    patient_scans = pl.query(pl.Scan).filter(pl.Scan.patient_id == patient_id, pl.Scan.annotations.any()).all()
    
    for scan in patient_scans:
        nods = scan.cluster_annotations()
        
        # Iteration for each node
        for anns in nods:
            nodule_id+=1
            
            nodule_characteristics = []

            # 50% consensus
            cmask, cbbox, masks = consensus(anns, clevel=0.5, pad=[(20, 20), (20, 20), (0, 0)])

            for i, mask in enumerate(masks):
                characteristics = anns[i].feature_vals()
                nodule_characteristics.append(characteristics)

            # If there was some feature extracted, otherwise the nodule will not be included
            if nodule_characteristics:
                # Calculation of the stdeviation for each feature
                std_deviations = [np.std(characteristic) for characteristic in zip(*nodule_characteristics)]
        
                # Verify if there is some feature which has a std > std_limit
                if any(std > std_limit for std in std_deviations):
                    # Ignore this nodule if its std is bigger
                    print(f"Nodule ID {nodule_id}")

                    continue
                    
                # If the nodule passes the condition: save its cmask and its volume so that it can be used on the extraction 
                # of features by pyradiomics
                filename = os.path.join(path, f"{nodule_id}_cmask.npy")
                np.save(filename, cmask)
                
                #there is no storage available 
#                 volume = anns[0].scan.to_volume()
#                 filename = os.path.join(path, f"{nodule_id}_volume.npy")
#                 np.save(filename, volume)
                
                # Extract the values of the features from xml
                feature_values = {
                    'Nodule_id': nodule_id,
                    'Patient_id': patient_id,
                    'Subtlety': nodule_characteristics[0][0],
                    'Internalstructure': nodule_characteristics[0][1],
                    'Calcification': nodule_characteristics[0][2],
                    'Sphericity': nodule_characteristics[0][3],
                    'Margin': nodule_characteristics[0][4],
                    'Lobulation': nodule_characteristics[0][5],
                    'Spiculation': nodule_characteristics[0][6],
                    'Texture': nodule_characteristics[0][7],
                    'Malignancy': malignancy_names.get(nodule_characteristics[0][8])
                }

                # Append the feature values to the nodule_data dictionary
                for key, value in feature_values.items():
                    nodule_data[key].append(value)
                    
features = pd.DataFrame(nodule_data)
csv_filename = os.path.join(external_drive_path, 'features_pylidc_0.5.csv')
features.to_csv(csv_filename, index=False)

LIDC-IDRI-0001
Nodule ID 1
LIDC-IDRI-0002
Nodule ID 2
LIDC-IDRI-0003
Nodule ID 4
Nodule ID 5
Nodule ID 6
LIDC-IDRI-0004
Nodule ID 7
LIDC-IDRI-0005
Nodule ID 8
Nodule ID 9
LIDC-IDRI-0006
Nodule ID 12
Nodule ID 14
LIDC-IDRI-0007
Nodule ID 15
LIDC-IDRI-0008
Nodule ID 17
Nodule ID 18
LIDC-IDRI-0009
LIDC-IDRI-0010
Nodule ID 22
Nodule ID 23
LIDC-IDRI-0011
Nodule ID 24
Nodule ID 25
Nodule ID 29
Nodule ID 33
LIDC-IDRI-0012
Nodule ID 35
Nodule ID 36
Nodule ID 37
Nodule ID 39
Nodule ID 40
Nodule ID 42
Nodule ID 43
Nodule ID 44
LIDC-IDRI-0013
Nodule ID 46
Nodule ID 47
Nodule ID 48
LIDC-IDRI-0014
Nodule ID 49
LIDC-IDRI-0015
Nodule ID 50
LIDC-IDRI-0016
Nodule ID 51
Nodule ID 52
Nodule ID 54
Nodule ID 55
Nodule ID 56
LIDC-IDRI-0017
Nodule ID 57
LIDC-IDRI-0018
Nodule ID 59
Nodule ID 60
Nodule ID 61
LIDC-IDRI-0019
Nodule ID 62
LIDC-IDRI-0020
Nodule ID 64
LIDC-IDRI-0021
Nodule ID 67
LIDC-IDRI-0022
LIDC-IDRI-0023
Nodule ID 69
LIDC-IDRI-0024
Nodule ID 70
Nodule ID 71
Nodule ID 72
Nodule ID 73
LIDC-IDRI-0

Nodule ID 554
LIDC-IDRI-0183
Nodule ID 555
Nodule ID 556
LIDC-IDRI-0184
Nodule ID 557
LIDC-IDRI-0185
Nodule ID 558
Nodule ID 559
Nodule ID 560
Nodule ID 561
Nodule ID 562
Nodule ID 563
LIDC-IDRI-0186
Nodule ID 564
Nodule ID 566
Nodule ID 568
LIDC-IDRI-0187
Nodule ID 570
Nodule ID 572
LIDC-IDRI-0188
Nodule ID 576
Nodule ID 577
Nodule ID 578
Nodule ID 579
Nodule ID 580
Nodule ID 581
LIDC-IDRI-0190
Nodule ID 583
Nodule ID 584
LIDC-IDRI-0191
Nodule ID 585
LIDC-IDRI-0192
Nodule ID 588
LIDC-IDRI-0193
Nodule ID 591
LIDC-IDRI-0194
Nodule ID 592
Nodule ID 593
Nodule ID 594
LIDC-IDRI-0195
Nodule ID 595
Nodule ID 596
Nodule ID 597
Nodule ID 598
Nodule ID 599
Nodule ID 600
Nodule ID 601
LIDC-IDRI-0196
Nodule ID 602
LIDC-IDRI-0198
Nodule ID 603
LIDC-IDRI-0199
Nodule ID 605
Nodule ID 607
LIDC-IDRI-0200
Nodule ID 609
LIDC-IDRI-0201
Nodule ID 611
Nodule ID 613
LIDC-IDRI-0202
Nodule ID 618
LIDC-IDRI-0203
Nodule ID 619
Nodule ID 620
LIDC-IDRI-0204
Failed to reduce all groups to <= 4 Annotations.
Some no

Nodule ID 1057
LIDC-IDRI-0404
Failed to reduce all groups to <= 4 Annotations.
Some nodules may be close and must be grouped manually.
Nodule ID 1058
Nodule ID 1059
Nodule ID 1060
Nodule ID 1061
Nodule ID 1062
Nodule ID 1063
LIDC-IDRI-0405
Nodule ID 1064
Nodule ID 1065
LIDC-IDRI-0406
Nodule ID 1069
Nodule ID 1070
LIDC-IDRI-0407
Nodule ID 1071
Nodule ID 1072
Nodule ID 1073
LIDC-IDRI-0408
Nodule ID 1075
Nodule ID 1076
LIDC-IDRI-0409
Nodule ID 1078
Nodule ID 1079
Nodule ID 1081
LIDC-IDRI-0411
Nodule ID 1085
Nodule ID 1086
LIDC-IDRI-0412
Nodule ID 1087
Nodule ID 1088
Nodule ID 1089
LIDC-IDRI-0413
Nodule ID 1090
Nodule ID 1091
LIDC-IDRI-0414
Nodule ID 1093
LIDC-IDRI-0415
Nodule ID 1094
Nodule ID 1096
Nodule ID 1097
Nodule ID 1098
Nodule ID 1099
Nodule ID 1100
Nodule ID 1101
LIDC-IDRI-0416
Nodule ID 1102
Nodule ID 1103
LIDC-IDRI-0419
Nodule ID 1104
Nodule ID 1105
LIDC-IDRI-0420
Nodule ID 1106
Nodule ID 1107
LIDC-IDRI-0421
Nodule ID 1108
Nodule ID 1109
LIDC-IDRI-0423
Nodule ID 1110
LIDC-IDRI-

Nodule ID 1615
Nodule ID 1616
Nodule ID 1617
LIDC-IDRI-0617
Nodule ID 1619
Nodule ID 1620
LIDC-IDRI-0618
Nodule ID 1621
LIDC-IDRI-0619
Nodule ID 1622
Nodule ID 1623
LIDC-IDRI-0620
Nodule ID 1624
Nodule ID 1625
Nodule ID 1626
Nodule ID 1627
LIDC-IDRI-0621
LIDC-IDRI-0624
Nodule ID 1632
Nodule ID 1633
LIDC-IDRI-0625
Nodule ID 1638
LIDC-IDRI-0626
Nodule ID 1640
LIDC-IDRI-0628
Nodule ID 1644
Nodule ID 1646
Nodule ID 1648
LIDC-IDRI-0629
LIDC-IDRI-0630
LIDC-IDRI-0631
Nodule ID 1652
LIDC-IDRI-0633
Nodule ID 1654
LIDC-IDRI-0634
Nodule ID 1656
LIDC-IDRI-0635
Nodule ID 1657
LIDC-IDRI-0636
Nodule ID 1660
Nodule ID 1661
Nodule ID 1662
LIDC-IDRI-0637
Nodule ID 1663
Nodule ID 1664
Nodule ID 1666
Nodule ID 1668
Nodule ID 1669
Nodule ID 1670
Nodule ID 1672
Nodule ID 1673
LIDC-IDRI-0638
Nodule ID 1674
LIDC-IDRI-0639
Nodule ID 1676
Nodule ID 1677
LIDC-IDRI-0640
Nodule ID 1678
Nodule ID 1679
LIDC-IDRI-0641
Nodule ID 1681
Nodule ID 1683
Nodule ID 1684
Nodule ID 1685
LIDC-IDRI-0642
Nodule ID 1688
Nodule ID 

Nodule ID 2219
Nodule ID 2220
Nodule ID 2221
Nodule ID 2222
Nodule ID 2223
Nodule ID 2225
LIDC-IDRI-0844
Nodule ID 2226
Nodule ID 2227
Nodule ID 2228
LIDC-IDRI-0845
Nodule ID 2229
LIDC-IDRI-0846
Nodule ID 2231
Nodule ID 2232
LIDC-IDRI-0847
LIDC-IDRI-0848
Nodule ID 2234
LIDC-IDRI-0849
Nodule ID 2236
Nodule ID 2237
Nodule ID 2239
LIDC-IDRI-0850
Nodule ID 2242
Nodule ID 2244
LIDC-IDRI-0851
Nodule ID 2245
Nodule ID 2246
LIDC-IDRI-0852
Nodule ID 2249
Nodule ID 2250
LIDC-IDRI-0854
Nodule ID 2252
Nodule ID 2253
Nodule ID 2254
LIDC-IDRI-0855
Nodule ID 2257
Nodule ID 2258
Nodule ID 2259
Nodule ID 2261
Nodule ID 2262
Nodule ID 2263
Nodule ID 2264
Nodule ID 2265
Nodule ID 2266
Nodule ID 2268
Nodule ID 2270
Nodule ID 2271
Nodule ID 2272
LIDC-IDRI-0856
LIDC-IDRI-0857
Nodule ID 2274
LIDC-IDRI-0858
Nodule ID 2278
Nodule ID 2279
Nodule ID 2280
Nodule ID 2282
Nodule ID 2286
Nodule ID 2288
LIDC-IDRI-0859
Nodule ID 2289
LIDC-IDRI-0860
Nodule ID 2290
Nodule ID 2291
LIDC-IDRI-0861
Nodule ID 2292
Nodule ID 

#### std_limit = 1.0

In [8]:
# Path were the images are stored
input_directory = "/home/cmonteiro/Aulas/Laboratório de IA e CD/Projeto 1/Imagens/LIDC-IDRI"

# Path to the setup file from radiomics
params_file = "/home/cmonteiro/pyradiomics-master/pyradiomics-master/examples/exampleSettings/Params.yaml"

path = "/home/cmonteiro/Aulas/Laboratório de IA e CD/Projeto 1/std_limit_1.0"


# Ordered list of all the subfolders
patient_folders = sorted(os.listdir(input_directory))
nodule_id=0

# Dictionary with all the correspondancies
nodule_data = {
    'Nodule_id': [],
    'Patient_id': [],
    'Subtlety': [],
    'Internalstructure': [],
    'Calcification': [],
    'Sphericity': [],
    'Margin': [],
    'Lobulation': [],
    'Spiculation': [],
    'Texture': [],
    'Malignancy': []
}

malignancy_names = {
    1: "1-Highly Unlikely",
    2: "2-Moderately Unlikely",
    3: "3-Indeterminate",
    4: "4-Moderately Suspicious",
    5: "5-Highly Suspicious"
}

std_limit = 1.0

for patient_folder in patient_folders:
    patient_folder_path = os.path.join(input_directory, patient_folder)
    
    # Id of the patient - 'LIDC-IDRI-xxxx'
    patient_id = os.path.basename(patient_folder_path)
    print(patient_id)
    patient_scans = pl.query(pl.Scan).filter(pl.Scan.patient_id == patient_id, pl.Scan.annotations.any()).all()
    
    for scan in patient_scans:
        nods = scan.cluster_annotations()
        
        # Iteration for each node
        for anns in nods:
            nodule_id+=1
            
            nodule_characteristics = []

            # 50% consensus
            cmask, cbbox, masks = consensus(anns, clevel=0.5, pad=[(20, 20), (20, 20), (0, 0)])

            for i, mask in enumerate(masks):
                characteristics = anns[i].feature_vals()
                nodule_characteristics.append(characteristics)

            # If there was some feature extracted, otherwise the nodule will not be included
            if nodule_characteristics:
                # Calculation of the stdeviation for each feature
                std_deviations = [np.std(characteristic) for characteristic in zip(*nodule_characteristics)]
        
                # Verify if there is some feature which has a std > std_limit
                if any(std > std_limit for std in std_deviations):
                    # Ignore this nodule if its std is bigger
                    print(f"Nodule ID {nodule_id}")

                    continue
                    
                # If the nodule passes the condition: save its cmask and its volume so that it can be used on the extraction 
                # of features by pyradiomics
                filename = os.path.join(path, f"{nodule_id}_cmask.npy")
                np.save(filename, cmask)
                
                #there is no storage available 
#                 volume = anns[0].scan.to_volume()
#                 filename = os.path.join(path, f"{nodule_id}_volume.npy")
#                 np.save(filename, volume)
                
                # Extract the values of the features from xml
                feature_values = {
                    'Nodule_id': nodule_id,
                    'Patient_id': patient_id,
                    'Subtlety': nodule_characteristics[0][0],
                    'Internalstructure': nodule_characteristics[0][1],
                    'Calcification': nodule_characteristics[0][2],
                    'Sphericity': nodule_characteristics[0][3],
                    'Margin': nodule_characteristics[0][4],
                    'Lobulation': nodule_characteristics[0][5],
                    'Spiculation': nodule_characteristics[0][6],
                    'Texture': nodule_characteristics[0][7],
                    'Malignancy': malignancy_names.get(nodule_characteristics[0][8])
                }

                # Append the feature values to the nodule_data dictionary
                for key, value in feature_values.items():
                    nodule_data[key].append(value)
                    
features = pd.DataFrame(nodule_data)
csv_filename = os.path.join(external_drive_path, 'features_pylidc_1.0.csv')
features.to_csv(csv_filename, index=False)

LIDC-IDRI-0001
Nodule ID 1
LIDC-IDRI-0002
LIDC-IDRI-0003
Nodule ID 4
Nodule ID 5
Nodule ID 6
LIDC-IDRI-0004
Nodule ID 7
LIDC-IDRI-0005
Nodule ID 8
LIDC-IDRI-0006
Nodule ID 14
LIDC-IDRI-0007
Nodule ID 15
LIDC-IDRI-0008
Nodule ID 17
Nodule ID 18
LIDC-IDRI-0009
LIDC-IDRI-0010
Nodule ID 23
LIDC-IDRI-0011
Nodule ID 25
Nodule ID 29
Nodule ID 33
LIDC-IDRI-0012
Nodule ID 35
Nodule ID 37
Nodule ID 44
LIDC-IDRI-0013
Nodule ID 47
Nodule ID 48
LIDC-IDRI-0014
Nodule ID 49
LIDC-IDRI-0015
Nodule ID 50
LIDC-IDRI-0016
Nodule ID 54
Nodule ID 55
Nodule ID 56
LIDC-IDRI-0017
LIDC-IDRI-0018
Nodule ID 59
Nodule ID 61
LIDC-IDRI-0019
Nodule ID 62
LIDC-IDRI-0020
Nodule ID 64
LIDC-IDRI-0021
Nodule ID 67
LIDC-IDRI-0022
LIDC-IDRI-0023
Nodule ID 69
LIDC-IDRI-0024
Nodule ID 73
LIDC-IDRI-0025
LIDC-IDRI-0026
LIDC-IDRI-0027
Nodule ID 80
Nodule ID 81
LIDC-IDRI-0029
LIDC-IDRI-0030
Nodule ID 85
Nodule ID 86
LIDC-IDRI-0031
Nodule ID 91
LIDC-IDRI-0033
Nodule ID 92
LIDC-IDRI-0034
LIDC-IDRI-0035
Nodule ID 95
LIDC-IDRI-0036
LI

Nodule ID 707
LIDC-IDRI-0254
Nodule ID 710
LIDC-IDRI-0255
LIDC-IDRI-0256
Nodule ID 713
LIDC-IDRI-0257
LIDC-IDRI-0258
Nodule ID 716
Nodule ID 717
LIDC-IDRI-0259
Nodule ID 718
LIDC-IDRI-0260
Nodule ID 721
Nodule ID 722
LIDC-IDRI-0262
LIDC-IDRI-0263
Nodule ID 724
LIDC-IDRI-0264
Nodule ID 725
Nodule ID 726
LIDC-IDRI-0265
Nodule ID 727
LIDC-IDRI-0266
Nodule ID 728
LIDC-IDRI-0267
LIDC-IDRI-0268
LIDC-IDRI-0269
LIDC-IDRI-0270
Nodule ID 733
LIDC-IDRI-0271
Nodule ID 734
LIDC-IDRI-0272
LIDC-IDRI-0273
LIDC-IDRI-0274
Nodule ID 738
LIDC-IDRI-0275
LIDC-IDRI-0276
Nodule ID 740
LIDC-IDRI-0277
Nodule ID 742
LIDC-IDRI-0278
LIDC-IDRI-0280
LIDC-IDRI-0281
Nodule ID 746
LIDC-IDRI-0282
LIDC-IDRI-0283
LIDC-IDRI-0284
Nodule ID 750
Nodule ID 752
LIDC-IDRI-0285
Nodule ID 753
LIDC-IDRI-0286
Nodule ID 755
LIDC-IDRI-0287
LIDC-IDRI-0288
LIDC-IDRI-0289
Nodule ID 760
Nodule ID 761
LIDC-IDRI-0290
LIDC-IDRI-0291
LIDC-IDRI-0292
LIDC-IDRI-0294
Nodule ID 769
LIDC-IDRI-0296
LIDC-IDRI-0297
LIDC-IDRI-0298
Nodule ID 773
Nodule 

LIDC-IDRI-0543
Nodule ID 1441
LIDC-IDRI-0545
LIDC-IDRI-0546
Nodule ID 1444
LIDC-IDRI-0547
LIDC-IDRI-0549
Nodule ID 1448
Nodule ID 1449
Nodule ID 1452
LIDC-IDRI-0550
LIDC-IDRI-0551
LIDC-IDRI-0552
LIDC-IDRI-0553
LIDC-IDRI-0554
LIDC-IDRI-0555
Nodule ID 1460
LIDC-IDRI-0556
LIDC-IDRI-0557
LIDC-IDRI-0558
LIDC-IDRI-0559
LIDC-IDRI-0560
LIDC-IDRI-0562
Nodule ID 1470
Nodule ID 1471
LIDC-IDRI-0563
LIDC-IDRI-0565
Nodule ID 1475
Nodule ID 1476
Nodule ID 1477
Nodule ID 1478
Nodule ID 1479
LIDC-IDRI-0566
Nodule ID 1480
LIDC-IDRI-0567
LIDC-IDRI-0568
Nodule ID 1484
Nodule ID 1485
Nodule ID 1486
LIDC-IDRI-0569
Nodule ID 1487
Nodule ID 1488
LIDC-IDRI-0570
LIDC-IDRI-0571
LIDC-IDRI-0572
LIDC-IDRI-0574
LIDC-IDRI-0575
LIDC-IDRI-0576
Nodule ID 1500
Nodule ID 1501
LIDC-IDRI-0577
Nodule ID 1504
Nodule ID 1505
Nodule ID 1506
LIDC-IDRI-0578
Nodule ID 1508
LIDC-IDRI-0579
LIDC-IDRI-0580
LIDC-IDRI-0581
LIDC-IDRI-0582
Nodule ID 1514
Nodule ID 1517
Nodule ID 1519
LIDC-IDRI-0583
Nodule ID 1522
Nodule ID 1534
Nodule ID 

LIDC-IDRI-0855
Nodule ID 2257
Nodule ID 2258
Nodule ID 2259
Nodule ID 2261
Nodule ID 2262
Nodule ID 2265
Nodule ID 2266
Nodule ID 2268
Nodule ID 2270
Nodule ID 2272
LIDC-IDRI-0856
LIDC-IDRI-0857
Nodule ID 2274
LIDC-IDRI-0858
Nodule ID 2278
LIDC-IDRI-0859
Nodule ID 2289
LIDC-IDRI-0860
Nodule ID 2290
LIDC-IDRI-0861
Nodule ID 2293
LIDC-IDRI-0863
Failed to reduce all groups to <= 4 Annotations.
Some nodules may be close and must be grouped manually.
LIDC-IDRI-0864
LIDC-IDRI-0865
Failed to reduce all groups to <= 4 Annotations.
Some nodules may be close and must be grouped manually.
Nodule ID 2299
LIDC-IDRI-0866
Nodule ID 2300
Nodule ID 2302
Nodule ID 2308
LIDC-IDRI-0867
Nodule ID 2309
Nodule ID 2310
LIDC-IDRI-0868
LIDC-IDRI-0869
Nodule ID 2314
Nodule ID 2315
LIDC-IDRI-0870
Nodule ID 2318
Nodule ID 2319
Nodule ID 2320
Nodule ID 2321
Nodule ID 2326
Nodule ID 2327
LIDC-IDRI-0871
Nodule ID 2330
Nodule ID 2331
LIDC-IDRI-0872
LIDC-IDRI-0873
LIDC-IDRI-0874
Nodule ID 2340
Nodule ID 2342
LIDC-IDRI-

#### std_limit = 1.5

In [9]:
# Path were the images are stored
input_directory = "/home/cmonteiro/Aulas/Laboratório de IA e CD/Projeto 1/Imagens/LIDC-IDRI"

# Path to the setup file from radiomics
params_file = "/home/cmonteiro/pyradiomics-master/pyradiomics-master/examples/exampleSettings/Params.yaml"

path = "/home/cmonteiro/Aulas/Laboratório de IA e CD/Projeto 1/std_limit_1.5"

# Ordered list of all the subfolders
patient_folders = sorted(os.listdir(input_directory))
nodule_id=0

# Dictionary with all the correspondancies
nodule_data = {
    'Nodule_id': [],
    'Patient_id': [],
    'Subtlety': [],
    'Internalstructure': [],
    'Calcification': [],
    'Sphericity': [],
    'Margin': [],
    'Lobulation': [],
    'Spiculation': [],
    'Texture': [],
    'Malignancy': []
}

malignancy_names = {
    1: "1-Highly Unlikely",
    2: "2-Moderately Unlikely",
    3: "3-Indeterminate",
    4: "4-Moderately Suspicious",
    5: "5-Highly Suspicious"
}

std_limit = 1.5

for patient_folder in patient_folders:
    patient_folder_path = os.path.join(input_directory, patient_folder)
    
    # Id of the patient - 'LIDC-IDRI-xxxx'
    patient_id = os.path.basename(patient_folder_path)
    print(patient_id)
    patient_scans = pl.query(pl.Scan).filter(pl.Scan.patient_id == patient_id, pl.Scan.annotations.any()).all()
    
    for scan in patient_scans:
        nods = scan.cluster_annotations()
        
        # Iteration for each node
        for anns in nods:
            nodule_id+=1
            
            nodule_characteristics = []

            # 50% consensus
            cmask, cbbox, masks = consensus(anns, clevel=0.5, pad=[(20, 20), (20, 20), (0, 0)])

            for i, mask in enumerate(masks):
                characteristics = anns[i].feature_vals()
                nodule_characteristics.append(characteristics)

            # If there was some feature extracted, otherwise the nodule will not be included
            if nodule_characteristics:
                # Calculation of the stdeviation for each feature
                std_deviations = [np.std(characteristic) for characteristic in zip(*nodule_characteristics)]
        
                # Verify if there is some feature which has a std > std_limit
                if any(std > std_limit for std in std_deviations):
                    # Ignore this nodule if its std is bigger
                    print(f"Nodule ID {nodule_id}")

                    continue
                    
                # If the nodule passes the condition: save its cmask and its volume so that it can be used on the extraction 
                # of features by pyradiomics
                filename = os.path.join(path, f"{nodule_id}_cmask.npy")
                np.save(filename, cmask)
                
                #there is no storage available 
#                 volume = anns[0].scan.to_volume()
#                 filename = os.path.join(path, f"{nodule_id}_volume.npy")
#                 np.save(filename, volume)
                
                # Extract the values of the features from xml
                feature_values = {
                    'Nodule_id': nodule_id,
                    'Patient_id': patient_id,
                    'Subtlety': nodule_characteristics[0][0],
                    'Internalstructure': nodule_characteristics[0][1],
                    'Calcification': nodule_characteristics[0][2],
                    'Sphericity': nodule_characteristics[0][3],
                    'Margin': nodule_characteristics[0][4],
                    'Lobulation': nodule_characteristics[0][5],
                    'Spiculation': nodule_characteristics[0][6],
                    'Texture': nodule_characteristics[0][7],
                    'Malignancy': malignancy_names.get(nodule_characteristics[0][8])
                }

                # Append the feature values to the nodule_data dictionary
                for key, value in feature_values.items():
                    nodule_data[key].append(value)
                    
features = pd.DataFrame(nodule_data)
csv_filename = os.path.join(external_drive_path, 'features_pylidc_1.5.csv')
features.to_csv(csv_filename, index=False)

LIDC-IDRI-0001
LIDC-IDRI-0002
LIDC-IDRI-0003
LIDC-IDRI-0004
LIDC-IDRI-0005
LIDC-IDRI-0006
LIDC-IDRI-0007
LIDC-IDRI-0008
Nodule ID 17
LIDC-IDRI-0009
LIDC-IDRI-0010
LIDC-IDRI-0011
LIDC-IDRI-0012
LIDC-IDRI-0013
LIDC-IDRI-0014
LIDC-IDRI-0015
LIDC-IDRI-0016
Nodule ID 54
Nodule ID 55
LIDC-IDRI-0017
LIDC-IDRI-0018
Nodule ID 61
LIDC-IDRI-0019
LIDC-IDRI-0020
LIDC-IDRI-0021
LIDC-IDRI-0022
LIDC-IDRI-0023
LIDC-IDRI-0024
LIDC-IDRI-0025
LIDC-IDRI-0026
LIDC-IDRI-0027
Nodule ID 81
LIDC-IDRI-0029
LIDC-IDRI-0030
LIDC-IDRI-0031
LIDC-IDRI-0033
LIDC-IDRI-0034
LIDC-IDRI-0035
LIDC-IDRI-0036
LIDC-IDRI-0037
LIDC-IDRI-0038
LIDC-IDRI-0039
Nodule ID 104
LIDC-IDRI-0040
LIDC-IDRI-0041
LIDC-IDRI-0042
LIDC-IDRI-0043
LIDC-IDRI-0044
LIDC-IDRI-0045
Nodule ID 132
LIDC-IDRI-0046
Nodule ID 144
Nodule ID 145
LIDC-IDRI-0047
LIDC-IDRI-0048
Nodule ID 150
Nodule ID 152
LIDC-IDRI-0049
Nodule ID 154
Nodule ID 155
Nodule ID 164
LIDC-IDRI-0050
Nodule ID 168
LIDC-IDRI-0051
LIDC-IDRI-0052
LIDC-IDRI-0053
LIDC-IDRI-0054
LIDC-IDRI-0055


Nodule ID 1162
LIDC-IDRI-0444
Nodule ID 1164
LIDC-IDRI-0445
LIDC-IDRI-0447
LIDC-IDRI-0448
LIDC-IDRI-0449
LIDC-IDRI-0450
LIDC-IDRI-0451
LIDC-IDRI-0452
Nodule ID 1190
LIDC-IDRI-0453
LIDC-IDRI-0454
LIDC-IDRI-0456
Nodule ID 1200
LIDC-IDRI-0457
LIDC-IDRI-0458
LIDC-IDRI-0459
LIDC-IDRI-0460
LIDC-IDRI-0461
LIDC-IDRI-0462
LIDC-IDRI-0463
LIDC-IDRI-0464
LIDC-IDRI-0466
Nodule ID 1232
LIDC-IDRI-0467
LIDC-IDRI-0468
LIDC-IDRI-0469
Nodule ID 1237
LIDC-IDRI-0470
LIDC-IDRI-0471
LIDC-IDRI-0473
Nodule ID 1249
LIDC-IDRI-0474
LIDC-IDRI-0475
Nodule ID 1257
LIDC-IDRI-0476
LIDC-IDRI-0477
LIDC-IDRI-0478
LIDC-IDRI-0479
Nodule ID 1271
LIDC-IDRI-0480
LIDC-IDRI-0481
LIDC-IDRI-0483
LIDC-IDRI-0484
LIDC-IDRI-0485
LIDC-IDRI-0486
LIDC-IDRI-0487
LIDC-IDRI-0488
LIDC-IDRI-0489
Nodule ID 1306
LIDC-IDRI-0490
LIDC-IDRI-0491
Nodule ID 1323
LIDC-IDRI-0492
LIDC-IDRI-0493
LIDC-IDRI-0494
LIDC-IDRI-0495
LIDC-IDRI-0496
Nodule ID 1336
LIDC-IDRI-0497
LIDC-IDRI-0498
LIDC-IDRI-0499
LIDC-IDRI-0500
LIDC-IDRI-0501
LIDC-IDRI-0502
LIDC-IDRI-

LIDC-IDRI-0943
LIDC-IDRI-0944
Nodule ID 2486
LIDC-IDRI-0945
LIDC-IDRI-0946
LIDC-IDRI-0947
LIDC-IDRI-0949
Nodule ID 2496
LIDC-IDRI-0950
LIDC-IDRI-0951
LIDC-IDRI-0953
Nodule ID 2506
LIDC-IDRI-0955
LIDC-IDRI-0956
Nodule ID 2509
Nodule ID 2510
LIDC-IDRI-0957
LIDC-IDRI-0958
LIDC-IDRI-0959
LIDC-IDRI-0961
Nodule ID 2517
Nodule ID 2523
LIDC-IDRI-0962
LIDC-IDRI-0963
LIDC-IDRI-0965
LIDC-IDRI-0966
LIDC-IDRI-0968
Nodule ID 2533
LIDC-IDRI-0969
Nodule ID 2536
LIDC-IDRI-0971
LIDC-IDRI-0972
LIDC-IDRI-0973
LIDC-IDRI-0974
LIDC-IDRI-0976
LIDC-IDRI-0977
LIDC-IDRI-0978
LIDC-IDRI-0980
Nodule ID 2556
LIDC-IDRI-0981
LIDC-IDRI-0982
LIDC-IDRI-0983
LIDC-IDRI-0984
Nodule ID 2564
LIDC-IDRI-0985
LIDC-IDRI-0986
LIDC-IDRI-0987
Nodule ID 2571
LIDC-IDRI-0989
LIDC-IDRI-0990
LIDC-IDRI-0991
LIDC-IDRI-0993
LIDC-IDRI-0994
Nodule ID 2581
Nodule ID 2582
LIDC-IDRI-0996
LIDC-IDRI-0997
LIDC-IDRI-0998
Nodule ID 2597
LIDC-IDRI-0999
LIDC-IDRI-1000
Nodule ID 2606
Nodule ID 2608
Nodule ID 2609
LIDC-IDRI-1001
LIDC-IDRI-1002
LIDC-IDRI-