# MR-Longitudinal Radiomics
### Radiomics pipeline created for longitudinal images collected at subsequent fractions of treatment.
##### Full model: Feature Extraction, Feature Reduction via volume correlation & test-retest stability, Feature Selection via Euclidean distance between feature pair trajectories and hierachical clustering.
##### Compares the results of the longitudinal model with a standard delta-radiomics approach to illustrate the importance of accounting for the full feature trajectory over treatment.

###
#### Below specify certain variables and options for customising the notebook

In [1]:
import os
import pandas as pd
from tqdm import tqdm

# Specify the output path
# specify the tag to use - could be anything, helps to identify the output if running multiple models
# default is "Test"
tag = "HM-FSTP"
# output_path = "C:/Users/b01297ar/Documents/ProstateMRL-local/ProstateMRL-Radiomics/ReleaseCode/Output/" + tag + "/"
cwd = os.getcwd()
output_path = cwd + "/Output/" 
if os.path.exists(output_path) == False:
        os.mkdir(output_path)
if os.path.exists(output_path + tag) == False:
    os.mkdir(output_path + tag)
    os.mkdir(output_path + tag + "/Features/")
    os.mkdir(output_path + tag + "/Extraction/")
    os.mkdir(output_path + tag + "/Plots/")
output_path = cwd + "/Output/" + tag + "/"

# specify if you want to compare to a delta model
# default is False
Delta_Model = False

# Specify if you want to visualise the results in plots
# default is False, can specify at given stages below if you want to visualise
plot = False

# Specify if you want to extract Features
# default is False, option to do so is below
# If features are already extracted, set to false and provide the path to the extracted features below
extract = True


## Feature Extraction
#### If you want to extract features, provide a csv containing the following:
####               | PatID | Fraction | Image file | Mask Name | Mask file | 
#### Specify the root of the csv in the Input dir.
#### Calculates features based on the parameter file specified. Default setting is currently set at PyRadiomics base extraction parameters - Fixed bin size (FBS) of 25, no resampling, no normalisation, 107 features (IBSI compliant) and no wavelet/laplacian filters applied. 
#### Features are then calculated and then saved in a new folder in the Output dir - with files in parquet format. Columns will be:
#### PatID | Fraction | Mask | Feature | Feature Value |

In [2]:
from Functions import Extraction as FE

if extract == True:
    
    key_extraction = cwd + "/Input/Default/PepKey_Man.csv"
    key_extraction = pd.read_csv(key_extraction)

    extraction_path = output_path + "/Extraction/"
    if not os.path.exists(extraction_path):
        os.makedirs(extraction_path)

    params_extractor = cwd + "/Input/Default/Default_ExtractionParams.yaml"


    # Loop over all patients
    print("Extracting features for patients...")
    for pat in key_extraction["PatID"].unique():
        print("Processing patient: " + str(pat) + "...")

        # Get the patient's key
        key_pat = key_extraction[key_extraction["PatID"] == pat]
        Features_pat = pd.DataFrame()
        # loop over all rows
        if os.path.exists(output_path + "/Extraction/Manual_" + str(pat) + "_" + tag + ".csv"):
            print(" ")

        else:
            for i, row in key_pat.iterrows():


                PatID = row["PatID"]
                Fraction = row["Fraction"]
                Mask = row["Contour"]
                ContourType = row["ContourType"]
                ImagePath = row["ImagePath"]
                MaskPath = row["MaskPath"]
                
                # Extract features
                Features = FE.ExtractFeatures(PatID, Fraction, Mask, ContourType, ImagePath, MaskPath, params_extractor)

                Features_pat = Features_pat.append(Features)
            
                
            Features_pat.to_csv(output_path + "/Extraction/" + ContourType + "_" + str(pat) + "_" + tag + ".csv", index=False)


#### Once all features have been extracted, combine in to one dataframe
#### Or
#### Specify the path of the feature values. 
##### Default is to read in a parquet file (smaller file sizes - so quicker), make sure to change to pd.read_csv if reading in csv and change path.

In [3]:
if extract == True:
    df_all = pd.DataFrame()
    files = os.listdir(output_path + "/Extraction/")
    for file in files:
        df_pat = pd.read_csv(output_path + "/Extraction/" + file)
        df_all = df_all.append(df_pat)
        

        # Save the features to parquet
    if os.path.exists(output_path + "/Features/") == False:
        os.mkdir(output_path + "/Features/")   
    df_all.to_csv(output_path + "/Features/Features_" + tag + ".csv")

else:
    df_all = pd.read_csv(output_path + "/Features/Features_" + tag + ".csv")

In [4]:
# df_all = pd.read_csv("C:/Users/b01297ar/Documents/ProstateMRL-local/ProstateMRL-Radiomics/P1-TempData/Features_All.csv")
df_all = df_all.drop(columns=["Unnamed: 0"])
df_all = df_all[~df_all["Feature"].isin(["firstorder_Minimum", "firstorder_Maximum"])]
df_all.head()

Unnamed: 0,PatID,Fraction,Contour,ContourType,Feature,FeatureValue
0,1642,1,RP,Manual,shape_Elongation,0.839757
1,1642,1,RP,Manual,shape_Flatness,0.679305
2,1642,1,RP,Manual,shape_LeastAxisLength,23.488558
3,1642,1,RP,Manual,shape_MajorAxisLength,34.57734
4,1642,1,RP,Manual,shape_Maximum2DDiameterColumn,40.050273


In [5]:
df_man = df_all.loc[df_all["ContourType"] == "Manual"]
df_limbus = df_all.loc[df_all["ContourType"] == "Auto"]

# Feature Reduction
#### Due to the high dimensionnality of radiomics values, it is vital that some of the features are removed if they offer no unique information. 
#### Since features are calculated by applying different formulas to images, many of these formulas are similar and so some features can be quite similar. We aim to remove all redundant features - redundant features in this model are those that are strongly correlated to volume, Spearman Rank coefficient rho > 0.6 and unstable due to contour differences, as measured by an ICC value < 0.5.

In [6]:
from Functions import Reduction as FR

## Volume Correlation
#### Previous studies have shown that radiomic feature values have a strong correlation with the volume of the mask



In [7]:
df_man.head()

Unnamed: 0,PatID,Fraction,Contour,ContourType,Feature,FeatureValue
0,1642,1,RP,Manual,shape_Elongation,0.839757
1,1642,1,RP,Manual,shape_Flatness,0.679305
2,1642,1,RP,Manual,shape_LeastAxisLength,23.488558
3,1642,1,RP,Manual,shape_MajorAxisLength,34.57734
4,1642,1,RP,Manual,shape_Maximum2DDiameterColumn,40.050273


In [8]:
df_limbus.head()

Unnamed: 0,PatID,Fraction,Contour,ContourType,Feature,FeatureValue
535,1029,1,Limbus,Auto,shape_Elongation,0.941881
536,1029,1,Limbus,Auto,shape_Flatness,0.650781
537,1029,1,Limbus,Auto,shape_LeastAxisLength,17.42983
538,1029,1,Limbus,Auto,shape_MajorAxisLength,26.782922
539,1029,1,Limbus,Auto,shape_Maximum2DDiameterColumn,31.032698


In [9]:
print("Manual")
FR.Volume(df_man, output_path, plot=False)
print("Limbus")
FR.Volume(df_limbus, output_path, plot=False)

Manual
------------------------------
Volume Correlation
Correlating features to volume...


100%|██████████| 105/105 [00:00<00:00, 485.15it/s]
100%|██████████| 105/105 [00:00<00:00, 545.23it/s]
100%|██████████| 105/105 [00:00<00:00, 562.02it/s]
100%|██████████| 105/105 [00:00<00:00, 598.62it/s]
100%|██████████| 105/105 [00:00<00:00, 564.89it/s]


Volume redundant features: 22/105
------------------------------
Limbus
------------------------------
Volume Correlation
Correlating features to volume...


100%|██████████| 105/105 [00:00<00:00, 547.34it/s]
100%|██████████| 105/105 [00:00<00:00, 605.32it/s]
100%|██████████| 105/105 [00:00<00:00, 516.00it/s]
100%|██████████| 105/105 [00:00<00:00, 565.82it/s]
100%|██████████| 105/105 [00:00<00:00, 525.66it/s]

Volume redundant features: 26/105
------------------------------





## ICC Stability
#### Intra-class correlation coefficient is used as a statistical measure of how much two observed quantities within a group tend to agree with each other. 
#### Been used widely within radiomics studies as a test-retest stability measure between two delineations.



In [10]:
FR.ICC(df_all, output_path, plot=False)


------------------------------
Stability Test
Calculating ICC...


100%|██████████| 105/105 [00:04<00:00, 22.13it/s]
100%|██████████| 105/105 [00:05<00:00, 17.74it/s]
100%|██████████| 105/105 [00:04<00:00, 22.39it/s]
100%|██████████| 105/105 [00:05<00:00, 18.46it/s]
100%|██████████| 105/105 [00:05<00:00, 17.78it/s]


ICC redudant features: 16/105
------------------------------


#### Remove redundant features

#### Still need to do further feature reduction

In [None]:
df_all = FR.RemoveFts(df_all, output_path)

------------------------------
Removing redundant features...
Number of features removed: 33
Number of features remaining: 72
------------------------------


## Clustering I - Distance between Feature Trajectories
#### Calculate the Euclidean distance between feature pairs.
#### Distance values can then be used to visualise the relationship between features.
#### Can also be used to group features together.

In [13]:
from Functions import Clustering as Cl
Cl.DistanceMatrix(df_man, output_path, True)

Calculating Euclidean distance between feature pair trajectories...


100%|██████████| 20/20 [01:39<00:00,  5.00s/it]


## Clustering II - Grouping Features
#### Hierarchical clustering using SciPy
####   - Weighted linkage (Refers to the algorithm by which clusters are formed)
####   - Starting T-val = 2 (Refers to the threshold value for which to go to a different cluster, i.e. how far away a value is to a cluster before a new cluster is created/put in another cluster.)
##### Clusters with < 3 features discarded as deemed too unstable.
##### Clusters with > 10 features re-clustered to subclusters. 


In [15]:
Cl.ClusterFeatures(df_man, output_path, 2)

------------------------------
Clustering Feature Trajectories...


100%|██████████| 20/20 [00:01<00:00, 11.80it/s]

------------------------------





## Clustering III - Feature Selection
#### Cross-correlation value between feature trajectories within each cluster performed to determine most “representative” feature
####    - Highest mean cross-correlation passed through.
####    - Top 20% of features.
#### Each patient passes through a set of features.
#### Features then tallied up and the top 10 ranked features are selected.


In [16]:
Cl.FeatureSelection(df_man, output_path)

------------------------------
Feature Selection
Calculating Cross-Correlation values...


100%|██████████| 20/20 [00:02<00:00,  6.96it/s]

------------------------------
Selected Features: 
glrlm_GrayLevelVariance
gldm_LargeDependenceEmphasis
glrlm_RunLengthNonUniformityNormalized
firstorder_Uniformity
firstorder_Mean
firstorder_RootMeanSquared
glcm_Idn
glcm_JointEntropy
glcm_Correlation
shape_MeshVolume
glcm_DifferenceEntropy
shape_MinorAxisLength
glcm_SumSquares
glcm_JointEnergy
gldm_LowGrayLevelEmphasis
------------------------------
Number of Selected Features: 15
------------------------------





In [28]:
df_all = pd.read_csv("C:\\Users\\b01297ar\\Documents\\ProstateMRL-local\\ProstateMRL-Radiomics\\Paper1-Release\\Output\\Long-Test\\Features\\Features_Long-Test.csv")

In [31]:
df_delta = pd.DataFrame()
patIDs = df_all["PatID"].unique()

for pat in patIDs:
    df_pat = df_all[df_all["PatID"] == pat]

    for contour in df_pat["ContourType"].unique():
        df_contour = df_pat[df_pat["ContourType"] == contour]

        for ft in df_pat.Feature.unique():
            df_ft = df_contour[df_contour["Feature"] == ft]

            df_fr1 = df_ft[df_ft["Fraction"] == 1]
            df_fr5 = df_ft[df_ft["Fraction"] == 5]

            delta = df_fr5["FeatureValue"].values[0] - df_fr1["FeatureValue"].values[0]

            df_temp = df_fr5.copy()
            df_temp["FeatureValue"] = delta
            
            df_delta = df_delta.append(df_temp)
            # get first and last fraction
            


            
        
print(df_delta.shape)      
print(df_all.shape)

df_delta.to_csv("C:/Users/b01297ar/Documents/ProstateMRL-local/ProstateMRL-Radiomics/P1-TempData/Features_Delta.csv")

(4280, 7)
(21400, 7)


In [None]:
L_fts = pd.read_csv(output_path + "/Features/Features_Selected.csv")
D_fts = pd.read_csv("/home/arn/Radiomics/Paper1-Release/Output/Delta-Test/Features/Features_Selected.csv")

L_fts = L_fts["Feature"].values
D_fts = D_fts["Feature"].values

# get the features that are in both
fts = [x for x in L_fts if x in D_fts]

print("Number of features in both: " + str(len(fts)))
for ft in fts:
    print(ft)

Number of features in both: 1
glrlm_GrayLevelVariance
