# Analyze FMCIB features for original and READII negative control CT images

This notebook is set up to use outputs from the `run_fmcib.ipynb` notebook.

Image features extracted from CT images cropped to a Gross Tumour Volume (GTV) undergo correlation analysis. Results are compared across READII negative control image types.

## Set up pixi environment kernel

1. Run the following commands in the terminal:

    ```bash
    $ pixi install

    $ pixi run make_kernel
    ```

2. In the `Select Kernel` menu at the top right of the notebook, select `Jupyter Kernel` as the source. 

3. Refresh the options and one called `readii-fmcib` should appear. Select this option.

## Imports

In [1]:
import itertools
from pandas import DataFrame

from pathlib import Path
from readii.analyze import getFeatureCorrelations, getHorizontalSelfCorrelations
from readii.data.label import setPatientIdAsIndex
from readii.io.loaders import loadImageDatasetConfig, loadFeatureFilesFromImageTypes
from readii.io.writers import base_writer

## Initialize dataset name and load config 

In [2]:
config = loadImageDatasetConfig("RADCURE", Path("config"))

DATASET_NAME = config["dataset_name"]
PAT_ID_PATTERN = config['patient_id_pattern']
NEG_CONTROL_REGIONS = config["negative_control_regions"]
NEG_CONTROL_TYPES = config["negative_control_types"]

# Get full list of image types to run FMCIB on
negative_control_list = [f"{negative_control[0]}_{negative_control[1]}" for negative_control in itertools.product(NEG_CONTROL_TYPES, NEG_CONTROL_REGIONS)]

## Set up data directories

In [3]:
correlations_dir = Path("results", DATASET_NAME, "analysis", "correlations")

# Set features input directory
features_dir = Path("results", DATASET_NAME, "fmcib_features")

# Make correlation results output directory
for combo in itertools.product([correlations_dir], ["matrix", "heatmap", "histogram"]):
    Path(*combo).mkdir(parents=True, exist_ok=True)

## Load all extracted feature sets

In [4]:
# Load the extracted feature data
# This makes a dictionary of feature sets, with the image type as the key
extracted_feature_sets = loadFeatureFilesFromImageTypes(extracted_feature_dir=features_dir,
                                                        image_types = (["original"] + negative_control_list), 
                                                        drop_labels = False)

# Run correlation analysis for each image type

In [5]:
def prepPatientIndex(feature_df:DataFrame, file_path_column:str, pat_id_pattern:str) -> DataFrame:
    """Extract patient ID from a DataFrame column of file paths based on a provided regex pattern."""
    # Get patient ID from file path name and make a column for this
    feature_df['patient_ID'] = feature_df[file_path_column].str.findall(pat_id_pattern)
    
    # Set the patient ID column as the index for the dataframe
    feature_df = setPatientIdAsIndex(feature_df, 'patient_ID')

    # Remove the image_path column
    feature_df.drop(labels="image_path", axis=1, inplace=True)

    return feature_df

In [11]:
# Flag to make the original self-correlation plots only once
make_original_plots = True

# Name of the column used to extract the patient ID for a row of features
file_path_column = 'image_path'

# Correlation method to apply
correlation_method = "pearson"

# Get and set up the feature dataframe for the original features once
vertical_feature_type = "original"
vertical_features_df = prepPatientIndex(extracted_feature_sets[vertical_feature_type],
                                        file_path_column,
                                        PAT_ID_PATTERN)


# Iterate over each negative control feature set and perform correlation analysis
for horizontal_feature_type in ["shuffled_roi"]:
    # Get extracted features for this image type, extract set the patient ID as the dataframe index, remove image_path column
    horizontal_features_df = prepPatientIndex(extracted_feature_sets[horizontal_feature_type], 
                                              file_path_column = file_path_column, 
                                              pat_id_pattern = PAT_ID_PATTERN)
    
    # Calculate correlations between original image features and image type features
    feature_correlation_df = getFeatureCorrelations(vertical_features=vertical_features_df,
                                                        horizontal_features=horizontal_features_df,
                                                        vertical_feature_name=vertical_feature_type,
                                                        horizontal_feature_name=horizontal_feature_type,
                                                        method = correlation_method)
    
    # save out the correlation dataframe
    
    


In [9]:
horizontal_features_df.head()

Unnamed: 0_level_0,pred_0,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6,pred_7,pred_8,pred_9,...,pred_4086,pred_4087,pred_4088,pred_4089,pred_4090,pred_4091,pred_4092,pred_4093,pred_4094,pred_4095
patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
[RADCURE-0300],2.721314,1.364399,1.997742,0.650702,1.195705,0.716645,1.324016,0.112905,1.206633,0.63562,...,0.56184,0.474169,0.947855,2.580508,0.348267,1.326199,0.134461,0.519743,1.144534,1.126188
[RADCURE-0315],1.34199,0.840205,2.217888,0.287051,0.051961,0.86621,1.714801,0.891389,1.025357,1.115861,...,0.952476,0.483258,0.726106,1.557243,0.50195,1.203163,0.770946,0.206626,1.505077,1.221517
[RADCURE-0323],2.165563,1.064253,2.534633,0.748814,0.322149,1.7691,1.015384,0.909927,0.384469,2.083093,...,0.996737,0.403293,0.329809,3.033765,0.173414,1.839332,0.030697,0.58956,1.437691,1.932057
[RADCURE-0327],2.094261,1.532087,0.89496,0.279018,0.426449,1.55125,1.034708,1.366443,0.750966,1.704991,...,0.345646,0.562048,0.400464,2.186393,0.682144,0.669017,0.478197,1.178036,1.414909,1.386635
[RADCURE-0342],3.785098,1.516655,1.338776,0.429899,1.140034,1.05449,0.951989,0.315031,1.293111,0.764472,...,0.468186,0.968482,1.295323,2.815835,0.694796,1.459,0.710509,0.957506,1.573271,1.068126
