In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

%matplotlib inline

data_root = "E:/Aaron/ProstateMRL/Data/Paper2/Extraction/"
out_root = "E:/Aaron/ProstateMRL/Data/Paper2/Features/"

In [2]:
df_key = pd.DataFrame()

treatment_root = "E:/prostateMR_radiomics/patientData/"

for treatment in os.listdir(treatment_root):
    temp = {}
    temp['Folder'] = treatment
    temp['Treatment'] = treatment.split("_")[0]

    for pat in os.listdir(treatment_root + treatment):
        temp["PatID"] = pat
        df_key = df_key.append(temp, ignore_index=True)

df_key.to_csv(out_root + "/df_treatmentkey.csv", index=False)

In [3]:
df_all = pd.DataFrame()

folders = os.listdir(data_root)

for folder in folders:
    files = os.listdir(data_root + folder)

    for file in files:
        df = pd.read_parquet(data_root + folder + '/' + file)
        df["Region"] = folder
        df["PatID"] = file.split("_")[0]
        
        # pivot wide to long
        df.rename(columns={"Unnamed: 0": "Feature"}, inplace=True)
        # drop rows with diagnotix features
        df = df[~df["Feature"].str.contains("diagnostic")]  
        dates = df.loc[0,:][1:6].values
        df = df.iloc[1:,:]

        # pivot wide to long
        df = df.melt(id_vars=["PatID", "Region", "Feature"])
        df.rename(columns={"variable": "Fraction"}, inplace=True)
        df["Fraction"] = df["Fraction"].str.replace(folder + "_", "").astype(int)
        df["Date"] = df["Fraction"].map(dict(zip(range(1,6), dates)))
        
        df["Treatment"] = df["PatID"].map(dict(zip(df_key["PatID"], df_key["Treatment"])))
        df["Folder"] = df["PatID"].map(dict(zip(df_key["PatID"], df_key["Folder"])))

        # reorder columns
        df = df[["PatID", "Treatment", "Folder", "Region", "Fraction", "Date","Feature", "value"]]

        df_all = df_all.append(df)

In [4]:
df_all["Feature"] = df_all["Feature"].str.replace("original_", "")
df_all["FeatureGroup"] = df_all["Feature"].str.split("_", expand=True)[0]
df_all.rename(columns={"value": "FeatureValue"}, inplace=True)
df_all["FeatureValue"] = df_all["FeatureValue"].astype(float)


df_all.to_parquet(out_root + "/df_all.parquet")

df_SABR = df_all[df_all["Treatment"] == "SABR"]
df_SABR.to_parquet(out_root + "/df_SABR.parquet")

In [5]:
df_SABR = pd.read_parquet(out_root + "/df_SABR.parquet")

PatIDs = df_SABR["PatID"].unique()  

# shape features will not change over time since they are extracted from the same mask
df_vol = df_SABR[df_SABR["Feature"] == "shape_MeshVolume"]
df_vol = df_vol[["PatID", "Region", "FeatureValue"]]
df_vol.rename(columns={"FeatureValue": "Volume"}, inplace=True)
df_vol.drop_duplicates(inplace=True)
df_vol["Volume"] = df_vol["Volume"].astype(float)
df_vol.to_parquet(out_root + "/df_vol.parquet")

In [6]:
df_SABR = df_SABR[df_SABR["Region"].isin(["dipl", "normProstate"]) ]
df_SABR = df_SABR[["PatID", "Fraction", "Region", "Feature", "FeatureValue"]]

# split into dipl and normProstate
df_dipl = df_SABR[df_SABR["Region"] == "dipl"]
df_normProstate = df_SABR[df_SABR["Region"] == "normProstate"]

In [None]:
from matplotlib.lines import Line2D

# plot random features over time
features = df_SABR["Feature"].unique()
features = np.random.choice(features, 10, replace=False)

for feature in features:
    df_ft = df_SABR[df_SABR["Feature"] == feature]

    # subplots
    # make subplot for each patient
    sns.set_style("darkgrid")
    sns.set_context("paper", font_scale=1.5)
    fig, axs = plt.subplots(4, 5, figsize=(15, 15), sharey=False)
    fig.suptitle(feature, fontsize=20)
    # chnage stye of plots
    
    for i, pat in enumerate(PatIDs):
        df_pat = df_ft[df_ft["PatID"] == pat]
        df_dipl = df_pat[df_pat["Region"] == "dipl"]
        df_norm = df_pat[df_pat["Region"] == "normProstate"]

        df_pat = df_pat.sort_values(by="Fraction")
        axs[i//5, i%5].plot(df_dipl["Fraction"], df_dipl["FeatureValue"], marker="o", label="dipl", color="blue")
        axs[i//5, i%5].plot(df_norm["Fraction"], df_norm["FeatureValue"], marker="o", label="normProstate", color="orange")

        axs[i//5, i%5].set_title(pat)
        axs[i//5, i%5].set_xticks(range(1,6))
        #axs[i//5, i%5].set_xticklabels(df_pat["Date"].values, rotation=45)
        #axs[i//5, i%5].set_ylim([0, df_pat["FeatureValue"].max() * 1.1])
        axs[i//5, i%5].set_xlim([0.5, 5.5])
        axs[i//5, i%5].set_xlabel("Fraction")
        axs[i//5, i%5].set_ylabel("Feature Value")

    # remove empty subplots
    for i in range(len(PatIDs), 20):
        axs[i//5, i%5].axis('off')
    
    # add legend in to empty subplot (bottom right)
    # Create a legend for the first line.
    legend_elements = [Line2D([0], [0], marker='o', color='w', label='dipl', markerfacecolor='blue', markersize=10),
                          Line2D([0], [0], marker='o', color='w', label='normProstate', markerfacecolor='orange', markersize=10)]
    
    
    axs[3, 4].legend(handles=legend_elements, loc='center', fontsize=20)
    

    plt.tight_layout()
    plt.subplots_adjust(top=0.9)

    
    plt.show()


In [8]:
from tqdm import tqdm
from scipy import stats

def Volume(df, out_dir, tag, output=False):
    '''
    Remove features that correlate with volume
    df: dataframe with all feature values across treatment for one region
    out_dir: output
    tag: tag for output to denote any changes
    output: print output
    '''
    # mask is constant so get volume for Fraction 1
    df_vol = df[df["Fraction"] == 1]
    df_vol = df_vol[df_vol["Feature"] == "shape_MeshVolume"]
    vals_vol = df_vol["FeatureValue"].values

    fractions = df["Fraction"].unique()
    features = df["Feature"].unique()

    df_res = pd.DataFrame()

    print("Tag: " + tag)
    print("Correlating features to volume...")
    for fr in fractions:
        
        for ft in tqdm(features):
            # get feature values for each fraction
            df_ft = df[df["Fraction"] == fr]
            df_ft = df_ft[df_ft["Feature"] == ft]
            vals_ft = df_ft["FeatureValue"].values
            # if vals are all the same, skip
            if len(np.unique(vals_ft)) == 1:
               rho = 1 

            else:
                # get spearman correlation
                rho = stats.spearmanr(vals_vol, vals_ft)[0]

            df_temp = pd.DataFrame({"Fraction": [fr], "Feature": [ft], "rho": [rho]})
            df_res = df_res.append(df_temp)

    # calculate mean rho for each feature
    df_mean = df_res.groupby("Feature").mean().reset_index()

    # remove features
    fts_remove = df_mean[abs(df_mean["rho"]) > 0.6]["Feature"].values

    if output == True:
        print("\nVolume redundant features: " + str(len(fts_remove)) + "/" + str(len(features)) )
        print("Remaining features: " + str(len(df["Feature"].unique()) - len(fts_remove)) + "/" + str(len(features)) + "\n")
    
    fts_remove = pd.DataFrame({"Feature": fts_remove})
    fts_remove.to_parquet(out_dir + "/FeaturesRemoved_Volume_" + tag + ".parquet", index=False)


In [9]:
outdir = out_root + "/Test/"
Volume(df_dipl, outdir, "dipl_Test", True)
Volume(df_normProstate, outdir, "normProstate_Test", True)

Tag: dipl_Test
Correlating features to volume...


100%|██████████| 1/1 [00:00<00:00, 333.04it/s]
100%|██████████| 1/1 [00:00<00:00, 499.98it/s]
100%|██████████| 1/1 [00:00<00:00, 348.05it/s]
100%|██████████| 1/1 [00:00<00:00, 333.38it/s]
100%|██████████| 1/1 [00:00<00:00, 500.51it/s]


Volume redundant features: 1/1
Remaining features: 0/1






Tag: normProstate_Test
Correlating features to volume...


100%|██████████| 107/107 [00:00<00:00, 446.16it/s]
100%|██████████| 107/107 [00:00<00:00, 497.93it/s]
100%|██████████| 107/107 [00:00<00:00, 538.84it/s]
100%|██████████| 107/107 [00:00<00:00, 460.34it/s]
100%|██████████| 107/107 [00:00<00:00, 480.57it/s]



Volume redundant features: 17/107
Remaining features: 90/107



In [10]:
from scipy.spatial import distance

def DistanceMatrix(df, outdir, tag, output=False):
    '''
    Calculates the Euclidean distance between feature pair trajectories
    df: dataframe with all feature values across treatment for one region
    out_dir: output
    tag: tag for output to denote any changes
    output: print output
    '''
    
    features = df["Feature"].unique()
    PatIDs = df["PatID"].unique()

    df_res = pd.DataFrame()

    print("Tag: " + tag)
    print("Calculating Euclidean distance between feature pair trajectories...")

    if os.path.isdir(outdir + "/DM/") == False:
        os.mkdir(outdir + "/DM/")
        os.mkdir(outdir + "/DM/data/")
        os.mkdir(outdir + "/DM/figs/")
    
    for pat in tqdm(PatIDs):
        df_pat = df[df["PatID"] == pat]

        matrix = np.zeros((len(features), len(features)))

        for i, ft1 in enumerate(features):
            df_ft = df_pat[df_pat["Feature"] == ft1]
            vals1 = df_ft["FeatureValue"].values
            if vals1[0] == 0:
                vals1[0] = 1
            vals1_ch = (vals1 - vals1[0]) / vals1[0]
            for j, ft2 in enumerate(features):
                df_ft2 = df_pat[df_pat["Feature"] == ft2]
                vals2 = df_ft2["FeatureValue"].values
                if vals2[0] == 0:
                    vals2[0] = 1
                
                vals2_ch = (vals2 - vals2[0]) / vals2[0]
                

                # get euclidean distance
                # fill nan with 0
                if np.isnan(vals1_ch).any() == True:
                    print(pat)
                    print(ft1, vals1)
                if np.isnan(vals2_ch).any() == True:
                    print(pat)
                    print(ft2, vals2)
                
                matrix[i,j] = distance.euclidean(vals1_ch, vals2_ch)
    
        df_dist = pd.DataFrame(matrix, columns=features, index=features)
        df_dist.to_parquet(outdir + "/DM/data/" + pat + "_" + tag + ".parquet")

        if output == True:
            plt.figure(figsize=(10,10))
            sns.heatmap(df_dist, cmap="viridis")
            plt.title("{} - {}".format(pat, tag), fontsize=20)
            # make sure all ticks show
            plt.xticks(np.arange(len(features)) + 0.5, features, fontsize=6)
            plt.yticks(np.arange(len(features)) + 0.5, features, fontsize=6)
            

            plt.savefig(outdir + "/DM/figs/" + pat + "_" + tag + ".png")
            plt.close()



In [11]:
outdir = out_root + "/Test/"
volume_fts = pd.read_parquet(outdir + "/FeaturesRemoved_Volume_dipl_Test.parquet")["Feature"].values
df_dipl = df_SABR[df_SABR["Region"] == "dipl"]
df_dipl = df_dipl[~df_dipl["Feature"].isin(volume_fts)]
df_dipl = df_dipl[df_dipl["Feature"] != "firstorder_Minimum"]
DistanceMatrix(df_dipl, outdir, "dipl_Test", False)     

Tag: dipl_Test
Calculating Euclidean distance between feature pair trajectories...


100%|██████████| 19/19 [04:20<00:00, 13.71s/it]


In [12]:
volume_fts = pd.read_parquet(outdir + "/FeaturesRemoved_Volume_normProstate_Test.parquet")["Feature"].values
df_norm = df_SABR[df_SABR["Region"] == "normProstate"]
df_norm = df_norm[~df_norm["Feature"].isin(volume_fts)]
df_norm = df_norm[df_norm["Feature"] != "firstorder_Minimum"]
DistanceMatrix(df_norm, outdir, "normProstate_Test", False)


Tag: normProstate_Test
Calculating Euclidean distance between feature pair trajectories...


100%|██████████| 19/19 [03:09<00:00,  9.96s/it]


In [None]:
####################################################

def ClusterCheck(df, fts, t_val, tries, df_DM):
        '''
        If cluster has more than 10 features, re-cluster with smaller t_val
        '''
        df_c = df
        df_new = pd.DataFrame()
        # feature names
        df_new["FeatureName"] = fts
        # cluster labels
        c = df_c["Cluster"].values[0]
        
        # need to filter distance matrix to only include features in cluster
        df_DM_c = df_DM[fts]
        # only keep features in cluster
        df_DM_c = df_DM_c[df_DM_c.index.isin(fts)]
        
        # convert to numpy array
        arr_DM_c = df_DM_c.to_numpy()
        
        # cluster
        df_new["Cluster"] = spch.fclusterdata(arr_DM_c, t=t_val, criterion="distance", method="ward")
        df_new["Cluster"] = str(c*100) + str(tries) + df_new["Cluster"].astype(str)
        df_new["Cluster"] = df_new["Cluster"].astype(int)
        df_new["NumFts"] = df_new.groupby("Cluster")["Cluster"].transform("count")
        number_fts = df_new["NumFts"].unique()
        fts_check = df_new.loc[df_new["NumFts"] > 10]["FeatureName"].values
        #print(t_val, number_fts)#, df_new)
        return number_fts, df_new, fts_check

####################################################

def ClusterFeatures(DataRoot, Norm, s_t_val, tag):
    '''
    Cluster features using distance matrix, 
    t_val is threshold for clustering, 
    method is clustering forumula
    performs clustering until all clusters have less than 10 features
    '''
    root = DataRoot
    DM_dir = root + "\\Aaron\\ProstateMRL\\Data\\Paper1\\" + Norm + "\\Longitudinal\\DM\\csvs\\"
    out_dir = root + "\\Aaron\\ProstateMRL\\Data\\Paper1\\"+ Norm + "\\Longitudinal\\ClusterLabels\\"

    # patIDs = UF.SABRPats()

    cluster_method = "weighted"
    patIDs = df_all["PatID"].unique()

    for pat in tqdm(patIDs):
        df_DM = pd.read_csv(DM_dir + pat + "_" + tag + ".csv")
        df_DM.set_index("Unnamed: 0", inplace=True)
        arr_DM = df_DM.to_numpy()
        fts = df_DM.columns

        # create temp df to hold ft name and label
        df_labels = pd.DataFrame()
        df_labels["FeatureName"] = fts

        # cluster function using DM, need to experiment with t_val and method
        df_labels["Cluster"] = spch.fclusterdata(arr_DM, t=s_t_val, criterion="distance", method=cluster_method)
        df_labels.set_index("FeatureName", inplace=True)
        
        # check number of features in each cluster
        df_labels["NumFts"] = df_labels.groupby("Cluster")["Cluster"].transform("count")
        df_labels["Cluster"] = df_labels["Cluster"].astype(int)
        #print("---------------------------")
        #print("Patient: {}".format(pat))
        #print(df_labels.loc[df_labels["NumFts"] > 10])
        # loop through clusters 
        for c in df_labels["Cluster"].unique():
                df_c = df_labels[df_labels["Cluster"] == c]
                number_fts = len(df_c)
                # check numnber of features in cluster
                if number_fts > 10:
                        # if more than 10 features in cluster, reduce t_val and recluster
                        t_val = s_t_val - 0.2
                        check_fts = df_c.index.values
                        tries = 1
                        number_fts, df_labels2, check_fts = ClusterCheck(df_c, check_fts, t_val, tries, df_DM)
                        new_fts = df_labels2["FeatureName"].unique()
                        df_labels.loc[new_fts, "Cluster"] = df_labels2["Cluster"].values
                        df_labels["NumFts"] = df_labels.groupby("Cluster")["Cluster"].transform("count")

                        while number_fts.max() > 10:
                                t_val = t_val - 0.2
                                tries += 1
                                #print("Cluster: {} Tries: {} T_val: {}".format(c, tries, t_val))
                                number_fts, df_labels2, check_fts = ClusterCheck(df_c, check_fts, t_val, tries, df_DM)
                                new_fts = df_labels2["FeatureName"].unique()
                                df_labels.loc[new_fts, "Cluster"] = df_labels2["Cluster"].values
                        
        df_labels["NumFts"] = df_labels.groupby("Cluster")["Cluster"].transform("count")

        # read in df with ft vals and merge
        ft_vals = pd.read_csv(root +"Aaron\\ProstateMRL\\Data\\Paper1\\"+ Norm + "\\Features\\Longitudinal_All_fts_" + tag + ".csv")
        ft_vals["PatID"] = ft_vals["PatID"].astype(str)
        pat_ft_vals = ft_vals[ft_vals["PatID"] == pat]
        pat_ft_vals = pat_ft_vals.merge(df_labels, left_on="Feature", right_on="FeatureName")

        # output is feature values w/ cluster labels
        pat_ft_vals.to_csv(out_dir + pat + "_" + tag + ".csv")

####################################################


In [None]:
def ClusterCC(Cluster_ft_df):
    '''
    Input - df filtered for norm, patient, cluster
    Output - performs cross-correlation within clustered fts and returns ft most strongly correlated with the rest, if more than 2 fts present
    '''
    fts = Cluster_ft_df.Feature.unique()
    num_fts = len(fts)
   
    if num_fts > 2:
        vals = {} # stores fts and values
        ccfs = {} # stores cc values for each feature
        mean_ccfs = {} # stores the mean cc value for every feature
        num_sel = np.rint(len(fts) * 0.2)
        
        for f in fts:
            ft_df = Cluster_ft_df[Cluster_ft_df["Feature"] == f]
            ft_vals = ft_df.FeatureChange.values
            vals[f] = ft_vals
        
        for v in vals:
            ft_1 = vals[v]
            ccfs[v] = v
            ccfs_vals = []

            for u in vals:
                ft_2 = vals[u]
                corr = sts.ccf(ft_1, ft_2)[0] # cross correlation value, index [0] for for 0 lag in csc function
                ccfs_vals.append(corr)
            
            mean_ccfs[v] = np.array(ccfs_vals).mean() # get mean across all cc values for each ft

        s_mean_ccfs = sorted(mean_ccfs.items(), key=lambda x:x[1], reverse=True)
        sorted_temp = s_mean_ccfs[0:int(num_sel)]
        ft_selected = [seq[0] for seq in sorted_temp]

    else: 
        ft_selected = 0

    return ft_selected

####################################################

def ClusterSelection(DataRoot, Norm, tag, output):
    '''
    Loops through each patient  to select the 'best' feature for each cluster by performing cross-correlation
    Discards clusters with less than 3 features
    Selects features which are ranked in top 10 across all patients
    '''
    root = DataRoot
    patIDs = UF.SABRPats()

    labels_dir = root + "\\Aaron\\ProstateMRL\\Data\\Paper1\\" + Norm + "\\Longitudinal\\ClusterLabels\\"
    out_dir = root + "\\Aaron\\ProstateMRL\\Data\\Paper1\\"+ Norm +"\\Features\\"
    
    df_result = pd.DataFrame()
    for pat in tqdm(patIDs):
        # read in feature vals and associated cluster labels
        df_pat = pd.read_csv(labels_dir + pat + "_" + tag + ".csv")

        cluster_num = df_pat["Cluster"].unique()
        fts_selected = []
        df_result_pat = pd.DataFrame()

        # for each patient loop through each cluster to get 'best' feature
        for c in cluster_num:
            df_cluster = df_pat[df_pat["Cluster"] == c]

            # function loops through each cluster and gets feature values
            # performs cross-correlation and returns feature with highest mean correlation to all other features
            # returns NULL if < 3 features in cluster 
            ft_selected = ClusterCC(df_cluster)

            if ft_selected != 0:
                for f in ft_selected:
                    fts_selected.append(f)
        
        # filter through all feature values and select only new features
            row = {}

        for f in fts_selected:
            row["patID"] = pat
            row["Feature"] = f
            df_result_pat = df_result_pat.append(row, ignore_index=True)
        
        df_result = df_result.append(df_result_pat, ignore_index=True)

    df_result = df_result.Feature.value_counts().rename_axis("Feature").reset_index(name="Counts")
    # get number of counts at 10th row
    counts = df_result.iloc[10]["Counts"]
    #print(df_result)
    # get features with counts >= counts
    fts = df_result[df_result["Counts"] >= counts]["Feature"].values
    if output == True:
        print("\nSelected Features: ({})".format(len(fts)))
        for f in fts:
            print(f)
    df_result = df_result[df_result["Counts"] >= counts]

    # drop counts
    df_result.drop(columns=["Counts"], inplace=True)
    df_result.to_csv(out_dir + "Longitudinal_SelectedFeatures_" + tag + ".csv")

####################################################
