In [32]:
import pandas as pd
import plotly.express as px
import re

In [33]:
# Define the results you want to explore
csv_path = "./results_cellpdia30_sigma6_dilrad4_dnad_obj_seg_v3_gliaero6_glia_sem_seg_v1.csv" 

# Read the CSV file
df = pd.read_csv(csv_path)

# Convert the index into a column
df.reset_index(inplace=True)

# Extract 'tissue_location'
df['tissue_location'] = df['filename'].str.split('_40X_').str[-1]

# Extract 'staining_id'
df['staining_id'] = df['filename'].str.extract('(\d+)_40X')

# Display the first few rows of the DataFrame
df.head()


Unnamed: 0,index,filename,avg_dna_damage_foci/glia_+,avg_dna_damage_foci/glia_+_damage_+,avg_dna_damage_foci/all_nuclei,avg_dna_damage_foci/all_nuclei_damage_+,nr_+_dna_damage_glia_nuclei,nr_-_dna_damage_glia_nuclei,nr_glia_+_nuclei,nr_total_nuclei,%_dna_damage_signal,%_glia+_signal,tissue_location,staining_id
0,0,DSB Iba1 16_40X_CA1,0.189655,1.0,0.308081,1.033898,11,47,58,198,0.170803,29.991817,CA1,16
1,1,DSB Iba1 16_40X_CA3,0.688312,1.65625,0.699454,1.542169,32,45,77,183,0.41399,51.081562,CA3,16
2,2,DSB Iba1 16_40X_CTX1,0.085714,1.0,0.164609,1.025641,3,32,35,243,0.10004,5.70879,CTX1,16
3,3,DSB Iba1 16_40X_CTX2,0.564885,1.298246,0.425703,1.232558,57,74,131,249,0.21801,46.037292,CTX2,16
4,4,DSB Iba1 16_40X_CTX3,0.54386,1.148148,0.524664,1.392857,27,30,57,223,0.296593,13.168621,CTX3,16


In [34]:
# Extract analysis parameters from the CSV path
extracted_values = re.findall(r'\d+', csv_path)

# Dynamically assign the extracted values to variables
if len(extracted_values) >= 6:
    cellpose_nuclei_diameter = int(extracted_values[0])
    gaussian_sigma = int(extracted_values[1])
    dilation_radius_nuclei = int(extracted_values[2])
    dna_damage_segmenter_version = int(extracted_values[3])
    glia_nuclei_colocalization_erosion = int(extracted_values[4])
    if "_glia_sem_seg_v" in str(csv_path):
        glia_segmenter = True
    else:
        glia_segmenter = False

if glia_segmenter:
    glia_segmenter_version = int(extracted_values[5])
    glia_channel_threshold = None
    # Dinamically adjust plot titles
    title = f"cellpdia{cellpose_nuclei_diameter}_sigma{gaussian_sigma}_dilrad{dilation_radius_nuclei}_dnad_obj_seg_v{dna_damage_segmenter_version}_gliaero{glia_nuclei_colocalization_erosion}_glia_sem_seg_v{glia_segmenter_version}"
else:
    glia_channel_threshold = int(extracted_values[5])
    # Dinamically adjust plot titles
    title = f"cellpdia{cellpose_nuclei_diameter}_sigma{gaussian_sigma}_dilrad{dilation_radius_nuclei}_dnad_obj_seg_v{dna_damage_segmenter_version}_gliaero{glia_nuclei_colocalization_erosion}_gliathr{glia_channel_threshold}"
    

# Print the assigned analysis parameters
print(f"Cellpose nuclei diameter: {cellpose_nuclei_diameter}")
print(f"Gaussian sigma: {gaussian_sigma}")
print(f"Dilation radius nuclei: {dilation_radius_nuclei}")
print(f"Dna damage segmenter version: {dna_damage_segmenter_version}")
print(f"Glia erosion: {glia_nuclei_colocalization_erosion}")
print(f"Glia threshold: {glia_channel_threshold}")
print(f"Glia semantic segmentation version: {glia_segmenter_version}")



Cellpose nuclei diameter: 30
Gaussian sigma: 6
Dilation radius nuclei: 4
Dna damage segmenter version: 3
Glia erosion: 6
Glia threshold: None
Glia semantic segmentation version: 1


In [35]:
mouse_id_csv_path = "./mouse_ids.csv"

# Read the CSV file
df_mouse_id = pd.read_csv(mouse_id_csv_path, delimiter=";", encoding="UTF-8")

# Display the first few rows of the DataFrame
df_mouse_id.head()

Unnamed: 0,staining_id,animal_id,sex,genotype,dna_damage_stain_quality_manual,manual_qc
0,34,885,male,APP/PS1,poor,passed
1,53,885,male,APP/PS1,good,passed
2,23,2042,male,APP/PS1,good,passed
3,31,2042,male,APP/PS1,good,passed
4,43,2042,male,APP/PS1,good,passed


In [36]:
df['staining_id'] = pd.to_numeric(df['staining_id'], errors='coerce')
df_mouse_id['staining_id'] = pd.to_numeric(df_mouse_id['staining_id'], errors='coerce')

# Merge both processed_results_df and mouse_id dataframes on staining_id
merged_df = pd.merge(df, df_mouse_id, on="staining_id")

# Display the first few rows of the DataFrame
merged_df.head()

Unnamed: 0,index,filename,avg_dna_damage_foci/glia_+,avg_dna_damage_foci/glia_+_damage_+,avg_dna_damage_foci/all_nuclei,avg_dna_damage_foci/all_nuclei_damage_+,nr_+_dna_damage_glia_nuclei,nr_-_dna_damage_glia_nuclei,nr_glia_+_nuclei,nr_total_nuclei,%_dna_damage_signal,%_glia+_signal,tissue_location,staining_id,animal_id,sex,genotype,dna_damage_stain_quality_manual,manual_qc
0,0,DSB Iba1 16_40X_CA1,0.189655,1.0,0.308081,1.033898,11,47,58,198,0.170803,29.991817,CA1,16,887,male,APP/PS1,poor,failed
1,1,DSB Iba1 16_40X_CA3,0.688312,1.65625,0.699454,1.542169,32,45,77,183,0.41399,51.081562,CA3,16,887,male,APP/PS1,poor,failed
2,2,DSB Iba1 16_40X_CTX1,0.085714,1.0,0.164609,1.025641,3,32,35,243,0.10004,5.70879,CTX1,16,887,male,APP/PS1,poor,failed
3,3,DSB Iba1 16_40X_CTX2,0.564885,1.298246,0.425703,1.232558,57,74,131,249,0.21801,46.037292,CTX2,16,887,male,APP/PS1,poor,failed
4,4,DSB Iba1 16_40X_CTX3,0.54386,1.148148,0.524664,1.392857,27,30,57,223,0.296593,13.168621,CTX3,16,887,male,APP/PS1,poor,failed


In [37]:
# Create the plot
fig = px.scatter(df, x='tissue_location', y='nr_+_dna_damage_glia_nuclei',
                 hover_data=['staining_id','index'], title=f"Number of DNA damage+ Glia+ Nuclei by Tissue Location - {title}")

# Show the plot
fig.show()

In [38]:
# Create the plot
fig = px.scatter(df, x='tissue_location', y='avg_dna_damage_foci/glia_+',
                 hover_data=['staining_id'], title=f"Average DNA damage foci in Glia Nuclei by Tissue Location - {title}")

# Show the plot
fig.show()

In [39]:
# Create the plot
fig = px.scatter(df, x='tissue_location', y='avg_dna_damage_foci/all_nuclei',
                 hover_data=['staining_id'], title=f"Average DNA damage foci in All Nuclei by Tissue Location - {title}")

# Show the plot
fig.show()

In [40]:
# Create the plot
fig = px.scatter(df, x='tissue_location', y='nr_glia_+_nuclei',
                 hover_data=['staining_id','index'], title=f"Nr of glia+ nuclei by Tissue Location - {title}")

# Show the plot
fig.show()

In [41]:
# Create the plot
fig = px.scatter(df, x='tissue_location', y='nr_total_nuclei',
                 hover_data=['staining_id','index'], title=f'Nr of total nuclei by Tissue Location - {title}')

# Show the plot
fig.show()

In [42]:
# Create the plot
fig = px.scatter(df, x='staining_id', y='nr_glia_+_nuclei',
                 hover_data=['tissue_location','index'], title=f'Nr of glia+ nuclei by Sample - {title}')

# Show the plot
fig.show()

In [43]:
# Create the plot
fig = px.scatter(df, x='staining_id', y='%_dna_damage_signal',
                 hover_data=['tissue_location','index'], title=f'Dna damage mask area (QC) - {title}')

# Show the plot
fig.show()

In [44]:
# Create the plot
fig = px.scatter(df, x='staining_id', y='%_glia+_signal',
                 hover_data=['tissue_location','index'], title=f'Glia mask area (QC) - {title}')

# Show the plot
fig.show()

We can observe there is a number of outliers in the glial and dna damage mask detection given the staining is suboptimal in some of the samples. I will filter the data to remove those suboptimal stains and just plot the optimal ones where the automated image analysis offers reliable results.

In [45]:
# Calculate mean area of the image occupied by glia+ signal
glia_mask_area_mean = df['%_glia+_signal'].mean() 

# Calculate mean area of the image occupied by dna_damage_+ signal
dna_damage_mask_area_mean = df['%_dna_damage_signal'].mean() 

# Print extracted values
print(f"Glia_mask_area_%_mean: {glia_mask_area_mean}, Dna_damage_mask_area_%_mean: {dna_damage_mask_area_mean}") 

Glia_mask_area_%_mean: 7.065792689247737, Dna_damage_mask_area_%_mean: 0.39151206849113346


In [46]:
# Define a function to determine staining quality, anything above 3 times the mean value is considered an outlier
def determine_stain_quality(value, mean_value):
    if value < (mean_value + mean_value*3):
        return "optimal"
    else:
        return "suboptimal"

# Check stain quality for glia and create another column storing optimal or suboptimal if qc_passed or not    
merged_df['glia_stain_quality_auto'] = merged_df['%_glia+_signal'].apply(lambda x: determine_stain_quality(x, glia_mask_area_mean))

# Check stain quality for dna_damage and create another column storing optimal or suboptimal if qc_passed or not 
merged_df['dna_damage_stain_quality_auto'] = merged_df['%_dna_damage_signal'].apply(lambda x: determine_stain_quality(x, dna_damage_mask_area_mean))

# Check for both stain qualities and store True qc_passed if both are optimal
merged_df['staining_qc_passed'] = (merged_df['glia_stain_quality_auto'] == 'optimal') & (merged_df['dna_damage_stain_quality_auto'] == 'optimal')

# Group the DataFrame by 'staining_id' and check if all 'staining_qc_passed' values are True, otherwise set them all to False
merged_df['staining_qc_passed'] = merged_df.groupby('staining_id')['staining_qc_passed'].transform('all')

# Now, if all 'staining_qc_passed' values for the same 'staining_id' were True, the column will remain True; otherwise, it will be False

merged_df.head()

Unnamed: 0,index,filename,avg_dna_damage_foci/glia_+,avg_dna_damage_foci/glia_+_damage_+,avg_dna_damage_foci/all_nuclei,avg_dna_damage_foci/all_nuclei_damage_+,nr_+_dna_damage_glia_nuclei,nr_-_dna_damage_glia_nuclei,nr_glia_+_nuclei,nr_total_nuclei,...,tissue_location,staining_id,animal_id,sex,genotype,dna_damage_stain_quality_manual,manual_qc,glia_stain_quality_auto,dna_damage_stain_quality_auto,staining_qc_passed
0,0,DSB Iba1 16_40X_CA1,0.189655,1.0,0.308081,1.033898,11,47,58,198,...,CA1,16,887,male,APP/PS1,poor,failed,suboptimal,optimal,False
1,1,DSB Iba1 16_40X_CA3,0.688312,1.65625,0.699454,1.542169,32,45,77,183,...,CA3,16,887,male,APP/PS1,poor,failed,suboptimal,optimal,False
2,2,DSB Iba1 16_40X_CTX1,0.085714,1.0,0.164609,1.025641,3,32,35,243,...,CTX1,16,887,male,APP/PS1,poor,failed,optimal,optimal,False
3,3,DSB Iba1 16_40X_CTX2,0.564885,1.298246,0.425703,1.232558,57,74,131,249,...,CTX2,16,887,male,APP/PS1,poor,failed,suboptimal,optimal,False
4,4,DSB Iba1 16_40X_CTX3,0.54386,1.148148,0.524664,1.392857,27,30,57,223,...,CTX3,16,887,male,APP/PS1,poor,failed,optimal,optimal,False


In [47]:
if glia_segmenter:
    merged_df.to_csv(
        f"qc_cellpdia{cellpose_nuclei_diameter}_sigma{gaussian_sigma}_dilrad{dilation_radius_nuclei}_dnad_obj_seg_v{dna_damage_segmenter_version}_gliaero{glia_nuclei_colocalization_erosion}_glia_sem_seg_v{glia_segmenter_version}.csv",
        index=False,
    )
else:
    merged_df.to_csv(
        f"qc_cellpdia{cellpose_nuclei_diameter}_sigma{gaussian_sigma}_dilrad{dilation_radius_nuclei}_dnad_obj_seg_v{dna_damage_segmenter_version}_gliaero{glia_nuclei_colocalization_erosion}_gliathr{glia_channel_threshold}_.csv",
        index=False,
    )

In [48]:
# Remove data from images with a poor quality stain (auto QC)
auto_filtered_df = merged_df[merged_df['staining_qc_passed'] == True]

In [49]:
# Create the boxplot
fig = px.box(auto_filtered_df, x='tissue_location', y='avg_dna_damage_foci/glia_+',
             color='genotype', # Different genotypes will be shown in different colors
             title=f'DNA Damage in Glia Nuclei by Tissue Location and Genotype (sex-aggregated) - Auto stain QC - {title}')

# Show the plot
fig.show()





In [50]:
# Create the boxplot
fig = px.box(auto_filtered_df, x='tissue_location', y='avg_dna_damage_foci/glia_+_damage_+',
             color='genotype', # Different genotypes will be shown in different colors
             title=f'DNA Damage in Damaged Glia Nuclei by Tissue Location and Genotype (sex-aggregated) - Auto stain QC - {title}')

# Show the plot
fig.show()





In [51]:
# Create the boxplot
fig = px.box(auto_filtered_df, x='tissue_location', y='avg_dna_damage_foci/all_nuclei',
             color='genotype', # Different genotypes will be shown in different colors
             title=f'DNA Damage in All Nuclei by Tissue Location and Genotype (sex-aggregated) - Auto stain QC - {title}')

# Show the plot
fig.show()





In [52]:
# Create the boxplot
fig = px.box(auto_filtered_df, x='tissue_location', y='avg_dna_damage_foci/all_nuclei_damage_+',
             color='genotype', # Different genotypes will be shown in different colors
             title=f'DNA Damage in Damage+ Nuclei by Tissue Location and Genotype (sex-aggregated) - Auto stain QC - {title}')

# Show the plot
fig.show()





Show failed qc dataframe

In [54]:
qc_failed_df = merged_df[merged_df['staining_qc_passed'] == False]

print(f"{qc_failed_df.shape[0]} stains have not passed QC and have been discarded")

qc_failed_list = qc_failed_df['index'].tolist()

qc_failed_df


12 stains have not passed QC and have been discarded


Unnamed: 0,index,filename,avg_dna_damage_foci/glia_+,avg_dna_damage_foci/glia_+_damage_+,avg_dna_damage_foci/all_nuclei,avg_dna_damage_foci/all_nuclei_damage_+,nr_+_dna_damage_glia_nuclei,nr_-_dna_damage_glia_nuclei,nr_glia_+_nuclei,nr_total_nuclei,...,tissue_location,staining_id,animal_id,sex,genotype,dna_damage_stain_quality_manual,manual_qc,glia_stain_quality_auto,dna_damage_stain_quality_auto,staining_qc_passed
0,0,DSB Iba1 16_40X_CA1,0.189655,1.0,0.308081,1.033898,11,47,58,198,...,CA1,16,887,male,APP/PS1,poor,failed,suboptimal,optimal,False
1,1,DSB Iba1 16_40X_CA3,0.688312,1.65625,0.699454,1.542169,32,45,77,183,...,CA3,16,887,male,APP/PS1,poor,failed,suboptimal,optimal,False
2,2,DSB Iba1 16_40X_CTX1,0.085714,1.0,0.164609,1.025641,3,32,35,243,...,CTX1,16,887,male,APP/PS1,poor,failed,optimal,optimal,False
3,3,DSB Iba1 16_40X_CTX2,0.564885,1.298246,0.425703,1.232558,57,74,131,249,...,CTX2,16,887,male,APP/PS1,poor,failed,suboptimal,optimal,False
4,4,DSB Iba1 16_40X_CTX3,0.54386,1.148148,0.524664,1.392857,27,30,57,223,...,CTX3,16,887,male,APP/PS1,poor,failed,optimal,optimal,False
5,5,DSB Iba1 16_40X_DG,0.227848,1.5,0.267281,1.234043,12,67,79,217,...,DG,16,887,male,APP/PS1,poor,failed,suboptimal,optimal,False
18,18,DSB Iba1 19_40X_CA1,1.5,1.916667,1.680851,2.135135,36,10,46,47,...,CA1,19,154,female,APP/PS1 x Neil3 KO,poor,failed,suboptimal,suboptimal,False
19,19,DSB Iba1 19_40X_CA3,1.675,2.09375,1.688889,2.111111,32,8,40,45,...,CA3,19,154,female,APP/PS1 x Neil3 KO,poor,failed,suboptimal,suboptimal,False
20,20,DSB Iba1 19_40X_CTX1,0.757009,1.557692,0.792308,1.688525,52,55,107,130,...,CTX1,19,154,female,APP/PS1 x Neil3 KO,poor,failed,suboptimal,optimal,False
21,21,DSB Iba1 19_40X_CTX2,0.606061,1.333333,0.471698,1.25,15,18,33,53,...,CTX2,19,154,female,APP/PS1 x Neil3 KO,poor,failed,suboptimal,optimal,False
