In [1]:
from utils import read_results_csv, extract_analysis_parameters
import pandas as pd
import plotly.express as px

In [2]:
# Define the dataset that you want to analyze below ("microglia" or "astrocyte")
dataset = "microglia"

# Define the .csv results you want to explore and quality check
csv_path = "./results/microglia_results_cellpdia30_sigma1_dilrad4_dnad_obj_seg_v1_gliaero6_gliathr20_dnadero2.csv"

# Read both results and mouse_id .csv files and load them into a Dataframe 
df, df_mouse_id = read_results_csv(dataset, csv_path)

# Print the analysis settings and extract them into variables
(
    cellpose_nuclei_diameter,
    gaussian_sigma,
    dilation_radius_nuclei,
    dna_damage_segmenter_version,
    glia_nuclei_colocalization_erosion,
    glia_channel_threshold,
    glia_segmenter,
    glia_segmenter_version,
    dna_damage_erosion,
    parameters_title,
) = extract_analysis_parameters(csv_path)

# Display the first few rows of the DataFrame
df.head()


The following dataset will be analyzed: microglia
Cellpose nuclei diameter: 30
Gaussian sigma: 1
Dilation radius nuclei: 4
Dna damage segmenter version: 1
Glia erosion: 6
Glia threshold: 20
Glia semantic segmentation version: None
DNA damage foci erosion: 2


Unnamed: 0,index,filename,avg_dna_damage_foci/glia_+,avg_dna_damage_foci/glia_+_damage_+,avg_dna_damage_foci/all_nuclei,avg_dna_damage_foci/all_nuclei_damage_+,nr_+_dna_damage_glia_nuclei,nr_+_dna_damage_all_nuclei,nr_-_dna_damage_glia_nuclei,nr_glia_+_nuclei,nr_total_nuclei,%_dna_damage_signal,%_glia+_signal,damage_load_ratio,tissue_location,staining_id
0,0,DSB Iba1 101_40X_CA1,1.0,1.333333,0.573333,1.409836,9,61,3,12,150,0.783348,1.67799,0.406667,CA1,101
1,1,DSB Iba1 101_40X_CA3,0.777778,1.0,0.934959,1.513158,7,76,2,9,123,1.286697,2.135658,0.617886,CA3,101
2,2,DSB Iba1 101_40X_CTX1,1.1,1.375,0.958084,1.415929,24,113,6,30,167,2.621174,5.073738,0.676647,CTX1,101
3,3,DSB Iba1 101_40X_CTX2,1.363636,1.666667,0.898374,1.407643,9,157,2,11,246,1.908875,5.266762,0.638211,CTX2,101
4,4,DSB Iba1 101_40X_CTX3,0.533333,1.333333,0.759657,1.301471,6,136,9,15,233,1.623058,3.178596,0.583691,CTX3,101


In [None]:
df['staining_id'] = pd.to_numeric(df['staining_id'], errors='coerce')
df_mouse_id['staining_id'] = pd.to_numeric(df_mouse_id['staining_id'], errors='coerce')

# Merge both processed_results_df and mouse_id dataframes on staining_id
merged_df = pd.merge(df, df_mouse_id, on="staining_id")

# Display the first few rows of the DataFrame
merged_df.head()

In [None]:
# Create the plot
fig = px.scatter(df, x='tissue_location', y='nr_+_dna_damage_glia_nuclei',
                 hover_data=['staining_id','index','filename'], title=f"Number of DNA damage+ {dataset.capitalize()}+ Nuclei by Tissue Location - {parameters_title}")

# Show the plot
fig.show()

In [None]:
# Create the plot
fig = px.scatter(df, x='tissue_location', y='avg_dna_damage_foci/glia_+',
                 hover_data=['staining_id','index','filename'], title=f"Average DNA damage foci in {dataset.capitalize()} Nuclei by Tissue Location - {parameters_title}")

# Show the plot
fig.show()

In [None]:
# Create the plot
fig = px.scatter(df, x='tissue_location', y='avg_dna_damage_foci/all_nuclei',
                 hover_data=['staining_id','index','filename'], title=f"Average DNA damage foci in All Nuclei by Tissue Location - {parameters_title}")

# Show the plot
fig.show()

In [None]:
# Create the plot
fig = px.scatter(df, x='tissue_location', y='nr_glia_+_nuclei',
                 hover_data=['staining_id','index','filename'], title=f"Nr of {dataset.capitalize()}+ nuclei by Tissue Location - {parameters_title}")

# Show the plot
fig.show()

In [None]:
# Create the plot
fig = px.scatter(df, x='tissue_location', y='nr_total_nuclei',
                 hover_data=['staining_id','index','filename'], title=f'Nr of total nuclei by Tissue Location - {parameters_title}')

# Show the plot
fig.show()

In [None]:
# Create the plot
fig = px.scatter(df, x='staining_id', y='nr_glia_+_nuclei',
                 hover_data=['staining_id','index','filename'], title=f'Nr of {dataset.capitalize()}+ nuclei by Sample - {parameters_title}')

# Show the plot
fig.show()

In [None]:
# Create the plot
fig = px.scatter(df, x='staining_id', y='%_dna_damage_signal',
                 hover_data=['staining_id','index','filename'], title=f'Dna damage mask area (QC) - {parameters_title}')

# Show the plot
fig.show()

In [None]:
# Create the plot
fig = px.scatter(df, x='staining_id', y='%_glia+_signal',
                 hover_data=['staining_id','index','filename'], title=f'{dataset.capitalize()} mask area (QC) - {parameters_title}')

# Show the plot
fig.show()

We can observe there is a number of outliers in the glial and dna damage mask detection given the staining is suboptimal in some of the samples. I will filter the data to remove those suboptimal stains and just plot the optimal ones where the automated image analysis offers reliable results.

In [None]:
# Calculate mean area of the image occupied by glia+ signal
glia_mask_area_mean = df['%_glia+_signal'].mean() 

# Calculate mean area of the image occupied by dna_damage_+ signal
dna_damage_mask_area_mean = df['%_dna_damage_signal'].mean() 

# Print extracted values
print(f"Glia_mask_area_%_mean: {glia_mask_area_mean}, Dna_damage_mask_area_%_mean: {dna_damage_mask_area_mean}") 

In [None]:
# Define a function to determine staining quality, anything above 3 times the mean value is considered an outlier
def determine_stain_quality(value, mean_value):
    if value < (mean_value + mean_value*3):
        return "optimal"
    else:
        return "suboptimal"

# Check stain quality for glia and create another column storing optimal or suboptimal if qc_passed or not    
merged_df['glia_stain_quality_auto'] = merged_df['%_glia+_signal'].apply(lambda x: determine_stain_quality(x, glia_mask_area_mean))

# Check stain quality for dna_damage and create another column storing optimal or suboptimal if qc_passed or not 
merged_df['dna_damage_stain_quality_auto'] = merged_df['%_dna_damage_signal'].apply(lambda x: determine_stain_quality(x, dna_damage_mask_area_mean))

# Check for both stain qualities and store True qc_passed if both are optimal
merged_df['staining_qc_passed'] = (merged_df['glia_stain_quality_auto'] == 'optimal') & (merged_df['dna_damage_stain_quality_auto'] == 'optimal')

# Group the DataFrame by 'staining_id' and check if all 'staining_qc_passed' values are True, otherwise set them all to False
merged_df['staining_qc_passed'] = merged_df.groupby('staining_id')['staining_qc_passed'].transform('all')

# Now, if all 'staining_qc_passed' values for the same 'staining_id' were True, the column will remain True; otherwise, it will be False

merged_df.head()

In [None]:
if glia_segmenter:
    merged_df.to_csv(
        f"./results/qc_{dataset}_cellpdia{cellpose_nuclei_diameter}_sigma{gaussian_sigma}_dilrad{dilation_radius_nuclei}_dnad_obj_seg_v{dna_damage_segmenter_version}_gliaero{glia_nuclei_colocalization_erosion}_glia_sem_seg_v{glia_segmenter_version}_dnadero{dna_damage_erosion}.csv",
        index=False,
    )
else:
    merged_df.to_csv(
        f"./results/qc_{dataset}_cellpdia{cellpose_nuclei_diameter}_sigma{gaussian_sigma}_dilrad{dilation_radius_nuclei}_dnad_obj_seg_v{dna_damage_segmenter_version}_gliaero{glia_nuclei_colocalization_erosion}_gliathr{glia_channel_threshold}_dnadero{dna_damage_erosion}.csv",
        index=False,
    )

In [None]:
# Remove data from images with a poor quality stain (auto QC)
auto_filtered_df = merged_df[merged_df['staining_qc_passed'] == True]

In [None]:
# Create the boxplot
fig = px.box(auto_filtered_df, x='tissue_location', y='avg_dna_damage_foci/glia_+',
             color='genotype', # Different genotypes will be shown in different colors
             title=f'DNA Damage in Glia Nuclei by Tissue Location and Genotype (sex-aggregated) - Auto stain QC - {title}')

# Show the plot
fig.show()

In [None]:
# Create the boxplot
fig = px.box(auto_filtered_df, x='tissue_location', y='avg_dna_damage_foci/glia_+_damage_+',
             color='genotype', # Different genotypes will be shown in different colors
             title=f'DNA Damage in Damaged Glia Nuclei by Tissue Location and Genotype (sex-aggregated) - Auto stain QC - {title}')

# Show the plot
fig.show()

In [None]:
# Create the boxplot
fig = px.box(auto_filtered_df, x='tissue_location', y='avg_dna_damage_foci/all_nuclei',
             color='genotype', # Different genotypes will be shown in different colors
             title=f'DNA Damage in All Nuclei by Tissue Location and Genotype (sex-aggregated) - Auto stain QC - {title}')

# Show the plot
fig.show()

In [None]:
# Create the boxplot
fig = px.box(auto_filtered_df, x='tissue_location', y='avg_dna_damage_foci/all_nuclei_damage_+',
             color='genotype', # Different genotypes will be shown in different colors
             title=f'DNA Damage in Damage+ Nuclei by Tissue Location and Genotype (sex-aggregated) - Auto stain QC - {title}')

# Show the plot
fig.show()

In [None]:
# Create the boxplot
fig = px.box(auto_filtered_df, x='tissue_location', y='damage_load_ratio',
             color='genotype', # Different genotypes will be shown in different colors
             title=f'Damage load ratio by Tissue Location and Genotype (sex-aggregated) - Auto stain QC - {title}')

# Show the plot
fig.show()

Show failed qc dataframe

In [None]:
qc_failed_df = merged_df[merged_df['staining_qc_passed'] == False]

print(f"{qc_failed_df.shape[0]} stains have not passed QC and have been discarded")

qc_failed_list = qc_failed_df['index'].tolist()

qc_failed_df
