In [90]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt

In [91]:
csv_path = "./test_results.csv"

# Read the CSV file
df = pd.read_csv(csv_path)

# Convert the index into a column
df.reset_index(inplace=True)

# Extract 'tissue_location'
df['tissue_location'] = df['filename'].str.split('_40X_').str[-1]

# Extract 'staining_id'
df['staining_id'] = df['filename'].str.extract('(\d+)_40X')

# Display the first few rows of the DataFrame
df.head()


Unnamed: 0,index,filename,avg_dna_damage_foci/glia_+,avg_dna_damage_foci/glia_+_damage_+,avg_dna_damage_foci/all_nuclei,nr_+_dna_damage_glia_nuclei,nr_-_dna_damage_glia_nuclei,nr_glia_+_nuclei,nr_total_nuclei,%_dna_damage_signal,%_glia+_signal,tissue_location,staining_id
0,0,DSB Iba1 16_40X_CA1,0.564286,1.338983,0.738636,59,81,140,176,1.09663,88.633728,CA1,16
1,1,DSB Iba1 16_40X_CA3,2.718954,3.014493,3.6875,138,15,153,160,18.227673,96.349525,CA3,16
2,2,DSB Iba1 16_40X_CTX1,0.72973,1.588235,0.699115,17,20,37,226,0.844288,13.598442,CTX1,16
3,3,DSB Iba1 16_40X_CTX2,1.69869,1.776256,2.296296,219,10,229,243,39.405441,94.930744,CTX2,16
4,4,DSB Iba1 16_40X_CTX3,1.732283,1.864407,1.822967,118,9,127,209,9.070396,54.768944,CTX3,16


In [92]:
mouse_id_csv_path = "./mouse_ids.csv"

# Read the CSV file
df_mouse_id = pd.read_csv(mouse_id_csv_path, delimiter=";", encoding="UTF-8")

# Display the first few rows of the DataFrame
df_mouse_id.head()

Unnamed: 0,staining_id,animal_id,sex,genotype,dna_damage_stain_quality_manual
0,34,885,male,APP/PS1,poor
1,53,885,male,APP/PS1,good
2,23,2042,male,APP/PS1,good
3,31,2042,male,APP/PS1,good
4,43,2042,male,APP/PS1,good


In [93]:
df['staining_id'] = pd.to_numeric(df['staining_id'], errors='coerce')
df_mouse_id['staining_id'] = pd.to_numeric(df_mouse_id['staining_id'], errors='coerce')

# Merge both processed_results_df and mouse_id dataframes on staining_id
merged_df = pd.merge(df, df_mouse_id, on="staining_id")

# Display the first few rows of the DataFrame
merged_df.head()

Unnamed: 0,index,filename,avg_dna_damage_foci/glia_+,avg_dna_damage_foci/glia_+_damage_+,avg_dna_damage_foci/all_nuclei,nr_+_dna_damage_glia_nuclei,nr_-_dna_damage_glia_nuclei,nr_glia_+_nuclei,nr_total_nuclei,%_dna_damage_signal,%_glia+_signal,tissue_location,staining_id,animal_id,sex,genotype,dna_damage_stain_quality_manual
0,0,DSB Iba1 16_40X_CA1,0.564286,1.338983,0.738636,59,81,140,176,1.09663,88.633728,CA1,16,887,male,APP/PS1,poor
1,1,DSB Iba1 16_40X_CA3,2.718954,3.014493,3.6875,138,15,153,160,18.227673,96.349525,CA3,16,887,male,APP/PS1,poor
2,2,DSB Iba1 16_40X_CTX1,0.72973,1.588235,0.699115,17,20,37,226,0.844288,13.598442,CTX1,16,887,male,APP/PS1,poor
3,3,DSB Iba1 16_40X_CTX2,1.69869,1.776256,2.296296,219,10,229,243,39.405441,94.930744,CTX2,16,887,male,APP/PS1,poor
4,4,DSB Iba1 16_40X_CTX3,1.732283,1.864407,1.822967,118,9,127,209,9.070396,54.768944,CTX3,16,887,male,APP/PS1,poor


In [94]:
# Create the plot
fig = px.scatter(df, x='tissue_location', y='nr_+_dna_damage_glia_nuclei',
                 hover_data=['staining_id','index'], title='Nr of DNA_damage+ glia+ nuclei by Tissue Location')

# Show the plot
fig.show()

In [95]:
# Create the plot
fig = px.scatter(df, x='tissue_location', y='avg_dna_damage_foci/glia_+',
                 hover_data=['staining_id'], title='DNA Damage in Glia Nuclei by Tissue Location')

# Show the plot
fig.show()

In [96]:
# Create the plot
fig = px.scatter(df, x='tissue_location', y='avg_dna_damage_foci/all_nuclei',
                 hover_data=['staining_id'], title='DNA Damage in All Nuclei by Tissue Location')

# Show the plot
fig.show()

In [97]:
# Create the plot
fig = px.scatter(df, x='tissue_location', y='nr_glia_+_nuclei',
                 hover_data=['staining_id','index'], title='Nr of glia+ nuclei by Tissue Location')

# Show the plot
fig.show()

In [98]:
# Create the plot
fig = px.scatter(df, x='tissue_location', y='nr_total_nuclei',
                 hover_data=['staining_id','index'], title='Nr of total nuclei by Tissue Location')

# Show the plot
fig.show()

In [99]:
# Create the plot
fig = px.scatter(df, x='staining_id', y='nr_glia_+_nuclei',
                 hover_data=['tissue_location','index'], title='Nr of glia+ nuclei by Sample')

# Show the plot
fig.show()

In [100]:
# Create the plot
fig = px.scatter(df, x='staining_id', y='%_dna_damage_signal',
                 hover_data=['tissue_location','index'], title='Dna damage mask area (QC)')

# Show the plot
fig.show()

In [101]:
# Create the plot
fig = px.scatter(df, x='staining_id', y='%_glia+_signal',
                 hover_data=['tissue_location','index'], title='Glia mask area (QC)')

# Show the plot
fig.show()

We can observe there is a number of outliers in the glial and dna damage mask detection given the staining is suboptimal in some of the samples. I will filter the data to remove those suboptimal stains and just plot the optimal ones where the automated image analysis offers reliable results.

In [102]:
# Calculate mean area of the image occupied by glia+ signal
glia_mask_area_mean = df['%_glia+_signal'].mean() 

# Calculate mean area of the image occupied by dna_damage_+ signal
dna_damage_mask_area_mean = df['%_dna_damage_signal'].mean()  

In [103]:
# Define a function to determine staining quality
def determine_stain_quality(value, mean_value):
    if value < mean_value:
        return "optimal"
    else:
        return "suboptimal"

# Check stain quality for glia and create another column storing optimal or suboptimal if qc_passed or not    
merged_df['glia_stain_quality_auto'] = merged_df['%_glia+_signal'].apply(lambda x: determine_stain_quality(x, glia_mask_area_mean))

# Check stain quality for dna_damage and create another column storing optimal or suboptimal if qc_passed or not 
merged_df['dna_damage_stain_quality_auto'] = merged_df['%_dna_damage_signal'].apply(lambda x: determine_stain_quality(x, dna_damage_mask_area_mean))

# Check for both stain qualities and store True qc_passed if both are optimal
merged_df['staining_qc_passed'] = (merged_df['glia_stain_quality_auto'] == 'optimal') & (merged_df['dna_damage_stain_quality_auto'] == 'optimal')

# Group the DataFrame by 'staining_id' and check if all 'staining_qc_passed' values are True, otherwise set them all to False
merged_df['staining_qc_passed'] = merged_df.groupby('staining_id')['staining_qc_passed'].transform('all')

# Now, if all 'staining_qc_passed' values for the same 'staining_id' were True, the column will remain True; otherwise, it will be False

merged_df.head()

Unnamed: 0,index,filename,avg_dna_damage_foci/glia_+,avg_dna_damage_foci/glia_+_damage_+,avg_dna_damage_foci/all_nuclei,nr_+_dna_damage_glia_nuclei,nr_-_dna_damage_glia_nuclei,nr_glia_+_nuclei,nr_total_nuclei,%_dna_damage_signal,%_glia+_signal,tissue_location,staining_id,animal_id,sex,genotype,dna_damage_stain_quality_manual,glia_stain_quality_auto,dna_damage_stain_quality_auto,staining_qc_passed
0,0,DSB Iba1 16_40X_CA1,0.564286,1.338983,0.738636,59,81,140,176,1.09663,88.633728,CA1,16,887,male,APP/PS1,poor,suboptimal,optimal,False
1,1,DSB Iba1 16_40X_CA3,2.718954,3.014493,3.6875,138,15,153,160,18.227673,96.349525,CA3,16,887,male,APP/PS1,poor,suboptimal,suboptimal,False
2,2,DSB Iba1 16_40X_CTX1,0.72973,1.588235,0.699115,17,20,37,226,0.844288,13.598442,CTX1,16,887,male,APP/PS1,poor,suboptimal,optimal,False
3,3,DSB Iba1 16_40X_CTX2,1.69869,1.776256,2.296296,219,10,229,243,39.405441,94.930744,CTX2,16,887,male,APP/PS1,poor,suboptimal,suboptimal,False
4,4,DSB Iba1 16_40X_CTX3,1.732283,1.864407,1.822967,118,9,127,209,9.070396,54.768944,CTX3,16,887,male,APP/PS1,poor,suboptimal,optimal,False


In [104]:
# Remove data from images with a poor quality stain (manual QC)
filtered_df = merged_df[merged_df['dna_damage_stain_quality_manual'] == 'good']

In [105]:
# Remove data from images with a poor quality stain (auto QC)
auto_filtered_df = merged_df[merged_df['staining_qc_passed'] == True]

In [106]:
# Create the boxplot
fig = px.box(filtered_df, x='tissue_location', y='avg_dna_damage_foci/glia_+',
             color='genotype', # Different genotypes will be shown in different colors
             title='DNA Damage in Glia Nuclei by Tissue Location and Genotype (sex-aggregated) - Manual stain QC')

# Show the plot
fig.show()

In [107]:
# Create the boxplot
fig = px.box(auto_filtered_df, x='tissue_location', y='avg_dna_damage_foci/glia_+',
             color='genotype', # Different genotypes will be shown in different colors
             title='DNA Damage in Glia Nuclei by Tissue Location and Genotype (sex-aggregated) - Auto stain QC')

# Show the plot
fig.show()

In [108]:
# Create the boxplot
fig = px.box(filtered_df, x='tissue_location', y='avg_dna_damage_foci/glia_+_damage_+',
             color='genotype', # Different genotypes will be shown in different colors
             title='DNA Damage in Damaged Glia Nuclei by Tissue Location and Genotype (sex-aggregated) - Manual stain QC')

# Show the plot
fig.show()

In [109]:
# Create the boxplot
fig = px.box(auto_filtered_df, x='tissue_location', y='avg_dna_damage_foci/glia_+_damage_+',
             color='genotype', # Different genotypes will be shown in different colors
             title='DNA Damage in Damaged Glia Nuclei by Tissue Location and Genotype (sex-aggregated) - Auto stain QC')

# Show the plot
fig.show()

In [110]:
# Create the boxplot
fig = px.box(filtered_df, x='tissue_location', y='avg_dna_damage_foci/all_nuclei',
             color='genotype', # Different genotypes will be shown in different colors
             title='DNA Damage in All Nuclei by Tissue Location and Genotype (sex-aggregated) - Manual stain QC')

# Show the plot
fig.show()


In [111]:
# Create the boxplot
fig = px.box(auto_filtered_df, x='tissue_location', y='avg_dna_damage_foci/all_nuclei',
             color='genotype', # Different genotypes will be shown in different colors
             title='DNA Damage in All Nuclei by Tissue Location and Genotype (sex-aggregated) - Auto stain QC')

# Show the plot
fig.show()

Now that we see that the auto QC control offers very similar results to the manual QC of stainings I will cplot the distribution of the analyzed parameters in the filtered dataset

In [112]:
# Create the plot
fig = px.scatter(auto_filtered_df, x='tissue_location', y='nr_+_dna_damage_glia_nuclei',
                 hover_data=['staining_id','index'], color='genotype', title='Nr of DNA_damage+ glia+ nuclei by Tissue Location')

# Show the plot
fig.show()
# Create the plot
fig = px.scatter(auto_filtered_df, x='tissue_location', y='avg_dna_damage_foci/glia_+',
                 hover_data=['staining_id'], color='genotype', title='DNA Damage in Glia Nuclei by Tissue Location')

# Show the plot
fig.show()
# Create the plot
fig = px.scatter(auto_filtered_df, x='tissue_location', y='avg_dna_damage_foci/all_nuclei',
                 hover_data=['staining_id'], color='genotype', title='DNA Damage in All Nuclei by Tissue Location')

# Show the plot
fig.show()
# Create the plot
fig = px.scatter(auto_filtered_df, x='tissue_location', y='nr_glia_+_nuclei',
                 hover_data=['staining_id','index'], color='genotype', title='Nr of glia+ nuclei by Tissue Location')

# Show the plot
fig.show()
# Create the plot
fig = px.scatter(auto_filtered_df, x='tissue_location', y='nr_total_nuclei',
                 hover_data=['staining_id','index'], color='genotype', title='Nr of total nuclei by Tissue Location')

# Show the plot
fig.show()
# Create the plot
fig = px.scatter(auto_filtered_df, x='staining_id', y='nr_glia_+_nuclei',
                 hover_data=['tissue_location','index'], color='genotype', title='Nr of glia+ nuclei by Sample')

# Show the plot
fig.show()
# Create the plot
fig = px.scatter(auto_filtered_df, x='staining_id', y='%_dna_damage_signal',
                 hover_data=['tissue_location','index'], color='genotype', title='Dna damage mask area (QC)')

# Show the plot
fig.show()
# Create the plot
fig = px.scatter(auto_filtered_df, x='staining_id', y='%_glia+_signal',
                 hover_data=['tissue_location','index'], color='genotype', title='Glia mask area (QC)')

# Show the plot
fig.show()

In [113]:
qc_failed_df = merged_df[merged_df['staining_qc_passed'] == False]

print(f"{qc_failed_df.shape[0]} stains have not passed QC and will need reanalysis")

#TODO: Iterate over the indexes stored in qc_failed_list and display the images to manually check why qc was not passed

qc_failed_list = qc_failed_df['index'].tolist()

qc_failed_df


78 stains have not passed QC and will need reanalysis


Unnamed: 0,index,filename,avg_dna_damage_foci/glia_+,avg_dna_damage_foci/glia_+_damage_+,avg_dna_damage_foci/all_nuclei,nr_+_dna_damage_glia_nuclei,nr_-_dna_damage_glia_nuclei,nr_glia_+_nuclei,nr_total_nuclei,%_dna_damage_signal,%_glia+_signal,tissue_location,staining_id,animal_id,sex,genotype,dna_damage_stain_quality_manual,glia_stain_quality_auto,dna_damage_stain_quality_auto,staining_qc_passed
0,0,DSB Iba1 16_40X_CA1,0.564286,1.338983,0.738636,59,81,140,176,1.09663,88.633728,CA1,16,887,male,APP/PS1,poor,suboptimal,optimal,False
1,1,DSB Iba1 16_40X_CA3,2.718954,3.014493,3.6875,138,15,153,160,18.227673,96.349525,CA3,16,887,male,APP/PS1,poor,suboptimal,suboptimal,False
2,2,DSB Iba1 16_40X_CTX1,0.72973,1.588235,0.699115,17,20,37,226,0.844288,13.598442,CTX1,16,887,male,APP/PS1,poor,suboptimal,optimal,False
3,3,DSB Iba1 16_40X_CTX2,1.69869,1.776256,2.296296,219,10,229,243,39.405441,94.930744,CTX2,16,887,male,APP/PS1,poor,suboptimal,suboptimal,False
4,4,DSB Iba1 16_40X_CTX3,1.732283,1.864407,1.822967,118,9,127,209,9.070396,54.768944,CTX3,16,887,male,APP/PS1,poor,suboptimal,optimal,False
5,5,DSB Iba1 16_40X_DG,0.679715,1.458015,0.782313,131,150,281,294,1.130867,87.153625,DG,16,887,male,APP/PS1,poor,suboptimal,optimal,False
6,6,DSB Iba1 17_40X_CA1,1.3,2.166667,2.538961,6,4,10,154,2.456284,5.763817,CA1,17,892,female,APP/PS1,poor,optimal,optimal,False
7,7,DSB Iba1 17_40X_CA3,1.238095,1.733333,2.945783,15,6,21,166,5.110073,6.432056,CA3,17,892,female,APP/PS1,poor,optimal,optimal,False
8,8,DSB Iba1 17_40X_CTX1,2.058824,2.5,1.265487,14,3,17,226,1.327896,2.645302,CTX1,17,892,female,APP/PS1,poor,optimal,optimal,False
9,9,DSB Iba1 17_40X_CTX2,2.509434,2.66,2.543796,50,3,53,274,29.742718,11.802101,CTX2,17,892,female,APP/PS1,poor,suboptimal,suboptimal,False
