<h2>Data Analysis - Batch Processing - Quantification of cell populations</h2>

The following notebook is able to process the .csv files resulting from Batch Processing (Average Intensity or Colocalization) and:

1. Define cell populations based on single or multiple markers (positive, negative or a combination of both)
2. Plot resulting data using Plotly.
3. Extract numbers of cells positive for a marker based on colocalization (using a user-defined threshold).
4. Save results in .csv file (find a name for this).

In [41]:
from pathlib import Path
import pandas as pd
import os 
# from utils_data_analysis

In [42]:
# Define the path containing your results
results_path = Path("./results/test_data/3D/MEC0.1")

# Input the method used to define cells as positive for a marker ("avg_int", "coloc") #TODO: "pixel_class"
method = "avg_int"

# Define the channels you want to analyze using the following structure:
# markers = [(channel_name, channel_nr, cellular_location),(..., ..., ...)]
markers = [("ki67", 0, "nucleus"), ("neun", 1, "nucleus"), ("calbindin", 2, "cytoplasm")]

# Define the min_max average intensity parameters to select your populations of interest
# You have the possibility to define populations for the same marker (i.e. neun high and neun low)
# max_values are set to 255 since the test input images are 8-bit, higher bit depths can result in higher max avg_int values
min_max_per_marker = [
    {"marker": "ki67", "min_max": (110,255), "population":"ki67"},
    {"marker": "neun", "min_max": (20,80), "population":"neun_low"},
    {"marker": "neun", "min_max": (80,255), "population":"neun_high"},
    {"marker": "calbindin", "min_max": (10,255), "population":"calbindin"},]

# Define cell populations based on multiple markers (i.e. double marker positive (True) or marker positive (True) and marker2 negative (False))
# Based on populations in min_max_per_marker in case multiple pops per marker are defined, as in the case of "neun"
# For cell_pop defined by a single populations marker add a + so it does not have the same name as population in min_max_per_marker
cell_populations = [
    {"cell_pop": "neun_high+", "subpopulations": [("neun_high", True)]},
    {"cell_pop": "neun_low+", "subpopulations": [("neun_low", True)]},
    {"cell_pop": "non_prolif", "subpopulations": [("ki67", False)]},
    {"cell_pop": "prolif_neun_high", "subpopulations": [("neun_high", True), ("ki67", True)]},
    {"cell_pop": "prolif_neun_low", "subpopulations": [("neun_low", True), ("ki67", True)]},
    {"cell_pop": "non_prolif_neun_high", "subpopulations": [("neun_high", True), ("ki67", False)]},
    {"cell_pop": "non_prolif_neun_low", "subpopulations": [("neun_low", True), ("ki67", False)]},
    {"cell_pop": "neun_high_+_calbindin_+", "subpopulations": [("neun_high", True), ("calbindin", True)]},
    {"cell_pop": "neun_low_+_calbindin_+", "subpopulations": [("neun_low", True), ("calbindin", True)]},]

In [43]:
def classify_cells(df, method, min_max_per_marker, cell_populations):

    # Select all column names in 'final_df' that contain the substring method (i.e. 'avg_int')
    avg_int_columns = [col for col in df.columns if method in col]

    for marker_analysis in min_max_per_marker:

        marker = marker_analysis["marker"]
        min_max_avg_int = marker_analysis["min_max"]
        population = marker_analysis["population"]

        # Retrieve the column name from which the avg_int values should be read
        for column in avg_int_columns:
            if marker in column:
                column_name = column

        # Define if each nuclei label is positive (True) or negative (False) for a particular marker/population
        df[population] = (df[column_name] > min_max_avg_int[0]) & (df[column_name] < min_max_avg_int[1])

    # Define populations based on subpopulations
    for cell_population in cell_populations:
        # Extract population name and 
        cell_pop_name = cell_population["cell_pop"]
        subpopulations = cell_population["subpopulations"]
        
        # Initialize the column for this cell population and set all values to True
        df[cell_pop_name] = True

        # Loop through each subpopulation and its corresponding status (tuple)
        for subpop, status in subpopulations:
            # If the status is True, the cell should be positive for this subpopulation
            if status:
                # Perform a logical AND between the current column and the subpopulation column
                # This keeps only the rows where both conditions are True
                df[cell_pop_name] &= df[subpop]
            else:
                # If the status is False, the cell should be negative for this subpopulation
                # Negate the subpopulation column and perform a logical AND
                # This keeps only the rows where the subpopulation is False and the previous conditions are True
                df[cell_pop_name] &= ~df[subpop]

    return df

In [44]:
# Extract model_name and segmentation type from the results path

model_name = results_path.parts[-1]
segmentation_type = results_path.parts[-2]

# Extract a list of filenames from the results path
per_label_csvs = []

for file_path in results_path.glob("*.csv"):
    if method in str(file_path) and "BP" not in str(file_path) and "SP" not in str(file_path):
        per_label_csvs.append(file_path)

per_label_csvs

[WindowsPath('results/test_data/3D/MEC0.1/HI 1  Contralateral Mouse 8  slide 6 Neun Red Calb Green KI67 Magenta 40x technical replica 1_per_label_avg_int.csv'),
 WindowsPath('results/test_data/3D/MEC0.1/HI 1  Ipsilateral Mouse 8  slide 6 Neun Red Calb Green KI67 Magenta 40x technical replica 1_per_label_avg_int.csv')]

In [45]:
# Define the .csv path
csv_path = results_path / f"BP_populations_marker_+_label_avg_int.csv"

for csv in per_label_csvs:

    # Read the original per_label .csv
    df = pd.read_csv(csv, index_col=0)

    # Classify cells based on subpopulations
    df = classify_cells(df, method, min_max_per_marker, cell_populations)

    # Append to the .csv with new data points each round
    df.to_csv(csv_path, mode="a", index=False, header=not os.path.isfile(csv_path))


In [46]:
# Extract ROIs
rois = list(df["ROI"].unique())