In [25]:
from pathlib import Path
import os
from tqdm import tqdm
import czifile
import tifffile
import pyclesperanto_prototype as cle
import numpy as np
import pandas as pd
from utils import check_filenames, segment_nuclei_2d, simulate_cytoplasm
from skimage.measure import regionprops_table
import pandas as pd
from sklearn.cluster import KMeans
import plotly.express as px

cle.select_device("RTX")

<NVIDIA GeForce RTX 4090 on Platform: NVIDIA CUDA (2 refs)>

In [27]:
# Copy the path where your images are stored, ideally inside the raw_data directory
directory_path = Path("./raw_data/Reelin")
roi_directory_path = Path("./raw_data/Reelin/ROI")

# Define the subdirectories containing your data
subdirectories = ["Contra", "Ipsi", "Sham"]

# Create empty lists to store all image filepaths and ROIs within the dataset directory
images = []
rois = []

# Create an empty list to store all stats extracted from each image
stats = []

# Scan subdirectories and add paths to images fitting certain conditions
for subdir in subdirectories:
    # Construct the subdirectory path
    image_path = directory_path / subdir
    # Iterate through the .czi files in the subdirectories
    for file_path in image_path.glob("*.czi"):
        # Remove unwanted images
        if "AWT" not in str(file_path) and "BWT" not in str(file_path):
            images.append(str(file_path))

# Scan ROI directory and add paths to the list
for file_path in roi_directory_path.glob("*.tif"):
    # Remove unwanted images
        if "AWT" not in str(file_path) and "BWT" not in str(file_path):
            rois.append(str(file_path))

# Check if there is any missing ROI or image file in their corresponding directories
check_filenames(images, rois)

# Extract filenames without extensions and sort the lists so they appear in the same order
images_sorted = sorted(images, key=lambda x: Path(x).stem)
rois_sorted = sorted(rois, key=lambda x: Path(x).stem)

No files missing in images list.
No files missing in rois list.


In [28]:
for image_path, roi_path in tqdm(zip(images_sorted, rois_sorted)):

    # Read path storing raw image and extract filename
    file_path = Path(image_path)
    filename = file_path.stem

    # Get rid of double spaces in the filename
    filename = filename.replace("  ", " ")

    # Extract experimental conditions from the filename
    descriptors = filename.split(" ")
    condition = descriptors[0]

    try:
        # Convert strings to int
        condition_nr = int(descriptors[1])
        brain_location = descriptors[2]
        mouse_id = int(descriptors[4])
        slide = int(descriptors[5][-1])
        tech_replica = int(descriptors[-1])

    except ValueError:
        # In the case of erroneous filenaming add info as strings
        condition_nr = descriptors[1]
        brain_location = descriptors[2]
        mouse_id = descriptors[4]
        slide = descriptors[5][-1]
        tech_replica = descriptors[-1]

    # Read image and ROI files into Numpy arrays
    img = czifile.imread(image_path)
    roi = tifffile.imread(roi_path)

    # Remove singleton dimensions and perform MIP on input image
    img = img.squeeze()
    img_mip = np.max(img, axis=1)

    # Perform MIP for the region of interest
    roi_mip = np.max(roi, axis=0)

    # We will create a mask where label_mip is greater than or equal to 1
    mask = roi_mip >= 1

    # Apply the mask to img_mip
    masked_img = np.where(mask, img_mip, 0)

    # Extract each of the channels separately
    neun_mip = masked_img[0, :, :]
    reelin_mip = masked_img[1, :, :]
    gad67_mip = masked_img[2, :, :]
    nuclei_mip = masked_img[3, :, :]

    # Segment nuclei inside the ROI
    nuclei_labels = segment_nuclei_2d(nuclei_mip)

    # Simulate a cytoplasm by dilating the nuclei and substracting the nuclei mask afterwards
    cytoplasm = simulate_cytoplasm(nuclei_labels, dilation_radius = 2, erosion_radius = 0)

    # Create a dictionary containing all image descriptors
    descriptor_dict = {
                "filename": filename,
                "condition": condition,
                "condition_nr": condition_nr,
                "brain_location": brain_location,
                "mouse_id": mouse_id,
                "slide_nr": slide,
                "tech_replica": tech_replica,
                }
    
    # List to hold the dataframes
    props_list = []

    # Extract intensity information from each marker channel
    neun_props = regionprops_table(label_image=nuclei_labels,
                                intensity_image=neun_mip,
                                properties=["label", "intensity_mean", "area"])

    reelin_props = regionprops_table(label_image=cytoplasm,
                                intensity_image=reelin_mip,
                                properties=["label", "intensity_mean", "area"])

    gad67_props = regionprops_table(label_image=cytoplasm,
                                intensity_image=gad67_mip,
                                properties=["label", "intensity_mean"])

    # Convert to dataframe
    neun_props_df = pd.DataFrame(neun_props)
    reelin_props_df = pd.DataFrame(reelin_props)
    gad67_props_df = pd.DataFrame(gad67_props)

    # Rename intensity_mean column to indicate the specific image
    neun_props_df.rename(columns={"intensity_mean": f"neun_intensity_mean"}, inplace=True)
    reelin_props_df.rename(columns={"intensity_mean": f"reelin_intensity_mean"}, inplace=True)
    gad67_props_df.rename(columns={"intensity_mean": f"gad67_intensity_mean"}, inplace=True)
    neun_props_df.rename(columns={"area": f"nuclei_area"}, inplace=True)
    reelin_props_df.rename(columns={"area": f"cyto_area"}, inplace=True)

    # Append to list
    props_list.append(neun_props_df)
    props_list.append(reelin_props_df)
    props_list.append(gad67_props_df)

    # Merge all dataframes on the "label" column

    # Initialize the df with the first df in the list
    props_df = props_list[0]
    # Start looping from the second df in the list
    for df in props_list[1:]:
        props_df = props_df.merge(df, on="label")

    # Add each key-value pair from descriptor_dict to props_df
    for key, value in descriptor_dict.items():
        props_df[key] = value

    stats.append(props_df)

    

In [30]:
# Concatenate list of dataframes into a single final df
final_df = pd.concat(stats)

# Perform k-means clustering with 3 clusters (no_signal, low_neun and high_neun)
kmeans = KMeans(n_clusters=3, random_state=0)
final_df['neun+_cluster'] = kmeans.fit_predict(final_df[['neun_intensity_mean']])

In [None]:
# Create a 'results' folder in the root directory
results_folder = 'results'

try:
    os.makedirs(results_folder)
    print(f"'{results_folder}' folder created successfully.")
except FileExistsError:
    print(f"'{results_folder}' folder already exists.")

# Save the df containing per_label results into a CSV file
final_df.to_csv('./results/Reelin_per_label_results.csv')