In [1]:
import polars as pl
import os

In [2]:
marker = "Sac6"
phenotype_dir = f"/home/alex/alex_files/markerproject_redux/phenotypes/{marker}"
overlay_coords = pl.read_csv(f"/home/alex/alex_files/markerproject_redux/coordinates/{marker}/all_overlay_paths.csv")
fp_coords = pl.read_csv(f"/home/alex/alex_files/markerproject_redux/coordinates/{marker}/all_raw_paths.csv")
output_dir = f"/mnt/c/Users/peree/OneDrive/Desktop/CompBio_Code/markerproject_redux/phenotypes/{marker}/sct_inputs"

if not os.path.exists(f"{output_dir}"):
    os.makedirs(f"{output_dir}")

In [12]:
cells = (
    pl
    .read_csv("/mnt/c/Users/peree/OneDrive/Desktop/dad2_cells.csv")
    #.filter(pl.col("Strain_ID") == "tsa269-37C")
    .select("Cell_ID")
    .to_series()
    .to_list()
    )
overlay_coords = overlay_coords.filter(pl.col("Cell_ID").is_in(cells))
fp_coords = fp_coords.filter(pl.col("Cell_ID").is_in(cells))

In [13]:
(
    pl
    .concat(items=[overlay_coords, fp_coords], how="vertical")
    .sort(["Cell_ID", "Image_Path"])
    .group_by("Cell_ID")
    .agg(pl.all())
    .sample(fraction=1, with_replacement=False, shuffle=True)
    .explode(pl.all().exclude("Cell_ID"))
    .write_csv(f"{output_dir}/test_cells.csv")
)

In [None]:
844 571 1373

In [3]:
def sct_input_maker_phenotype(outlier_cells_path, output_file, overlay_coords, fp_coords, cc_stage="", strain=""):
    """
    Creates an input for singlecelltool to view outlier cells for a given phenotype.

    Args:
        outlier_cells_path (str): path to csv with outlier Cell_IDs for a given phenotype
        output_file (str): where and what to save file as
        overlay_coords (pl.DataFrame): dataframe with all cells' x/y coordinates and image paths for overlay images
        fp_coords (pl.DataFrame): dataframe with all cells' x/y coordinates and image paths for FP (GFP/RFP) images
        cc_stage (str): if specified, filters outlier cells to only include those from a specific cell cycle stage; defaults to no filtering
        strain (str): if specified, filters outlier cells to only include those from a specific gene; defaults to no filtering
    """
    subset = pl.read_csv(outlier_cells_path).select(["Cell_ID", "Predicted_Label", "Strain_ID"])

    if cc_stage:
        subset = subset.filter(pl.col("Predicted_Label") == cc_stage)
    if strain:
        subset = subset.filter(pl.col("Strain_ID") == strain)

    overlay_paths = (
        overlay_coords
        .filter(
            pl.col("Cell_ID").is_in(subset["Cell_ID"].to_list())
        )
    )

    fp_paths = (
        fp_coords
        .filter(
            pl.col("Cell_ID").is_in(subset["Cell_ID"].to_list())
        )
    )

    (
        pl
        .concat(items=[overlay_paths, fp_paths], how="vertical")
        .sort(["Cell_ID", "Image_Path"])
        .group_by("Cell_ID")
        .agg(pl.all())
        .sample(fraction=1, with_replacement=False, shuffle=True)
        .explode(pl.all().exclude("Cell_ID"))
        .write_csv(output_file)
    )

### Whole-Cell phenotypes

In [None]:
# large
sct_input_maker_phenotype(
    outlier_cells_path=f"{phenotype_dir}/Cells/abnormal_cell_size/abnormally_large_cells/outlier_cells/all_strain_outlier_cells.csv",
    output_file=f"{output_dir}/large_tsa150-37C_outliers.csv", 
    overlay_coords=overlay_coords, 
    fp_coords=fp_coords, 
    cc_stage="", 
    strain="tsa150-37C")

# small
sct_input_maker_phenotype(
    outlier_cells_path=f"{phenotype_dir}/Cells/abnormal_cell_size/abnormally_small_cells/outlier_cells/all_strain_outlier_cells.csv",
    output_file=f"{output_dir}/small_dma4490_outliers.csv", 
    overlay_coords=overlay_coords, 
    fp_coords=fp_coords, 
    cc_stage="", 
    strain="dma4490")

# apolar
sct_input_maker_phenotype(
    outlier_cells_path=f"{phenotype_dir}/Cells/abnormal_cell_eccentricity/abnormally_round/outlier_cells/all_strain_outlier_cells.csv",
    output_file=f"{output_dir}/apolar_tsa939-37C_outliers.csv", 
    overlay_coords=overlay_coords, 
    fp_coords=fp_coords, 
    cc_stage="", 
    strain="tsa939-37C")

# elongated
sct_input_maker_phenotype(
    outlier_cells_path=f"{phenotype_dir}/Cells/abnormal_cell_eccentricity/abnormally_elongated/outlier_cells/all_strain_outlier_cells.csv",
    output_file=f"{output_dir}/elongated_dma5103_outliers.csv", 
    overlay_coords=overlay_coords, 
    fp_coords=fp_coords, 
    cc_stage="", 
    strain="dma5103")

### Compartment Phenotypes

In [23]:
# too few
sct_input_maker_phenotype(
    outlier_cells_path=f"{phenotype_dir}/APs/abnormal_radial_distribution/abnormal_outer_distribution/low_outer_distribution/outlier_cells/DMA_Plate05_APs_outlier_cells.csv",
    output_file=f"{output_dir}/low_outer_distribution_outliers.csv", 
    overlay_coords=overlay_coords, 
    fp_coords=fp_coords, 
    cc_stage="", 
    strain="")

### Get inliers

In [None]:
outlier_cells = pl.read_csv(f"{phenotype_dir}/Cells/aggregated_cell_outlier_data/all_aggregated_cell_outlier_data.csv").select("Cell_ID").to_series().to_list()
outlier_spb = pl.read_csv(f"{phenotype_dir}/SPBs/aggregated_cell_outlier_data/all_aggregated_cell_outlier_data.csv").select("Cell_ID").to_series().to_list()
filtered_cells = pl.read_csv(f"/home/alex/alex_files/markerproject_redux/quality_check/{marker}/cell_and_nuclei/filtered_cells/all_filtered_cells.csv").select("Cell_ID").to_series().to_list()

cell_ids = set(outlier_cells + outlier_spb + filtered_cells)

overlay_coords_inliers = overlay_coords.filter(~pl.col("Cell_ID").is_in(cell_ids))
fp_coords_inliers = fp_coords.filter(~pl.col("Cell_ID").is_in(cell_ids))

(
        pl
        .concat(items=[overlay_coords_inliers, fp_coords_inliers], how="vertical")
        .sort(["Cell_ID", "Image_Path"])
        .group_by("Cell_ID")
        .agg(pl.all())
        .sample(fraction=1, with_replacement=False, shuffle=True)
        .explode(pl.all().exclude("Cell_ID"))
        .write_csv(f"{output_dir}/inlier_examples.csv")
    )

### Cells with multiple defects

In [None]:
outlier_cells = pl.read_csv(f"{phenotype_dir}/{marker}/Cells/aggregated_cell_outlier_data/all_aggregated_cell_outlier_data.csv").select(["Cell_ID", "Cell_Phenotype", "Num_Cell_Phenotypes", "Predicted_Label"])
outlier_spb = pl.read_csv(f"{phenotype_dir}/{marker}/SPBs/aggregated_cell_outlier_data/all_aggregated_cell_outlier_data.csv").select(["Cell_ID", "Cell_Phenotype", "Num_Cell_Phenotypes"]).rename({"Cell_Phenotype": "SPB_Phenotype", "Num_Cell_Phenotypes": "Num_SPB_Phenotypes"})
combined_outliers = outlier_cells.join(outlier_spb, on="Cell_ID", how="left")


In [None]:
(
        pl
        .concat(items=[overlay_coords, fp_coords], how="vertical")
        .filter(pl.col("Cell_ID")=="2TS13701016008001472")
        .sort(["Cell_ID", "Image_Path"])
        .group_by("Cell_ID")
        .agg(pl.all())
        .sample(fraction=1, with_replacement=False, shuffle=True)
        .explode(pl.all().exclude("Cell_ID"))
        .write_csv(f"{output_dir}/multi_defect_example.csv")
    )