In [40]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from IPython.display import display

from pathlib import Path
import pandas as pd
import numpy as np
import dask.dataframe as dd
import seaborn as sns
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np
from pycytominer import aggregate, normalize, feature_select
from pycytominer.cyto_utils import load_profiles

In [42]:
def aggregate_cellprofiler_outputs(data_dir):
    """
    Aggregate CellProfiler outputs from multiple compartments into a single combined file.
    Optimized to reduce DataFrame fragmentation.
    
    Parameters:
    -----------
    data_dir : str
        Path to directory containing CellProfiler output CSV files
        
    Returns:
    --------
    pandas.DataFrame
        Combined DataFrame with all compartment measurements
    """
    compartment_files = {
        "cells": "Cells.csv",
        "cytoplasm": "Cytoplasm.csv",
        "golgi": "GolgiSpots.csv",
        "mito": "MitoObjects.csv",
        "mito_child": "MitoChildObjects.csv",
        "nuclei": "Nuclei.csv",
        "nucleoli": "Nucleoli.csv",
        "nucleoli_child": "NucleoliChildObjects.csv"
    }
    
    def custom_aggregation(df, strata, features, operations):
        """
        Perform multiple aggregation operations efficiently
        
        Parameters:
        -----------
        df : pandas.DataFrame
            Input DataFrame
        strata : list
            Columns to group by
        features : list
            Columns to aggregate
        operations : dict
            Dictionary mapping operation names to functions
            
        Returns:
        --------
        pandas.DataFrame
            Aggregated results
        """
        # Create a copy to avoid fragmentation
        df = df.copy()
        
        # Perform all aggregations at once
        grouped = df.groupby(strata)[features]
        results = []
        
        for op_name, op_func in operations.items():
            result = grouped.agg(op_func).reset_index()
            
            # Rename non-strata columns
            if op_name != 'mean':
                for col in result.columns:
                    if col not in strata:
                        result = result.rename(columns={col: f"{col}_{op_name}"})
            
            results.append(result)
        
        # Merge all results using reduce
        final = reduce(lambda left, right: pd.merge(left, right, on=strata, how='outer'), 
                      results)
        
        return final
    
    def mad(x):
        """Calculate Median Absolute Deviation"""
        median = np.median(x)
        return np.median(np.abs(x - median))
    
    # Define operations
    operations = {
        'mean': 'mean',
        'std': 'std',
        'median': 'median',
        'mad': mad
    }
    
    # Define strata columns
    strata_columns = ["Metadata_Plate", "Metadata_Well"]
    
    # Initialize empty list to store all compartment results
    all_results = []
    
    # Process each compartment
    for compartment, file_path in compartment_files.items():
        try:
            # Load data with copy to avoid fragmentation
            df = pd.read_csv(f"{data_dir}/{file_path}").copy()
            # print(f"Loaded {compartment} data with {len(df)} rows")
            
            # Get feature columns
            feature_cols = [col for col in df.columns 
                          if not col.startswith(("Image_", "Metadata_", "ObjectNumber"))]
            
            # Calculate all aggregations at once
            result = custom_aggregation(df, strata_columns, feature_cols, operations)
            
            # Add object counts
            counts = df.groupby(strata_columns).size().reset_index(name=f'{compartment}_count')
            result = result.merge(counts, on=strata_columns, how='left')
            
            # Add compartment prefix to all non-strata columns
            prefix_cols = {col: f"{compartment}_{col}" 
                         for col in result.columns 
                         if col not in strata_columns}
            result = result.rename(columns=prefix_cols)
            
            all_results.append(result)
            # print(f"Processed {compartment} data")
            
        except FileNotFoundError:
            print(f"Warning: Could not find {file_path}")
            continue
    
    # Merge all compartment results
    final_result = reduce(lambda left, right: pd.merge(left, right, on=strata_columns, how='outer'), 
                         all_results)
    
    # Save combined results
    output_file = f"{data_dir}/combined_measurements.csv"
    final_result.to_csv(output_file, index=False)
    # print(f"\nSaved combined measurements to {output_file}")
    
    return final_result

# Example usage
result = aggregate_cellprofiler_outputs('/mnt/data/cellprofiler-output/inhibitors/compound-001_Plate13_compound-001_13_AA24_conc1.0')

In [52]:
inhibitor_results = []
for plate in Path('/mnt/data/cellprofiler-output/inhibitors').glob('*'):
    print(plate)
    result = aggregate_cellprofiler_outputs(plate)
    inhibitor_results.append(result)
inhibitor_results_df = pd.concat(inhibitor_results, ignore_index=True)
inhibitor_results_df.to_csv('inhibitor_results.csv', index=False)

In [55]:
ko_results = []
for plate in Path('/mnt/data/cellprofiler-output/ko').glob('*'):
    print(plate)
    try:
        result = aggregate_cellprofiler_outputs(plate)
        ko_results.append(result)
    except Exception as e:
        print(e)
        continue
ko_results_df = pd.concat(ko_results, ignore_index=True)
ko_results_df.to_csv('ko_results.csv', index=False)

/mnt/data/cellprofiler-output/ko/gene-157_Plate5_gene-157_5_F04
/mnt/data/cellprofiler-output/ko/gene-081_Plate6_gene-081_6_AC34
/mnt/data/cellprofiler-output/ko/gene-157_Plate1_gene-157_1_W18
/mnt/data/cellprofiler-output/ko/gene-157_Plate6_gene-157_6_W09
/mnt/data/cellprofiler-output/ko/gene-157_Plate1_gene-157_1_S32
/mnt/data/cellprofiler-output/ko/gene-157_Plate5_gene-157_5_F41
/mnt/data/cellprofiler-output/ko/gene-157_Plate2_gene-157_2_AD07
/mnt/data/cellprofiler-output/ko/gene-157_Plate4_gene-157_4_X02
/mnt/data/cellprofiler-output/ko/gene-081_Plate2_gene-081_2_AD02
/mnt/data/cellprofiler-output/ko/gene-081_Plate5_gene-081_5_D46
/mnt/data/cellprofiler-output/ko/gene-081_Plate7_gene-081_7_Q32
/mnt/data/cellprofiler-output/ko/gene-081_Plate9_gene-081_9_J12
/mnt/data/cellprofiler-output/ko/gene-157_Plate7_gene-157_7_AD24
/mnt/data/cellprofiler-output/ko/gene-157_Plate5_gene-157_5_E22
/mnt/data/cellprofiler-output/ko/gene-081_Plate4_gene-081_4_AC39
/mnt/data/cellprofiler-output/ko/ge

```
$ csvtk stats *.csv
file                   num_cols  num_rows
inhibitor_results.csv     6,538       892
ko_results.csv            6,538       256
```