## Count LINCS Cell Painting data

This notebook provides counting statistics for two batches of LINCS Cell Painting set in this repository.

* Count profiles
* Count perturbation treatments
* Count single cells (with an output cell count summary file)

In [1]:
import pathlib
import pandas as pd

In [2]:
batches = ["2016_04_01_a549_48hr_batch1", "2017_12_05_Batch2"]

consensus_dir = pathlib.Path("consensus")
batch_effect_dir = pathlib.Path("spherized_profiles")
cell_count_dir = pathlib.Path("profiles/cell_count")
platemap_dir = pathlib.Path("metadata/platemaps")

consensus_suffix = "_consensus_modz_feature_select_dmso.csv.gz"
batch_effect_suffix = "_dmso_spherized_profiles_with_input_normalized_by_whole_plate.csv.gz"

## Profile and perturbation count

In [3]:
for batch in batches:
    consensus_file = pathlib.Path(consensus_dir, batch, f"{batch}{consensus_suffix}")
    batch_effect_file = pathlib.Path(batch_effect_dir, "profiles", f"{batch}{batch_effect_suffix}")
    
    profiles_df = pd.read_csv(batch_effect_file, low_memory=False)
    print(f"There are a total of {profiles_df.shape[0]:,} image-based profiles assayed in {batch}")

    consensus_df = pd.read_csv(consensus_file)
    print(f"There are a total of {consensus_df.shape[0]:,} image-based consensus profiles assayed in {batch}")
    
    num_compounds = len(consensus_df.Metadata_broad_sample.unique())
    print(f"There are {num_compounds:,} unique compounds assayed in {batch}")
    
    time_points = len(consensus_df.Metadata_time_point.unique())
    print(f"There are {time_points} unique time points assayed in {batch}")
    
    doses = len(consensus_df.Metadata_dose_recode.unique())
    print(f"There are {doses} unique doses assayed in {batch}")
    
    cell_lines = consensus_df.Metadata_cell_id.unique()
    print(f"There are {len(cell_lines)} unique cell lines assayed in {batch} ({cell_lines})")
    
    unique_perturbations = (
        consensus_df
        .groupby(
            ["Metadata_cell_id", "Metadata_broad_sample", "Metadata_time_point", "Metadata_dose_recode"]
        )["Metadata_pert_well"]
        .count()
        .reset_index()
        .rename(
            columns={"Metadata_pert_well": "num_replicates"}
        )
    )
    print(f"There is a total of {len(unique_perturbations):,} unique perturbations assayed in {batch}\n")

There are a total of 52,223 image-based profiles assayed in 2016_04_01_a549_48hr_batch1
There are a total of 10,752 image-based consensus profiles assayed in 2016_04_01_a549_48hr_batch1
There are 1,571 unique compounds assayed in 2016_04_01_a549_48hr_batch1
There are 1 unique time points assayed in 2016_04_01_a549_48hr_batch1
There are 7 unique doses assayed in 2016_04_01_a549_48hr_batch1
There are 1 unique cell lines assayed in 2016_04_01_a549_48hr_batch1 (['A549'])
There is a total of 9,395 unique perturbations assayed in 2016_04_01_a549_48hr_batch1

There are a total of 51,447 image-based profiles assayed in 2017_12_05_Batch2
There are a total of 10,368 image-based consensus profiles assayed in 2017_12_05_Batch2
There are 349 unique compounds assayed in 2017_12_05_Batch2
There are 3 unique time points assayed in 2017_12_05_Batch2
There are 6 unique doses assayed in 2017_12_05_Batch2
There are 3 unique cell lines assayed in 2017_12_05_Batch2 (['A549' 'MCF7' 'U2OS'])
There is a total 

### Cell Count

In [4]:
# Count cells
all_cell_count_df = {x: [] for x in batches}
for batch in batches:
    # Load barcode platemap
    barcode_platemap_file = platemap_dir / batch / "barcode_platemap.csv"
    barcode_platemap_df = pd.read_csv(barcode_platemap_file)

    # Setup other path variables
    batch_count_dir = pathlib.Path(cell_count_dir, batch)
    plate_dirs = [x for x in batch_count_dir.iterdir() if ".DS_Store" not in x.name]
    
    for plate_dir in plate_dirs:
        plate_name = plate_dir.name
        platemap_name = barcode_platemap_df.query("Assay_Plate_Barcode == @plate_name").Plate_Map_Name.values[0]
        platemap_file = platemap_dir / batch / "platemap" / f"{platemap_name}.txt"
        platemap_df = pd.read_csv(platemap_file, sep="\t")

        cell_count_file = plate_dir / f"{plate_name}_cell_count.csv"
        cell_count_df = (
            pd.read_csv(cell_count_file)
            .assign(batch=batch)
            .rename({
                "Image_Metadata_Well": "Metadata_Well",
                "Image_Metadata_Plate": "Metadata_Plate"
            }, axis="columns")
        )
        
        cell_count_df = (
            cell_count_df
            .merge(platemap_df, left_on="Metadata_Well", right_on="well_position")
        )
        
        all_cell_count_df[batch].append(cell_count_df)
    
    # Combine batch-specific cell count summary
    all_cell_count_df[batch] = pd.concat(all_cell_count_df[batch]).reset_index(drop=True)

In [5]:
# Output metadata summary files
total_cells = 0
for batch in batches:
    batch_metadata_df = all_cell_count_df[batch]
    batch_cell_count = batch_metadata_df.cell_count.sum()
    
    print(f"In batch {batch}, we profiled {batch_cell_count:,} cells")
    
    output_cell_count_summary_file = cell_count_dir / f"{batch}_metadata_cell_count_summary.tsv.gz"
    batch_metadata_df.to_csv(output_cell_count_summary_file, index=False)
    
    total_cells += batch_cell_count
    
print(f"\nWe profiled a total of {total_cells:,} cells in the LINCS Cell Painting dataset")

In batch 2016_04_01_a549_48hr_batch1, we profiled 110,012,425 cells
In batch 2017_12_05_Batch2, we profiled 49,705,063 cells

We profiled a total of 159,717,488 cells in the LINCS Cell Painting dataset
