# Generate Morphological Profiles

**Gregory Way, 2019**

Here, I use [pycytominer](https://github.com/cytomining/pycytominer) to aggregate, normalize, and perform feature selection on cell painting profiles.

In [1]:
import os
import pandas as pd

from pycytominer.aggregate import AggregateProfiles
from pycytominer.annotate import annotate
from pycytominer.normalize import normalize
from pycytominer.feature_select import feature_select
from pycytominer.audit import audit

In [2]:
project_name = "2018_05_30_ResistanceMechanisms_Kapoor"

In [3]:
workspace_dir = os.path.join(
    os.path.abspath(os.sep),
    "home",
    "ubuntu",
    "bucket",
    "projects",
    project_name,
    "workspace",
)

batches = os.listdir(os.path.join(workspace_dir, "backend"))
batches

['2019_02_15_Batch1_20X',
 '2019_02_15_Batch1_40X',
 '2019_03_20_Batch2',
 '2019_06_25_Batch3']

In [4]:
plates = {x: [] for x in batches}
for batch in batches:
    [plates[batch].append(x) for x in os.listdir(os.path.join(workspace_dir, "backend", batch))]
    
plates

{'2019_02_15_Batch1_20X': ['HCT116bortezomib'],
 '2019_02_15_Batch1_40X': ['HCT116bortezomib'],
 '2019_03_20_Batch2': ['207106_exposure320'],
 '2019_06_25_Batch3': ['MutClones', 'WTClones']}

In [5]:
feature_select_operations = ["drop_na_columns", "blacklist", "variance_threshold", "correlation_threshold"]

In [6]:
for batch in batches:
    
    batch_dir = os.path.join(workspace_dir, "backend", batch)
    metadata_dir = os.path.join(workspace_dir, "metadata", batch)
    barcode_plate_map_file = os.path.join(metadata_dir, sorted(os.listdir(metadata_dir))[0])
    barcode_plate_map_df = pd.read_csv(barcode_plate_map_file)
    
    for plate in plates[batch]:
        if batch in ["2019_02_15_Batch1_20X", "2019_02_15_Batch1_40X", "2019_03_20_Batch2"]:
            continue
        print("Now processing... batch: {}, plate: {}".format(batch, plate))
        plate_map_name = (
            barcode_plate_map_df
            .query("Assay_Plate_Barcode == @plate")
            .Plate_Map_Name
            .values[0]
        )
        plate_map_file = os.path.join(metadata_dir, "platemap", "{}.txt".format(plate_map_name))
        plate_map_df = pd.read_csv(plate_map_file, sep="\t")

        sql_dir = os.path.join(batch_dir, plate)
        sqlite_file = "sqlite:////{}/{}.sqlite".format(sql_dir, plate)

        # Initialize sql file for processing
        ap = AggregateProfiles(sqlite_file,
                               strata=["Metadata_Plate", "Metadata_Well"],
                               features="all",
                               operation="median")

        # Count cells and output file
        cell_count_file = os.path.join("results", "{}_{}_cell_count.tsv".format(batch, plate))
        cell_count_df = ap.count_cells()
        cell_count_df = (
            cell_count_df.merge(
                plate_map_df,
                left_on="Metadata_Well",
                right_on="well_position"
            )
            .drop(["well_position"], axis="columns")
        )
        cell_count_df.to_csv(cell_count_file, sep="\t", index=False)

        # Begin processing profiles
        output_dir = os.path.join("data", "profiles", batch, plate)
        os.makedirs(output_dir, exist_ok=True)

        # Aggregate single cells into well profiles
        out_file = os.path.join(output_dir, "{}.csv.gz".format(plate))
        ap.aggregate_profiles(output_file=out_file, compression="gzip")

        del ap
    
        # Annotate Profiles
        anno_file = os.path.join(output_dir, "{}_augmented.csv.gz".format(plate))
        annotate(
            profiles=out_file,
            platemap=plate_map_df,
            join_on=["Metadata_well_position", "Metadata_Well"],
            output_file=anno_file,
            compression="gzip",
        )

        # Normalize Profiles
        norm_file = os.path.join(output_dir, "{}_normalized.csv.gz".format(plate))
        normalize(
            profiles=anno_file,
            features="infer",
            samples="all",
            output_file=norm_file,
            compression="gzip",
        )

        # Perform feature selection
        feat_file = os.path.join(
            output_dir, "{}_normalized_feature_select.csv.gz".format(plate)
        )
        feature_select(
            profiles=norm_file,
            features="infer",
            samples="none",
            operation=feature_select_operations,
            output_file=feat_file,
            compression="gzip",
            corr_threshold=0.9,
            corr_method="pearson"
        )

Now processing... batch: 2019_06_25_Batch3, plate: MutClones


MemoryError: 