## Generate Morpheus Input Data

**Gregory Way, 2019**

Use this script to concatenate all of the cell painting data into one `.gct` file for input into morpheus.

In [1]:
import os
import pandas as pd

from pycytominer import write_gct
from pycytominer.cyto_utils import infer_cp_features

In [2]:
%load_ext rpy2.ipython

In [3]:
batch_id = "CRISPR_PILOT_B1"
backend_dir = os.path.join("..", "1.generate-profiles", "data", "profiles")

plate_dirs = [os.path.join(backend_dir, x) for x in os.listdir(backend_dir) if x != ".DS_Store"]
plate_dirs

['../1.generate-profiles/data/profiles/SQ00014618',
 '../1.generate-profiles/data/profiles/SQ00014611',
 '../1.generate-profiles/data/profiles/SQ00014616',
 '../1.generate-profiles/data/profiles/SQ00014617',
 '../1.generate-profiles/data/profiles/SQ00014610',
 '../1.generate-profiles/data/profiles/SQ00014615',
 '../1.generate-profiles/data/profiles/SQ00014612',
 '../1.generate-profiles/data/profiles/SQ00014613',
 '../1.generate-profiles/data/profiles/SQ00014614']

In [4]:
# Build full cell painting dataset
df_list = []
all_plate_files = {}
for plate_dir in plate_dirs:
    plate_files = os.listdir(plate_dir)
    for plate_file in plate_files:
        if "normalized_feature_select.csv" in plate_file:
            full_plate_file = os.path.join(plate_dir, plate_file)
            
            plate = plate_dir.split("/")[-1]
            all_plate_files[plate] = full_plate_file
            
            df = pd.read_csv(full_plate_file)
            print("reading {} with profile count: {}".format(plate_file, df.shape[0]))
            df_list.append(df)

reading SQ00014618_normalized_feature_select.csv.gz with profile count: 384
reading SQ00014611_normalized_feature_select.csv.gz with profile count: 384
reading SQ00014616_normalized_feature_select.csv.gz with profile count: 384
reading SQ00014617_normalized_feature_select.csv.gz with profile count: 384
reading SQ00014610_normalized_feature_select.csv.gz with profile count: 384
reading SQ00014615_normalized_feature_select.csv.gz with profile count: 384
reading SQ00014612_normalized_feature_select.csv.gz with profile count: 384
reading SQ00014613_normalized_feature_select.csv.gz with profile count: 384
reading SQ00014614_normalized_feature_select.csv.gz with profile count: 384


In [5]:
# Combine into a single file
cp_df = pd.concat(df_list, sort=True)#.reset_index(drop=True)
cp_features = infer_cp_features(cp_df)
meta_features = cp_df.drop(cp_features, axis="columns").columns.tolist()

cp_df = cp_df.loc[:, meta_features + cp_features]

print(cp_df.shape)
cp_df.head()

(3456, 1633)


Unnamed: 0,Image_Metadata_Plate,Image_Metadata_Well,Metadata_WellCol,Metadata_WellRow,Metadata_broad_sample,Metadata_cell_line,Metadata_gene_name,Metadata_pert_name,Cells_AreaShape_Area,Cells_AreaShape_Center_X,...,Nuclei_Texture_Variance_DNA_5_0,Nuclei_Texture_Variance_ER_10_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_ER_5_0,Nuclei_Texture_Variance_Mito_10_0,Nuclei_Texture_Variance_Mito_20_0,Nuclei_Texture_Variance_Mito_5_0,Nuclei_Texture_Variance_RNA_10_0,Nuclei_Texture_Variance_RNA_20_0,Nuclei_Texture_Variance_RNA_5_0
0,SQ00014618,A01,1,A,,HCC44,EMPTY,EMPTY,-1.177142,,...,-1.308729,-0.375032,-0.180865,-0.956427,,,,,,0.919439
1,SQ00014618,A02,2,A,,HCC44,MCL1,MCL1-5,0.975853,,...,0.715469,2.078659,1.895482,1.578239,,,,,,0.84604
2,SQ00014618,A03,3,A,BRDN0001054908,HCC44,AKT1,AKT1-1,-0.325094,,...,-1.396879,-1.223689,-0.972473,-1.323873,,,,,,0.778623
3,SQ00014618,A04,4,A,,HCC44,KRAS,KRAS-2B,0.800868,,...,-1.681831,-0.72616,-0.466001,-0.941686,,,,,,0.949607
4,SQ00014618,A05,5,A,BRDN0001055115,HCC44,AKT1,AKT1-2,0.624168,,...,-0.547497,-1.31732,-0.807424,-1.581907,,,,,,0.394535


# Extract Data for the Most Replicable Genes

The genes include: *ITGAV*, *KIF11*, *MYC*, *POLR2D*, and *PSMA1*. (Plus one control LacZ)

In [6]:
# We are interested here in the most replicable CRISPR'd genes
genes = ['ITGAV', 'KIF11', 'MYC', 'POLR2D', 'PSMA1', 'LacZ']
cp_genes_df = cp_df.query("Metadata_gene_name in @genes").reset_index(drop=True)

cp_genes_df = (
    cp_genes_df
    .groupby(
        ['Metadata_cell_line', 'Metadata_gene_name', 'Metadata_pert_name']
    )
    .mean()
    .reset_index()
)

print(cp_genes_df.shape)
cp_genes_df.head(2)

(36, 1629)


Unnamed: 0,Metadata_cell_line,Metadata_gene_name,Metadata_pert_name,Metadata_WellCol,Cells_AreaShape_Area,Cells_AreaShape_Center_X,Cells_AreaShape_Center_Y,Cells_AreaShape_Compactness,Cells_AreaShape_Eccentricity,Cells_AreaShape_Extent,...,Nuclei_Texture_Variance_DNA_5_0,Nuclei_Texture_Variance_ER_10_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_ER_5_0,Nuclei_Texture_Variance_Mito_10_0,Nuclei_Texture_Variance_Mito_20_0,Nuclei_Texture_Variance_Mito_5_0,Nuclei_Texture_Variance_RNA_10_0,Nuclei_Texture_Variance_RNA_20_0,Nuclei_Texture_Variance_RNA_5_0
0,A549,ITGAV,ITGAV-1,12.5,2.965886,0.236253,1.796267,1.313601,0.703083,-1.877379,...,2.747207,3.23764,2.837458,3.693132,3.603531,3.735676,4.013545,1.016681,1.90638,1.152944
1,A549,ITGAV,ITGAV-2,12.5,3.11251,-0.393902,-1.180283,1.247236,0.282965,-1.917632,...,3.193628,3.517475,3.25049,3.904908,3.805202,4.348153,4.399,0.772239,1.752225,0.683402


## Create `.gct` files for Morpheus heatmap inputs

In [7]:
# Build and output gct file for all genes
output_file = os.path.join(
    "results", "morpheus", "full_genes_morpheus.gct"
)

write_gct(
    profiles=cp_df,
    output_file=output_file,
    features=cp_features
)

In [8]:
# Build and output gct file for select genes
output_file = os.path.join(
    "results", "morpheus", "reproducible_genes.gct"
)

write_gct(
    profiles=cp_genes_df,
    output_file=output_file,
    features=cp_features
)

In [9]:
# Write a gct file for all plates
for plate in all_plate_files:

    df = pd.read_csv(all_plate_files[plate])
    
    output_file = os.path.join(
        "results", "morpheus", "{}_morpheus.gct".format(plate)
    )
    
    write_gct(
        profiles=df,
        output_file=output_file,
        features=cp_features
    )

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)
