## Generate Morpheus Input Data

**Gregory Way, 2019**

Use this script to concatenate all of the cell painting data into one `.gct` file for input into morpheus.

In [1]:
import os
import pandas as pd

from pycytominer import write_gct
from pycytominer.cyto_utils import infer_cp_features

In [2]:
%load_ext rpy2.ipython

In [3]:
batch_id = "CRISPR_PILOT_B1"
backend_dir = os.path.join("..", "1.generate-profiles", "data", "profiles")

plate_dirs = [os.path.join(backend_dir, x) for x in os.listdir(backend_dir) if x != ".DS_Store"]
plate_dirs

['../1.generate-profiles/data/profiles/SQ00014618',
 '../1.generate-profiles/data/profiles/SQ00014611',
 '../1.generate-profiles/data/profiles/SQ00014616',
 '../1.generate-profiles/data/profiles/SQ00014617',
 '../1.generate-profiles/data/profiles/SQ00014610',
 '../1.generate-profiles/data/profiles/SQ00014615',
 '../1.generate-profiles/data/profiles/SQ00014612',
 '../1.generate-profiles/data/profiles/SQ00014613',
 '../1.generate-profiles/data/profiles/SQ00014614']

In [4]:
# Build full cell painting dataset
df_list = []
all_plate_files = {}
for plate_dir in plate_dirs:
    plate_files = os.listdir(plate_dir)
    for plate_file in plate_files:
        if "normalized_feature_select.csv" in plate_file:
            full_plate_file = os.path.join(plate_dir, plate_file)
            
            plate = plate_dir.split("/")[-1]
            all_plate_files[plate] = full_plate_file
            
            df = pd.read_csv(full_plate_file)
            print("reading {} with profile count: {}".format(plate_file, df.shape[0]))
            df_list.append(df)

reading SQ00014618_normalized_feature_select.csv.gz with profile count: 384
reading SQ00014611_normalized_feature_select.csv.gz with profile count: 384
reading SQ00014616_normalized_feature_select.csv.gz with profile count: 384
reading SQ00014617_normalized_feature_select.csv.gz with profile count: 384
reading SQ00014610_normalized_feature_select.csv with profile count: 384
reading SQ00014610_normalized_feature_select.csv.gz with profile count: 384
reading SQ00014615_normalized_feature_select.csv.gz with profile count: 384
reading SQ00014612_normalized_feature_select.csv.gz with profile count: 384
reading SQ00014613_normalized_feature_select.csv.gz with profile count: 384
reading SQ00014614_normalized_feature_select.csv.gz with profile count: 384


In [5]:
# Combine into a single file
cp_df = pd.concat(df_list, sort=True)#.reset_index(drop=True)
cp_features = infer_cp_features(cp_df)
meta_features = cp_df.drop(cp_features, axis="columns").columns.tolist()

cp_df = cp_df.loc[:, meta_features + cp_features]

print(cp_df.shape)
cp_df.head()

(3840, 1642)


Unnamed: 0,Image_Metadata_Plate,Image_Metadata_Well,Metadata_WellCol,Metadata_WellRow,Metadata_broad_sample,Metadata_cell_line,Metadata_gene_name,Metadata_pert_name,Cells_AreaShape_Area,Cells_AreaShape_Center_X,...,Nuclei_Texture_Variance_DNA_5_0,Nuclei_Texture_Variance_ER_10_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_ER_5_0,Nuclei_Texture_Variance_Mito_10_0,Nuclei_Texture_Variance_Mito_20_0,Nuclei_Texture_Variance_Mito_5_0,Nuclei_Texture_Variance_RNA_10_0,Nuclei_Texture_Variance_RNA_20_0,Nuclei_Texture_Variance_RNA_5_0
0,SQ00014618,A01,1,A,,HCC44,EMPTY,EMPTY,-0.891801,0.870588,...,-0.888368,-0.273388,-0.136467,-0.709549,-0.754013,-0.834143,-0.937134,0.829446,0.69203,0.691189
1,SQ00014618,A02,2,A,,HCC44,MCL1,MCL1-5,0.739305,1.247059,...,0.485662,1.515287,1.43019,1.170855,0.48787,0.880912,0.231417,0.937816,1.829701,0.636011
2,SQ00014618,A03,3,A,BRDN0001054908,HCC44,AKT1,AKT1-1,-0.24629,0.352941,...,-0.948205,-0.892036,-0.733756,-0.982148,-0.949812,-0.236291,-0.943733,1.497683,2.65479,0.58533
3,SQ00014618,A04,4,A,,HCC44,KRAS,KRAS-2B,0.606737,0.188235,...,-1.14163,-0.529351,-0.35161,-0.698613,-1.072429,-0.947348,-1.086496,1.222672,1.554963,0.713868
4,SQ00014618,A05,5,A,BRDN0001055115,HCC44,AKT1,AKT1-2,0.472869,1.905882,...,-0.371642,-0.960291,-0.609222,-1.173577,-0.730081,-0.547583,-0.670532,1.083936,1.909282,0.296592


# Extract Data for the Most Replicable Genes

The genes include: *ITGAV*, *KIF11*, *MYC*, *POLR2D*, and *PSMA1*. (Plus one control LacZ)

In [6]:
# We are interested here in the most replicable CRISPR'd genes
genes = ['ITGAV', 'KIF11', 'MYC', 'POLR2D', 'PSMA1', 'LacZ']
cp_genes_df = cp_df.query("Metadata_gene_name in @genes").reset_index(drop=True)

cp_genes_df = (
    cp_genes_df
    .groupby(
        ['Metadata_cell_line', 'Metadata_gene_name', 'Metadata_pert_name']
    )
    .mean()
    .reset_index()
)

print(cp_genes_df.shape)
cp_genes_df.head(2)

(36, 1638)


Unnamed: 0,Metadata_cell_line,Metadata_gene_name,Metadata_pert_name,Metadata_WellCol,Cells_AreaShape_Area,Cells_AreaShape_Center_X,Cells_AreaShape_Center_Y,Cells_AreaShape_Compactness,Cells_AreaShape_Eccentricity,Cells_AreaShape_Extent,...,Nuclei_Texture_Variance_DNA_5_0,Nuclei_Texture_Variance_ER_10_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_ER_5_0,Nuclei_Texture_Variance_Mito_10_0,Nuclei_Texture_Variance_Mito_20_0,Nuclei_Texture_Variance_Mito_5_0,Nuclei_Texture_Variance_RNA_10_0,Nuclei_Texture_Variance_RNA_20_0,Nuclei_Texture_Variance_RNA_5_0
0,A549,ITGAV,ITGAV-1,12.5,2.052464,-0.299906,0.934997,1.026237,0.582357,-1.473252,...,1.90635,2.031859,2.016701,2.474035,2.431481,2.531894,2.525777,0.628726,1.229263,0.698393
1,A549,ITGAV,ITGAV-2,12.5,2.405602,0.012288,-0.652569,0.758848,0.068491,-1.252881,...,2.496538,2.529136,2.695503,2.898689,3.005495,3.449669,3.009599,0.756017,1.493282,0.748728


## Create `.gct` files for Morpheus heatmap inputs

In [7]:
# Build and output gct file for all genes
output_file = os.path.join(
    "results", "morpheus", "full_genes_morpheus.gct"
)

write_gct(
    profiles=cp_df,
    output_file=output_file,
    features=cp_features
)

In [8]:
# Build and output gct file for select genes
output_file = os.path.join(
    "results", "morpheus", "reproducible_genes.gct"
)

write_gct(
    profiles=cp_genes_df,
    output_file=output_file,
    features=cp_features
)

In [9]:
# Write a gct file for all plates
for plate in all_plate_files:

    df = pd.read_csv(all_plate_files[plate])
    
    output_file = os.path.join(
        "results", "morpheus", "{}_morpheus.gct".format(plate)
    )
    
    write_gct(
        profiles=df,
        output_file=output_file,
        features=cp_features
    )