# Process LINCS Cell Painting Data into Training and Testing Data

Stratify all A549 profiles we have into 85% training and and 15% testing data.
The data are balanced across perturbations.

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
np.random.seed(123)

In [3]:
test_proportion = 0.15

In [4]:
project_name = "2015_10_05_DrugRepurposing_AravindSubramanian_GolubLab_Broad"
project_dir = os.path.join("~", "bucket", "projects", project_name)

batch_name = "2016_04_01_a549_48hr_batch1"
data_dir = os.path.join(project_dir, "workspace", "backend", batch_name)

In [5]:
file = os.path.join(data_dir, "{}.csv".format(batch_name))
df = pd.read_csv(file, low_memory=False)

# For some reason, some metadata is missing
df.loc[df.Metadata_pert_id == "BRD-K60230970", "Metadata_pert_iname"] = "MG-132"
df.loc[df.Metadata_pert_id == "BRD-K50691590", "Metadata_pert_iname"] = "bortezomib"
df.loc[df.Metadata_broad_sample == "DMSO", ["Metadata_pert_iname", "Metadata_pert_id"]] = "DMSO"

print(df.shape)
df.head(2)

(10752, 426)


Unnamed: 0,Metadata_broad_sample,Metadata_mmoles_per_liter,Metadata_moa,Metadata_pert_id,Metadata_pert_id_vendor,Metadata_pert_idose,Metadata_pert_iname,Metadata_pert_mfc_desc,Metadata_pert_mfc_id,Metadata_pert_vehicle,...,Nuclei_Texture_InverseDifferenceMoment_Mito_5_0,Nuclei_Texture_InverseDifferenceMoment_RNA_10_0,Nuclei_Texture_InverseDifferenceMoment_RNA_20_0,Nuclei_Texture_SumAverage_DNA_20_0,Nuclei_Texture_SumAverage_Mito_5_0,Nuclei_Texture_SumAverage_RNA_20_0,Nuclei_Texture_SumEntropy_DNA_10_0,Nuclei_Texture_SumEntropy_DNA_5_0,Nuclei_Texture_Variance_ER_20_0,Nuclei_Texture_Variance_RNA_10_0
0,BRD-A00147595-001-01-5,0.041152,insulin sensitizer|PPAR receptor partial agonist,BRD-A00147595,,0.041152,balaglitazone,balaglitazone,BRD-A00147595-001-01-5,DMSO,...,0.081673,0.650381,0.804025,-0.143705,0.595971,-0.874259,0.058965,-0.245046,-0.012409,-1.354363
1,BRD-A00147595-001-01-5,0.123457,insulin sensitizer|PPAR receptor partial agonist,BRD-A00147595,,0.123457,balaglitazone,balaglitazone,BRD-A00147595-001-01-5,DMSO,...,0.335284,0.827858,0.998955,-0.621075,0.773121,0.098758,0.650768,0.303148,0.588129,-1.264081


In [6]:
# Load Additional Annotations
folder = "2016_04_01_a549_48hr_batch1_CellPainting_CPfeats_whitened"
meta_dir = os.path.join(project_dir, "workspace", "metadata", folder)

file = os.path.join(meta_dir, "level_4_col_meta_n52223.txt")
annot_df = pd.read_csv(file, sep='\t')

In [7]:
annot_df.cell_id.value_counts()

A549    52223
Name: cell_id, dtype: int64

In [8]:
train_x, test_x = train_test_split(df, test_size=test_proportion, stratify=df.Metadata_pert_id)

## Output Training and Testing Data

In [9]:
print(train_x.shape)

file = os.path.join("data", "cp_train_data.tsv.gz")
train_x.to_csv(file, sep='\t', index=False)

(9139, 426)


In [10]:
print(test_x.shape)

file = os.path.join("data", "cp_test_data.tsv.gz")
test_x.to_csv(file, sep='\t', index=False)

(1613, 426)
