In [1]:
import sys
import pathlib
import sqlite3
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from pycytominer import feature_select
from pycytominer.cyto_utils import infer_cp_features

from utils.single_cell_utils import process_sites, normalize_sc
sys.path.append("../0.generate-profiles")
from scripts.profile_util import load_config

In [2]:
pd.np.random.seed(1234)

In [3]:
# Set constants
batch = "2019_03_20_Batch2"
plate = "207106_exposure320"

feature_filter = ["Object", "Location", "Count", "Parent"]
test_split_prop = 0.15
scaler_method = "standard"
seed = 123

feature_select_opts = [
    "variance_threshold",
    "drop_na_columns",
    "blacklist",
    "drop_outliers",
]
corr_threshold = 0.8
na_cutoff = 0

In [4]:
# Load locations of single cell files
config = pathlib.Path("../0.generate-profiles/profile_config.yaml")
pipeline, single_cell_files = load_config(config, append_sql_prefix=False)

In [5]:
workspace_dir = pipeline["workspace_dir"]
batch_dir = pathlib.Path(workspace_dir, "backend", batch)
metadata_dir = pathlib.Path(workspace_dir, "metadata", batch)

barcode_plate_map_file = pathlib.Path(metadata_dir, "barcode_platemap.csv")
barcode_plate_map_df = pd.read_csv(barcode_plate_map_file)

barcode_plate_map_df

Unnamed: 0,Assay_Plate_Barcode,Plate_Map_Name,Batch_Number
0,207106_exposure320,PlateMap_207106_exposure320,2


In [6]:
plate_map_name = (
    barcode_plate_map_df
    .query("Assay_Plate_Barcode == @plate")
    .Plate_Map_Name
    .values[0]
)

plate_map_file = pathlib.Path(metadata_dir, "platemap", f"{plate_map_name}.txt")
plate_map_df = pd.read_csv(plate_map_file, sep="\t")
plate_map_df.columns = [x if x.startswith("Metadata_") else f"Metadata_{x}" for x in plate_map_df.columns]
plate_map_df.head()

Unnamed: 0,Metadata_plate_map_name,Metadata_well_position,Metadata_CellLine,Metadata_Dosage
0,PlateMap_207106_exposure320,B02,WT,0.0
1,PlateMap_207106_exposure320,B03,WT,0.0
2,PlateMap_207106_exposure320,B04,WT,0.0
3,PlateMap_207106_exposure320,B05,CloneA,0.0
4,PlateMap_207106_exposure320,B06,CloneA,0.0


## Setup Single Cell Connection

In [7]:
plate_column = pipeline["aggregate"]["plate_column"]
well_column = pipeline["aggregate"]["well_column"]

In [8]:
# Establish connection to sqlite file
single_cell_sqlite = single_cell_files[batch]["plates"][plate]
conn = sqlite3.connect(single_cell_sqlite)

In [9]:
image_cols = f"TableNumber, ImageNumber, {plate_column}, {well_column}"
image_query = f"select {image_cols} from image"
image_df = (
    pd.read_sql_query(image_query, conn)
    .merge(
        plate_map_df,
        left_on=well_column,
        right_on="Metadata_well_position"
    )
    .drop(["Metadata_well_position"], axis="columns")
)

print(image_df.shape)
image_df.head()

(324, 7)


Unnamed: 0,TableNumber,ImageNumber,Metadata_Plate,Metadata_Well,Metadata_plate_map_name,Metadata_CellLine,Metadata_Dosage
0,35063824554719371464203761079871132620,1,207106_exposure320,B02,PlateMap_207106_exposure320,WT,0.0
1,55321079170264086709741025581452256546,37,207106_exposure320,B02,PlateMap_207106_exposure320,WT,0.0
2,156575971115499494274828396611545171867,73,207106_exposure320,B02,PlateMap_207106_exposure320,WT,0.0
3,177833145266349265724759827001312244688,109,207106_exposure320,B02,PlateMap_207106_exposure320,WT,0.0
4,322613752142964989790892398074175721670,145,207106_exposure320,B02,PlateMap_207106_exposure320,WT,0.0


## Identify Representative Wells

In [10]:
# Assert that image number is unique
assert len(image_df.ImageNumber.unique()) == image_df.shape[0]

In [11]:
image_df.Metadata_CellLine.value_counts()

WT        108
CloneE    108
CloneA    108
Name: Metadata_CellLine, dtype: int64

In [12]:
image_df.Metadata_Well.value_counts()

E09    9
E03    9
D05    9
E02    9
D08    9
D04    9
E10    9
C03    9
B09    9
E05    9
D10    9
D03    9
B02    9
C06    9
B03    9
B06    9
C07    9
C04    9
E06    9
B08    9
B04    9
B05    9
D07    9
B10    9
B07    9
C05    9
E07    9
C08    9
D06    9
D02    9
C09    9
E04    9
D09    9
C10    9
C02    9
E08    9
Name: Metadata_Well, dtype: int64

In [13]:
clone_e_wells = pd.np.random.choice(
    (
        image_df
        .query("Metadata_CellLine == 'CloneE'")
        .query("Metadata_Dosage == 0")
    )
    .Metadata_Well.unique(), size=2, replace=False
)

wt_wells = pd.np.random.choice(
    (
        image_df
        .query("Metadata_CellLine == 'WT'")
        .query("Metadata_Dosage == 0")
    ).Metadata_Well.unique(), size=2, replace=False
)

clone_e_holdout_wells = pd.np.random.choice(
    (
        image_df
        .query("Metadata_CellLine == 'CloneE'")
        .query("Metadata_Dosage == 0")
        .query("Metadata_Well not in @clone_e_wells")
    )
    .Metadata_Well.unique(), size=1, replace=False
)

wt_holdout_wells = pd.np.random.choice(
    (
        image_df
        .query("Metadata_CellLine == 'WT'")
        .query("Metadata_Dosage == 0")
        .query("Metadata_Well not in @wt_wells")
    ).Metadata_Well.unique(), size=1, replace=False
)

clone_a_wells = pd.np.random.choice(
    (
        image_df
        .query("Metadata_CellLine == 'CloneA'")
        .query("Metadata_Dosage == 0")
    )
    .Metadata_Well.unique(), size=1, replace=False
)

print(
    f"Clone E Wells: {clone_e_wells}",
    f"\nWT Wells: {wt_wells}",
    f"Clone E Holdout Wells: {clone_e_holdout_wells}",
    f"\nWT Holdout Wells: {wt_holdout_wells}",
    f"\nClone A Wells: {clone_a_wells}"
)

Clone E Wells: ['B08' 'B09'] 
WT Wells: ['B03' 'B04'] Clone E Holdout Wells: ['B10'] 
WT Holdout Wells: ['B02'] 
Clone A Wells: ['B07']


# Load Cells

In [14]:
imagenumber_dict = {}
imagenumber_dict["clone_e"] = image_df.query("Metadata_Well in @clone_e_wells").ImageNumber.tolist()
imagenumber_dict["wt"] = image_df.query("Metadata_Well in @wt_wells").ImageNumber.tolist()
imagenumber_dict["clone_a"] = image_df.query("Metadata_Well in @clone_a_wells").ImageNumber.tolist()
imagenumber_dict["clone_e_holdout"] = image_df.query("Metadata_Well in @clone_e_holdout_wells").ImageNumber.tolist()
imagenumber_dict["wt_holdout"] = image_df.query("Metadata_Well in @wt_holdout_wells").ImageNumber.tolist()

imagenumber_dict

{'clone_e': [7,
  43,
  79,
  115,
  151,
  187,
  223,
  259,
  295,
  8,
  44,
  80,
  116,
  152,
  188,
  224,
  260,
  296],
 'wt': [2,
  38,
  74,
  110,
  146,
  182,
  218,
  254,
  290,
  3,
  39,
  75,
  111,
  147,
  183,
  219,
  255,
  291],
 'clone_a': [6, 42, 78, 114, 150, 186, 222, 258, 294],
 'clone_e_holdout': [9, 45, 81, 117, 153, 189, 225, 261, 297],
 'wt_holdout': [1, 37, 73, 109, 145, 181, 217, 253, 289]}

In [15]:
train_dict_df = {}
test_dict_df = {}
holdout_dict_df = {}
for clone_type, clone_imagenumbers in imagenumber_dict.items():
    print(f"Now processing clone: {clone_type}")
    train_df, test_df = process_sites(
        connection=conn,
        imagenumbers=clone_imagenumbers,
        image_df=image_df,
        feature_filter=feature_filter,
        scaler_method=scaler_method,
        seed=seed,
        test_split_prop=test_split_prop,
        normalize=False 
    )
    print(train_df.shape)
    print(test_df.shape)
    
    if clone_type in ["clone_e", "wt"]:
        train_dict_df[clone_type] = train_df.reset_index(drop=True)
        test_dict_df[clone_type] = test_df.reset_index(drop=True)
    else:
        holdout_dict_df[clone_type] = train_df

Now processing clone: clone_e
ImageNumber: 7
ImageNumber: 43
ImageNumber: 79
ImageNumber: 115
ImageNumber: 151
ImageNumber: 187
ImageNumber: 223
ImageNumber: 259
ImageNumber: 295
ImageNumber: 8
ImageNumber: 44
ImageNumber: 80
ImageNumber: 116
ImageNumber: 152
ImageNumber: 188
ImageNumber: 224
ImageNumber: 260
ImageNumber: 296
(17816, 3422)
(3153, 3422)
Now processing clone: wt
ImageNumber: 2
ImageNumber: 38
ImageNumber: 74
ImageNumber: 110
ImageNumber: 146
ImageNumber: 182
ImageNumber: 218
ImageNumber: 254
ImageNumber: 290
ImageNumber: 3
ImageNumber: 39
ImageNumber: 75
ImageNumber: 111
ImageNumber: 147
ImageNumber: 183
ImageNumber: 219
ImageNumber: 255
ImageNumber: 291
(12205, 3422)
(2164, 3422)
Now processing clone: clone_a
ImageNumber: 6
ImageNumber: 42
ImageNumber: 78
ImageNumber: 114
ImageNumber: 150
ImageNumber: 186
ImageNumber: 222
ImageNumber: 258
ImageNumber: 294
(9236, 3422)
(1635, 3422)
Now processing clone: clone_e_holdout
ImageNumber: 9
ImageNumber: 45
ImageNumber: 81
Image

In [16]:
# Normalize and shuffle row order
train_df = normalize_sc(pd.concat(train_dict_df).reset_index(drop=True), scaler_method=scaler_method)
test_df = normalize_sc(pd.concat(test_dict_df).reset_index(drop=True), scaler_method=scaler_method)

train_df = train_df.sample(frac=1).reset_index(drop=True)
test_df = test_df.sample(frac=1).reset_index(drop=True)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


## Apply Feature Selection

In [17]:
# Original shapes
print(train_df.shape)
print(test_df.shape)

(30021, 3422)
(5317, 3422)


In [18]:
meta_features = infer_cp_features(train_df, metadata=True)
meta_features

['Metadata_TableNumber',
 'Metadata_ImageNumber',
 'Metadata_Plate',
 'Metadata_Well',
 'Metadata_plate_map_name',
 'Metadata_CellLine',
 'Metadata_Dosage']

In [19]:
train_df = feature_select(
    train_df,
    operation=feature_select_opts,
    na_cutoff=na_cutoff,
    corr_threshold=corr_threshold
)

selected_features = infer_cp_features(train_df)
reindex_features = meta_features + selected_features

test_df = test_df.reindex(reindex_features, axis="columns")
train_df = train_df.reindex(reindex_features, axis="columns")

In [20]:
# Shapes after feature selection
print(train_df.shape)
print(test_df.shape)

(30021, 916)
(5317, 916)


## Output Files

In [21]:
out_file = pathlib.Path("data", "example_train.tsv.gz")
train_df.to_csv(out_file, sep="\t", compression="gzip", index=False)

out_file = pathlib.Path("data", "example_test.tsv.gz")
test_df.to_csv(out_file, sep="\t", compression="gzip", index=False)

In [22]:
# Normalize, shuffle row order, and output for holdout sets
for clone_type in holdout_dict_df:
    df = normalize_sc(holdout_dict_df[clone_type].reset_index(drop=True), scaler_method=scaler_method)
    df = df.sample(frac=1).reset_index(drop=True).reindex(reindex_features, axis="columns")
    print(clone_type)
    print(df.shape)
    out_file = pathlib.Path("data", f"example_holdout_{clone_type}.tsv.gz")
    df.to_csv(out_file, sep="\t", compression="gzip", index=False)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


clone_a
(9236, 916)


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


clone_e_holdout
(7023, 916)


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


wt_holdout
(6453, 916)
