In [1]:
import sys
import pathlib
import sqlite3
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from pycytominer import feature_select
from pycytominer.cyto_utils import infer_cp_features

from utils.single_cell_utils import process_sites, normalize_sc
sys.path.append("../0.generate-profiles")
from scripts.profile_util import load_config

In [2]:
pd.np.random.seed(1234)

In [14]:
# Set constants
batch = "2020_07_02_Batch8"
plate = "218360"
cell_line_column = "Metadata_clone_number"

feature_filter = ["Object", "Location", "Count", "Parent"]
test_split_prop = 0.15
scaler_method = "standard"
seed = 123

feature_select_opts = [
    "variance_threshold",
    "drop_na_columns",
    "blacklist",
    "drop_outliers",
]
corr_threshold = 0.8
na_cutoff = 0

In [4]:
# Load locations of single cell files
config = pathlib.Path("../0.generate-profiles/profile_config.yaml")
pipeline, single_cell_files = load_config(config, append_sql_prefix=False)

In [5]:
workspace_dir = pipeline["workspace_dir"]
batch_dir = pathlib.Path(workspace_dir, "backend", batch)
metadata_dir = pathlib.Path(workspace_dir, "metadata", batch)

barcode_plate_map_file = pathlib.Path(metadata_dir, "barcode_platemap.csv")
barcode_plate_map_df = pd.read_csv(barcode_plate_map_file)

barcode_plate_map_df

Unnamed: 0,Assay_Plate_Barcode,Plate_Map_Name
0,218360,218360
1,218361,218361
2,218362,218362
3,218363,218363


In [6]:
plate_map_name = (
    barcode_plate_map_df
    .query("Assay_Plate_Barcode == @plate")
    .Plate_Map_Name
    .values[0]
)

plate_map_file = pathlib.Path(metadata_dir, "platemap", f"{plate_map_name}.txt")
plate_map_df = pd.read_csv(plate_map_file, sep="\t")
plate_map_df.columns = [x if x.startswith("Metadata_") else f"Metadata_{x}" for x in plate_map_df.columns]
plate_map_df.head()

Unnamed: 0,Metadata_plate_map_name,Metadata_well_position,Metadata_clone_number,Metadata_plate_ID,Metadata_plate_filename,Metadata_treatment,Metadata_treatment_time,Metadata_cell_density
0,218360,B02,WT parental,218360,20200626-WTpAE-Lo,0.1% DMSO,13 hr,2.5x10^3 cells/well
1,218360,B03,WT parental,218360,20200626-WTpAE-Lo,2.1 nM bortezomib,13 hr,2.5x10^3 cells/well
2,218360,B04,WT parental,218360,20200626-WTpAE-Lo,21 nM bortezomib,13 hr,2.5x10^3 cells/well
3,218360,B05,WT parental,218360,20200626-WTpAE-Lo,210 nM bortezomib,13 hr,2.5x10^3 cells/well
4,218360,B06,WT parental,218360,20200626-WTpAE-Lo,0.1% DMSO,13 hr,2.5x10^3 cells/well


## Setup Single Cell Connection

In [7]:
plate_column = pipeline["aggregate"]["plate_column"]
well_column = pipeline["aggregate"]["well_column"]

In [8]:
# Establish connection to sqlite file
single_cell_sqlite = single_cell_files[batch]["plates"][plate]
conn = sqlite3.connect(single_cell_sqlite)

In [9]:
image_cols = f"TableNumber, ImageNumber, {plate_column}, {well_column}"
image_query = f"select {image_cols} from image"
image_df = (
    pd.read_sql_query(image_query, conn)
    .merge(
        plate_map_df,
        left_on=well_column,
        right_on="Metadata_well_position"
    )
    .drop(["Metadata_well_position"], axis="columns")
)

print(image_df.shape)
image_df.head()

(984, 11)


Unnamed: 0,TableNumber,ImageNumber,Metadata_Plate,Metadata_Well,Metadata_plate_map_name,Metadata_clone_number,Metadata_plate_ID,Metadata_plate_filename,Metadata_treatment,Metadata_treatment_time,Metadata_cell_density
0,337567195667998632376188922851239700308,61,218360,B02,218360,WT parental,218360,20200626-WTpAE-Lo,0.1% DMSO,13 hr,2.5x10^3 cells/well
1,207799439167560665032536330184604846742,121,218360,B02,218360,WT parental,218360,20200626-WTpAE-Lo,0.1% DMSO,13 hr,2.5x10^3 cells/well
2,335293908648249163110744433032854670027,181,218360,B02,218360,WT parental,218360,20200626-WTpAE-Lo,0.1% DMSO,13 hr,2.5x10^3 cells/well
3,193238743777114473610479773238969539214,241,218360,B02,218360,WT parental,218360,20200626-WTpAE-Lo,0.1% DMSO,13 hr,2.5x10^3 cells/well
4,248707456829043659205101342022339991799,301,218360,B02,218360,WT parental,218360,20200626-WTpAE-Lo,0.1% DMSO,13 hr,2.5x10^3 cells/well


## Identify Representative Wells

In [10]:
# Assert that image number is unique
assert len(image_df.ImageNumber.unique()) == image_df.shape[0]

In [15]:
image_df.loc[:, cell_line_column].value_counts()

Clone A        329
Clone E        328
WT parental    327
Name: Metadata_clone_number, dtype: int64

In [16]:
image_df.Metadata_Well.value_counts()

F06    17
B09    17
G03    17
F11    17
F05    17
F09    17
G05    17
E04    17
C02    17
F10    17
D09    17
G07    17
E10    17
E11    17
C05    17
G08    17
D06    17
G09    17
G10    17
B06    17
G06    17
C03    17
G04    17
D05    17
D03    17
B08    17
C04    17
C11    17
E08    17
F03    17
G02    17
E03    17
G11    17
C10    17
D02    17
E09    17
E05    17
B11    16
F08    16
C08    16
E02    16
F07    16
B10    16
D11    16
B04    16
C09    16
B03    16
B02    16
D08    16
C07    16
B07    16
D07    16
E06    16
F04    15
F02    15
C06    15
B05    15
E07    14
D10    14
D04    11
Name: Metadata_Well, dtype: int64

In [22]:
clone_e_wells = pd.np.random.choice(
    (
        image_df
        .query(f"{cell_line_column} == 'Clone E'")
        .query("Metadata_treatment == '0.1% DMSO'")
    )
    .Metadata_Well.unique(), size=2, replace=False
)

wt_wells = pd.np.random.choice(
    (
        image_df
        .query(f"{cell_line_column} == 'WT parental'")
        .query("Metadata_treatment == '0.1% DMSO'")
    ).Metadata_Well.unique(), size=2, replace=False
)

clone_e_holdout_wells = pd.np.random.choice(
    (
        image_df
        .query(f"{cell_line_column} == 'Clone E'")
        .query("Metadata_treatment == '0.1% DMSO'")
        .query("Metadata_Well not in @clone_e_wells")
    )
    .Metadata_Well.unique(), size=1, replace=False
)

wt_holdout_wells = pd.np.random.choice(
    (
        image_df
        .query(f"{cell_line_column} == 'WT parental'")
        .query("Metadata_treatment == '0.1% DMSO'")
        .query("Metadata_Well not in @wt_wells")
    ).Metadata_Well.unique(), size=1, replace=False
)

clone_a_wells = pd.np.random.choice(
    (
        image_df
        .query(f"{cell_line_column} == 'Clone A'")
        .query("Metadata_treatment == '0.1% DMSO'")
    )
    .Metadata_Well.unique(), size=1, replace=False
)

holdout_wells = list(clone_e_holdout_wells) + list(wt_holdout_wells) + list(clone_a_wells)

print(
    f"Clone E Wells: {clone_e_wells}",
    f"\nWT Wells: {wt_wells}",
    f"\nAll Holdout Wells: {holdout_wells}"
    f"\nHoldout: Clone E Wells: {clone_e_holdout_wells}",
    f"\nHoldout: WT Wells: {wt_holdout_wells}",
    f"\nHoldout: Clone A Wells: {clone_a_wells}",
)

Clone E Wells: ['G08' 'D02'] 
WT Wells: ['E02' 'B08'] 
All Holdout Wells: ['D06', 'E08', 'C06']
Holdout: Clone E Wells: ['D06'] 
Holdout: WT Wells: ['E08'] 
Holdout: Clone A Wells: ['C06']


# Load Cells

In [23]:
imagenumber_dict = {}
imagenumber_dict["clone_e"] = image_df.query("Metadata_Well in @clone_e_wells").ImageNumber.tolist()
imagenumber_dict["wt"] = image_df.query("Metadata_Well in @wt_wells").ImageNumber.tolist()
imagenumber_dict["holdout"] = image_df.query("Metadata_Well in @holdout_wells").ImageNumber.tolist()

imagenumber_dict

{'clone_e': [21,
  81,
  141,
  201,
  261,
  321,
  381,
  441,
  501,
  561,
  621,
  681,
  741,
  801,
  861,
  921,
  981,
  57,
  117,
  177,
  237,
  297,
  357,
  417,
  477,
  537,
  597,
  657,
  717,
  777,
  837,
  897,
  957,
  1017],
 'wt': [7,
  67,
  127,
  187,
  247,
  307,
  367,
  427,
  487,
  547,
  607,
  667,
  727,
  787,
  847,
  907,
  967,
  31,
  91,
  151,
  211,
  271,
  391,
  451,
  511,
  571,
  631,
  691,
  751,
  811,
  871,
  931,
  991],
 'holdout': [15,
  75,
  135,
  195,
  315,
  375,
  435,
  495,
  555,
  615,
  675,
  795,
  855,
  915,
  975,
  25,
  85,
  145,
  205,
  265,
  325,
  385,
  445,
  505,
  565,
  625,
  685,
  745,
  805,
  865,
  925,
  985,
  37,
  97,
  157,
  217,
  277,
  337,
  397,
  457,
  517,
  577,
  637,
  697,
  757,
  817,
  877,
  937,
  997]}

In [24]:
training_dict_df = {}
holdout_dict_df = {}
for clone_type, clone_imagenumbers in imagenumber_dict.items():
    print(f"Now processing clone: {clone_type}")
    train_df = process_sites(
        connection=conn,
        imagenumbers=clone_imagenumbers,
        image_df=image_df,
        feature_filter=feature_filter,
        seed=seed,
        normalize=False
    )
    print(train_df.shape)
    
    if clone_type in ["clone_e", "wt"]:
        training_dict_df[clone_type] = train_df.reset_index(drop=True)
    else:
        holdout_dict_df[clone_type] = train_df.reset_index(drop=True)

Now processing clone: clone_e
ImageNumber: 21
ImageNumber: 81
ImageNumber: 141
ImageNumber: 201
ImageNumber: 261
ImageNumber: 321
ImageNumber: 381
ImageNumber: 441
ImageNumber: 501
ImageNumber: 561
ImageNumber: 621
ImageNumber: 681
ImageNumber: 741
ImageNumber: 801
ImageNumber: 861
ImageNumber: 921
ImageNumber: 981
ImageNumber: 57
ImageNumber: 117
ImageNumber: 177
ImageNumber: 237
ImageNumber: 297
ImageNumber: 357
ImageNumber: 417
ImageNumber: 477
ImageNumber: 537
ImageNumber: 597
ImageNumber: 657
ImageNumber: 717
ImageNumber: 777
ImageNumber: 837
ImageNumber: 897
ImageNumber: 957
ImageNumber: 1017
(3782, 3426)
Now processing clone: wt
ImageNumber: 7
ImageNumber: 67
ImageNumber: 127
ImageNumber: 187
ImageNumber: 247
ImageNumber: 307
ImageNumber: 367
ImageNumber: 427
ImageNumber: 487
ImageNumber: 547
ImageNumber: 607
ImageNumber: 667
ImageNumber: 727
ImageNumber: 787
ImageNumber: 847
ImageNumber: 907
ImageNumber: 967
ImageNumber: 31
ImageNumber: 91
ImageNumber: 151
ImageNumber: 211
Imag

In [25]:
# Normalize, split, and shuffle row order
train_df = pd.concat(training_dict_df).sample(frac=1).reset_index(drop=True)
train_df = normalize_sc(train_df, scaler_method=scaler_method)

train_df, test_df = train_test_split(train_df, test_size=test_split_prop, random_state=seed)

print(train_df.shape)
print(test_df.shape)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


(5468, 3426)
(965, 3426)


In [26]:
holdout_df = pd.concat(holdout_dict_df).sample(frac=1).reset_index(drop=True)
holdout_df = normalize_sc(holdout_df, scaler_method=scaler_method)
print(holdout_df.shape)

  return self.partial_fit(X, y)


(5416, 3426)


  return self.fit(X, **fit_params).transform(X)


## Apply Feature Selection

In [27]:
meta_features = infer_cp_features(train_df, metadata=True)
meta_features

['Metadata_TableNumber',
 'Metadata_ImageNumber',
 'Metadata_Plate',
 'Metadata_Well',
 'Metadata_plate_map_name',
 'Metadata_clone_number',
 'Metadata_plate_ID',
 'Metadata_plate_filename',
 'Metadata_treatment',
 'Metadata_treatment_time',
 'Metadata_cell_density']

In [28]:
train_df = feature_select(
    train_df,
    operation=feature_select_opts,
    na_cutoff=na_cutoff,
    corr_threshold=corr_threshold
)

selected_features = infer_cp_features(train_df)
reindex_features = meta_features + selected_features

test_df = test_df.reindex(reindex_features, axis="columns")
train_df = train_df.reindex(reindex_features, axis="columns")
holdout_df = holdout_df.reindex(reindex_features, axis="columns")

In [29]:
# Shapes after feature selection
print(train_df.shape)
print(test_df.shape)
print(holdout_df.shape)

(5468, 2180)
(965, 2180)
(5416, 2180)


## Output Files

In [30]:
out_file = pathlib.Path("data", "example_train.tsv.gz")
train_df.to_csv(out_file, sep="\t", compression="gzip", index=False)

out_file = pathlib.Path("data", "example_test.tsv.gz")
test_df.to_csv(out_file, sep="\t", compression="gzip", index=False)

out_file = pathlib.Path("data", "example_holdout.tsv.gz")
holdout_df.to_csv(out_file, sep="\t", compression="gzip", index=False)