In [1]:
import sys
import pathlib
import sqlite3
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from pycytominer import feature_select
from pycytominer.cyto_utils import infer_cp_features

from utils.single_cell_utils import process_sites
sys.path.append("../0.generate-profiles")
from scripts.profile_util import load_config

In [2]:
pd.np.random.seed(123)

In [3]:
# Set constants
batch = "2019_03_20_Batch2"
plate = "207106_exposure320"

feature_filter = ["Object", "Location", "Count", "Parent"]
test_split_prop = 0.15
scaler_method = "standard"
seed = 123

feature_select_opts = [
        "variance_threshold",
        "drop_na_columns",
        "blacklist",
        "drop_outliers",
    ]
corr_threshold = 0.8
na_cutoff = 0

In [4]:
# Load locations of single cell files
config = pathlib.Path("../0.generate-profiles/profile_config.yaml")
pipeline, single_cell_files = load_config(config, append_sql_prefix=False)

In [5]:
workspace_dir = pipeline["workspace_dir"]
batch_dir = pathlib.Path(workspace_dir, "backend", batch)
metadata_dir = pathlib.Path(workspace_dir, "metadata", batch)

barcode_plate_map_file = pathlib.Path(metadata_dir, "barcode_platemap.csv")
barcode_plate_map_df = pd.read_csv(barcode_plate_map_file)

barcode_plate_map_df

Unnamed: 0,Assay_Plate_Barcode,Plate_Map_Name,Batch_Number
0,207106_exposure320,PlateMap_207106_exposure320,2


In [6]:
plate_map_name = (
    barcode_plate_map_df
    .query("Assay_Plate_Barcode == @plate")
    .Plate_Map_Name
    .values[0]
)

plate_map_file = pathlib.Path(metadata_dir, "platemap", f"{plate_map_name}.txt")
plate_map_df = pd.read_csv(plate_map_file, sep="\t")
plate_map_df.columns = [x if x.startswith("Metadata_") else f"Metadata_{x}" for x in plate_map_df.columns]
plate_map_df.head()

Unnamed: 0,Metadata_plate_map_name,Metadata_well_position,Metadata_CellLine,Metadata_Dosage
0,PlateMap_207106_exposure320,B02,WT,0.0
1,PlateMap_207106_exposure320,B03,WT,0.0
2,PlateMap_207106_exposure320,B04,WT,0.0
3,PlateMap_207106_exposure320,B05,CloneA,0.0
4,PlateMap_207106_exposure320,B06,CloneA,0.0


## Setup Single Cell Connection

In [7]:
plate_column = pipeline["aggregate"]["plate_column"]
well_column = pipeline["aggregate"]["well_column"]

In [8]:
# Establish connection to sqlite file
single_cell_sqlite = single_cell_files[batch]["plates"][plate]
conn = sqlite3.connect(single_cell_sqlite)

In [9]:
image_cols = f"TableNumber, ImageNumber, {plate_column}, {well_column}"
image_query = f"select {image_cols} from image"
image_df = (
    pd.read_sql_query(image_query, conn)
    .merge(
        plate_map_df,
        left_on=well_column,
        right_on="Metadata_well_position"
    )
    .drop(["Metadata_well_position"], axis="columns")
)

print(image_df.shape)
image_df.head()

(324, 7)


Unnamed: 0,TableNumber,ImageNumber,Metadata_Plate,Metadata_Well,Metadata_plate_map_name,Metadata_CellLine,Metadata_Dosage
0,35063824554719371464203761079871132620,1,207106_exposure320,B02,PlateMap_207106_exposure320,WT,0.0
1,55321079170264086709741025581452256546,37,207106_exposure320,B02,PlateMap_207106_exposure320,WT,0.0
2,156575971115499494274828396611545171867,73,207106_exposure320,B02,PlateMap_207106_exposure320,WT,0.0
3,177833145266349265724759827001312244688,109,207106_exposure320,B02,PlateMap_207106_exposure320,WT,0.0
4,322613752142964989790892398074175721670,145,207106_exposure320,B02,PlateMap_207106_exposure320,WT,0.0


## Identify Representative Wells

In [10]:
image_df.Metadata_CellLine.value_counts()

CloneE    108
CloneA    108
WT        108
Name: Metadata_CellLine, dtype: int64

In [11]:
clone_e_well = pd.np.random.choice(image_df.query("Metadata_CellLine == 'CloneE'").Metadata_Well.unique())
wt_well = pd.np.random.choice(image_df.query("Metadata_CellLine == 'WT'").Metadata_Well.unique())

print(f"Clone E Well: {clone_e_well}", f"\nWT Well: {wt_well}")

Clone E Well: B10 
WT Well: B04


# Load Cells

In [12]:
imagenumber_dict = {}
imagenumber_dict["clone_e"] = image_df.query("Metadata_Well == @clone_e_well").ImageNumber.tolist()
imagenumber_dict["wt"] = image_df.query("Metadata_Well == @wt_well").ImageNumber.tolist()
imagenumber_dict

{'clone_e': [9, 45, 81, 117, 153, 189, 225, 261, 297],
 'wt': [3, 39, 75, 111, 147, 183, 219, 255, 291]}

In [13]:
train_df = {}
test_df = {}
for clone_type, clone_imagenumbers in imagenumber_dict.items():
    print(f"Now processing clone: {clone_type}")
    train_df[clone_type], test_df[clone_type] = process_sites(
        connection=conn,
        imagenumbers=clone_imagenumbers,
        image_df=image_df,
        feature_filter=feature_filter,
        scaler_method=scaler_method,
        seed=seed,
        test_split_prop=test_split_prop,
        normalize=True 
    )
    
train_df = pd.concat(train_df).reset_index(drop=True)
test_df = pd.concat(test_df).reset_index(drop=True)

Now processing clone: clone_e
ImageNumber: 9
ImageNumber: 45
ImageNumber: 81
ImageNumber: 117
ImageNumber: 153
ImageNumber: 189
ImageNumber: 225
ImageNumber: 261
ImageNumber: 297


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


Now processing clone: wt
ImageNumber: 3
ImageNumber: 39
ImageNumber: 75
ImageNumber: 111
ImageNumber: 147
ImageNumber: 183
ImageNumber: 219
ImageNumber: 255
ImageNumber: 291


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


## Apply Feature Selection

In [14]:
# Original shapes
print(train_df.shape)
print(test_df.shape)

(13029, 3422)
(2310, 3422)


In [15]:
meta_features = infer_cp_features(train_df, metadata=True)
meta_features

['Metadata_TableNumber',
 'Metadata_ImageNumber',
 'Metadata_Plate',
 'Metadata_Well',
 'Metadata_plate_map_name',
 'Metadata_CellLine',
 'Metadata_Dosage']

In [16]:
train_df = feature_select(
    train_df,
    operation=feature_select_opts,
    na_cutoff=na_cutoff,
    corr_threshold=corr_threshold
)

selected_features = infer_cp_features(train_df)
reindex_features = meta_features + selected_features

test_df = test_df.reindex(reindex_features, axis="columns")

In [17]:
# Shapes after feature selection
print(train_df.shape)
print(test_df.shape)

(13029, 1440)
(2310, 1440)


## Output Files

In [18]:
out_file = pathlib.Path("data", "example_train.tsv.gz")
train_df.to_csv(out_file, sep="\t", compression="gzip", index=False)

out_file = pathlib.Path("data", "example_test.tsv.gz")
test_df.to_csv(out_file, sep="\t", compression="gzip", index=False)