In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib notebook
import black
import jupyter_black

jupyter_black.load(
    lab=False,
    line_length=79,
    verbosity="DEBUG",
    target_version=black.TargetVersion.PY310,
)
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn.preprocessing as sp
import pickle
import matplotlib.pyplot as plt
import scipy
import os
import time
from datetime import date
import dask.dataframe as dd
import dask.config
dask.config.set({"distributed.scheduler.allowed-failures": 10})
today = date.today()
import os
import sys
sys.path.insert(0, '/home/ubuntu/workspace_SingleCell/SingleCell_Morphological_Analysis/') 
from singlecell.read import read_single_cell_sql
from singlecell.preprocess import handle_nans, extract_cpfeature_names,find_highly_correlated_features
from singlecell.visualize import visualize_n_SingleCell
from singlecell.process import statistical_tests,precision_recall
from singlecell.preprocess.filter_out_edge_single_cells import edgeCellFilter
from singlecell.save.save_pandas_dfs import saveDF_to_CSV_GZ_no_timestamp
from singlecell.preprocess.control_for_cellcount import control_feature_y_for_variable_x
from singlecell.process.replicate_correlation import replicate_null_corr_coefs
from singlecell.process import normalize_funcs

ModuleNotFoundError: No module named 'black'

In [None]:
########################## Project root directory and path to results ########################
mito_project_root_dir = "/home/ubuntu/bucket/projects/2016_08_01_RadialMitochondriaDistribution_donna/"
save_results_dir = mito_project_root_dir + "/workspace/results/jump_fq/"

In [None]:
# ########## jump_orf/jump_crispr/jump_compound

plates = pd.read_csv(
    mito_project_root_dir + "/workspace/metadata/JUMP/plate.csv.gz"
)
wells = pd.read_csv(
    mito_project_root_dir + "/workspace/metadata/JUMP/well.csv.gz"
)
compound = pd.read_csv(
    mito_project_root_dir + "/workspace/metadata/JUMP/compound.csv.gz"
)
orf = pd.read_csv(
    mito_project_root_dir + "/workspace/metadata/JUMP/orf.csv.gz"
)
crispr = pd.read_csv(
    mito_project_root_dir + "/workspace/metadata/JUMP/crispr.csv.gz"
)

compound_plates = plates[
    plates["Metadata_PlateType"] == "COMPOUND"
].reset_index(drop=True)

dataset = "jump_orf"
annot_orf = wells.merge(orf, on=["Metadata_JCP2022"]).merge(
    plates, on=["Metadata_Plate", "Metadata_Source"]
)
annot_orf["Batch"] = annot_orf["Metadata_Batch"]
annot_orf["batch_plate"] = (
    annot_orf["Metadata_Batch"] + "-" + annot_orf["Metadata_Plate"]
)
annot_orf["ctrl_well"] = annot_orf["Metadata_Symbol"].isin(
    ["LacZ", "BFP", "HcRed", "LUCIFERASE"]
)
annot_orf.to_csv(
    mito_project_root_dir
    + "/workspace/metadata/preprocessed/annot_"
    + dataset
    + ".csv"
)

dataset = "jump_crispr"
annot_crispr = wells.merge(crispr, on=["Metadata_JCP2022"]).merge(
    plates, on=["Metadata_Plate", "Metadata_Source"]
)
annot_compound = wells.merge(compound, on=["Metadata_JCP2022"]).merge(
    compound_plates, on=["Metadata_Plate", "Metadata_Source"]
)

annot_crispr["Batch"] = annot_crispr["Metadata_Batch"]
annot_crispr["batch_plate"] = (
    annot_crispr["Metadata_Batch"] + "-" + annot_crispr["Metadata_Plate"]
)
annot_compound["batch_plate"] = (
    annot_compound["Metadata_Batch"] + "-" + annot_compound["Metadata_Plate"]
)
annot_compound["Batch"] = annot_compound["Metadata_Batch"]

annot_crispr["ctrl_well"] = annot_crispr["Metadata_Symbol"].isin(
    ["non-targeting"]
)
annot_crispr.to_csv(
    mito_project_root_dir
    + "/workspace/metadata/preprocessed/annot_"
    + dataset
    + ".csv"
)

dataset = "jump_compound"
## annot['ctrl_well']=annot['Symbol'].isin(['LacZ'])
annot_compound.to_csv(
    mito_project_root_dir
    + "/workspace/metadata/preprocessed/annot_"
    + dataset
    + ".csv"
)

In [None]:
import pandas as pd
from sqlalchemy import create_engine
from functools import reduce
import gc


def read_per_well_data_parquet(input_data_dir, annot):
    batches = annot["Batch"].unique()

    df_agg_all_batches_ls = []
    for b in batches:
        print(b)
        if "Metadata_Source" in annot.columns:
            source_str = annot.loc[
                annot["Batch"] == b, "Metadata_Source"
            ].unique()[0]
            print(source_str)
            profile_path = input_data_dir + source_str + "/workspace/profiles/"
        else:
            profile_path = input_data_dir + "/workspace/profiles/"

        df_sag_ls = []
        plates_exist = os.listdir(profile_path + b)
        plates_meta = annot.loc[annot["Batch"] == b, "Metadata_Plate"].unique()
        plates = set(plates_meta) & set(plates_exist)
        for p in plates:
            print(p)

            fileName = profile_path + b + "/" + p + "/" + p + ".parquet"
            #             print(fileName)
            if os.path.exists(fileName):
                sc_df = pd.read_parquet(fileName)

                #         per_site_aggregate=sc_df.groupby(['Metadata_Well','Metadata_Site']).mean()[feature_list+['Count_Cells']].reset_index()
                sc_df["Metadata_Batch"] = b
                sc_df["Metadata_Plate"] = p
                df_sag_ls.append(sc_df)
                del sc_df
                gc.collect()
            else:
                print(fileName, " not exists")

        if df_sag_ls:
            df_sag = pd.concat(df_sag_ls, axis=0)
            df_agg_all_batches_ls.append(df_sag)

    df_agg_all_batches = pd.concat(df_agg_all_batches_ls, axis=0)
    return df_agg_all_batches

In [None]:
lincs_meta_cols = [
    "Metadata_broad_sample",
    "Metadata_dose_recode",
    "Metadata_pert_id",
    "Metadata_pert_mfc_id",
    "Metadata_InChIKey14",
    "Metadata_pert_type",
    "Metadata_moa",
    "Metadata_target",
    "Metadata_pert_id_dose",
    "Metadata_pert_name",
]

# lincs_meta_cols=['Metadata_broad_sample','Metadata_dose_recode','Metadata_pert_id','Metadata_pert_mfc_id',\
# 'Metadata_InChIKey14','Metadata_pert_type','Metadata_pert_id_dose']

cdrp_meta_cols = [
    "Metadata_broad_sample",
    "Metadata_mmoles_per_liter2",
    "Metadata_pert_id",
    "Metadata_Sample_Dose",
    "Metadata_moa",
]
jumporf_meta_cols = ["Symbol", "broad_sample"]
jumpcrispr_meta_cols = ["Metadata_NCBI_Gene_ID", "Metadata_Symbol"]
jumpcompound_meta_cols = ["Metadata_InChIKey", "Metadata_InChI"]
taorf_meta_cols = [
    "Metadata_gene_name",
    "Metadata_pert_name",
    "Metadata_broad_sample",
    "Metadata_moa",
]


# jump_orf_params={'profiles_path':"/home/ubuntu/jumpbucket/projects/2021_04_26_Production/workspace/backend/",\
#                  'meta_cols':jumporf_meta_cols,\
#                  'pert_col':'broad_sample',\
#                  'target_features_list':target_features_list_orf_cdrp
#                 }

jump_orf_params = {
    "profiles_path": "/home/ubuntu/gallery/cpg0016-jump/",
    "meta_cols": jumporf_meta_cols,
    "pert_col": "Metadata_JCP2022",
    "untreated_key_val": ["Metadata_pert_type", "negcon"],
}

cdrp_params = {
    "profiles_path": "/home/ubuntu/gallery/cpg0012-wawer-bioactivecompoundprofiling/broad/workspace/backend/",
    "meta_cols": cdrp_meta_cols,
    "pert_col": "Metadata_Sample_Dose",
}
# /home/ubuntu/bucket/projects/2015_10_05_DrugRepurposing_AravindSubramanian_GolubLab_Broad/workspace/backend/
# https://cellpainting-gallery.s3.amazonaws.com/cpg0004-lincs/broad/workspace/backend/2016_04_01_a549_48hr_batch1/SQ00014812
lincs_params = {
    "profiles_path": "/home/ubuntu/gallery/cpg0004-lincs/",
    "meta_cols": lincs_meta_cols,
    "pert_col": "Metadata_pert_id_dose",
    "untreated_key_val": ["Metadata_pert_type", "control"],
}

jump_crispr_params = {
    "profiles_path": "/home/ubuntu/gallery/cpg0016-jump/",
    "meta_cols": jumpcrispr_meta_cols,
    "pert_col": "Metadata_JCP2022",
    "untreated_key_val": ["Metadata_Symbol", "non-targeting"],
}

jump_compound_params = {
    "profiles_path": "/home/ubuntu/gallery/cpg0016-jump/",
    "meta_cols": jumpcompound_meta_cols,
    "pert_col": "Metadata_JCP2022",
    "untreated_key_val": ["Metadata_JCP2022", "JCP2022_999999"],
}

ta_orf_params = {
    "profiles_path": "/home/ubuntu/gallery/cpg0017-rohban-pathways/broad/workspace/backend/",
    "meta_cols": taorf_meta_cols,
    "pert_col": "Metadata_broad_sample",
    "untreated_key_val": ["Metadata_pert_type", "Untreated"],
}

ds_info_dict = {
    "jump_orf": jump_orf_params,
    "CDRP": cdrp_params,
    "lincs": lincs_params,
    "jump_crispr": jump_crispr_params,
    "jump_compound": jump_compound_params,
    "taorf": ta_orf_params,
}
# 'broad_sample', 'pert_type', 'control_type'

# results=annot[['Symbol','broad_sample', 'pert_type', 'control_type']].drop_duplicates().reset_index(drop=True)

# dataset='CDRP';dataset_meta_hue='Metadata_moa'
# dataset='lincs';dataset_meta_hue='Metadata_moa'
# dataset='jump_orf';dataset_meta_hue='Symbol'

In [None]:
annot_crispr["Metadata_Symbol"].unique()

In [None]:
# annot_crispr.groupby(['Metadata_Batch','Metadata_Source']).size()

In [None]:
# annot_compound.groupby(['Metadata_Batch','Metadata_Source']).size()

In [None]:
dataset = "jump_crispr"
dataset = "jump_compound"
dataset = "lincs"
# dataset="taorf"

##################### Read preprocessed metadata
annot = pd.read_csv(
    mito_project_root_dir
    + "/workspace/metadata/preprocessed/annot_"
    + dataset
    + ".csv",
    dtype={"Metadata_Plate": str},
)
annot['Metadata_pert_type']

In [None]:
annot['Metadata_pert_type'].unique()

In [None]:
annot[annot["Metadata_JCP2022"] == "JCP2022_999999"].groupby(
    "Metadata_Source"
).size()

## some usefull stats
- jump-orf
   - cp_features: 3578
   - cp_features_analysis/nan cols/low vars: 3419 0 159
   - before dropping nan rows:  (91001, 4607)
   - after dropping nan rows:  (90997, 4607)
   
- jump-compound source 1
   - cp_features: 3673
   - cp_features_analysis/nan cols/low vars: 3392 0 186
   - before dropping nan rows:  (74843, 4580)
   - after dropping nan rows:  (74049, 4580) 
   

In [None]:
%time

# dataset = "jump_orf"
# dataset='CDRP'
dataset='lincs'
# dataset="jump_crispr"
# dataset = "jump_compound"
# dataset="taorf"

##################### Read preprocessed metadata
annot = pd.read_csv(
    mito_project_root_dir
    + "/workspace/metadata/preprocessed/annot_"
    + dataset
    + ".csv",
    dtype={"Metadata_Plate": str},
)

# annot = annot[annot["Metadata_Batch"] == "2021_04_26_Batch1"].reset_index(
#     drop=True
# )

# target_features_list = ds_info_dict[dataset]["target_features_list"]

sources = annot["Metadata_Source"].unique()
for si in sources:

    annot_source = annot[annot["Metadata_Source"] == si].reset_index()

    ##################### Read per well data
    if (
        dataset == "lincs"
    ):  # for this batch of lincs data we dont have well level profiles
        df_agg_all_batches = sample_single_cells_from_sql(
            ds_info_dict[dataset]["profiles_path"], annot_source
        )
    else:
        #     df_agg_all_batches=read_per_well_data_csvs(ds_info_dict[dataset]["profiles_path"],annot);
        df_agg_all_batches = read_per_well_data_parquet(
            ds_info_dict[dataset]["profiles_path"], annot_source
        )

    ##################### Clean and shrink features
    (
        cp_features,
        cp_features_analysis_0,
    ) = extract_cpfeature_names.extract_cpfeature_names(df_agg_all_batches)
    df_sag, cp_features_analysis = handle_nans.handle_nans(
        df_agg_all_batches,
        cp_features_analysis_0,
        thrsh_null_ratio=0.05,
        thrsh_std=0.001,
        fill_na_method="drop-rows",
    )

    ##################### merge all aggregated profiles with annot
    common_cols_2merge = list(set(annot_source.columns) & set(df_sag.columns))
    df_sag["Metadata_Plate"] = df_sag["Metadata_Plate"].astype(str)
    df_sag = pd.merge(df_sag, annot_source, how="inner", on=common_cols_2merge)

    ##################### normalize to negative control
    if 0:
        df_sag_zscored = normalize_funcs.zscore_df_columns_by_control_perPlate(
            df_sag,
            cp_features_analysis,
            "Standard",
            "batch_plate",
            ds_info_dict[dataset]["untreated_key_val"],
        )

        df_sag_zscored = df_sag_zscored[
            ds_info_dict[dataset]["untreated_key_val"][0]
            != ds_info_dict[dataset]["untreated_key_val"][1]
        ].reset_index()
    else:
        df_sag_zscored = df_sag.copy()

    ##################### per plate standardization
    df_rep_level_scaled = normalize_funcs.standardize_per_catX(
        df_sag_zscored, "batch_plate", cp_features_analysis
    ).copy()

    df_rep_level_scaled = df_rep_level_scaled[
        df_rep_level_scaled[ds_info_dict[dataset]["untreated_key_val"][0]]
        != ds_info_dict[dataset]["untreated_key_val"][1]
    ].reset_index()

    ##################### calculate and save replication correlatiopn curves
    pert_col = ds_info_dict[dataset]["pert_col"]
    nOfReps = df_rep_level_scaled.groupby([pert_col]).size().reset_index()
    pairWithReplicates = nOfReps.loc[nOfReps[0] != 1, :].reset_index()[
        pert_col
    ]  # .groupby([0]).size()

    df_rep_level_scaled[cp_features_analysis] = df_rep_level_scaled[
        cp_features_analysis
    ].astype("float32")

    df_rep_level_scaled.to_csv(
        save_results_dir
        + "/"
        + dataset
        + "_df_rep_level_scaled_"
        + si
        + ".csv",
        index=False,
    )

    if 1:
        fh_2save, repCorrDf = replicate_null_corr_coefs(
            df_rep_level_scaled[
                df_rep_level_scaled[pert_col].isin(pairWithReplicates)
            ].reset_index(drop=True),
            pert_col,
            cp_features_analysis,
            1,
            title="",
            hist_bins=50,
        )

        fh_2save.savefig(
            save_results_dir + "/" + dataset + "_corr_curves_" + si + ".png"
        )
        repCorrDf.to_csv(
            save_results_dir + "/" + dataset + "_corr_df_" + si + ".csv",
            index=False,
        )

    ##################### calculate and save feature quality rannkings
    #     df_rep_level_scaled2 = (
    #         df_rep_level_scaled.groupby([pert_col, "batch_plate"])
    #         .mean()
    #         .reset_index()
    #     )

    df_rep_level_scaled2 = df_rep_level_scaled[
        df_rep_level_scaled[pert_col].isin(pairWithReplicates)
    ].reset_index(drop=True)

    DS_featureQ_rank = pd.DataFrame(
        data=df_rep_level_scaled2.groupby(pert_col)
        .std()
        .mean()[cp_features_analysis],
        index=cp_features_analysis,
        columns=["average_std_score"],
    )

    batches = annot_source["Batch"].unique()
    for b in batches:
        df_rep_per_batch = df_rep_level_scaled2[
            df_rep_level_scaled2["Batch"] == b
        ].reset_index(drop=True)
        nOfReps = df_rep_per_batch.groupby([pert_col]).size().reset_index()
        pairWithReplicates_batch = nOfReps.loc[
            nOfReps[0] != 1, :
        ].reset_index()[pert_col]
        df_rep_per_batch2 = df_rep_per_batch[
            df_rep_per_batch[pert_col].isin(pairWithReplicates_batch)
        ].reset_index(drop=True)

        DS_featureQ_rank["average_std_score_" + b] = (
            df_rep_per_batch2.groupby(pert_col)
            .std()
            .mean()[cp_features_analysis]
        )

    DS_featureQ_rank2 = (
        DS_featureQ_rank.sort_values(by="average_std_score", ascending=True)
        .reset_index()
        .rename(columns={"index": "features"})
    )

    DS_featureQ_rank2.to_csv(
        save_results_dir + "/" + dataset + "_feature_quality_" + si + ".csv",
        index=False,
    )

In [None]:
df_rep_level_scaled[
    df_rep_level_scaled[ds_info_dict[dataset]["untreated_key_val"][0]]
    == ds_info_dict[dataset]["untreated_key_val"][1]
]

In [None]:
df_rep_level_scaled[
    df_rep_level_scaled[pert_col].isin(pairWithReplicates)
].reset_index(drop=True).groupby("Metadata_JCP2022").size().sort_values()

In [None]:
reshaped_feature_rep_level_ddf = dd.from_pandas(
    reshaped_feature_rep_level_df, npartitions=10
)


DS_featureQ_rank = (
    reshaped_feature_rep_level_ddf.groupby("features")
    .std()
    .mean(axis=1)
    .reset_index()
    .sort_values(by=0, ascending=True)
    .reset_index(drop=True)
)

DS_featureQ_rank.compute()