In [None]:
import logging
import stats
import plotly.express as px
import pandas as pd
import numpy as np
from utils import find_meta_cols, find_feat_cols
logging.basicConfig(format='%(levelname)s:%(asctime)s:%(name)s:%(message)s',
                    level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
# stats: statistics of negative controls for all the plates
logger.info('Loading data')
dframe = stats.load_profiles()
logger.info('Removing nan and inf columns')
dframe = stats.remove_nan_infs_columns(dframe)
negcon = dframe.query('Metadata_JCP2022 == @stats.DMSO')
logger.info('computing stats for negcons')
all_neg_stats = stats.get_stats(negcon)
logger.info('stats done.')
stats.add_metadata(all_neg_stats, dframe[find_meta_cols(dframe)])

In [None]:
# variant_features: features that have mad != 0 and abs_coef_var>1e-3 in every plate for negative controls.
neg_stats = all_neg_stats.query('mad!=0 and abs_coef_var>1e-3')
variant_features = set.intersection(*neg_stats.groupby('Metadata_Plate')['feature'].agg(set).tolist())
variant_features = list(variant_features)
neg_stats = neg_stats.query('feature in @variant_features')
len(variant_features)

In [None]:
# choose neg_stats from source 6 only
scatter = neg_stats.query('Metadata_Source=="source_6"')
# family_order = neg_stats.groupby('family')['abs_coef_var'].min().sort_values().index
family_order = (
    scatter
    .groupby('family')
    .apply(lambda df: df[['mad', 'median']].max().max())
    .sort_values(ascending=False)
    .index
)
fig = px.scatter(scatter,
                 x='median',
                 y='mad',
                 symbol='family',
                 hover_name='feature',
                 hover_data=['Metadata_Plate'],
                 color='family',
                 size='abs_coef_var',
                 category_orders={'family': family_order})
fig.update_layout(
    coloraxis_colorbar=dict(yanchor="top", y=1, x=0, ticks="outside"))
fig.update_traces(marker_sizemin=6)
#fig.update_xaxes(tickformat='.2e')
#fig.update_yaxes(tickformat='.2e')

In [None]:
# Compute params for MAD normalization
mads = neg_stats.pivot(columns='feature', index='Metadata_Plate', values='mad')
medians = neg_stats.pivot(columns='feature', index='Metadata_Plate', values='median')

In [None]:
# Get normalized features with epsilon = 0 for all plates that have MAD stats
feats = dframe.query('Metadata_Plate in @mads.index')
fnorm = (feats.set_index('Metadata_Plate')[mads.columns] - medians) / mads
fnorm.reset_index(drop=True, inplace=True)
for c in find_meta_cols(feats):
    fnorm[c] = feats[c].values

In [None]:
# Get stats for normalized features
desc = fnorm[variant_features].describe()

In [None]:
# Features binned w.r.t. the highest values
pd.qcut(desc.T['max'], 50).value_counts().sort_index()

In [None]:
# Top-40 features with the largest values
desc.T.sort_values(by='max', ascending=False).head(40)

In [None]:
# Identify the values for the outlier column
outlier_col = 'Nuclei_Texture_SumVariance_DNA_5_00_256'
outlier = fnorm.sort_values(by=[outlier_col]).iloc[-1]
outlier[find_meta_cols(outlier.index)]

In [None]:
# Binning the features of the sample that has the highest value
pd.qcut(outlier[variant_features].astype(np.float32), 30).value_counts().sort_index()

In [None]:
# Raw value
dframe.query((
    'Metadata_Source == @outlier.Metadata_Source and '
    'Metadata_Plate == @outlier.Metadata_Plate and '
    'Metadata_Well == @outlier.Metadata_Well'))[outlier_col]

In [None]:
# Stats for the raw values of negative controls in the plate
negcon.query('Metadata_Plate==@outlier.Metadata_Plate')[outlier_col].describe()

In [None]:
# Stats for the raw values of all values in the plate
dframe.query('Metadata_Plate==@outlier.Metadata_Plate')[outlier_col].describe()

In [None]:
# Top-10 highest raw values for the column in the plate
dframe.query('Metadata_Plate==@outlier.Metadata_Plate')[outlier_col].sort_values()[-10:]

In [None]:
# Identify the median values for the outlier column
median_sample = fnorm.sort_values(by=[outlier_col]).iloc[len(fnorm)//2]
median_sample[find_meta_cols(median_sample.index)]

In [None]:
# scatter plot for outlier features
features_max_1e5 = desc.T.sort_values(by='max', ascending=False).head(40).index
fig = px.scatter(scatter.query('feature in @features_max_1e5'),
                 x='median',
                 y='mad',
                 symbol='family',
                 hover_name='feature',
                 hover_data=['Metadata_Plate'],
                 color='family',
                 size='abs_coef_var',
                 category_orders={'family': family_order})
fig.update_layout(
    coloraxis_colorbar=dict(yanchor="top", y=1, x=0, ticks="outside"))
fig.update_traces(marker_sizemin=6)

In [None]:
neg_stats.query('Metadata_Plate==@outlier.Metadata_Plate and feature==@outlier_col').T