## Fit a linear model on cell morphology features

We aim to determine which features are significantly impacted by drug treatment, adjusted by cell count.

In [1]:
import pathlib
import pandas as pd

from sklearn.linear_model import LinearRegression

from pycytominer import feature_select
from pycytominer.cyto_utils import infer_cp_features

In [2]:
# Define inputs and outputs
plate = "localhost220513100001_KK22-05-198_FactinAdjusted"  # Focusing on plate 2
file_suffix = "_sc_norm_fs_cellprofiler_ic.csv.gz"

data_dir = pathlib.Path("..", "..", "..", "3.process_cfret_features", "data")

cp_file = pathlib.Path(data_dir, f"{plate}{file_suffix}")

output_dir = pathlib.Path("results")
output_cp_file = pathlib.Path(output_dir, f"{plate}_linear_model_cp_features.tsv")

In [3]:
# Load data
cp_df = pd.read_csv(cp_file)

# Drop NA columns
cp_df = feature_select(
    cp_df,
    operation="drop_na_columns",
    na_cutoff=0
)

# Count number of cells per well and add to dataframe as metadata
cell_count_df = pd.DataFrame(
    cp_df.groupby("Metadata_Well").count()["Metadata_treatment"]
).reset_index()
cell_count_df.columns = ["Metadata_Well", "Metadata_cell_count_per_well"]
cp_df = cell_count_df.merge(cp_df, on=["Metadata_Well"])

# Clean the dose column to extract numeric value
cp_df = cp_df.assign(Metadata_dose_numeric=cp_df.Metadata_dose.str.strip("uM").astype(float))

# Define CellProfiler features
cp_features = infer_cp_features(cp_df)

print(f"We are testing {len(cp_features)} CellProfiler features")
print(cp_df.shape)
cp_df.head()

We are testing 505 CellProfiler features
(17995, 521)


Unnamed: 0.1,Metadata_Well,Metadata_cell_count_per_well,Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_number_of_singlecells,Metadata_heart_number,Metadata_treatment,Metadata_dose,Metadata_ImageNumber,...,Nuclei_Texture_InverseDifferenceMoment_PM_3_03_256,Nuclei_Texture_SumEntropy_ER_3_03_256,Nuclei_Texture_SumEntropy_Hoechst_3_00_256,Nuclei_Texture_SumEntropy_Mitochondria_3_03_256,Nuclei_Texture_SumEntropy_PM_3_01_256,Nuclei_Texture_SumVariance_ER_3_03_256,Nuclei_Texture_SumVariance_Hoechst_3_01_256,Nuclei_Texture_SumVariance_Mitochondria_3_03_256,Nuclei_Texture_SumVariance_PM_3_01_256,Metadata_dose_numeric
0,A09,357,0,A,9,357,9,drug_x,5uM,1,...,-0.889765,0.584458,-0.756811,1.646285,1.627744,-0.130277,-0.32954,0.168744,0.223935,5.0
1,A09,357,1,A,9,357,9,drug_x,5uM,1,...,0.177754,-1.035615,-0.488188,-0.927906,-0.37853,-0.45207,-0.30925,-0.179711,-0.193299,5.0
2,A09,357,2,A,9,357,9,drug_x,5uM,1,...,-1.693589,0.386622,0.999544,1.192634,1.117077,-0.112725,0.048069,-0.020985,0.010248,5.0
3,A09,357,3,A,9,357,9,drug_x,5uM,1,...,-1.901882,1.253318,0.376866,1.911982,1.972299,1.347963,-0.199487,0.628766,0.704861,5.0
4,A09,357,4,A,9,357,9,drug_x,5uM,1,...,-0.113685,-0.535773,-0.974978,-0.045518,0.206396,-0.412969,-0.336288,-0.155509,-0.156673,5.0


## Fit linear model

In [4]:
# Setup linear modeling framework
variables = ["Metadata_cell_count_per_well", "Metadata_dose_numeric"]
X = cp_df.loc[:, variables]

print(X.shape)
X.head()

(17995, 2)


Unnamed: 0,Metadata_cell_count_per_well,Metadata_dose_numeric
0,357,5.0
1,357,5.0
2,357,5.0
3,357,5.0
4,357,5.0


In [5]:
# Fit linear model for each feature
lm_results = []
for cp_feature in cp_features:
    # Subset CP data to each individual feature (univariate test)
    cp_subset_df = cp_df.loc[:, cp_feature]

    # Fit linear model
    lm = LinearRegression(fit_intercept=True)
    lm_result = lm.fit(X=X, y=cp_subset_df)
    
    # Extract Beta coefficients
    # (contribution of feature to X covariates)
    coef = lm_result.coef_
    
    # Estimate fit (R^2)
    r2_score = lm.score(X=X, y=cp_subset_df)
    
    # Add results to a growing list
    lm_results.append([cp_feature, r2_score] + list(coef))

# Convert results to a pandas DataFrame
lm_results = pd.DataFrame(
    lm_results,
    columns=["feature", "r2_score", "cell_count_coef", "treatment_dose_coef"]
)

# Output file
lm_results.to_csv(output_cp_file, sep="\t", index=False)

print(lm_results.shape)
lm_results.head()

(505, 4)


Unnamed: 0,feature,r2_score,cell_count_coef,treatment_dose_coef
0,Cytoplasm_AreaShape_Compactness,0.02647,0.000403,-0.050196
1,Cytoplasm_AreaShape_FormFactor,0.070085,-0.000828,0.074347
2,Cytoplasm_AreaShape_MajorAxisLength,0.05939,-0.001345,0.025609
3,Cytoplasm_AreaShape_MinorAxisLength,0.061127,-0.001351,0.027655
4,Cytoplasm_AreaShape_Orientation,8.6e-05,6e-06,0.003491
