## Fit a linear model on cell morphology features

We aim to determine which features are significantly impacted by drug treatment, adjusted by cell count.

In [1]:
import pathlib
import pandas as pd

from sklearn.linear_model import LinearRegression

from pycytominer import feature_select
from pycytominer.cyto_utils import infer_cp_features

In [2]:
# Define inputs and outputs
plate = "localhost230405150001"  # Focusing on plate 3
file_suffix = "_sc_norm_fs_cellprofiler.csv.gz"

data_dir = pathlib.Path("..", "..", "..", "3.process_cfret_features", "data")

cp_file = pathlib.Path(data_dir, f"{plate}{file_suffix}")

output_dir = pathlib.Path("results")
output_cp_file = pathlib.Path(output_dir, f"{plate}_linear_model_cp_features.tsv")

In [7]:
# Load data
cp_df = pd.read_csv(cp_file)

# Drop NA columns
cp_df = feature_select(
    cp_df,
    operation="drop_na_columns",
    na_cutoff=0
)

# Count number of cells per well and add to dataframe as metadata
cell_count_df = pd.DataFrame(
    cp_df.groupby("Metadata_Well").count()["Metadata_treatment"]
).reset_index()
cell_count_df.columns = ["Metadata_Well", "Metadata_cell_count_per_well"]
cp_df = cell_count_df.merge(cp_df, on=["Metadata_Well"])

# Clean the dose column to extract numeric value
# cp_df = cp_df.assign(Metadata_dose_numeric=cp_df.Metadata_dose.str.strip("uM").astype(float))

# # Define CellProfiler features
cp_features = infer_cp_features(cp_df)

print(f"We are testing {len(cp_features)} CellProfiler features")
print(cp_df.shape)
cp_df.head()

  cp_df = pd.read_csv(cp_file)


We are testing 552 CellProfiler features
(26992, 566)


Unnamed: 0,Metadata_Well,Metadata_cell_count_per_well,Metadata_WellRow,Metadata_WellCol,Metadata_heart_number,Metadata_cell_type,Metadata_heart_failure_type,Metadata_treatment,Metadata_ImageNumber,Metadata_Plate,...,Nuclei_Texture_InfoMeas2_PM_3_01_256,Nuclei_Texture_InfoMeas2_PM_3_03_256,Nuclei_Texture_InverseDifferenceMoment_Actin_3_02_256,Nuclei_Texture_InverseDifferenceMoment_ER_3_01_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_00_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_01_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_03_256,Nuclei_Texture_SumVariance_Hoechst_3_03_256,Nuclei_Texture_SumVariance_Mitochondria_3_01_256,Nuclei_Texture_SumVariance_PM_3_01_256
0,B02,564,B,2,9,failing,rejected,DMSO,1,localhost230405150001,...,0.191918,-0.032872,0.292288,-0.604487,0.888165,0.429366,0.39953,-0.366829,-0.258781,-0.310718
1,B02,564,B,2,9,failing,rejected,DMSO,1,localhost230405150001,...,0.608292,0.29013,-0.069668,-0.411109,0.182782,0.44762,0.350265,-0.380608,2.405688,-0.190068
2,B02,564,B,2,9,failing,rejected,DMSO,1,localhost230405150001,...,-0.140377,-0.314924,-0.217099,-0.612188,0.524718,0.798054,0.552916,-0.401958,-0.232218,-0.316777
3,B02,564,B,2,9,failing,rejected,DMSO,1,localhost230405150001,...,1.021672,0.81369,0.616922,-0.400131,0.260481,0.714237,0.41196,-0.367868,-0.152203,-0.217675
4,B02,564,B,2,9,failing,rejected,DMSO,1,localhost230405150001,...,-0.170594,0.078071,0.048193,0.768125,-0.580192,0.982025,0.97974,-0.395945,0.034628,-0.348407


In [None]:
# separating failing and healthy cells 
healthy_df = cp_df.loc[cp_df["Metadata_cell_type"]=="healthy"]
failing_df = cp_df.loc[cp_df["Metadata_cell_type"]=="failing"]

## Fit linear model

In [4]:
# Setup linear modeling framework
variables = ["Metadata_cell_count_per_well", "Metadata_dose_numeric"]
X = cp_df.loc[:, variables]

print(X.shape)
X.head()

(17352, 2)


Unnamed: 0,Metadata_cell_count_per_well,Metadata_dose_numeric
0,342,5.0
1,342,5.0
2,342,5.0
3,342,5.0
4,342,5.0


In [5]:
# Fit linear model for each feature
lm_results = []
for cp_feature in cp_features:
    # Subset CP data to each individual feature (univariate test)
    cp_subset_df = cp_df.loc[:, cp_feature]

    # Fit linear model
    lm = LinearRegression(fit_intercept=True)
    lm_result = lm.fit(X=X, y=cp_subset_df)
    
    # Extract Beta coefficients
    # (contribution of feature to X covariates)
    coef = lm_result.coef_
    
    # Estimate fit (R^2)
    r2_score = lm.score(X=X, y=cp_subset_df)
    
    # Add results to a growing list
    lm_results.append([cp_feature, r2_score] + list(coef))

# Convert results to a pandas DataFrame
lm_results = pd.DataFrame(
    lm_results,
    columns=["feature", "r2_score", "cell_count_coef", "treatment_dose_coef"]
)

# Output file
lm_results.to_csv(output_cp_file, sep="\t", index=False)

print(lm_results.shape)
lm_results.head()

(585, 4)


Unnamed: 0,feature,r2_score,cell_count_coef,treatment_dose_coef
0,Cytoplasm_AreaShape_Compactness,0.044215,-2.5e-05,-0.078005
1,Cytoplasm_AreaShape_Extent,0.069271,-7.7e-05,0.096291
2,Cytoplasm_AreaShape_FormFactor,0.095343,-0.00016,0.111922
3,Cytoplasm_AreaShape_MajorAxisLength,0.083236,-0.001751,0.014357
4,Cytoplasm_AreaShape_Perimeter,0.045651,-0.001369,-0.011158
