#### Yu Han, August 2022.

We used logistic regression to classify neuron cells with deletion vs. w/o deletion. Feature importance score is printed. 

Mann-whitney-u-test was adopted to run feature by feature analysis between the deletion group and the control group to find significant features. 

Cell morphology features only.

In [1]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import math
import statistics
import random

from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from scipy.stats import ttest_ind
from statsmodels.stats.multitest import multipletests
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import adjusted_rand_score, homogeneity_score

from collections import Counter
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GroupShuffleSplit

from pycytominer import aggregate
import scipy.stats as ss

import warnings

warnings.filterwarnings("ignore")

### logistic regression analysis

In [2]:
# read profile data of two plates
df_p72 = pd.read_csv(
    "../2019_05_28_Neuronal_Cell_Painting/profiles/2022_03_03_NCP_NEURONS_2_20x/BR00132672/BR00132672_normalized_feature_select_batch.csv.gz"
)
df_p73 = pd.read_csv(
    "../2019_05_28_Neuronal_Cell_Painting/profiles/2022_03_03_NCP_NEURONS_2_20x/BR00132673/BR00132673_normalized_feature_select_batch.csv.gz"
)
df_neuron = pd.concat([df_p72, df_p73])
df_neuron.head(2)

Unnamed: 0,Metadata_plate_map_name,Metadata_EXPERIMENT_NAME,Metadata_DENSITY,Metadata_LINE_ID,Metadata_GENOTYPE,Metadata_Plate,Metadata_Well,Metadata_Site_Count,Metadata_Object_Count,Cells_AreaShape_Compactness,...,Nuclei_Texture_InverseDifferenceMoment_AGP_20_00,Nuclei_Texture_InverseDifferenceMoment_AGP_5_02,Nuclei_Texture_InverseDifferenceMoment_Brightfield_20_01,Nuclei_Texture_InverseDifferenceMoment_DNA_20_03,Nuclei_Texture_InverseDifferenceMoment_Mito_20_02,Nuclei_Texture_InverseDifferenceMoment_RNA_20_01,Nuclei_Texture_InverseDifferenceMoment_RNA_5_01,Nuclei_Texture_SumAverage_AGP_20_03,Nuclei_Texture_SumAverage_ER_20_02,Nuclei_Texture_SumVariance_Brightfield_20_00
0,NCP_2022_03_03_METADATA,2022_03_03_NCP_NEURONS_2,3500,1,DELETION,BR00132672,A01,3,144,-0.17229,...,-0.065279,1.0098,-1.3223,0.083103,0.41479,0.29334,1.2459,-0.54579,-0.98414,2.7363
1,NCP_2022_03_03_METADATA,2022_03_03_NCP_NEURONS_2,3500,1,DELETION,BR00132672,A02,3,83,-0.33451,...,0.59669,1.5056,-0.60581,0.86209,0.25006,1.0278,2.3084,-0.82466,-1.7576,-0.3397


In [3]:
# code deletion as 1 and control as 0.
df_neuron.Metadata_GENOTYPE[df_neuron.Metadata_GENOTYPE == "DELETION"] = 1
df_neuron.Metadata_GENOTYPE[df_neuron.Metadata_GENOTYPE == "CONTROL"] = 0

In [4]:
# select all feature columns
df_neuron_feature = df_neuron.loc[:, ~df_neuron.columns.str.startswith("Metadata")]

In [5]:
### logistic regression
# define X and y
X = df_neuron_feature.values
y = df_neuron["Metadata_GENOTYPE"].tolist()
y0 = np.array(y)

# split training and testing
# using GroupShuffleSplit to ensure the same patient won't appear in train and test sets at the same time
gss = GroupShuffleSplit(n_splits=2, train_size=0.8, random_state=42)
gss.get_n_splits()
for train_idx, test_idx in gss.split(X, y0, df_neuron["Metadata_LINE_ID"]):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y0[train_idx], y0[test_idx]

# fit the model
logisticRegr = LogisticRegression()
logisticRegr.fit(X_train, y_train)
predictions = logisticRegr.predict(X_test)

# print acc score
score = logisticRegr.score(X_test, y_test)
score

0.39375

In [6]:
# print feature importance values
neuron_feature_weights = pd.DataFrame(
    logisticRegr.coef_[0], columns=["weight"], index=df_neuron_feature.columns
)
neuron_feature_weights.reindex(
    neuron_feature_weights["weight"].abs().sort_values(ascending=False).index
).head(10)

Unnamed: 0,weight
Nuclei_Correlation_Costes_RNA_Brightfield,0.075574
Cytoplasm_RadialDistribution_RadialCV_Brightfield_3of4,0.05398
Cytoplasm_RadialDistribution_RadialCV_Brightfield_4of4,0.05164
Nuclei_Texture_DifferenceEntropy_RNA_10_02,-0.051537
Nuclei_AreaShape_Zernike_6_4,-0.05062
Nuclei_Correlation_Costes_Brightfield_Mito,-0.050581
Nuclei_Correlation_Costes_Mito_ER,0.050152
Cells_Correlation_Correlation_DNA_ER,-0.049888
Nuclei_Texture_DifferenceVariance_RNA_5_01,0.04927
Cells_Granularity_7_Mito,0.04926


### mann-whitney-u-test

In [7]:
# mean profile based on patient ID and genotype
df_neuron_mean = aggregate(
    population_df=df_neuron,
    strata=["Metadata_LINE_ID", "Metadata_GENOTYPE"],
    features="infer",
    operation="mean",
)
df_neuron_mean.head(2)

Unnamed: 0,Metadata_LINE_ID,Metadata_GENOTYPE,Cells_AreaShape_Compactness,Cells_AreaShape_MedianRadius,Cells_AreaShape_Zernike_1_1,Cells_AreaShape_Zernike_2_0,Cells_AreaShape_Zernike_2_2,Cells_AreaShape_Zernike_3_1,Cells_AreaShape_Zernike_3_3,Cells_AreaShape_Zernike_4_0,...,Nuclei_Texture_InverseDifferenceMoment_AGP_20_00,Nuclei_Texture_InverseDifferenceMoment_AGP_5_02,Nuclei_Texture_InverseDifferenceMoment_Brightfield_20_01,Nuclei_Texture_InverseDifferenceMoment_DNA_20_03,Nuclei_Texture_InverseDifferenceMoment_Mito_20_02,Nuclei_Texture_InverseDifferenceMoment_RNA_20_01,Nuclei_Texture_InverseDifferenceMoment_RNA_5_01,Nuclei_Texture_SumAverage_AGP_20_03,Nuclei_Texture_SumAverage_ER_20_02,Nuclei_Texture_SumVariance_Brightfield_20_00
0,1,1,0.051249,-0.623481,-0.012366,0.1316,0.390005,-0.640063,0.378335,0.733289,...,0.638041,1.097832,0.079206,0.509392,0.533359,-0.263681,0.341041,-0.751126,-0.444946,-0.485764
1,2,1,-0.232959,0.745598,-0.024121,0.244952,0.223665,-0.008189,0.085162,0.351442,...,0.287171,0.517993,-0.325573,0.365159,-0.391601,0.754717,1.302866,0.00383,-0.58746,0.822778


In [8]:
# select all feature columns
df_neuron_feature_mean = df_neuron_mean.loc[
    :, ~df_neuron_mean.columns.str.startswith("Metadata")
]

In [9]:
# create feature list
feature_list = list(df_neuron_feature_mean.columns)

In [10]:
# create empty dataframe
df_p_values_feature = pd.DataFrame(index=[0], columns=feature_list)

In [11]:
# run mann-whitney-u test
list_p = []
list_u = []
for feat in feature_list:
    u, p = ss.mannwhitneyu(
        df_neuron_mean.query("Metadata_GENOTYPE == 1")[feat],
        df_neuron_mean.query("Metadata_GENOTYPE == 0")[feat],
    )
    list_p.append(p)
    list_u.append(u)

In [12]:
# store p value to each feature
df_p_values_feature.iloc[0] = list_p
df_p_values_feature

Unnamed: 0,Cells_AreaShape_Compactness,Cells_AreaShape_MedianRadius,Cells_AreaShape_Zernike_1_1,Cells_AreaShape_Zernike_2_0,Cells_AreaShape_Zernike_2_2,Cells_AreaShape_Zernike_3_1,Cells_AreaShape_Zernike_3_3,Cells_AreaShape_Zernike_4_0,Cells_AreaShape_Zernike_4_2,Cells_AreaShape_Zernike_4_4,...,Nuclei_Texture_InverseDifferenceMoment_AGP_20_00,Nuclei_Texture_InverseDifferenceMoment_AGP_5_02,Nuclei_Texture_InverseDifferenceMoment_Brightfield_20_01,Nuclei_Texture_InverseDifferenceMoment_DNA_20_03,Nuclei_Texture_InverseDifferenceMoment_Mito_20_02,Nuclei_Texture_InverseDifferenceMoment_RNA_20_01,Nuclei_Texture_InverseDifferenceMoment_RNA_5_01,Nuclei_Texture_SumAverage_AGP_20_03,Nuclei_Texture_SumAverage_ER_20_02,Nuclei_Texture_SumVariance_Brightfield_20_00
0,0.404975,0.357197,0.421262,0.365028,0.275589,0.198514,0.289731,0.318991,0.21646,0.318991,...,0.365028,0.1067,0.0548238,0.487488,0.0265336,0.421262,0.0168944,0.429458,0.282617,0.341715


In [13]:
# select features with significant p values
df_p_values_feature_sig = df_p_values_feature.drop(
    df_p_values_feature.columns[df_p_values_feature.iloc[-1, :] > 0.005], axis=1
)
df_p_values_feature_sig.columns

Index(['Cytoplasm_Correlation_K_DNA_Brightfield',
       'Cytoplasm_Granularity_2_Mito',
       'Cytoplasm_RadialDistribution_FracAtD_Mito_2of4',
       'Cytoplasm_RadialDistribution_FracAtD_Mito_3of4',
       'Cytoplasm_RadialDistribution_FracAtD_Mito_4of4',
       'Cytoplasm_RadialDistribution_RadialCV_Brightfield_3of4',
       'Cytoplasm_RadialDistribution_RadialCV_Brightfield_4of4',
       'Nuclei_Granularity_7_Mito'],
      dtype='object')