#### Yu Han, Sep 2022.

We used logistic regression to classify neuron cells with deletion vs. w/o deletion. Feature importance score is printed.

Mann-whitney-u-test was adopted to run feature by feature analysis between the deletion group and the control group to find significant features. 

Four branching features only.

In [1]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import statistics
import random
import scipy.stats as ss

from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GroupShuffleSplit

import warnings

warnings.filterwarnings("ignore")
from pycytominer import aggregate

### logistic regression analysis

In [2]:
# read profile data of two plates
df_p72 = pd.read_csv(
    "../2019_05_28_Neuronal_Cell_Painting/profiles/2022_03_03_NCP_NEURONS_2_20x/BR00132672/BR00132672_normalized.csv.gz"
)
df_p73 = pd.read_csv(
    "../2019_05_28_Neuronal_Cell_Painting/profiles/2022_03_03_NCP_NEURONS_2_20x/BR00132673/BR00132673_normalized.csv.gz"
)
df_neuron = pd.concat([df_p72, df_p73])
df_neuron.head(2)

Unnamed: 0,Metadata_plate_map_name,Metadata_EXPERIMENT_NAME,Metadata_DENSITY,Metadata_LINE_ID,Metadata_GENOTYPE,Metadata_Plate,Metadata_Well,Metadata_Site_Count,Metadata_Object_Count,Cells_AreaShape_Area,...,Nuclei_Texture_Variance_RNA_10_02,Nuclei_Texture_Variance_RNA_10_03,Nuclei_Texture_Variance_RNA_20_00,Nuclei_Texture_Variance_RNA_20_01,Nuclei_Texture_Variance_RNA_20_02,Nuclei_Texture_Variance_RNA_20_03,Nuclei_Texture_Variance_RNA_5_00,Nuclei_Texture_Variance_RNA_5_01,Nuclei_Texture_Variance_RNA_5_02,Nuclei_Texture_Variance_RNA_5_03
0,NCP_2022_03_03_METADATA,2022_03_03_NCP_NEURONS_2,3500,1,DELETION,BR00132672,A01,3,144,-0.6597,...,-0.56724,-0.69918,-0.72463,-0.83127,-0.61628,-0.95575,-0.6134,-0.58717,-0.58301,-0.62436
1,NCP_2022_03_03_METADATA,2022_03_03_NCP_NEURONS_2,3500,1,DELETION,BR00132672,A02,3,83,-0.49612,...,-1.401,-1.3796,-1.1928,-1.386,-1.0686,-0.98057,-1.3489,-1.3409,-1.3489,-1.3968


In [3]:
# extract four branching features
df_branch = df_neuron[
    df_neuron.columns[
        df_neuron.columns.str.contains(
            "Metadata|NumberTrunks|NumberNonTrunkBranches|NumberBranchEnd|TotalObjectSkeletonLength"
        )
    ]
]
df_branch.to_csv("profile/neuron_branch_normalized.csv", index=False)
df_branch.head(2)

Unnamed: 0,Metadata_plate_map_name,Metadata_EXPERIMENT_NAME,Metadata_DENSITY,Metadata_LINE_ID,Metadata_GENOTYPE,Metadata_Plate,Metadata_Well,Metadata_Site_Count,Metadata_Object_Count,Nuclei_ObjectSkeleton_NumberBranchEnds_CellImageSkel,Nuclei_ObjectSkeleton_NumberNonTrunkBranches_CellImageSkel,Nuclei_ObjectSkeleton_NumberTrunks_CellImageSkel,Nuclei_ObjectSkeleton_TotalObjectSkeletonLength_CellImageSkel
0,NCP_2022_03_03_METADATA,2022_03_03_NCP_NEURONS_2,3500,1,DELETION,BR00132672,A01,3,144,-0.95647,-0.88478,-0.73272,-0.81801
1,NCP_2022_03_03_METADATA,2022_03_03_NCP_NEURONS_2,3500,1,DELETION,BR00132672,A02,3,83,-0.64866,-0.46213,-0.02024,-0.61935


In [4]:
# code deletion as 1 and control as 0.
df_branch.Metadata_GENOTYPE[df_branch.Metadata_GENOTYPE == "DELETION"] = 1
df_branch.Metadata_GENOTYPE[df_branch.Metadata_GENOTYPE == "CONTROL"] = 0

In [5]:
# select all feature columns
df_branch_feature = df_branch.loc[:, ~df_branch.columns.str.startswith("Metadata")]

In [6]:
### logistic regression
# define X and y
X = df_branch_feature.values
y = df_branch["Metadata_GENOTYPE"].tolist()
y0 = np.array(y)

# split training and testing
# using GroupShuffleSplit to ensure the same patient won't appear in train and test set at the same time
gss = GroupShuffleSplit(n_splits=2, train_size=0.8, random_state=42)
gss.get_n_splits()
for train_idx, test_idx in gss.split(X, y0, df_branch["Metadata_LINE_ID"]):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y0[train_idx], y0[test_idx]

# fit the model
logisticRegr = LogisticRegression()
logisticRegr.fit(X_train, y_train)
predictions = logisticRegr.predict(X_test)

# print acc score
score = logisticRegr.score(X_test, y_test)
score

0.5125

In [7]:
# print feature importance values
neuron_feature_weights = pd.DataFrame(
    logisticRegr.coef_[0], columns=["weight"], index=df_branch_feature.columns
)
neuron_feature_weights.reindex(
    neuron_feature_weights["weight"].abs().sort_values(ascending=False).index
).head(10)

Unnamed: 0,weight
Nuclei_ObjectSkeleton_NumberBranchEnds_CellImageSkel,-0.77153
Nuclei_ObjectSkeleton_NumberNonTrunkBranches_CellImageSkel,0.409082
Nuclei_ObjectSkeleton_TotalObjectSkeletonLength_CellImageSkel,0.270561
Nuclei_ObjectSkeleton_NumberTrunks_CellImageSkel,0.247203


### mann-whitney-u-test

In [8]:
# mean profile based on patient ID and genotype
df_branch_mean = aggregate(
    population_df=df_branch,
    strata=["Metadata_LINE_ID", "Metadata_GENOTYPE"],
    features="infer",
    operation="mean",
)
df_branch_mean.to_csv("profile/neuron_feat_branch_mean.csv", index=False)
df_branch_mean.head(2)

Unnamed: 0,Metadata_LINE_ID,Metadata_GENOTYPE,Nuclei_ObjectSkeleton_NumberBranchEnds_CellImageSkel,Nuclei_ObjectSkeleton_NumberNonTrunkBranches_CellImageSkel,Nuclei_ObjectSkeleton_NumberTrunks_CellImageSkel,Nuclei_ObjectSkeleton_TotalObjectSkeletonLength_CellImageSkel
0,1,1,-0.576519,-0.49064,-0.049856,-0.717221
1,2,1,0.244571,0.299023,0.392656,0.279753


In [9]:
# select all feature columns
df_branch_feature_mean = df_branch_mean.loc[
    :, ~df_branch_mean.columns.str.startswith("Metadata")
]

In [10]:
# create feature lists
feature_list = list(df_branch_feature_mean.columns)

In [11]:
# create empty dataframe
df_p_values_feature = pd.DataFrame(index=[0], columns=feature_list)

In [12]:
# run mann-whitney-u test
list_p = []
list_u = []
for feat in feature_list:
    u, p = ss.mannwhitneyu(
        df_branch_mean.query("Metadata_GENOTYPE == 1")[feat],
        df_branch_mean.query("Metadata_GENOTYPE == 0")[feat],
    )
    list_p.append(p)
    list_u.append(u)

In [13]:
# store p value to each feature
df_p_values_feature.iloc[0] = list_p
df_p_values_feature

Unnamed: 0,Nuclei_ObjectSkeleton_NumberBranchEnds_CellImageSkel,Nuclei_ObjectSkeleton_NumberNonTrunkBranches_CellImageSkel,Nuclei_ObjectSkeleton_NumberTrunks_CellImageSkel,Nuclei_ObjectSkeleton_TotalObjectSkeletonLength_CellImageSkel
0,0.404975,0.404975,0.388851,0.495829


In [14]:
# select features with p value smaller than .05
df_p_values_feature = df_p_values_feature.drop(
    df_p_values_feature.columns[df_p_values_feature.iloc[-1, :] > 0.05], axis=1
)
df_p_values_feature.columns

Index([], dtype='object')