#### Yu Han, August 2022.

We used logistic regression to classify neuron cells with deletion vs. w/o deletion. Feature importance score is printed. 

In [1]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns 
import math
import statistics
import random

from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from scipy.stats import ttest_ind
from statsmodels.stats.multitest import multipletests
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import adjusted_rand_score, homogeneity_score

from collections import Counter
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings("ignore")

In [2]:
#read profile data of two plates 
df_p72=pd.read_csv('../2019_05_28_Neuronal_Cell_Painting/profiles/2022_03_03_NCP_NEURONS_2_20x/BR00132672/BR00132672_normalized_feature_select_batch.csv.gz')
df_p73=pd.read_csv('../2019_05_28_Neuronal_Cell_Painting/profiles/2022_03_03_NCP_NEURONS_2_20x/BR00132673/BR00132673_normalized_feature_select_batch.csv.gz')
df_neuron=pd.concat([df_p72,df_p73])
df_neuron.head(2)

Unnamed: 0,Metadata_plate_map_name,Metadata_EXPERIMENT_NAME,Metadata_DENSITY,Metadata_LINE_ID,Metadata_GENOTYPE,Metadata_Plate,Metadata_Well,Metadata_Site_Count,Metadata_Object_Count,Cells_AreaShape_Compactness,...,Nuclei_Texture_InverseDifferenceMoment_AGP_20_00,Nuclei_Texture_InverseDifferenceMoment_AGP_5_02,Nuclei_Texture_InverseDifferenceMoment_Brightfield_20_01,Nuclei_Texture_InverseDifferenceMoment_DNA_20_03,Nuclei_Texture_InverseDifferenceMoment_Mito_20_02,Nuclei_Texture_InverseDifferenceMoment_RNA_20_01,Nuclei_Texture_InverseDifferenceMoment_RNA_5_01,Nuclei_Texture_SumAverage_AGP_20_03,Nuclei_Texture_SumAverage_ER_20_02,Nuclei_Texture_SumVariance_Brightfield_20_00
0,NCP_2022_03_03_METADATA,2022_03_03_NCP_NEURONS_2,3500,1,DELETION,BR00132672,A01,3,144,-0.17229,...,-0.065279,1.0098,-1.3223,0.083103,0.41479,0.29334,1.2459,-0.54579,-0.98414,2.7363
1,NCP_2022_03_03_METADATA,2022_03_03_NCP_NEURONS_2,3500,1,DELETION,BR00132672,A02,3,83,-0.33451,...,0.59669,1.5056,-0.60581,0.86209,0.25006,1.0278,2.3084,-0.82466,-1.7576,-0.3397


In [3]:
#code deletion as 1 and control as 0. 
df_neuron.Metadata_GENOTYPE[df_neuron.Metadata_GENOTYPE == 'DELETION'] = 1
df_neuron.Metadata_GENOTYPE[df_neuron.Metadata_GENOTYPE == 'CONTROL'] = 0

In [4]:
#select all feature columns
df_neuron_feature=df_neuron.loc[:,~df_neuron.columns.str.startswith('Metadata')]

In [5]:
### logistic regression 
#sdefine X and y
X=df_neuron_feature.values
y=df_neuron['Metadata_GENOTYPE'].tolist()
#split training and testing 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
#fit the model
logisticRegr = LogisticRegression()
logisticRegr.fit(X_train, y_train)
predictions = logisticRegr.predict(X_test)
#print acc score
score = logisticRegr.score(X_test, y_test)
score

0.6428571428571429

In [6]:
#print feature importance values
neuron_feature_weights = pd.DataFrame(logisticRegr.coef_[0], columns=['weight'], index=df_neuron_feature.columns)
neuron_feature_weights.reindex(neuron_feature_weights['weight'].abs().sort_values(ascending=False).index).head(15)

Unnamed: 0,weight
Cells_Intensity_MinIntensityEdge_Brightfield,-0.090791
Cytoplasm_Intensity_IntegratedIntensity_DNA,0.088187
Nuclei_Texture_AngularSecondMoment_Brightfield_20_01,-0.084419
Nuclei_Correlation_Correlation_RNA_AGP,0.080209
Nuclei_Correlation_Costes_RNA_Brightfield,0.079236
Cytoplasm_Intensity_MassDisplacement_Brightfield,0.076245
Cytoplasm_RadialDistribution_RadialCV_Brightfield_3of4,0.07361
Cytoplasm_RadialDistribution_RadialCV_Brightfield_4of4,0.072241
Nuclei_Correlation_Correlation_ER_AGP,0.068618
Cells_Correlation_RWC_Mito_AGP,-0.066996
