Following the previous notebook, we investigate what happens when the low cell count wells are removed.

Conclusion: Removing the low cell count wells vastly improves the signal and accuracy of logistic regression. Looking at the images, it's clear that those wells were mostly empty or had cells missing.

In [1]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns 
import math
import statistics
import random

from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from scipy.stats import ttest_ind
from statsmodels.stats.multitest import multipletests
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import adjusted_rand_score, homogeneity_score

from collections import Counter

# Functions

In [2]:
def id_to_cluster(linkage_data, agg_features_df):
    """
    linkage_data: array of cluster numbers
    agg_features_df: df of aggregated features to merge, index must match order of linkage_data
    Returns a df with aggregated cp data and cluster number for each id
    """
    # Get cluster number with the aggregated feature data
    clusters_hierarchal_df = pd.DataFrame(data=linkage_data, index=agg_features_df.index)
    clusters_hierarchal_df.rename(columns={0:'cluster_num'}, inplace=True)
    clusters_hierarchal_df = clusters_hierarchal_df.merge(agg_features_df, how='left', left_index=True, right_index=True)

    # Get the cell profiler features by cluster
    cp_features_by_cluster = clusters_hierarchal_df.groupby(by='cluster_num').mean()
    
    return clusters_hierarchal_df

def hierarchical_cluster(df, threshold, show=False):
    """
    Returns df of cluster features
    """
    threshold=threshold
    Z1 = linkage(df, 'ward')
    flat_linkage1 = fcluster(Z1, t=threshold, criterion='distance')
    cluster_features1 = id_to_cluster(flat_linkage1, df)
    
    if show:
        plt.figure(figsize=(12, df.shape[0]/4))
        plt.axvline(x=threshold)
        label = [str(i) + ' ' + j for i, j in zip(flat_linkage1.tolist(), df.index.tolist())]
        plt.title('Clusters based on cell profiler features')
        dend1 = dendrogram(Z1, color_threshold=threshold, orientation='left', leaf_font_size=10, labels=label)
    return cluster_features1

def km_cluster(df, num_clusters, random_state=2):
    """
    returns df of cluster features
    """
    km = KMeans(n_clusters=num_clusters, random_state=random_state).fit(df)
    return id_to_cluster(km.labels_, df)

# Load data

In [3]:
# Progenitors data
from pathlib import Path
path = os.getcwd()
base_dir = str(Path(path).parent)

switch_isogenic_labels = True
human_only=True

#FS data
progenitors = pd.read_csv(base_dir + '/3.analysis/feature_sets/Progenitors/0714_stdev_corr_fs.csv', index_col=0)

exclude = ['5', '6', '33', '12', '16']
try:
    idx = [i for i in progenitors.index.tolist() if i.split('_')[2] not in exclude] # Exclude patient number
    progenitors = progenitors[progenitors.index.isin(idx)]
except: pass

if human_only:
    progenitors = progenitors[progenitors.index.str.contains('human')]

print ('shape: {}'.format(progenitors.shape))

labels = progenitors.index.tolist()
if switch_isogenic_labels:
    for i in range(len(labels)):
        if 'isogenic_deletion' in labels[i]:
            labels[i] = labels[i].replace('isogenic_deletion', 'temp')
    for i in range(len(labels)):
        if 'isogenic_control' in labels[i]:
            labels[i] = labels[i].replace('isogenic_control', 'isogenic_deletion')            
        if 'temp' in labels[i]:
            labels[i] = labels[i].replace('temp', 'isogenic_control')                    
progenitors.index = labels
progenitors_orig = progenitors.copy()

shape: (312, 508)


In [4]:
# STEM data
# Or use my FS data
stem = pd.read_csv(base_dir + '/3.analysis/feature_sets/STEM01/0621_stdev_corr_fs.csv', index_col=0)


try:
    idx = [i for i in stem.index.tolist() if i.split('_')[-1] not in exclude]
    stem = stem[stem.index.isin(idx)]
except: pass

if human_only:
    stem = stem[stem.index.str.contains('human')]
    
print ('shape: {}'.format(stem.shape))

labels = stem.index.tolist()
if switch_isogenic_labels:
    for i in range(len(labels)):
        if 'isogenic_deletion' in labels[i]:
            labels[i] = labels[i].replace('isogenic_deletion', 'temp')
    for i in range(len(labels)):
        if 'isogenic_control' in labels[i]:
            labels[i] = labels[i].replace('isogenic_control', 'isogenic_deletion')            
        if 'temp' in labels[i]:
            labels[i] = labels[i].replace('temp', 'isogenic_control')                    
stem.index = labels
stem_orig = stem.copy()

shape: (352, 559)


# Logistic Regression

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans

In [6]:
# SPLIT DATA BASED ON PATIENT NUMBER STEM
random.seed(3)
stem['label'] = stem.index.str.split('_').map(lambda x: x[1])
stem['patient_num'] = stem.index.str.split('_').map(lambda x: x[2])

patients = list(set(stem['patient_num']))
random.shuffle(patients)
train_nums = patients[0:31]
test_nums = patients[31:]

train_df = stem[stem['patient_num'].isin(train_nums)]
test_df = stem[stem['patient_num'].isin(test_nums)]

stem_xtrain = train_df.drop(['label', 'patient_num'], axis=1)
stem_ytrain = train_df['label']
stem_xtest = test_df.drop(['label', 'patient_num'], axis=1)
stem_ytest = test_df['label']
                             
print(len(stem_xtrain) + len(stem_xtest))

logr1 = LogisticRegression(random_state=2, penalty='l2', C=1).fit(stem_xtrain, stem_ytrain)
print(logr1.score(stem_xtrain, stem_ytrain), logr1.score(stem_xtest, stem_ytest))

352
1.0 0.7596153846153846


In [7]:
# SPLIT DATA BASED ON PATIENT NUMBER PROGENITORS
random.seed(3)
progenitors['label'] = progenitors.index.str.split('_').map(lambda x: x[1])
progenitors['patient_num'] = progenitors.index.str.split('_').map(lambda x: x[2])

patients = list(set(progenitors['patient_num']))
random.shuffle(patients)
train_nums = patients[0:28]
test_nums = patients[28:]

train_df = progenitors[progenitors['patient_num'].isin(train_nums)]
test_df = progenitors[progenitors['patient_num'].isin(test_nums)]

progenitors_xtrain = train_df.drop(['label', 'patient_num'], axis=1)
progenitors_ytrain = train_df['label']
progenitors_xtest = test_df.drop(['label', 'patient_num'], axis=1)
progenitors_ytest = test_df['label']
                             
print(len(progenitors_xtrain) + len(progenitors_xtest))

logr1 = LogisticRegression(random_state=2, penalty='l2', C=1, max_iter=150).fit(progenitors_xtrain, progenitors_ytrain)
print(logr1.score(progenitors_xtrain, progenitors_ytrain), logr1.score(progenitors_xtest, progenitors_ytest))

312
1.0 0.6818181818181818


# Removing under 200 ct

In [8]:
# Progenitors data
from pathlib import Path
path = os.getcwd()
base_dir = str(Path(path).parent)

switch_isogenic_labels = True
human_only=True

#FS data
progenitors = pd.read_csv(base_dir + '/3.analysis/feature_sets/Progenitors/0714_stdev_corr_fs.csv', index_col=0)

if human_only:
    progenitors = progenitors[progenitors.index.str.contains('human')]

print ('shape: {}'.format(progenitors.shape))

labels = progenitors.index.tolist()
if switch_isogenic_labels:
    for i in range(len(labels)):
        if 'isogenic_deletion' in labels[i]:
            labels[i] = labels[i].replace('isogenic_deletion', 'temp')
    for i in range(len(labels)):
        if 'isogenic_control' in labels[i]:
            labels[i] = labels[i].replace('isogenic_control', 'isogenic_deletion')            
        if 'temp' in labels[i]:
            labels[i] = labels[i].replace('temp', 'isogenic_control')                    
progenitors.index = labels
progenitors_orig = progenitors.copy()

# Merge progenitors with cell_number_object_number
prog_cts = pd.read_csv(base_dir + '/1.run-workflows/profiles/NCP_PROGENITORS_1/BR_NCP_PROGENITORS_1.csv.gz', index_col=0)
prog_cts = prog_cts[['Metadata_Well', 'Cells_Number_Object_Number']]

progenitors['well'] = progenitors.index.str.split('_').map(lambda x: x[-1])
progenitors['well'] = progenitors.index.str.split('_').map(lambda x: x[-1])
progenitors['idx'] = progenitors.index
progenitors = progenitors.merge(prog_cts, how='inner', left_on='well', right_on='Metadata_Well').drop(['well', 'Metadata_Well'], axis=1)
progenitors.set_index('idx', inplace=True)

shape: (348, 508)


In [9]:
progenitors_over200 = progenitors[progenitors['Cells_Number_Object_Number']>200]

# SPLIT DATA BASED ON PATIENT NUMBER progenitors_over200
random.seed(2)
progenitors_over200['label'] = progenitors_over200.index.str.split('_').map(lambda x: x[1])
progenitors_over200['patient_num'] = progenitors_over200.index.str.split('_').map(lambda x: x[2])

patients = list(set(progenitors_over200['patient_num']))
random.shuffle(patients)
train_nums = patients[0:24]
test_nums = patients[24:]

train_df = progenitors_over200[progenitors_over200['patient_num'].isin(train_nums)]
test_df = progenitors_over200[progenitors_over200['patient_num'].isin(test_nums)]

progenitors_over200_xtrain = train_df.drop(['label', 'patient_num', 'Cells_Number_Object_Number'], axis=1)
progenitors_over200_ytrain = train_df['label']
progenitors_over200_xtest = test_df.drop(['label', 'patient_num', 'Cells_Number_Object_Number'], axis=1)
progenitors_over200_ytest = test_df['label']
                             
print(len(progenitors_over200_xtrain) + len(progenitors_over200_xtest))

logr1 = LogisticRegression(random_state=2, penalty='l2', C=1, max_iter=150).fit(progenitors_over200_xtrain, progenitors_over200_ytrain)
print(logr1.score(progenitors_over200_xtrain, progenitors_over200_ytrain), logr1.score(progenitors_over200_xtest, progenitors_over200_ytest))

269
1.0 0.7415730337078652


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  progenitors_over200['label'] = progenitors_over200.index.str.split('_').map(lambda x: x[1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  progenitors_over200['patient_num'] = progenitors_over200.index.str.split('_').map(lambda x: x[2])


In [10]:
progenitors_over200['label'] = progenitors_over200.index.str.split('_').map(lambda x: x[1])
progenitors_over200_xtrain, progenitors_over200_xtest, progenitors_over200_ytrain, progenitors_over200_ytest = train_test_split(progenitors_over200.drop(['label', 'Cells_Number_Object_Number', 'patient_num'], axis=1), progenitors_over200['label'], 
                                                                    test_size=0.3, random_state=4)
# print(len(progenitors_over200_xtrain) + len(progenitors_over200_xtest))

logr2 = LogisticRegression(random_state=2, penalty='l2', C=1, max_iter=130).fit(progenitors_over200_xtrain, progenitors_over200_ytrain)
print(logr2.score(progenitors_over200_xtrain, progenitors_over200_ytrain), logr2.score(progenitors_over200_xtest, progenitors_over200_ytest))

0.9946808510638298 0.9506172839506173


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  progenitors_over200['label'] = progenitors_over200.index.str.split('_').map(lambda x: x[1])
