# Background subtraction notebook

By: Megan Grout (groutm@ohsu.edu)

Adapted from code written by Dr. Marilyne Labrie and Nick Kendsersky


Last updated: 20191219

Import external libraries.

In [None]:
import os
import random
import re
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import matplotlib.colors as mplc
import subprocess


from scipy import signal

import plotly.figure_factory as ff
import plotly
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot 
import plotly.express as px
init_notebook_mode(connected = True)

Import function written for this project.

In [None]:
from cycif_modules import *

Define function to change header names. Not encapsutated in `cycif_modules`, so that user can change on the fly as necessary.

In [None]:
# This may change for each experiment, so I have not sequestered
# this code in the cycif_modules.py file

# This function takes in a dataframe, changes the names
# of the column in various ways, and returns the dataframe.
# For best accuracy and generalizability, the code uses
# regular expressions (regex) to find strings for replacement.
def apply_header_changes(df):
    # remove lowercase x at beginning of name
    df.columns = df.columns.str.replace("^x","")
    # remove space at beginning of name
    df.columns = df.columns.str.replace("^ ","")
    # replace space with underscore
    df.columns = df.columns.str.replace(" ","_")
    # fix typos
    df.columns = df.columns.str.replace("CKD1","CDK1")
    df.columns = df.columns.str.replace("GAG3","GATA3")
    return df

## Begin Workflow

### Get directories

In [None]:
# Base directory for project
base_dir = '/Users/groutm/Desktop/weewin'
base_dir = '/Users/groutm/Desktop/reproducibility'
base_dir = 'Z:\Marilyne\Axioscan\Gao_Zhang\Segmentation'
base_dir = '/Users/groutm/Desktop/gz_new'

# Set name for of project
# for use in directory creation
project_name = 'ww'
project_name = 'repro'
project_name = 'gz_new'

# Set string for current step, and for previous step
# for use in file and direcotry naming
step_suffix = 'bs'
previous_step_suffix_long = "_qc_eda"

# Initial input data directory
#input_data_dir = r'/Users/groutm/Desktop/TMAdata'
#input_data_dir = r'/Users/groutm/Desktop/ww_data'
input_data_dir = os.path.join(base_dir, project_name + previous_step_suffix_long)

# BS directory
#output_data_dir = r'/Users/groutm/Desktop/TMAoutputdata'
#output_data_dir = r'/Users/groutm/Desktop/ww_outputdata'
output_data_dir = os.path.join(base_dir, project_name + "_" + step_suffix)

# BS images subdirectory
#output_images_dir = r'/Users/groutm/Desktop/TMAimages'
#output_images_dir = r'/Users/groutm/Desktop/wwimages'
output_images_dir = os.path.join(output_data_dir,"images")

# Metadata directories
metadata_dir = os.path.join(base_dir, project_name + "_metadata")
metadata_images_dir = os.path.join(metadata_dir,"images")

# Create necessary directories for this step, if they don't already exist
for d in [base_dir, input_data_dir, output_data_dir, output_images_dir, 
          metadata_dir, metadata_images_dir]:
    if not os.path.exists(d):
        os.makedirs(d)

# Change directory to location of input files
os.chdir(input_data_dir)



Create list of samples for use in this step of workflow. Do not include file extensions or steps labels.

In [None]:
# Provide list of samples whose files we want to read int
# Needs to be a list of strings, which serve as bases for 
# input file names. Input files will be derived from base
# sample names, previous step substring, and filetype 
# extension

ls_samples = ['TMA','ww1', 'ww10', 'ww11', 'ww12', 'ww13', 'ww15', 
              'ww16', 'ww17', 'ww19', 'ww2', 'ww20', 'ww21', 
              'ww22', 'ww23', 'ww3', 'ww4', 'ww5', 'ww6', 'ww7', 
              'ww8', 'ww9']

ls_samples = ['TMA1.1', 'TMA1.2', 'TMA1.3', 'TMA2.1', 'TMA2.2', 'TMA2.3']

ls_samples = ['GZ10.1', 'GZ10.2', 'GZ10.3', 'TMA',
 'GZ7.1', 'GZ6', 'GZ7.2']

ls_samples = ['A_GZ2', 'B_GZ1', 'C_GZ5', 'D_GZ4', 'E_GZ3','F_GZ6','G_GZ7', 'H_GZ9','I_GZ10','TMA']

## Import all metadata we need from the QC/EDA chapter

### Metadata

This file contains 

In [None]:
filename = "marker_intensity_metadata.csv"
filename = os.path.join(metadata_dir, filename)

# Check file exists
if not os.path.exists(filename):
    print("WARNING: Could not find desired file: "+filename)

In [None]:
# Open, read in information
metadata = pd.read_csv(filename)

# Verify size
# This part is wrapped in a try/except block because 
# it wasn't working on the PC workstation, but worked
# on MG's personal PC laptop and department loaner MacBook
try:
    verify_line_no(filename, metadata.shape[0] + 1)
    print("Ran file length verification.")
except:
    pass
    
# Verify headers are expected
exp_cols = ['Round','Target','Channel','target_lower','full_column','marker','location']
compare_headers(exp_cols, metadata.columns.values, "Marker metadata file")

In [None]:
# Show some of dataframe - FYI
metadata.head()

### not_intensities

This file should be a csv with the name of a single "not intensity" or "not marker" column (e.g., ROI_index, Nuc_X_Inv) on each line. No need for each item to actually be present in any dataframe.

In [None]:
filename = "not_intensities.csv"
filename = os.path.join(metadata_dir, filename)

# Check file exists
if not os.path.exists(filename):
    print("WARNING: Could not find desired file: "+filename)

In [None]:
# Open, read in information
not_intensities = []
with open(filename, 'r') as fh:
    not_intensities = fh.read().strip().split("\n")
    # take str, strip whitespace, split on new line character

# Verify size
# This part is wrapped in a try/except block because 
# it wasn't working on the PC workstation, but worked
# on MG's personal PC laptop and department loaner MacBook
try:
    verify_line_no(filename, not_intensities.shape[0])
    print("Ran file length verification.")
except:
    pass
    

# Print to console
print("not_intensities = ")
print(not_intensities)

### full_to_short_column names

In [None]:
filename = "full_to_short_column_names.csv"
filename = os.path.join(metadata_dir, filename)

# Check file exists
if not os.path.exists(filename):
    print("WARNING: Could not find desired file: " + filename)

In [None]:
# Open, read in information
df = pd.read_csv(filename, header = 0)

# Verify size
# This part is wrapped in a try/except block because 
# it wasn't working on the PC workstation, but worked
# on MG's personal PC laptop and department loaner MacBook
try:
    verify_line_no(filename, df.shape[0] + 1)
    print("Ran file length verification.")
except:
    pass
    

# Turn into dictionary
full_to_short_names = df.set_index('full_name').T.to_dict('records')[0]

# Print information
print('full_to_short_names =')
print(full_to_short_names)

### short_to_full_column_names

In [None]:
filename = "short_to_full_column_names.csv"
filename = os.path.join(metadata_dir, filename)

# Check file exists
if not os.path.exists(filename):
    print("WARNING: Could not find desired file: " + filename)

In [None]:
# Open, read in information
df = pd.read_csv(filename, header = 0)


# Verify size
# This part is wrapped in a try/except block because 
# it wasn't working on the PC workstation, but worked
# on MG's personal PC laptop and department loaner MacBook
try:
    verify_line_no(filename, df.shape[0] + 1)
    print("Ran file length verification.")
except:
    pass
    

# Turn into dictionary
short_to_full_names = df.set_index('short_name').T.to_dict('records')[0]

# Print information
print('short_to_full_names =')
print(short_to_full_names)

### Color information

#### Samples

In [None]:
filename = "sample_color_data.csv"
filename = os.path.join(metadata_dir, filename)

# Check file exists
if not os.path.exists(filename):
    print("WARNING: Could not find desired file: " + filename)

In [None]:
# Open, read in information
df = pd.read_csv(filename, header = 0)
df = df.drop(columns = ['hex'])

# our tuple of float values for rgb, (r, g, b) was read in 
# as a string '(r, g, b)'. We need to extract the r-, g-, and b-
# substrings and convert them back into floats
df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1)


# Verify size
# This part is wrapped in a try/except block because 
# it wasn't working on the PC workstation, but worked
# on MG's personal PC laptop and department loaner MacBook
try:
    verify_line_no(filename, df.shape[0] + 1)
    print("Ran file length verification.")
except:
    pass
    

# Turn into dictionary
sample_color_dict = df.set_index('Sample_ID').T.to_dict('rgb')[0]

# Print information
print('sample_color_dict =')
print(sample_color_dict)


#### Channels

In [None]:
filename = "channel_color_data.csv"
filename = os.path.join(metadata_dir, filename)

# Check file exists
if not os.path.exists(filename):
    print("WARNING: Could not find desired file: "+filename)

In [None]:
# Open, read in information
df = pd.read_csv(filename, header = 0)
df = df.drop(columns = ['hex'])

# our tuple of float values for rgb, (r, g, b) was read in 
# as a string '(r, g, b)'. We need to extract the r-, g-, and b-
# substrings and convert them back into floats
df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1)


# Verify size
# This part is wrapped in a try/except block because 
# it wasn't working on the PC workstation, but worked
# on MG's personal PC laptop and department loaner MacBook
try:
    verify_line_no(filename, df.shape[0] + 1)
    print("Ran file length verification.")
except:
    pass
    
# Turn into dictionary
channel_color_dict = df.set_index('Channel').T.to_dict('rgb')[0]

# Print information
print('channel_color_dict =')
print(channel_color_dict)


#### Round

In [None]:
filename = "round_color_data.csv"
filename = os.path.join(metadata_dir, filename)

# Check file exists
if not os.path.exists(filename):
    print("WARNING: Could not find desired file: "+filename)

In [None]:
# Open, read in information
df = pd.read_csv(filename, header = 0)
df = df.drop(columns = ['hex'])

# our tuple of float values for rgb, (r, g, b) was read in 
# as a string '(r, g, b)'. We need to extract the r-, g-, and b-
# substrings and convert them back into floats
df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1)



# Verify size
# This part is wrapped in a try/except block because 
# it wasn't working on the PC workstation, but worked
# on MG's personal PC laptop and department loaner MacBook
try:
    verify_line_no(filename, df.shape[0] + 1)
    print("Ran file length verification.")
except:
    pass
    
# Turn into dictionary
round_color_dict = df.set_index('Round').T.to_dict('rgb')[0]

# Print information
print('round_color_dict =')
print(round_color_dict)


## Import data

In [None]:
# Read in the first row of the file correpsonding to the first sample (index = 0)
# in ls_samples

# We do not need to specify a directory, since we earlier changed
# the current working directory to be that containing these files
filename = ls_samples[0] + previous_step_suffix_long + ".csv"

# Read in only the first line
df = pd.read_csv(filename, index_col = 0, nrows = 1)

# Apply the changes to the headers as specified in above funciton
df = apply_header_changes(df)

# Set variable to hold default header values
expected_headers = df.columns.values

print("df index name is currently",df.index.name)

In [None]:
df.head()

In [None]:
print("Used " + ls_samples[0] + previous_step_suffix_long +
      ".csv to determine the expected, corrected headers for all files.")
print("There headers are: \n" + ", ".join([h for h in expected_headers]) + ".")

In [None]:
# Set dictionary to hold all individual sample data
dfs = {}

# iterate through each sample in our list of samples
for sample in ls_samples:
    # Check for existence of file
    if not os.path.exists(sample+previous_step_suffix_long+".csv"):
        print("File " + sample+previous_step_suffix_long+".csv" +
             " does not exist. Removing from analysis...")
        # Remove from list if not found
        ls_samples.remove(sample)
        continue
        
    # open the file
    # set the index to be the first (0-based indexing, so 0th)
    # column in input file.
    df = pd.read_csv('{}.csv'.format(sample), index_col = 0)#,
                    #nrows = 500) 
    # use nrows = # to specify number of input rows if you want
    
    # Check for empty df
    # if so, don't continue trying to process df
    if df.shape[0] == 0:
        print('Zero content lines detected in ' + sample + ' file.'
              'Removing from analysis...')
        # Remove from list, so further steps won't be looking
        # for data on this sample.
        # Note that for lists, we do not need to re-assign
        # the list when removing an item, i.e., we do not say
        # 'ls_samples = ls_samples.remove(sample)', since this
        # operation does not return anything.
        ls_samples.remove(sample)
        continue
    
    
    # Verify that the loaded df are the right length
    # commenting out because this code did not work on all
    # machines during testing (failed one PC, succeeded with
    # one PC and one MacBook)
    try:
        verify_line_no(sample + ".csv", df.shape[0] + 1) 
    except:
        pass
    # adding 1 because we expect the header was detected 
    # during file import and not counted towards length of df
    
     # Manipulations necessary for concatenation
    df = apply_header_changes(df)
    # sort them alphanetically
    df = df[[x for x in sorted(df.columns.values)]]
    
    
    # Compare headers of new df against what is expected
    compare_headers(expected_headers, df.columns.values, sample)
    
    
    # For cases where we have samples called TMA1.1, TMA1.2, TMA1.3, etc.
    # Using regular expressions (regex) to extract the characters in the
    # sample name from TMA to the following digits, stopping at the period
    #if 'ROI_index' in df.columns.values:
     #   df['ROI_slide'] = re.findall(r'(TMA\d+)',sample)[0]
        
    # Add to dictionary of dfs 
    dfs[sample] = df
    


#Merge dfs into one big df
df = pd.concat(dfs.values(), ignore_index=False , sort = False)
# remove dfs from memory, since its big (relatively) and we
# don't need a data struture of all samples' data separated
# individually when we can extract information from the big
# df using the Sample_ID column
del dfs

Let's take a look at a few features to make sure our dataframe is as expected

In [None]:
df.shape

In [None]:
df.index 

Check for NaN entries (should not be any unless columns do not align), which can result from stitching together dfs with different values in their headers.

In [None]:
# if there are any null values, then print names of columns containing
# null values
if df.isnull().any().any():
    print(df.columns[df.isnull().any()])

#in 'if' statement, false means no NaN entries True means NaN entries 

Check that all expected files were imported into final dataframe by comparing our sample names to the unique values in the Sample_ID column.

In [None]:
# Check that all expected files were imported into final dataframe

if sorted(df.Sample_ID.unique()) == sorted(ls_samples):
    print("All expected filenames present in big df Sample_ID column.")
else:
    compare_headers(['no samples'], df.Sample_ID.unique(), "big df Sample_ID column")

## Filtering

#### Filter rows with 0 mean intensity

In [None]:
#delete rows that have 0 value mean intensities for intensity columns
print("df.shape: ", df.shape)
df = df.loc[df.apply(
    lambda row: 
    row[~row.index.isin(not_intensities)].mean(), axis = 1) >0 ,:]

print("df.shape: ", df.shape)
# changed this from != 0 to 0
# original line of code:
#df = df.loc[((df.iloc[:,3:] != 0).all(1) )]

## R Shiny Filtering App

Save file for filtering app.

I found that the R Shiny apps work better with a smaller dataset. Here, I create a dataframe of only 10,000 rows, where the proportion of rows attributed to each sample is maintained, by setting 'original' in `create_subset`.

In [None]:
subset_row_count = 10000

In [None]:
subset_df = create_subset(
    df = df, 
    col = 'Sample_ID', 
    count = subset_row_count, 
    ratio = 'original')

How many lines for each sample ID are in our subset df?

In [None]:
subset_df['Sample_ID'].value_counts().sort_index()

How do the proportions of cells in the original and subset dfs compare?

In [None]:
df['Sample_ID'].value_counts().sort_index()/df.shape[0]

In [None]:
subset_df['Sample_ID'].value_counts().sort_index()/subset_df.shape[0]

In [None]:
filename = "for_filtering_app.csv"
filename = os.path.join(output_data_dir,filename)
subset_df.to_csv(filename, index = False)

## Perform filtering

In [None]:
# As a reminder to us, here are the features available on which we can filter...
df.columns.values

### Filter on nuc size and AF

In [None]:
#delete small cells and objects w/high AF555 Signal (RBCs)
df = df.loc[(df['Nucleus_Size'] > 60 )]
df = df.loc[(df['Nucleus_Size'] < 600 )]
print("Number of cells after filtering on nucleus size:", df.shape[0])

#df = df.loc[(df['AF555_Cytoplasm_Intensity_Average'] < 2500)]
#df = df.loc[(df['AF555_Nucleus'] < 3000)]
df = df.loc[(df['AF555_Cell_Intensity_Average'] < 2500)]
print("Number of cells after filtering on AF555 ___ intensity:", df.shape[0])

## Assign Cell types

In [None]:
#assign tumor cells 

def assign_cell_type(row):
    print(row.keys)
    if (row['CKs_Cytoplasm_Intensity_Average'] > 2000)  \
        | (row['Ecad_Cytoplasm_Intensity_Average'] > 1800) \
        | (row['NCad_Cytoplasm_Intensity_Average'] > 2200):
            return 'TUMOR'
    elif (row['CD45_Cytoplasm_Intensity_Average'] > 4500) \
     | (row['CD4_Cytoplasm_Intensity_Average'] > 3000) \
     | (row['CD68_Cytoplasm_Intensity_Average'] > 3200):
        return 'IMMUNE'
    else:
        return 'STROMA'
    
###############
### WARNING ###
###############
# randomly assigning here just for development purposes
# comment out this version of assign_cell_types() when
# running the code 'for real'
def assign_cell_type(row):
    n = np.random.randint(0,10)
    if n < 2:
        return 'IMMUNE'
    if n < 5:
        return 'CANCER'    
    return 'STROMA'

# First create a cell_type column and make it blank
df['cell_type'] = ''

# Then iterate through each sample and perform the cell type assignment as necessary
my_list = [] # put sample names here
my_list = ls_samples.copy() # comment this line out
for sample in my_list:
    df.loc[df['Sample_ID'] == sample,'cell_type'] = df.apply(lambda row: assign_cell_type(row), axis = 1)

For saving parameters to file:

In [None]:
# Create empty dataframe to hold parameters

my_cols = ['Sample_ID','a','b']
filtering_params = pd.DataFrame(columns = my_cols)

In [None]:
filtering_params

In [None]:
# Add your param info
filtering_params = filtering_params.append(
    pd.DataFrame(
        {'Sample_ID':['my sample'],
         'a':['parameter value'],
        'b':['parameter value']}), sort = True)

filtering_params

In [None]:
# Save to file

fileaname = "filtering_params.csv"
filename = os.path.join(output_data_dir, filename)
filtering_params.to_csv(filename, index = False)

Save counts of each cell type to file

In [None]:
# By Sample_ID, by cell type
cell_series = df.groupby('Sample_ID').cell_type.value_counts()
filename = "counts_by_sample_ID_by_cell_type.csv"
filename = os.path.join(output_data_dir, filename)
cell_series.to_csv(filename, header = True)
print("Cell counts by Sample_ID, by cell type:")
print(cell_series)

# Just by cell type
cell_series = df.cell_type.value_counts()
filename = "counts_by_cell_type.csv"
filename = os.path.join(output_data_dir, filename)
cell_series.to_csv(filename, header = True)
print("Cell counts by cell type:")
print(cell_series)

### Establish colors to use throughout workflow

Cell type - want colors that are categorical, since Cell Type is a non-ordered category. A categorical color palette will have dissimilar colors.

In [None]:
# Get those unique colors
cell_types = ['STROMA','CANCER','IMMUNE']
color_values = sb.color_palette("hls",n_colors = len(cell_types))#'HLS'
# each color value is a tuple of three values: (R, G, B)

print("Unique cell types are:",df.cell_type.unique())
# Display those unique colors
sb.palplot(sb.color_palette(color_values))

# allow for user-input of named colors

Store in a dictionary

In [None]:
celltype_color_dict = dict(zip(cell_types, color_values))

In [None]:
celltype_color_dict

In [None]:
## Here is an example of how you might specify the values yourself,
# derived from the QC/EDA chapter

#celltype_color_dict['CANCER'] = mplc.to_rgb('xkcd:dark sky blue')
#celltype_color_dict['IMMUNE'] = mplc.to_rgb('xkcd:reddish orange')
#celltype_color_dict['STROMA'] = mplc.to_rgb('xkcd:jungle green')

sb.palplot(sb.color_palette(
    [celltype_color_dict['IMMUNE'],
     celltype_color_dict['STROMA'],
     celltype_color_dict['CANCER']]))


#### Save color information (mapping and legend) to metadata directory

In [None]:
# Create dataframe
celltype_color_df = color_dict_to_df(celltype_color_dict, "cell_type")
celltype_color_df.head()

# Save to file in metadatadirectory
filename = "celltype_color_data.csv"
filename = os.path.join(metadata_dir, filename)
celltype_color_df.to_csv(filename, index = False)

In [None]:
# Legend of cell type info only

g  = plt.figure(figsize = (1,1)).add_subplot(111)
g.axis('off')
handles = []
for item in celltype_color_dict.keys():
        h = g.bar(0,0, color = celltype_color_dict[item],
                  label = item, linewidth =0)
        handles.append(h)
first_legend = plt.legend(handles=handles, loc='upper right', title = 'Cell type'),


filename = "Celltype_legend.png"
filename = os.path.join(metadata_images_dir, filename)
plt.savefig(filename, bbox_inches = 'tight')

#### Plot the post-filtering nucleus sizes, colored by cell type

In [None]:
make_distr_plot_per_sample(title = "Post-filtering nucleus sizes",
                           dfs = [df.loc[df['cell_type'] == 'STROMA',:],
                               df.loc[df['cell_type'] == 'IMMUNE',:],
                                  df.loc[df['cell_type'] == 'CANCER',:]], 
                           df_names = ['Immune','Cancer','Stroma'], 
                           colors = [celltype_color_dict['STROMA'],
                               celltype_color_dict['IMMUNE'],
                                    celltype_color_dict['CANCER']
                                    ], 
                           x_label = "Nucleus Size", 
                           legend = False,
                           markers = ['Nucleus_Size'],
                          location = output_images_dir)

# note that the traces are layered on top of each other, so if the tallest trace is in front, 
# then the other two will be tinted accordingly. You can change around the order of the data
# in the parameter dfs = [] above, but don't forget to also change the order of the colors = []
# parameter accordingly!!

#### Bar plots

In [None]:
# Get counts for each Sample_ID, sorted by Sample_ID
sample_counts = pd.DataFrame(df.Sample_ID.value_counts()).sort_index()
sample_counts = sample_counts.rename(columns = {'Sample_ID':'counts'})
sample_counts['Sample_ID'] = sample_counts.index
#counts['color'] = counts.apply(lambda row: color_dict[row['Sample_ID']], axis = 1)
sample_counts

# There should be a better way to do this with 'groupby' or something
stroma_counts = pd.DataFrame({'stroma':
    df.loc[
        df['cell_type'] == 'STROMA',:].Sample_ID.value_counts()}).sort_index()

immune_counts = pd.DataFrame({'immune':
    df.loc[
        df['cell_type'] == 'IMMUNE',:].Sample_ID.value_counts()}).sort_index()

cancer_counts = pd.DataFrame({'cancer':
    df.loc[
        df['cell_type'] == 'CANCER',:].Sample_ID.value_counts()}).sort_index()

counts = pd.concat([sample_counts, stroma_counts,cancer_counts,immune_counts],
                   axis = 1, sort = False)
counts.head()

Eastablish the proportional breakdown of each cell type in dataframe for relevant plots.

In [None]:
def get_perc(row, cell_type):
    total = row['stroma'] + row['immune'] + row['cancer']
    return round(row[cell_type]/total *100,1)

counts['stroma_perc'] = counts.apply(lambda row: get_perc(row, 'stroma'), axis = 1)
counts['immune_perc'] = counts.apply(lambda row: get_perc(row, 'immune'), axis = 1)
counts['cancer_perc'] = counts.apply(lambda row: get_perc(row, 'cancer'), axis = 1)

In [None]:
# By sample ID only 

# Establish figure
fig = go.Figure()
title = 'Post-filtering BS Cell counts by Sample ID'

# Iterate through all samples and add trace to plot
for sample in ls_samples:
    fig.add_trace(go.Bar(
        x=counts.loc[counts['Sample_ID']==sample,'Sample_ID'], 
        y = counts.loc[counts['Sample_ID']==sample,'counts'],
        text = counts.loc[counts['Sample_ID']==sample,'counts'], textposition='outside',
        marker=dict(
            color='rgb' + str(sample_color_dict[sample])),
            showlegend = False
        
    ))
    
# Update aesthetic parameters
fig.update_layout(title = title, plot_bgcolor = 'white')
fig.update_xaxes(title_text = "Sample ID", linecolor = 'black')
fig.update_yaxes(title_text = "Cell count", linecolor = "black")

# Figure output
#plot(fig)
fig.write_image(output_images_dir + "/" + title.replace(" ","_") + ".png")


In [None]:
# By sample ID and cell type - proportion

# Establish figure
fig = go.Figure()
title = 'BS Cell proportions by Sample ID and tissue type'

# Plot all three cell types in one go
fig = go.Figure(data=[
    go.Bar(name='Stroma', x=counts['Sample_ID'], y=counts['stroma_perc'], 
           text = counts['stroma_perc'], textposition='auto',
           marker_color = 'rgb' + str(celltype_color_dict['STROMA'])),
    go.Bar(name='Immune', x=counts['Sample_ID'], y=counts['immune_perc'], 
           text = counts['immune_perc'], textposition='auto',
           marker_color = 'rgb' + str(celltype_color_dict['IMMUNE'])),
    go.Bar(name='Cancer',x=counts['Sample_ID'], y=counts['cancer_perc'], 
           text = counts['cancer_perc'], textposition='auto', 
           marker_color = 'rgb' + str(celltype_color_dict['CANCER']))
])

# Adjust aesthetic parameters
fig.update_layout(title = title, plot_bgcolor = 'white',barmode ='stack')
fig.update_xaxes(title = "Sample", linecolor = 'black')
fig.update_yaxes(title = "Cell count", linecolor = 'black')

# Figure output
#plot(fig)
fig.write_image(output_images_dir + "/" + title.replace(" ","_") + ".png")


In [None]:
# By sample ID and cell type - count

# Establish figure
fig = go.Figure()
title = 'BS Cell counts by Sample ID and tissue type'

# Plot all three cell types in one go
fig = go.Figure(data=[
    go.Bar(name='Stroma', x=counts['Sample_ID'], y=counts['stroma'], 
           text = counts['stroma'], textposition='auto',
           marker_color = 'rgb' + str(celltype_color_dict['STROMA'])),
    go.Bar(name='Immune', x=counts['Sample_ID'], y=counts['immune'], 
           text = counts['immune'], textposition='auto',
           marker_color = 'rgb' + str(celltype_color_dict['IMMUNE'])),
    go.Bar(name='Cancer',x=counts['Sample_ID'], y=counts['cancer'], 
           text = counts['cancer'], textposition='auto', 
           marker_color = 'rgb' + str(celltype_color_dict['CANCER']))
])
 
# Adjust aesthetic parameters
fig.update_layout(title = title, plot_bgcolor = 'white',barmode ='stack')
fig.update_xaxes(title = "Sample", linecolor = 'black')
fig.update_yaxes(title = "Cell count", linecolor = 'black')

# Figure output
#plot(fig)
fig.write_image(output_images_dir + "/" + title.replace(" ","_") + ".png")


### Continue with BS

#### Divide each marker (and its location) by the right exposure setting for each group of samples

In [None]:
def divide_exp_time(col, exp_col):
    exp_time = metadata.loc[metadata['full_column'] == col.name, exp_col].values[0]
    return col/exp_time

#df.loc[df['ROI_slide'] == 'TMA1',~df.columns.isin(not_intensities)] = \
#    df.loc[df['ROI_slide'] == 'TMA1',~df.columns.isin(not_intensities)].apply(
#    lambda column: divide_exp_time(column, 'Exp_TMA1'), axis = 0)

#df.loc[df['ROI_slide'] == 'TMA2',~df.columns.isin(not_intensities)] = \
#    df.loc[df['ROI_slide'] == 'TMA2',~df.columns.isin(not_intensities)].apply(
#    lambda column: divide_exp_time(column, 'Exp_TMA2'), axis = 0)

# Operate only on not_intensity columns
# Divide each of these columns by the appropriate exposure time in the metadata dataframe
df.loc[:, ~df.columns.isin(not_intensities)] = \
    df.loc[:, ~df.columns.isin(not_intensities)].apply(
    lambda column: divide_exp_time(column, 'Exp'), axis = 0)

#### Do background subtraction

In [None]:
metadata.head()


In [None]:
#metadata.to_csv("/Users/groutm/Desktop/metadata_for_PCA_test.csv", index = False)

### Perform background substitution

In [None]:
def do_background_sub(col):
    #print(col.name)
    location = metadata.loc[metadata['full_column'] == col.name, 'location'].values[0]
    #print('location = ' + location)
    channel = metadata.loc[metadata['full_column'] == col.name, 'Channel'].values[0]
    #print('channel = ' + channel)
    af_target = metadata.loc[
        (metadata['Channel']==channel) \
        & (metadata['location']==location) \
        &(metadata['target_lower'].str.contains(r'^af\d{3}$')),\
        'full_column'].values[0]
    # ^ right channel, right location, right maker (AF baseline)
    # don't want to subtract an AF channel from itself
    if af_target == col.name:
        return col
    return col - df.loc[:,af_target]


# Operate only on not_intensity columns
# Subtract from non-AF columns the appropriate AF column value for that cell.
df.loc[:,~df.columns.isin(not_intensities)] = \
    df.loc[:,~df.columns.isin(not_intensities)].apply(
    lambda column: do_background_sub(column), axis = 0)

### Adjust for outliers

In [None]:
#remove outliers (replace outliers with X percentile)
outlier_percent = 0.005

# Establish our allowable ceiling value for each feature
upper_lim = df.loc[
    :,~df.columns.isin(not_intensities)].quantile(1 - outlier_percent)

# Identify which values exceed allowable ceiling value
upper_outliers = (df.loc[:,~df.columns.isin(not_intensities)] > upper_lim)

# Set the exceeding value to the ceiling value
df.loc[:,~df.columns.isin(not_intensities)] = \
    df.loc[:,~df.columns.isin(not_intensities)].mask(upper_outliers, upper_lim, axis=1)  



In [None]:
# Diplay what the upper limits are for each marker
upper_lim

#### Set values < 0 to 0

In [None]:
# set anything that is below 0 to 0, so that we can do the log transform
for f in df.columns[~df.columns.isin(not_intensities)]:
    df.loc[df[f] < 0,f] = 0


### Drop AF columns

We have no use for AF columns after background subtraction.

In [None]:
df = df.filter(regex='^(?!AF\d{3}).*')

## annotate the regex

## Further background subtraction visualizations

### Heatmap

We will only be plotting ~10k cells in the interest of time/computing resources. We want these 10k lines in our original df to be sampled randomly, without replacement, with the caveat that the proportions of all samples in the data remains the same in this subset. If the size of the dataframe is > 10k rows, then we will proceed with the entire dataset.

In [None]:
subset_row_count = 10000

In [None]:
subset_df = create_subset(df, 'Sample_ID', subset_row_count, 'equal')

How many lines for each sample ID are in our subset df?

In [None]:
subset_df['Sample_ID'].value_counts().sort_index()

How do the proportions of cells in the original and subset dfs compare?

In [None]:
df['Sample_ID'].value_counts().sort_index()/df.shape[0]

In [None]:
subset_df['Sample_ID'].value_counts().sort_index()/subset_df.shape[0]

### Get data structures to map colors to columns and rows...

## Row colors

For the row colors, we essentially just need to map the information in a given feature to the colors that correspond to that value in the right color dictionary. For example, it might be sample_3, sample_3, sample_4, , so we need the row colors to be (1, 1, 1), (1, 1, 1), (0, 0.25, 0.6). These are the initialy colors--if we are clustering rows or columns, the labels will still match the data with which they're associated.

In [None]:
sample_row_colors = subset_df.Sample_ID.map(sample_color_dict)

sample_row_colors[1:5]

## Column rows

For column rows, matching up the information in each column with the appropriate color is more difficult. 

In [None]:
# Here, we want to translate marker columns to their corresponding channel information,
# and then match that up with the right color, as with row columns

# First, we merge the (L) non-intensity column values, transformed into a dataframe,
# with the metadata df (R), matching on the "0" column present in the L,
# which is the only column in there, with the "full_column" (aka df header name)
# column in the R, only including all cases where there is a match and any unmatched
# L cases ('both' [?] would be only cases where ther is is a match, and 'right' would
# be cases with a match and any unmatched R columns).
column_channel_colors = pd.merge(pd.DataFrame(pd.Series(
    subset_df.loc[:,~subset_df.columns.isin(not_intensities)].columns.values)), 
                  metadata, how = 'left',
         left_on = 0, right_on = 'full_column')[[0,'Channel']]['Channel'].map(channel_color_dict)

# Set the index to be the names of the colors. There is only one column, and that is the corresponding
# colors
column_channel_colors.index = subset_df.loc[:,~subset_df.columns.isin(not_intensities)].columns.values

column_channel_colors.head()

In [None]:
# Here, we want to translate marker columns to their corresponding round information,
# and then match that up with the right color, as with row columns

# First, we merge the (L) non-intensity column values, transformed into a dataframe,
# with the metadata df (R), matching on the "0" column present in the L,
# which is the only column in there, with the "full_column" (aka df header name)
# column in the R, only including all cases where there is a match and any unmatched
# L cases ('both' [?] would be only cases where ther is is a match, and 'right' would
# be cases with a match and any unmatched R columns).
column_round_colors = pd.merge(pd.DataFrame(pd.Series(
    subset_df.loc[:,~subset_df.columns.isin(not_intensities)].columns.values)), 
                  metadata, how = 'left',
         left_on = 0, right_on = 'full_column')[[0,'Round']]['Round'].map(round_color_dict)

# Set the index to be the names of the colors. There is only one column, and that is the corresponding
# colors
column_round_colors.index = subset_df.loc[:,~subset_df.columns.isin(not_intensities)].columns.values

column_round_colors.head()

### Annotations data structure

In [None]:
# Create data structure to hold everything we need for row/column annotations
# annotations is a dictionary
## IMPORTANT - if you use 'annotations', it MUST have both 'rows' and 'cols'
## objects inside. These can be empty lists, but they must be there!
annotations = {}

# create a data structure to hold everything we need for only row annotations
# row_annotations is a list, where each item therein is a dictioary corresponding
# to all of the data pertaining to that particular annotation
# Adding each item (e.g., Sample, then Cluster), one at a time to ensure ordering
# is as anticipated on figure
row_annotations = []
row_annotations.append({'label':'Sample','type':'row','mapping':sample_row_colors,'dict':sample_color_dict,
                        'location':'center left','bbox_to_anchor':(0, 0.5)})
# Add all row information into the annotations dictionary
annotations['rows'] = row_annotations


# Now we repeat the process for column annotations
col_annotations = []
col_annotations.append({'label':'Round','type':'column','mapping':column_round_colors,'dict':round_color_dict,
                       'location':'upper right','bbox_to_anchor':(1,0.50)})

col_annotations.append({'label':'Column','type':'column','mapping':column_channel_colors,'dict':channel_color_dict,
                       'location':'upper right','bbox_to_anchor':(1,0.75)})
annotations['cols'] = col_annotations

#### Actually plot the heatmap

In [None]:
heatmap_function(
    data = subset_df.loc[:,~subset_df.columns.isin(not_intensities)],
    title = "Background substraction heatmap",
    # define method, metric, and color map
    method = 'ward', metric = 'euclidean',cmap = 'coolwarm',
    # colorbar (legend coloring of main plot) 
    cbar_kws = {'label':'BS Intens.'},
    # xticklabels - want to have the nicknames instead of full names,
    # so we translate from full to short names; we also only want to include
    # non_intensity columns, to match the data we fed into under 'data'
    xticklabels = [full_to_short_names[name] for name in 
                     subset_df.loc[:,
                                 ~subset_df.columns.isin(not_intensities)].columns.values],
    # where to save the dataframe        
    save_loc = output_images_dir,
    # Boolean values for clustering
    row_cluster = True, col_cluster = True,
    # provide annotations established aboved
    annotations = annotations
          )

   #### XY plot - one per sample, colored by cell type

I needed to make a subset of the original df, since the original was too big for my computer to plot

In [None]:
subset_row_count = 10000

In [None]:
subset_df = create_subset(df, 'Sample_ID', subset_row_count, 'original')

How many lines for each sample ID are in our subset df?

In [None]:
subset_df['Sample_ID'].value_counts().sort_index()

How do the proportions of cells in the original and subset dfs compare?

In [None]:
df['Sample_ID'].value_counts().sort_index()/df.shape[0]

In [None]:
subset_df['Sample_ID'].value_counts().sort_index()/subset_df.shape[0]

#### Perform plotting

In [None]:
# iterate through all samples

x_feature = 'Nuc_X'
y_feature = 'Nuc_Y_Inv'

for sample in ls_samples:
    # Extract x/y and cell type infor for all cells
    location_colors = subset_df.loc[subset_df['Sample_ID'] == sample,[x_feature,y_feature,'cell_type']]

    # Establish figure
    fig = go.Figure()
    title = sample + " Background Subtracted XY Map"

    # Iterate through cell types and plot each trace
    for celltype in subset_df.loc[subset_df['Sample_ID'] == sample,'cell_type'].unique():
        fig.add_scatter(
            # Only want
            mode = 'markers',
            marker=dict(
                        size=2, opacity = 0.70, # size is dot size, higher opacity = less opaque
                        color='rgb' + str(celltype_color_dict[celltype])#,
                #line = dict(width = 2, color = 'gray') # line around each marker
                       ),
        x = location_colors.loc[location_colors['cell_type']==celltype,x_feature],
        y = location_colors.loc[location_colors['cell_type']==celltype,y_feature],
        name = celltype)

    # Update aesthetic parameters
    fig.update_layout(title = title, plot_bgcolor = 'white',
                     legend= {'itemsizing': 'constant'}) # make the legend dots a bit bigger    
    fig.update_xaxes(title_text = x_feature, linecolor = 'black')
    fig.update_yaxes(title_text = y_feature, linecolor = 'black')

    # Figure output
    #plot(fig)
    fig.write_image(output_images_dir + "/" + title.replace(" ","_") + ".png")



### R Shiny - PCA

Here might me a good place to perform PCA. You can save an output file like so:

In [None]:
filename = "for_PCA.csv"
filename = os.path.join(output_data_dir, filename)
subset_df.to_csv(filename, index = False)

### Distributions

#### One per sample, one per marker

In [None]:
# per-sample, per-marker distribution plots

df = df.rename(columns = full_to_short_names)

for sample in ls_samples:
#for sample in [s for s in ls_samples if df.loc[df['Sample_ID'] == s,:].shape[0] > 0]:
#for sample in ['ww10','TMA','ww12']:
    for marker in [m for m in df.columns.values if m not in not_intensities]:
        make_distr_plot_per_sample(
        title = sample + " " + marker + " BS",
        dfs = [
            df.loc[df['Sample_ID']==sample,:].copy()],
        df_names = [sample],
        colors = [sample_color_dict[sample]],
        x_label = "Intensity",
        legend = False,
        markers = marker,
        location = output_images_dir)     

df = df.rename(columns = short_to_full_names)


#### Distributions - one per channel

All samples are represented individually on each plot

In [None]:
metadata.loc[ (metadata['target_lower'].str.contains(r'^af\d{3}$')), ['Target','Channel']]

In [None]:
for c in metadata.Channel.unique():
    my_cols = metadata.loc[metadata['Channel'] == c, 'full_column']
    channel_data = df.loc[:,[f for f in my_cols if f in df.columns]]
    channel_data = channel_data.rename(columns = full_to_short_names)
    wavelength = metadata.loc[(metadata['Channel'] == c) \
                              & (metadata['target_lower'].str.contains(r'^af\d{3}$')), 'Target'].values[0]
    make_distr_plot_per_sample(
        title = "Distribution of " + wavelength + " markers across all BS cells",
        dfs = [channel_data],
            df_names = [''],
                              colors = sb.color_palette("hls",n_colors = channel_data.shape[1]),
                              x_label = "Intensity",
                              legend = True,
    location = output_images_dir, not_intensities = not_intensities)

## make sure it's clear how to set xlims manually for these plots

### Scatterplot - cell size vs nucleus size, color = nulceus roundess

Useful now that we have performed filtering. Note that real workflow will use the appropriate features that are lacking in this dataset.

In [None]:
df.columns[df.columns.isin(not_intensities)]

In [None]:
# Establish labeling
title = "Nucleus size by cell size for post-filtering data"
x_label = "Nucleus Size"
y_label = "Cell Size" # cell size

# Create plot
fig = px.scatter(subset_df, x="Cell_Size", y="Nucleus_Size",
                 color='Nucleus_Roundness')
                 
fig.update_layout(title_text=title, font=dict(size=18), 
        plot_bgcolor = 'white', showlegend = True )

# Adjust opacity of traces and size of marker
fig.update_traces(opacity=0.6, marker ={'size':5})
# Adjust x-axis parameters
fig.update_xaxes(title_text = x_label, showline=True, linewidth=2, linecolor='black', 
        tickfont=dict(size=18))
    # Adjust y-axis parameters
fig.update_yaxes(title_text = y_label, showline=True, linewidth=2, linecolor='black',
        tickfont=dict(size=18))

# Figure output
#plot(fig)
filename = title.replace(" ","_") + ".png"
filename = os.path.join(output_images_dir, filename)
fig.write_image(filename)

### Drop any other rows or columns we want to before saving data

In [None]:
# Let's take a look
df.columns.values

For the sake of example, I will operate on a copy of df, called df_copy

In [None]:
# You MUST do df.copy()
# 'df_copy = df' would essentially 
# give you two different names for the
# SAME dataframe, so operating on one
# would also operate on the other
df_copy = df.copy()

#### Operate on entire rows or columns

In [None]:
# Drop columns
my_cols = []
df_copy = df_copy.drop(columns = my_cols)

In [None]:
# Keep only specific columns (explained below)
my_cols = []
my_cols = df.columns.values
df_copy = df_copy.loc[:,my_cols]

#### Operate on rows and columns using filtering criteria

In [None]:
# Keep only certain rows based off of criteria

# use df.loc[] to filter
# df.loc[rows,columns]
# df.loc[:,certain_cols] --> keep all rows ':', only certain cols
# df.loc[certain_rows,:] --> keep only certain row, all cols ':'

# Say we only want certain values for Sample_ID
print(df_copy.Sample_ID.unique())
keep = ['TMA1.1','TMA1.2','TMA1.3','TMA2.1','TMA2.2','TMA2.3']
df_copy = df_copy.loc[df_copy['Sample_ID'].isin(keep),:]
print(df_copy.Sample_ID.unique())

In [None]:
# Filter on multiple criteria
# '&' or 'and'
# '|' or 'or'
# you MUST have parentheses around each logic expression!
df_copy = df_copy.loc[
    (df_copy['Sample_ID'].isin(['TMA1.1','TMA1.2','TMA1.3'])) \
    ## backslash above used to break line for readability, but tell Python to act like it's all one line
        | (df_copy['Sample_ID'].isin(['TMA2.1','TMA2.2','TMA2.3'])),:]
print(df_copy.Sample_ID.unique())

In [None]:
# Remove rows based off of certain criteria
# note the negating tilde '~'!

df_copy = df_copy.loc[
    (~df_copy['Sample_ID'].isin(['TMA1.1','TMA1.2','TMA1.3'])) \
    ## backslash above used to break line for readability, but tell Python to act like it's all one line
        & (~df_copy['Sample_ID'].isin(['TMA2.1','TMA2.2','TMA2.3'])),:]
print(df_copy.Sample_ID.unique())

### Save the data by Sample_ID

In [None]:
# Check for existence of output file first
for sample in ls_samples:
    filename = os.path.join(output_data_dir+ "/" + sample + "_" + step_suffix + ".csv")
    if os.path.exists(filename):
        print("File by name "+filename+" already exists.")

In [None]:
# Save output files
for sample in ls_samples:
    df_save = df.loc[df['Sample_ID'] == sample,:]
    filename = os.path.join(output_data_dir,   sample + "_" + step_suffix + ".csv")
    df_save.to_csv(filename, index = True)
