## KMeans Notebook

By: Megan Grout (groutm2020@alumni.ohsu.edu)

Adapted from code written by Dr. Marilyne Labrie and Nick Kendsersky


Last updated: 20200527

Import external libraries.

In [None]:
import os
import random
import re
import subprocess
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import matplotlib.colors as mplc



from scipy import signal

import plotly.figure_factory as ff
import plotly
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot 
import plotly.express as px
init_notebook_mode(connected = True)

Import functions written for this project.

In [None]:
from cycif_modules import *

Define function to change header names. Not encapsutated in `cycif_modules`, so that user can change on the fly as necessary.

In [None]:
# This may change for each experiment, so I have not sequestered
# this code in the my_modules.py file

# This function takes in a dataframe, changes the names
# of the column in various ways, and returns the dataframe.
# For best accuracy and generalizability, the code uses
# regular expressions (regex) to find strings for replacement.
def apply_header_changes(df):
    # remove lowercase x at beginning of name
    df.columns = df.columns.str.replace("^x","")
    # remove space at beginning of name
    df.columns = df.columns.str.replace("^ ","")
    # replace space with underscore
    df.columns = df.columns.str.replace(" ","_")
    # fix typos
    #df.columns = df.columns.str.replace("typo","correct_name")
    return df

## Begin Workflow

### Get directories

In [None]:
# Base directory for project
base_dir = ''


# Set name for of project
# for use in directory creation
project_name = ''

# Set string for current step, and for previous step
# for use in file and direcotry naming
step_suffix = 'kmeans'
previous_step_suffix_long = "_zscore"

# Initial input data directory
input_data_dir = os.path.join(base_dir, project_name + previous_step_suffix_long)


# KMeans directory
output_data_dir = os.path.join(base_dir, project_name + "_" + step_suffix)

# KMeans images subdirectory
output_images_dir = os.path.join(output_data_dir,"images")

# Metadata directories
metadata_dir = os.path.join(base_dir, project_name + "_metadata")
metadata_images_dir = os.path.join(metadata_dir,"images")

# Create necessary directories for this step, if they don't already exist
for d in [base_dir, input_data_dir, output_data_dir, output_images_dir, 
          metadata_dir, metadata_images_dir]:
    if not os.path.exists(d):
        os.makedirs(d)

# Change directory to location of input files        
os.chdir(input_data_dir)

Create list of samples for use in this step of workflow. Do not include file extensions or steps labels.

In [None]:
## Comment for final workflow

ls_samples = []

## Import all metadata we need from the QC/EDA chapter

### metadata

In [None]:
filename = "marker_intensity_metadata.csv"
filename = os.path.join(metadata_dir, filename)

# Check file exists
if not os.path.exists(filename):
    print("WARNING: Could not find desired file: "+filename)

In [None]:
# Open, read in information
metadata = pd.read_csv(filename)

# Verify size
# This part is wrapped in a try/except block because 
# it wasn't working on the PC workstation, but worked
# on MG's personal PC laptop and department loaner MacBook
try:
    verify_line_no(filename, metadata.shape[0] + 1)
    print("Ran file length verification.")
except:
    pass


# Verify headers
exp_cols = ['Round','Target','Channel','target_lower','full_column','marker','location']
compare_headers(exp_cols, metadata.columns.values, "Marker metadata file")

In [None]:
# Show some of dataframe - FYI
metadata.head()

### not_intensities

In [None]:
filename = "not_intensities.csv"
filename = os.path.join(metadata_dir, filename)

# Check file exists
if not os.path.exists(filename):
    print("WARNING: Could not find desired file: "+filename)

In [None]:
# Open, read in information
not_intensities = []
with open(filename, 'r') as fh:
    not_intensities = fh.read().strip().split("\n")
    # take str, strip whitespace, split on new line character

# Verify size
# This part is wrapped in a try/except block because 
# it wasn't working on the PC workstation, but worked
# on MG's personal PC laptop and department loaner MacBook
try:
    verify_line_no(filename, not_intensities.shape[0])
    print("Ran file length verification.")
except:
    pass

# Print to console
print("not_intensities = ")
print(not_intensities)

### full_to_short_column names

In [None]:
filename = "full_to_short_column_names.csv"
filename = os.path.join(metadata_dir, filename)

# Check file exists
if not os.path.exists(filename):
    print("WARNING: Could not find desired file: " + filename)

In [None]:
# Open, read in information
df = pd.read_csv(filename, header = 0)

# Verify size
# This part is wrapped in a try/except block because 
# it wasn't working on the PC workstation, but worked
# on MG's personal PC laptop and department loaner MacBook
try:
    verify_line_no(filename, df.shape[0] + 1)
    print("Ran file length verification.")
except:
    pass

# Turn into dictionary
full_to_short_names = df.set_index('full_name').T.to_dict('records')[0]

# Print information
print('full_to_short_names =')
print(full_to_short_names)

### short_to_full_column_names

In [None]:
filename = "short_to_full_column_names.csv"
filename = os.path.join(metadata_dir, filename)

# Check file exists
if not os.path.exists(filename):
    print("WARNING: Could not find desired file: " + filename)

In [None]:
# Open, read in information
df = pd.read_csv(filename, header = 0)

# Verify size
# This part is wrapped in a try/except block because 
# it wasn't working on the PC workstation, but worked
# on MG's personal PC laptop and department loaner MacBook
try:
    verify_line_no(filename, df.shape[0] + 1)
    print("Ran file length verification.")
except:
    pass

# Turn into dictionary
short_to_full_names = df.set_index('short_name').T.to_dict('records')[0]

# Print information
print('short_to_full_names =')
print(short_to_full_names)

### Color information

#### Samples

In [None]:
filename = "sample_color_data.csv"
filename = os.path.join(metadata_dir, filename)

# Check file exists
if not os.path.exists(filename):
    print("WARNING: Could not find desired file: " + filename)

In [None]:
# Open, read in information
df = pd.read_csv(filename, header = 0)
df = df.drop(columns = ['hex'])

# our tuple of float values for rgb, (r, g, b) was read in 
# as a string '(r, g, b)'. We need to extract the r-, g-, and b-
# substrings and convert them back into floats
df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1)

# Verify size
# This part is wrapped in a try/except block because 
# it wasn't working on the PC workstation, but worked
# on MG's personal PC laptop and department loaner MacBook
try:
    verify_line_no(filename, df.shape[0] + 1)
    print("Ran file length verification.")
except:
    pass

# Turn into dictionary
sample_color_dict = df.set_index('Sample_ID').T.to_dict('rgb')[0]

# Print information
print('sample_color_dict =')
print(sample_color_dict)


#### Channels

In [None]:
filename = "channel_color_data.csv"
filename = os.path.join(metadata_dir, filename)

# Check file exists
if not os.path.exists(filename):
    print("WARNING: Could not find desired file: "+filename)

In [None]:
# Open, read in information
df = pd.read_csv(filename, header = 0)
df = df.drop(columns = ['hex'])

# our tuple of float values for rgb, (r, g, b) was read in 
# as a string '(r, g, b)'. We need to extract the r-, g-, and b-
# substrings and convert them back into floats
df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1)

# Verify size
# This part is wrapped in a try/except block because 
# it wasn't working on the PC workstation, but worked
# on MG's personal PC laptop and department loaner MacBook
try:
    verify_line_no(filename, df.shape[0] + 1)
    print("Ran file length verification.")
except:
    pass

# Turn into dictionary
channel_color_dict = df.set_index('Channel').T.to_dict('rgb')[0]

# Print information
print('channel_color_dict =')
print(channel_color_dict)


#### Round

In [None]:
filename = "round_color_data.csv"
filename = os.path.join(metadata_dir, filename)

# Check file exists
if not os.path.exists(filename):
    print("WARNING: Could not find desired file: "+filename)

In [None]:
# Open, read in information
df = pd.read_csv(filename, header = 0)
df = df.drop(columns = ['hex'])

# our tuple of float values for rgb, (r, g, b) was read in 
# as a string '(r, g, b)'. We need to extract the r-, g-, and b-
# substrings and convert them back into floats
df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1)

# Verify size
# This part is wrapped in a try/except block because 
# it wasn't working on the PC workstation, but worked
# on MG's personal PC laptop and department loaner MacBook
try:
    verify_line_no(filename, df.shape[0] + 1)
    print("Ran file length verification.")
except:
    pass

# Turn into dictionary
round_color_dict = df.set_index('Round').T.to_dict('rgb')[0]

# Print information
print('round_color_dict =')
print(round_color_dict)

#### Cell Type

In [None]:
filename = "celltype_color_data.csv"
filename = os.path.join(metadata_dir, filename)

# Check file exists
if not os.path.exists(filename):
    print("WARNING: Could not find desired file: "+filename)

In [None]:
# Open, read in information
df = pd.read_csv(filename, header = 0)
df = df.drop(columns = ['hex'])

# our tuple of float values for rgb, (r, g, b) was read in 
# as a string '(r, g, b)'. We need to extract the r-, g-, and b-
# substrings and convert them back into floats
df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1)

# Verify size
# This part is wrapped in a try/except block because 
# it wasn't working on the PC workstation, but worked
# on MG's personal PC laptop and department loaner MacBook
try:
    verify_line_no(filename, df.shape[0] + 1)
    print("Ran file length verification.")
except:
    pass

# Turn into dictionary
celltype_color_dict = df.set_index('cell_type').T.to_dict('rgb')[0]

# Print information
print('celltype_color_dict =')
print(celltype_color_dict)

## Import data

Prompt user for any files they would like excluded from analysis.

In [None]:
# Read in the first row of the file correpsonding to the first sample (index = 0)
# in ls_samples

# We do not need to specify a directory, since we earlier changed
# the current working directory to be that containing these files
filename = ls_samples[0] + previous_step_suffix_long + ".csv"

# Read in only the first line
df = pd.read_csv(filename, index_col = 0, nrows = 1)

# Apply the changes to the headers as specified in above funciton
df = apply_header_changes(df)

# Set variable to hold default header values
expected_headers = df.columns.values

print("df index name is currently",df.index.name)

In [None]:
df.head()

In [None]:
print("Used " + ls_samples[0] + previous_step_suffix_long +
      ".csv to determine the expected, corrected headers for all files.")
print("There headers are: \n" + ", ".join([h for h in expected_headers]) + ".")

In [None]:
# Set dictionary to hold all individual sample data
dfs = {}

# iterate through each sample in our list of samples
for sample in ls_samples:
    # Check for existence of file
    if not os.path.exists(sample+previous_step_suffix_long+".csv"):
        print("File " + sample+previous_step_suffix_long+".csv" +
             " does not exist. Removing from analysis...")
        # Remove from list if not found
        ls_samples.remove(sample)
        continue
        
    # open the file
    # set the index to be the first (0-based indexing, so 0th)
    # column in input file.
    df = pd.read_csv(sample + previous_step_suffix_long + ".csv", 
                     index_col = 0) #,  nrows = 500)
    # use nrows to specify the number of rows you want
    
    # Check for empty df
    # if so, don't continue trying to process df
    if df.shape[0] == 0:
        print('Zero content lines detected in ' + sample + ' file.'
              'Removing from analysis...')
        # Remove from list, so further steps won't be looking
        # for data on this sample.
        # Note that for lists, we do not need to re-assign
        # the list when removing an item, i.e., we do not say
        # 'ls_samples = ls_samples.remove(sample)', since this
        # operation does not return anything.
        ls_samples.remove(sample)
        continue
    
    
    # Verify that the loaded df are the right length
    # commenting out because this code did not work on all
    # machines during testing (failed one PC, succeeded with
    # one PC and one MacBook)
    try:
        verify_line_no(sample + ".csv", df.shape[0] + 1) 
    except:
        pass
    # adding 1 because we expect the header was detected 
    # during file import and not counted towards length of df
    
     # Manipulations necessary for concatenation
    df = apply_header_changes(df)
    # sort them alphanetically
    df = df[[x for x in sorted(df.columns.values)]]
    
    # Compare headers of new df against what is expected
    compare_headers(expected_headers, df.columns.values, sample)

    # For cases where we have samples called TMA1.1, TMA1.2, TMA1.3, etc.
    # Using regular expressions (regex) to extract the characters in the
    # sample name from TMA to the following digits, stopping at the period
    #if 'ROI_index' in df.columns.values:
    #    df['ROI_slide'] = re.findall(r'(TMA\d+)',sample)[0]    
    
    # Add to dictonary of dfs 
    dfs[sample] = df
    

#Merge dfs into one big df
df = pd.concat(dfs.values(), ignore_index=False , sort = False)
# remove dfs from memory, since its big (relatively) and we
# don't need a data struture of all samples' data separated
# individually when we can extract information from the big
# df using the Sample_ID column
del dfs

Let's take a look at a few features to make sure our dataframe is as expected

In [None]:
df.shape

In [None]:
df.index 

Check for NaN entries (should not be any unless columns do not align), which can result from stitching together dfs with different values in their headers.

In [None]:
# if there are any null values, then print names of columns containing
# null values
if df.isnull().any().any():
    print(df.columns[df.isnull().any()])

#in 'if' statement, false means no NaN entries True means NaN entries 

Check that all expected files were imported into final dataframe by comparing our sample names to the unique values in the Sample_ID column.

In [None]:
# Check that all expected files were imported into final dataframe

if sorted(df.Sample_ID.unique()) == sorted(ls_samples):
    print("All expected filenames present in big df Sample_ID column.")
else:
    compare_headers(['no samples'], df.Sample_ID.unique(), "big df Sample_ID column")

# K-Means Clustering

First, we will determine the best number of clusters, K, to use in our KMeans clustering. We may decide to operate on a random subset of our data (with Sample_ID proportions the same), in order to save time. Then, once the 'best' K is determined using the elbox method and chosen algorithm, we will perform KMeans clustering, using that K, on the entire dataset.

## Elbow method

Check out the following references for more information and methods on finding the elbow/knee in our KMeans clustering:

- https://github.com/arvkevi/kneed
- https://raghavan.usc.edu//papers/kneedle-simplex11.pdf
- https://github.com/arvkevi/kneed/blob/master/notebooks/decreasing_function_walkthrough.ipynb
- https://www.scikit-yb.org/en/latest/api/cluster/elbow.html#elbow-method
- https://www.geeksforgeeks.org/elbow-method-for-optimal-value-of-k-in-kmeans/

## Automated

### Find elbow and plot all at once

This resource is a library that will take your data, find the 'best' K, and supply your data using this best 'K'. It can also be used to plot the performance of different K values. Since--as far as I can tell--it does not allow the user to tweak enough parameters in the K determination or the plotting, I have in 'raw' format in the code so that it will not run. However, since it makes the process so automated, it has the potential to be very useful for the user, and so I am leaving it in here.

But this doesn't allow us to modify/stylize the plot the way we want, and doesn't lend itself well to a dashboard. We also cannot determine the metric used to calculated `cdist()` to caluclate distorition (Euclidean, sqeuclidean, etc.), just that the metric for the visualization *overall* is distortion. It looks like the value for every *K* is the `inertia_` object from our `KMeans()` model...

## Manual

Use the above references to calculate the intertia and distortion scores manually. Here, we will use the `k_scores_` values from our `viz` object above to create a custom plot that matches the style of the rest of the workflow and can be edited as seen fit. The `k_scores_` values are essentially equivalent to the "manual" intertia calculatios, despite the fact the metric used will be `"distortion"`. But we can still use this to find our infection point.

In [None]:
subset_row_count = 10000

In [None]:
subset_df = create_subset(df, 'Sample_ID', subset_row_count, 'original')

How many lines for each sample ID are in our subset df?

In [None]:
subset_df['Sample_ID'].value_counts().sort_index()

How do the proportions of cells in the original and subset dfs compare?

In [None]:
df['Sample_ID'].value_counts().sort_index()/df.shape[0]

#### Create model and calculate interias

In [None]:
model = KMeans()
viz = KElbowVisualizer(model, k=(1,20), timings = False,
                             metric = 'distortion')

viz.fit(subset_df.loc[:,subset_df.columns.isin(not_intensities)])        # Fit the data to the visualizer
inertias = viz.k_scores_

### Plot inertia for each K

In [None]:
# Establish figure
fig = go.Figure()
title = 'K-Means cluster count determination'

# Add data to figure
fig.add_trace(
    go.Scatter(
        # Plot inertias by K value
        x=list(K), y=inertias,
        # Show points and connecting lines
        mode='lines+markers',
        # Marker/lione aesthetics
        marker=dict(
            color='LightSkyBlue',
            size=15,
            line=dict(
                color='MediumPurple',
                width=2
            ))))
    
# Update figure aesthetics    
fig.update_layout(title = title, plot_bgcolor = 'white')

# Update figure axes
fig.update_xaxes(title_text = 'Number of clusters', linecolor = 'black')
fig.update_yaxes(title_text = 'Inertia', linecolor = 'black', 
                 range = [0, max(inertias)+0.1*max(inertias)])
# Figure output
#plot(fig)
filename = title.replace(" ","_") + ".png"
filename = os.path.join(output_images_dir, filename)
fig.write_image(filename)

### Find elbow in above

In [None]:
from kneed import KneeLocator

In [None]:
best_kn = KneeLocator(list(K), inertias, S=1.0, curve='convex', direction='decreasing')

What was determined to be the best 'knee' using the KneeLocator() function?

In [None]:
best_kn.knee

Plot the same figure as above, but include a vertical line at the `best_kn.knee` K.

In [None]:
# Establish figure
fig = go.Figure()
title = 'K-Means cluster count determination with elbow'

# Add data to figure
fig.add_trace(
    go.Scatter(
        # Plot score by K value
        x=list(K), y=inertias, 
        # We want points and connecting lines
        mode='lines+markers',
        # Marker/line parameters
        marker=dict(
            color='LightSkyBlue', size=15,
            line=dict(
                color='MediumPurple',width=2
                        ))))
# Add in vertial line   
fig.add_shape(
    # Line Vertical
    go.layout.Shape(
        type="line",
        xref = "x",
        yref = "y",
        x0=best_kn.knee,
        y0=0,
        x1=best_kn.knee,
        # Determine the height of the line.
        # Can distort plot if too long.
        y1= max(inertias)+0.1*max(inertias),
        # Line aesthetics
        line=dict(
            color="black", width=2, dash = 'dot'
        ),
))   

# Update figure aesthetics
fig.update_layout(title = title, plot_bgcolor = 'white')

# Update axes
fig.update_xaxes(title_text = 'Number of clusters', linecolor = 'black')
fig.update_yaxes(title_text = 'Inertia', linecolor = 'black',
                 range = [0, max(inertias)+0.1*max(inertias)])

# Plot output
#plot(fig)
fig.write_image(output_images_dir + "/" + title.replace(" ","_") + ".png")

### Perform KMean clustering

Here, we are using the K value determined by the `KneeLocator`. We are going to operate on the full `df`, not a subset, since to esrablish cluster IDs for each cell.

In [None]:
n_clusters = best_kn.knee
#n_clusters = 
n_clusters

In [None]:
#KMeans Clustering on mean intensities

from sklearn.cluster import KMeans
cluster = KMeans(n_clusters = n_clusters, random_state = 20) #number of clusters can be set here 

# We are performing KMeans clustering on df using K (5) clusters
cluster.fit_predict(df.loc[:,~df.columns.isin(not_intensities)])

Update `df` to include cluster information for each cell.

In [None]:
#create a new column with cluster number 
# We don't want our cluster labels to start with '0'
df['cluster'] = cluster.labels_ + 1

Save the full dataset with the clustering column.

In [None]:
df.to_csv("Kmeans_full_df.csv")

## Visualizations

#### Create color dictionary for clusters

In theory, we to choose colors that are categorical, since Cluster is actually a non-ordered category. However, since we could conceivably be working with > 10-11 clusters, we do not want a color palette that will just cycle back through the same limited colors, so we are going to take this continuous color palette and get the number of unique colors we need.

In [None]:
# Get those unique colors
cluster_color_values = sb.color_palette("hls",n_colors = len(df.cluster.unique()))#'HLS'

print(sorted(df.cluster.unique()))
# Display those unique colors
sb.palplot(sb.color_palette(cluster_color_values))


Store in a dictionary

In [None]:
cluster_color_dict = dict(zip(sorted(df.cluster.unique()), cluster_color_values))
cluster_color_dict

In [None]:
## Here is an example of how you might specify the values yourself,
# derived from the QC/EDA chapter. Note that in this case, our keys
# are not strings (e.g., '1'), but are actually ints (integers, e.g., 1).

#cluster_color_dict['1'] = mplc.to_rgb('xkcd:dark sky blue')
#cluster_color_dict['2'] = mplc.to_rgb('xkcd:reddish orange')
#cluster_color_dict['3'] = mplc.to_rgb('xkcd:jungle green')


sb.palplot(sb.color_palette(
    [cluster_color_dict[1],
     cluster_color_dict[2],
     cluster_color_dict[3],
    cluster_color_dict[4],
    cluster_color_dict[5]]))


#### Save color information (mapping and legend) to metadata directory

In [None]:
# Create dataframe
cluster_color_df = color_dict_to_df(cluster_color_dict, "cluster")
cluster_color_df.head()

# Save to file in metadatadirectory
filename = "cluster_color_data.csv"
filename = os.path.join(metadata_dir, filename)
cluster_color_df.to_csv(filename, index = False)

In [None]:
# Legend of cluster info only

g  = plt.figure(figsize = (1,1)).add_subplot(111)
g.axis('off')
handles = []
for item in sorted(cluster_color_dict.keys()):
        h = g.bar(0,0, color = cluster_color_dict[item],
                  label = item, linewidth =0)
        handles.append(h)
first_legend = plt.legend(handles=handles, loc='upper right', title = 'Cluster'),


filename = "Clustertype_legend.png"
filename = os.path.join(metadata_images_dir, filename)
plt.savefig(filename, bbox_inches = 'tight')

### Heatmap

Here, I assume we have clustering data for the full df loaded in for this chapter, not just a subset of ~10k cells, regardless of how many rows of our df were used to determine the best K. If we *are* working with a subsetted df at this point, then the below method to get a subset appropriate for the heatmap will still work fine.

We will only be plotting ~10k cells in the interest of time/computing resources. We want these 10k lines in our original df to be sampled randomly, without replacement, with the caveat that the proportions of all samples in the data remains the same in this subset. If the size of the dataframe is > 10k rows, then we will proceed with the entire dataset.

In [None]:
subset_row_count = 10000

In [None]:
subset_df = create_subset(df, 'Sample_ID', subset_row_count, 'equal')

How many lines for each sample ID are in our subset df?

In [None]:
subset_df['Sample_ID'].value_counts().sort_index()

How do the proportions of cells in the original and subset dfs compare?

In [None]:
df['Sample_ID'].value_counts().sort_index()/df.shape[0]

*Unlike other heatmaps in the workflow, this one will not use row clustering. We want to arrange the rows first by cluster number and then by Sample_ID.*

In [None]:
subset_df = subset_df.sort_values(by = ['cluster','Sample_ID'])

### Get data structures to map colors to columns and rows...

## Row colors

For the row colors, we essentially just need to map the information in a given feature to the colors that correspond to that value in the right color dictionary. For example, it might be sample_3, sample_3, sample_4, , so we need the row colors to be (1, 1, 1), (1, 1, 1), (0, 0.25, 0.6). These are the initialy colors--if we are clustering rows or columns, the labels will still match the data with which they're associated.

In [None]:
sample_row_colors = subset_df.Sample_ID.map(sample_color_dict)

sample_row_colors[1:5]

In [None]:
row_celltype_colors = subset_df.cell_type.map(celltype_color_dict)

row_celltype_colors[1:5]

In [None]:
row_cluster_colors = subset_df.cluster.map(cluster_color_dict)

row_cluster_colors[1:5]

## Column rows

For column rows, matching up the information in each column with the appropriate color is more difficult. 

In [None]:
# Here, we want to translate marker columns to their corresponding channel information,
# and then match that up with the right color, as with row columns

# First, we merge the (L) non-intensity column values, transformed into a dataframe,
# with the metadata df (R), matching on the "0" column present in the L,
# which is the only column in there, with the "full_column" (aka df header name)
# column in the R, only including all cases where there is a match and any unmatched
# L cases ('both' [?] would be only cases where ther is is a match, and 'right' would
# be cases with a match and any unmatched R columns).
column_channel_colors = pd.merge(pd.DataFrame(pd.Series(
    subset_df.loc[:,~subset_df.columns.isin(not_intensities)].columns.values)), 
                  metadata, how = 'left',
         left_on = 0, right_on = 'full_column')[[0,'Channel']]['Channel'].map(channel_color_dict)

# Set the index to be the names of the colors. There is only one column, and that is the corresponding
# colors
column_channel_colors.index = subset_df.loc[:,~subset_df.columns.isin(not_intensities)].columns.values

column_channel_colors.head()

In [None]:
# Here, we want to translate marker columns to their corresponding round information,
# and then match that up with the right color, as with row columns

# First, we merge the (L) non-intensity column values, transformed into a dataframe,
# with the metadata df (R), matching on the "0" column present in the L,
# which is the only column in there, with the "full_column" (aka df header name)
# column in the R, only including all cases where there is a match and any unmatched
# L cases ('both' [?] would be only cases where ther is is a match, and 'right' would
# be cases with a match and any unmatched R columns).
column_round_colors = pd.merge(pd.DataFrame(pd.Series(
    subset_df.loc[:,~subset_df.columns.isin(not_intensities)].columns.values)), 
                  metadata, how = 'left',
         left_on = 0, right_on = 'full_column')[[0,'Round']]['Round'].map(round_color_dict)

# Set the index to be the names of the colors. There is only one column, and that is the corresponding
# colors
column_round_colors.index = subset_df.loc[:,~subset_df.columns.isin(not_intensities)].columns.values

column_round_colors.head()

### Annotations data structure

In [None]:
# Create data structure to hold everything we need for row/column annotations
# annotations is a dictionary
## IMPORTANT - if you use 'annotations', it MUST have both 'rows' and 'cols'
## objects inside. These can be empty lists, but they must be there!
anns = {}

# create a data structure to hold everything we need for only row annotations
# row_annotations is a list, where each item therein is a dictioary corresponding
# to all of the data pertaining to that particular annotation
# Adding each item (e.g., Sample, then Cluster), one at a time to ensure ordering
# is as anticipated on figure
row_annotations = []
row_annotations.append({'label':'Sample','type':'row','mapping':sample_row_colors,'dict':sample_color_dict,
                        'location':'center left','bbox_to_anchor':(0, 0.5)})
row_annotations.append({'label':'Cell type','type':'row','mapping':row_celltype_colors,
                        'dict':celltype_color_dict,
                        'location':'lower left','bbox_to_anchor':(0, 0.65)})
row_annotations.append({'label':'Cluster','type':'row','mapping':row_cluster_colors,
                        'dict':cluster_color_dict,
                        'location':'lower left','bbox_to_anchor':(0, 0.20)})
# Add all row information into the annotations dictionary
anns['rows'] = row_annotations


# Now we repeat the process for column annotations
col_annotations = []
col_annotations.append({'label':'Round','type':'column','mapping':column_round_colors,'dict':round_color_dict,
                       'location':'upper right','bbox_to_anchor':(1,0.50)})

col_annotations.append({'label':'Column','type':'column','mapping':column_channel_colors,'dict':channel_color_dict,
                       'location':'upper right','bbox_to_anchor':(1,0.75)})
anns['cols'] = col_annotations

#### Actually plot the heatmap

In [None]:
heatmap_function(
    data = subset_df.loc[:,~subset_df.columns.isin(not_intensities)],
    title = "KMeans heatmap",
    # define method, metric, and color map
    method = 'ward', metric = 'correlation', cmap = 'coolwarm',
    # colorbar (legend coloring of main plot)   
    cbar_kws = {'label':'Correlation'},
    # xticklabels - want to have the nicknames instead of full names,
    # so we translate from full to short names; we also only want to include
    # non_intensity columns, to match the data we fed into under 'data'
    xticklabels = [full_to_short_names[name] for name in 
                     subset_df.loc[:,
                                 ~subset_df.columns.isin(not_intensities)].columns.values],
    # Location where we want to save the output image
    save_loc = output_images_dir,
    # Boolean values for clustering
    row_cluster = False, col_cluster = True,
    # provide annotations established above
    annotations = anns
          )

### XY Maps

In [None]:
# Plot one XY map for each sample, where each cluster is a different color

x_feature = 'Nuc_X'
y_feature = 'Nuc_Y_Inv'

# iterate through all samples
for sample in ls_samples:
    # Extract x/y coordinates and cluster ID for all cells
    location_colors = df.loc[df['Sample_ID'] == sample,[x_feature,y_feature,'cluster']]

    # Establish figure
    fig = go.Figure()
    title = sample
    
    # Iterate through all unique cluster values assocaited with this sample
    # We do this because not every cluster may be present in every sample
    for cluster in sorted(df.loc[df['Sample_ID'] == sample,'cluster'].unique()):
        # Plot cells for a particular cluster ID
        fig.add_scatter(
            # We only want points, not points and lines, or just lines
            mode = 'markers',
            # Marker aesthetics
            marker=dict(
                size=5, opacity=0.4, # size is dot size, higher opacity = less opaque
                color='rgb' + str(cluster_color_dict[cluster])#,
                #line = dict(width = 2, color = 'gray') # line around each marker
                ),
        # X/Y data
        x = location_colors.loc[location_colors['cluster']==cluster,x_feature],
        y = location_colors.loc[location_colors['cluster']==cluster,y_feature],
        name = "Cluster " + str(cluster))

    # Update general plot aesthetics
    fig.update_layout(title = title, plot_bgcolor = 'white', showlegend = True,
                     legend= {'itemsizing': 'constant'}) # make the legend dots a bit bigger
    
    # Update axes
    fig.update_xaxes(title_text = x_feature, linecolor = 'black')
    fig.update_yaxes(title_text = y_feature, linecolor = 'black')

    # Plot output
    #plot(fig)
    filename = sample + " KMeans XY Map with " + str(n_clusters) + " clusters.png"
    filename = filename.replace(" ", "_")
    filename = os.path.join(output_images_dir, filename)
    fig.write_image(filename)



### Bar plots

Bar plotw will be counts of cells in each cluster. First, we need to create a subset of cluster info to work with. This will make things a little easier moving forward.

In [None]:
cluster_counts = pd.DataFrame(columns = ['clusternum','clustername','clustername_full','count'])

# Iterate through all clusters (need 1, n.clusters+1) because we shifted all cluster IDs
# up by one to avoid having a cluster 0
for c in range(1,n_clusters+1):
    cluster_counts = cluster_counts.append(pd.DataFrame(
        {'clusternum':[c],
         'clustername':['Cl ' + str(c)],
         'clustername_full':['Cluster ' + str(c)],
         'count':[df.loc[df['cluster'] == c,:].shape[0]]}))
    
# Set index to cluster ocunt (0-based indexing)    
cluster_counts.index = range(cluster_counts.shape[0])

cluster_counts.head()

In [None]:
# By Cluster only

# Establish figure
fig = go.Figure()
title = 'KMeans cluster cell counts'

# Plot each cluster's information separately
for c in sorted(df.cluster.unique()):
    fig.add_trace(go.Bar(
        x=cluster_counts.loc[cluster_counts['clusternum']==c,'clustername'], 
        y = cluster_counts.loc[cluster_counts['clusternum']==c,'count'],
        text = cluster_counts.loc[cluster_counts['clusternum']==c,'count'], textposition= 'outside',
        marker=dict(
            color='rgb' + str(cluster_color_dict[c])),
            showlegend = False
    ))

# Update figure aesthetics
fig.update_layout(title = title, plot_bgcolor = 'white')

# Update plots
fig.update_xaxes(linecolor = 'black')
fig.update_yaxes(title_text = "Cells", linecolor = 'black')

# Figure output
#plot(fig)
filename = title.replace(" ","_") + ".png"
filename = os.path.join(output_images_dir, filename)
fig.write_image(filename)


#### Bar plot - Proportional and count breakdowns of tissue type within each cluster

In [None]:
cluster_counts

Now, we add columns to hold information on how many of each cell type we have.

In [None]:
# Cell type counts

cluster_counts['stroma'] = cluster_counts.apply(lambda row: 
    df.loc[(df['cluster'] == row['clusternum']) &
             (df['cell_type'] == 'STROMA'),:].shape[0] , axis = 1)

cluster_counts['immune'] = cluster_counts.apply(lambda row: 
    df.loc[(df['cluster'] == row['clusternum']) &
             (df['cell_type'] == 'IMMUNE'),:].shape[0] , axis = 1)

cluster_counts['cancer'] = cluster_counts.apply(lambda row: 
    df.loc[(df['cluster'] == row['clusternum']) &
             (df['cell_type'] == 'CANCER'),:].shape[0] , axis = 1)

In [None]:
# Cell type proportions
# Using lambdas to operate along each row to establish the value
# in the new column for a given row

cluster_counts['stroma_perc'] = cluster_counts.apply(
    lambda row: round(row['stroma']/row['count']*100,1) , axis = 1)

cluster_counts['immune_perc'] = cluster_counts.apply(
    lambda row: round(row['immune']/row['count']*100,1) , axis = 1)

cluster_counts['cancer_perc'] = cluster_counts.apply(
    lambda row: round(row['cancer']/row['count']*100,1) , axis = 1)

In [None]:
# What does our dataframe look like now?
cluster_counts

In [None]:
# Cell type within cluster - count

# Establish figure
fig = go.Figure()
title = 'KMeans cell types count within clusters'

# plot each cell type separately
# This could also be wrapped in a for loop, as in other bar plots, if the user wishes
# Here, the decision was made not to do so, since we want three different capitalization
# styles and so things would be more complated than just iterating through a list of
# ['STROMA','CANCER','IMMUNE'], for example. This was should be more readable and thus
# easier to adapt as the user wishes.

fig = go.Figure(data=[
    go.Bar(name='Stroma', x=cluster_counts['clustername'], y=cluster_counts['stroma'], 
           text = cluster_counts['stroma'], textposition='auto',
           marker_color = 'rgb' + str(celltype_color_dict['STROMA'])),
    
    go.Bar(name='Immune', x=cluster_counts['clustername'], y=cluster_counts['immune'], 
           text = cluster_counts['immune'], textposition='auto',
           marker_color = 'rgb' + str(celltype_color_dict['IMMUNE'])),
    
    go.Bar(name='Cancer',x=cluster_counts['clustername'], y=cluster_counts['cancer'], 
           text = cluster_counts['cancer'], textposition='auto', 
           marker_color = 'rgb' + str(celltype_color_dict['CANCER']))
])

# Update figue aesthetics
fig.update_layout(title = title, plot_bgcolor = 'white',barmode ='stack')

# Update axes
fig.update_xaxes(linecolor = 'black')
fig.update_yaxes(title = "Cell count", linecolor = 'black')

# Plot output
#plot(fig)
filename = title.replace(" ","_") + ".png"
filename = os.path.join(output_images_dir, filename)
fig.write_image(filename)

In [None]:
# Cell type within cluster - proportional

# Establish figure
fig = go.Figure()
title = 'KMeans cell types proportions within clusters'


# Plot each cell type seprately. See previous cell for an explanation.
fig = go.Figure(data=[
    go.Bar(name='Stroma', x=cluster_counts['clustername'], y=cluster_counts['stroma_perc'], 
           text = cluster_counts['stroma_perc'], textposition='auto',
           marker_color = 'rgb' + str(celltype_color_dict['STROMA'])),
    
    go.Bar(name='Immune', x=cluster_counts['clustername'], y=cluster_counts['immune_perc'], 
           text = cluster_counts['immune_perc'], textposition='auto',
           marker_color = 'rgb' + str(celltype_color_dict['IMMUNE'])),
    
    go.Bar(name='Cancer',x=cluster_counts['clustername'], y=cluster_counts['cancer_perc'], 
           text = cluster_counts['cancer_perc'], textposition='auto', 
           marker_color = 'rgb' + str(celltype_color_dict['CANCER']))
])

# Update figure aesthetics
fig.update_layout(title = title, plot_bgcolor = 'white',barmode ='stack')

# Update axes
fig.update_xaxes(linecolor = 'black')
fig.update_yaxes(title = "Cell proportion of total", linecolor = 'black')

# Plot output
#plot(fig)
filename  = title.replace(" ","_") + ".png"
filename = os.path.join(output_images_dir, filename)
fig.write_image(filename)

### Distributions


Boxplots distributions
On Kmeans clusters – one plot per marker, each box is a cluster, all samples in there


In [None]:
# By sample ID and cell type

# Define a function to be called for each marker below.
# This function takes in a string my_marker corresponding to a colummn header
# in the pandas dataframe df. The function then plots boxplots of the data
# in this column, separated into different boxes by the column 'cluster',
# which must be present in df. It returns nothing but does save the plot
# to a png.
def make_cluster_boxplot(my_marker, df):
    # Establish figure
    fig = go.Figure()
    title = 'KMeans ' + my_marker + ' Distributions by cluster'

    # Get list of clusters in order
    clusters = sorted(df.cluster.unique())
    data = []
    # Append data for plotting by iterating through each cluster
    for c in clusters:
        data.append(go.Box(
            # Naming choices
            name="Cl "+str(c),
            # y-values for this cluster's data
            y = df.loc[df['cluster']==c,my_marker],
            # Marker aesthetics
            marker = dict(
                # Color for this cluster's data
                color = 'rgb'+str(cluster_color_dict[c])
            )
        ))
    
    # Plot the data
    fig = go.Figure(data=data)
    
    # Update figure aesthetics
    fig.update_layout(title = title, plot_bgcolor = 'white',barmode ='stack', showlegend = False)
    
    # Update axes
    fig.update_xaxes(linecolor = 'black')
    fig.update_yaxes(title_text = 'Z-Scored Intensity', linecolor = 'black')
    
    # Plot output
    #plot(fig)    
    filename = title.replace(" ","_") + ".png"
    filename = os.path.join(output_images_dir, filename)
    fig.write_image(filename)
    return None

In [None]:
# Perform the plotting

# Rename columns for visualization purposes
df = df.rename(columns = full_to_short_names)

# Create a plot for each marker
# If you wanted only a subset of markers, you could say:
# for m in ['marker1', 'marker2', 'marker3']:
for m in [m for m in df.columns.values if m not in not_intensities]:
    make_cluster_boxplot(m, df)

# Reinstate original column names
df = df.rename(columns = short_to_full_names)

### Calculate the percentage/proportion of each cluster for each sample

This information will be used in a few subsequent visualizations, as well as saved for future reference. One important thing to note about these dataframes is that Sample_ID is the name of the index, and the values of the index are our sample names. This is how the dfs are generated, and I've chosen to keep them that way because it will make it a bit easier for the median value heatmap later on. Note that the bar plot will gather its x-axis data from the index of a dataframe. Both the index (Sample_ID) and columns (cluster) are going to be named, which might make the dfs look a little unusual when printed to the screen.

First, we create a dataframe of the counts for each sample/cluster combination.

In [None]:
# Group by Sample_ID, then get the value counts for each cluster,
# then massage into a dataframe
cluster_sample_counts = df.groupby('Sample_ID')['cluster'].value_counts().unstack().fillna(0)

## Keeping some commented out code in case we want to make 'Sample_ID' a column in its own right
#cluster_sample_counts['Sample_ID'] = cluster_sample_counts.index

## Change from floats to ints
#cluster_sample_counts.loc[:,cluster_sample_counts.columns != 'Sample_ID'] = \
#    cluster_sample_counts.loc[:,cluster_sample_counts.columns != 'Sample_ID'].astype(int)
cluster_sample_counts = cluster_sample_counts.astype(int)

In [None]:
cluster_sample_counts.head()

# Note 'Sample_ID' is name of index, 'cluster' is name of columns.

We are also interested in the proportion values.

In [None]:
## Proportions
cluster_sample_props = cluster_sample_counts.copy()

# Keeping some commented out code in case we've chosen
# to make 'Sample_ID' a column in its own right
"""cluster_sample_props.loc[:,
    cluster_sample_props.columns != 'Sample_ID'] = \
        cluster_sample_props.loc[:,cluster_sample_props.columns != 'Sample_ID'].apply(
            lambda row: round(row/row.sum()*100,1), axis =1)"""
cluster_sample_props = \
        cluster_sample_props.apply(
            lambda row: round(row/row.sum()*100,1), axis =1)

In [None]:
cluster_sample_props

# Note 'Sample_ID' is name of index, 'cluster' is name of columns.

Save files of these two dataframes.

In [None]:
## Counts
filename = "sample_cluster_counts.csv"
filename = os.path.join(output_data_dir, filename)
cluster_sample_counts.to_csv(filename, index = True, 
                             # We want the header of the output file to have the format 'cluster_#' instead of '#'
                             ## Keeping some commented out code in case we have decided to make 'Sample_ID' a column in its own right
                             # We also do not want to alter the name of the 'Sample_ID' column
                             #header = ["cluster_" + str(c) for c in cluster_sample_counts.columns if c != 'Sample_ID'] +\
                             #        ['Sample_ID'])
                            header = ["cluster_" + str(c) for c in cluster_sample_counts.columns])
# Proportions
filename = "sample_cluster_counts_perc.csv"
filename = os.path.join(output_data_dir, filename)
cluster_sample_props.to_csv(filename, index = True,
                            # We want the header of the output file to have the format 'cluster_#' instead of '#'
                            ## Keeping some commented out code in case we have decided to make 'Sample_ID' a column in its own right
                             # We also do not want to alter the name of the 'Sample_ID' column
                             #header = ["cluster_" + str(c) for c in cluster_sample_counts.columns if c != 'Sample_ID'] +\
                             #        ['Sample_ID'])
                            header = ["cluster_" + str(c) for c in cluster_sample_counts.columns])

### Bar plot - Proportional breakdown of clusters within sample

Now that we have enough information established to create this plot, we will take a detour from the cluster breakdown process to generate this plot. Then, we will continue with the above, as it is necessary for the more complicated visualizaitons.

In [None]:
# By sample ID and cell type

# Establish figure
fig = go.Figure()
title = 'Proportional breakdown of clusters within sample'

# Get list of clusters
# sorted() puts them in order
# Remember that we need to shift the range up by 1 since the number of clusters is smaller than then name of our
# last cluster, since we want to avoid having a Cluster 0
clusters = sorted(list(range(1,n_clusters+1)))
data = []
# Iteate through each cluster and extract its data
for c in clusters:
    data.append(go.Bar(
        # Display name for plt
        name="Cl "+str(c),
        # establish x- and y-values of cluster data
        x=cluster_sample_counts.index, # if 'Sample_ID' were a column, we would have said cluster_sample_counts['Sample_ID']
        y=cluster_sample_counts[c],
        # Maker aesthetics
        marker = dict(
            # cluster color
            color='rgb' + str(cluster_color_dict[c])
        )
    ))

# Plot all of the data
fig = go.Figure(data=data)

# Update figure aesthetics
fig.update_layout(title = title, plot_bgcolor = 'white',barmode ='stack')

# Update axes
fig.update_xaxes(linecolor = 'black')
fig.update_yaxes(title_text = "Cluster proportion", linecolor = 'black')

# Plot output
#plot(fig)
filename = title.replace(" ","_") + ".png"
filename = os.path.join(output_images_dir, filename)
fig.write_image(filename)

### Prepare for median value heatmap

For this, we will want to combine rows in `cluster_sample_counts` at will, using whichever criteria we chose. The added rows will be replaced with one sum row. Then, we will determine the proportions and only plot the samples/data we are interested in.

First, let's create a copy of the `cluster_sample_counts` to work with. We want to maintain the original, so that we can create as many different combinations of rows as we want without extra work to revert back to the original.

In [None]:
counts_subset = cluster_sample_counts.copy()

# We also want to rename 'Sample_ID' to 'Combination_ID'
counts_subset.index = counts_subset.index.rename('comb_id')

counts_subset.head()



In [None]:
# This function takes in a pandas dataframe and information on how to combin columns.
# It sums designated columns, adds the summed row to the dataframe, drops the individual
# rows used to create the sum, and returns the final df. The input pandas dataframe is df,
# the rows to be dropped are identified by a list of their index identifiers, drop_rows,
# and the new index identifier is a string called new_name.
def row_combiner(df, drop_rows, new_name):
    # Check that drop_rows are in the index
    if len(set(drop_rows).intersection(set(df.index))) == 0:
        print("1+ item(s) specified for dropping not found in dataframe's index.")
        print("Returning unaltered dataframe.")
        return df
    
    # First create a dataframe consisiting of the chosen summed rows
    # Note that we are feeding a list of index identifiers into
    # the .index.isin() portion
    new_row = pd.DataFrame( # make a df out of the following...
    counts_subset.loc[ # only the rows where given df has rows meeting criteria
                    # criteria are that index identifiers are in the given list
        df.index.isin(drop_rows),
        # all columns are wanted, hence the ":"
        # We are also summming along the first (0th index) axis, along the rows
        :].sum(axis = 0)
            ).T # transpose to get a row instead of a column

    # In order to replace the summed rows with our new row,
    # we will need to assign our new row an index identifier
    # This should be the identifier you want to see on the median
    # value heatmap.
    new_row.index = [new_name]

    # We will also rename the axis to 'Sample_ID', as before,
    # or else the counts_subset will lose its axis name when
    # the new row is added
    new_row = new_row.rename_axis(df.index.name, axis = 'index')

    # What does this new dataframe look like?
    # Remember 'cluster' is just the name of the columns
    
    df = df.append(new_row)
    df = df.drop(drop_rows)

    return df

Combine specified rows.

In [None]:
# drop_rows = the rows we are going to be replaced with the one combined row

#counts_subset = \
#    row_combiner(df = counts_subset, drop_rows = ['patient1_data2','patient1_data2'], new_name = 'Patient_1')

counts_subset = \
    row_combiner(df = counts_subset, drop_rows = [''], new_name = 'Patient_X')

counts_subset

You can repeat the above process as many times as you would like to add on as many rows as you would like. There are a few other manipulations to consider.

What if we want to drop some additional rows, wihtout replacing them with a sum? The `drop()` function is fed a list of index identifiers to drop from the dataframe.

In [None]:
#counts_subset.drop(['row_identifier1','row_identifier2'])

counts_subset = counts_subset.drop([''])
counts_subset

What if we want to rename the index name? I will not write over `cluster_sample_counts`, but here is what you would do.

In [None]:
counts_subset.rename_axis('new_index_name',axis = 'index')

What if we made a typo, and Patient_1 should be Patient_10? Again, I am not overwriting `counts_subset` unless I set it equal to this new expression.

In [None]:
counts_subset.rename(index={'Patient_1':'Patient_10'})

What if we want to include one or more of our dropped previously dropped rows? We can append those from the intact `counts_subset`. Here is what the resulting dataframe would look like.

In [None]:
#counts_subset.append(cluster_sample_counts.loc[['row_identifer1'],:])

We also want to create a copy of our original dataframe to use for finding the medians. As before, we will be exchanging original sample labels with other labels. Unlike before, we will not be combining rows until the very end. Row renaming will occur more simply.

In [None]:
# Create a copy of the original df

for_medians = df.copy()

In [None]:
# Rename rows as necessary

#for_medians.loc[for_medians['Sample_ID'].isin(['patient1_data1','patient2_data2']),['Sample_ID']] = \
#    'Patient_1' 


for_medians.loc[for_medians['Sample_ID'].isin(['']),['Sample_ID']] = \
    'Patient_X' 

for_medians.Sample_ID.unique()

In [None]:
# Lets take a look at some rows that WERE NOT just renamed

for_medians.loc[~for_medians['Sample_ID'].isin(['Patient_X']),['Sample_ID']].head()

In [None]:
## We ALSO need to drop rows from the `for_medians` df as necessary, so that it matches our `counts_subset` item
# Drop rows as necessary 

# Drop everything that isn't 'Patient_1'
drop_indices = for_medians.loc[for_medians['Sample_ID'] != 'Patient_X',:].index

# Or maybe we want to drop everything that isn't 'Patient_1' or 'Patient_2'
#drop_indices = for_medians.loc[~for_medians['Sample_ID'].isin(['Patient_2', 'Patient_1']),:].index

# The portion through "drop(drop_indices)" gives us a dataframe with the rows we don't want removed
for_medians = for_medians.drop(drop_indices)

Remember `for_medians` is just our regular df with the 'Sample_ID' label changed to reflect our groupings for the visualization.

#### *In order to proceed, both `counts_subset` and `for_medians` should have the same, and only the same, identifiers for sample id (data column)/comb id (index)*

In [None]:
sorted(for_medians.Sample_ID.unique()) == sorted(counts_subset.index.unique())

Now let's incorporate our cutoff. We want to avoid plotting any cluster that makes up less than *x*% of that subset of data. Now that we have counts across all samples we want to group together, we can calculate the proprotion of each cluster contribution to that grouping.

In [None]:
# Create a dataframe of the proportions.
# Note that the *100 part of the command means that
# we are dealing percentages, not proportions, so
# that's a bit of a misnomer

props_subset = \
        counts_subset.apply(
            lambda row: round(row/row.sum()*100,1), axis =1)

props_subset

In [None]:
# cutoff should be a percentage, not a proportion
cutoff = 5

# We will create a dataframe of boolean T/F
# values, signifying whether the value of a
# given grouping (row) and cluster (columns)
# meets the inclusion criteria
props_subset_bool = props_subset.apply(
    lambda row: row >= cutoff, axis = 1)

props_subset_bool

Let's go ahead and create our `medians` dataframe now. This will be a dataframe where each row represents the medians values for markers for each sample-cluster combination.

In [None]:
#medians.columns[medians.columns.isin(not_intensities)].values

In [None]:
# first group by our features of interest
medians = for_medians.copy().groupby(['Sample_ID','cluster']).median()

# then drop all columns that are not markers
medians = medians.drop(columns = medians.columns[medians.columns.isin(not_intensities)].values)


In [None]:
# Visualize medians
medians.head()

What we have here is a multiindex dataframe. We have two layers of row indexing, `Sample_ID` and `cluster`.

Using `medians` and `props_subset_bool`, drop all indices in `medians` where the sample-cluster combination does not meet our proportion threshold cutoff

In [None]:
medians

In [None]:
drop_indices = []
# iterate through first five rows in medians
# this should be # of clusters instead!!
for i in range(medians.shape[0]):
#for i in range(5):
    # extract the sample and cluster IDs from that row
    sample = medians.iloc[i,:].name[0]
    cluster = medians.iloc[i,:].name[1]
    # using those sample and cluster IDs, look inside props_subset_bool
    # look at the value specified by the correct sample and cluster col/row
    # that T/F value therein indicates whether this cluster should be used
    # with this sample
    use = props_subset_bool.loc[props_subset_bool.index == sample, props_subset_bool.columns == cluster].values[0][0]
    if not use:
        drop_indices.append(medians.iloc[i,:].name)

print("Dropping the following patient/cluster data from plotting due to proportion threshold failure: " + str([i for i in drop_indices]))
medians = medians.drop(drop_indices)

In [None]:
# Save output to a dataframe

filename = "medians_patient_1_patient_2.csv"
filename = os.path.join(output_data_dir, filename)
medians.to_csv(filename, index = True)

### Median value heatmap

In [None]:
# rename `medians` columns for plotting
medians = medians.rename(columns = full_to_short_names)
medians

What if we want the columns in a different order? Simply reorder them by asking for a df with the columns in a given order.

In [None]:
medians[['AXL','53BP1']].head()

In [None]:
medians[['53BP1','AXL']].head()

#### Do the plotting!

In [None]:
# Anchor values for plot
vmin = -2.5
vmax = 5

sb.set()

# Set x- and y-axis labels
x_axis_labels = medians.columns.values.tolist()
y_axis_labels = [medians.iloc[i,:].name[0] + " - Cluster " + 
                 str(medians.iloc[i,:].name[1]) for i in range(medians.shape[0])]

ax = sb.heatmap(
    # Data for plotting by color
    medians.loc[:,~medians.columns.isin(not_intensities)], 
    # Anchor values are those we determined above
    vmin=vmin, vmax=vmax, 
    # Annotations are the text displayed on the plot             
    annot=medians.loc[:,~medians.columns.isin(not_intensities)], 
    # Annotation keywords - here we determine font size            
    annot_kws={"size": 4},
    # Format of annotations - .2f means a float (decimal) value to the hundreths place
    fmt='.2f',
    # Add lines of specified color and length between boxes
    linewidths = 0.33, linecolor = 'black',
    # x- and y-axis labels are as specified above
    xticklabels=x_axis_labels, yticklabels=y_axis_labels,
    # Color bar keybords - provide label on color scale bar
    cbar_kws = {'label':'Median value'},
    # color scheme for plotting
    cmap = 'coolwarm'
                )

# Set size of axis tick markers to 0
ax.tick_params(length=0)

# Adjust y-axis
plt.yticks(rotation=0, size = 8)

# Adjust x-axis
ax.xaxis.tick_top() # x axis on top
ax.xaxis.set_label_position('top')
plt.xticks(rotation=45, size = 8)
plt.setp(ax.xaxis.get_majorticklabels(), ha='left') # align left

# Adjust general plot aesthetics
ax.set_title(label = "Median values", fontsize = 20)
plt.tight_layout()

# Plot output
filename = "median_values.png"
filename = os.path.join(output_images_dir, filename)
plt.savefig(filename,dpi=300)

### Drop any other rows or columns we want to before saving data

In [None]:
# Let's take a look
df.columns.values

For the sake of example, I will operate on a copy of df, called df_copy

In [None]:
# You MUST do df.copy()
# 'df_copy = df' would essentially 
# give you two different names for the
# SAME dataframe, so operating on one
# would also operate on the other
df_copy = df.copy()

#### Operate on entire rows or columns

In [None]:
# Drop columns
my_cols = []
df_copy = df_copy.drop(columns = my_cols)

In [None]:
# Keep only specific columns (explained below)
my_cols = []
my_cols = df.columns.values
df_copy = df_copy.loc[:,my_cols]

#### Operate on rows and columns using filtering criteria

In [None]:
# Keep only certain rows based off of criteria

# use df.loc[] to filter
# df.loc[rows,columns]
# df.loc[:,certain_cols] --> keep all rows ':', only certain cols
# df.loc[certain_rows,:] --> keep only certain row, all cols ':'

# Say we only want certain values for Sample_ID
print(df_copy.Sample_ID.unique())
keep = ['TMA1.1','TMA1.2','TMA1.3','TMA2.1','TMA2.2','TMA2.3']
df_copy = df_copy.loc[df_copy['Sample_ID'].isin(keep),:]
print(df_copy.Sample_ID.unique())

In [None]:
# Filter on multiple criteria
# '&' or 'and'
# '|' or 'or'
# you MUST have parentheses around each logic expression!
df_copy = df_copy.loc[
    (df_copy['Sample_ID'].isin(['TMA1.1','TMA1.2','TMA1.3'])) \
    ## backslash above used to break line for readability, but tell Python to act like it's all one line
        | (df_copy['Sample_ID'].isin(['TMA2.1','TMA2.2','TMA2.3'])),:]
print(df_copy.Sample_ID.unique())

In [None]:
# Remove rows based off of certain criteria
# note the negating tilde '~'!

df_copy = df_copy.loc[
    (~df_copy['Sample_ID'].isin(['TMA1.1','TMA1.2','TMA1.3'])) \
    ## backslash above used to break line for readability, but tell Python to act like it's all one line
        & (~df_copy['Sample_ID'].isin(['TMA2.1','TMA2.2','TMA2.3'])),:]
print(df_copy.Sample_ID.unique())

### Save the data by Sample_ID

In [None]:
# Check for existence of output file first
for sample in ls_samples:
    filename = sample + "_" + step_suffix + ".csv"
    filename = os.path.join(output_data_dir,  filename)
    if os.path.exists(filename):
        print("File by name "+filename+" already exists.")

In [None]:
# Save output files
for sample in ls_samples:
    df_save = df.loc[df['Sample_ID'] == sample,:]
    filename = sample + "_" + step_suffix + ".csv"
    filename = os.path.join(output_data_dir, filename)
    df_save.to_csv(filename, index = True)
