In [None]:
import pandas as pd
%reload_ext watermark
%matplotlib inline

import os
from scipy.stats import mannwhitneyu
from metapool.metapool import *
from metapool import (SAMPLE_DNA_CONC_KEY)
from metapool.mp_strings import (
    PM_SAMPLE_KEY, PM_LIB_WELL_KEY)
from metapool.util import warn_if_fp_exists
%watermark -i -v -iv -m -h -p metapool,sample_sheet,openpyxl -u

In [None]:
! conda list

# Knight Lab TellSeq pipeline notebook B

## Part 3 (of 5): Library concentration estimation

This portion of the notebook takes in fluorescent
 quantification values and produces visual outputs to interpret and check 
 values. 

As inputs, this workflow requires:
As inputs, it requires:
1. A tab-delimited `*_plate_df_A.txt` file containing the plate map info from the compression/barcoding step
2. MiniPico output (tab-delimited text format with columns 'Concentration' and 'Well')

The workflow:
1. reads in the specified input files
2. calculates estimated library concentration
3. visualizes concentration
4. produces a tab-delimited `*_plate_df_B.txt` file containing the updated plate map info

### Part 3 of 5, Step 1 of 5: Read in the 384-well plate data and the experiment info

In [None]:
## INPUT
full_plate_fp = './test_output/QC/Tellseq_plate_df_A.txt'

In [None]:
# if the full_plate_fp does not end with "plate_df_A.txt", throw an error
expected_suffix = f"plate_df_A.txt"
if not full_plate_fp.endswith(expected_suffix):
    raise ValueError(f"Expected file ending with '{expected_suffix}'")

In [None]:
if not os.path.isfile(full_plate_fp):
    print("Problem! %s is not a path to a valid file" % full_plate_fp)

In [None]:
plate_df = pd.read_csv(full_plate_fp, sep='\t')
plate_df.head()

In [None]:
is_absquant(plate_df)

### Part 3 of 5, Step 2 of 5: read in MiniPico library concentration
Enter path to MiniPico file:

In [None]:
## INPUT
lib_concs_fp = './test_data/Quant/MiniPico/Tellseq_clean_lib_quant.txt'

In [None]:
# 'Well' differs from 'Library Well' because the former specifies the 
# gDNA source well while the latter specifies the well (destination well) that 
# will contain the sequencing library for the sample. These contain the same
# info when replicates are not used, but differ when replicates ARE used,
# so it is safer to use 'Library Well' in both cases.
# (Careful!  well_col is a global variable used throughout rest of notebook)
well_col = PM_LIB_WELL_KEY

In [None]:
lib_concs = read_pico_csv(lib_concs_fp, plate_reader='SpectraMax_i3x',
                          conc_col_name='MiniPico Library DNA Concentration')
lib_concs.rename(columns={'Well':well_col},inplace=True)
plate_df = pd.merge(plate_df, lib_concs, on=well_col)

plate_df.head()

### Part 3 of 5, Step 3 of 5: calculate sample concentration from MiniPico

You will want to make sure that 'size' is correct for your average library size.

In [None]:
plate_df['MiniPico Library Concentration'] = \
    compute_pico_concentration(
        plate_df['MiniPico Library DNA Concentration'], size=500)
plate_df.head()

### Part 3 of 5, Step 4 of 5: visualize MiniPico values

This step will present visuals of the results, including:
1. Scatter plot of DNA concentrations by Library concentration
2. Plate-wise heatmap and histogram showing library concentrations
3. per-96-well plate heatmaps and histograms showing library concentrations and sample names
4. Plate-wise heatmap showing pooling values

#### Library concentration by sample DNA concentration:

In [None]:
f, (ax1,ax2,ax3) = plt.subplots(nrows=1, ncols=3, figsize=(14, 4))
plate_df['Input DNA'] = plate_df['Sample DNA Concentration']*plate_df['Normalized DNA volume']/1000
sns.regplot(x="Sample DNA Concentration", y="MiniPico Library DNA Concentration", data=plate_df, ax = ax1)
sns.boxplot(x="Blank", y="MiniPico Library DNA Concentration", data=plate_df, ax = ax2)
sns.swarmplot(x="Blank", y="MiniPico Library DNA Concentration", data=plate_df, ax = ax2,
              size=3,color='black',alpha=0.5)
sns.scatterplot( x="Input DNA",y="MiniPico Library DNA Concentration",hue='Sample DNA Concentration',data=plate_df ,ax = ax3)
ax3.legend(title='Sample DNA Concentration',loc='center left', bbox_to_anchor=(1, 0.5))

In [None]:
blanks_gdna_concs = plate_df.loc[plate_df['Blank']==True,'Sample DNA Concentration']
samples_gdna_concs = plate_df.loc[plate_df['Blank']==False,'Sample DNA Concentration']
mannwhitneyu(samples_gdna_concs, blanks_gdna_concs)

In [None]:
blanks_lib_concs = plate_df.loc[plate_df['Blank']==True,'MiniPico Library Concentration']
samples_lib_concs = plate_df.loc[plate_df['Blank']==False,'MiniPico Library Concentration']
mannwhitneyu(samples_lib_concs, blanks_lib_concs)

#### Library concentration heatmap, whole plate

In [None]:
# get concentration and pooling values for plotting
concs = make_2D_array(plate_df, data_col="MiniPico Library Concentration", well_col=well_col).astype(float)
dna = make_2D_array(plate_df, data_col=SAMPLE_DNA_CONC_KEY, well_col=well_col).astype(float)

# get information for annotation
names = make_2D_array(plate_df, data_col=PM_SAMPLE_KEY, well_col=well_col)

In [None]:
plot_plate_vals(concs, color_map='viridis')

#### Plate maps for individual constituent plates

In [None]:
# CONSTANTS: Users, DO NOT CHANGE THESE
# values without consulting with tech team

# Mask arrays for even and odd rows and columns
EVEN_ROWS = [x for x in range(16) if x % 2 == 0]
ODD_ROWS = [x for x in range(16) if x % 2 == 1]
EVEN_COLS = [x for x in range(24) if x % 2 == 0]
ODD_COLS = [x for x in range(24) if x % 2 == 1]

##### Library concentration heatmap, Plate 1

In [None]:
plot_plate_vals(concs[np.ix_(EVEN_ROWS,EVEN_COLS)],
                    annot_str= names[np.ix_(EVEN_ROWS,EVEN_COLS)],
                    color_map='viridis',
                    annot_fmt='')

##### Library concentration heatmap, Plate 2

In [None]:
plot_plate_vals(concs[np.ix_(EVEN_ROWS,ODD_COLS)],
                    annot_str= names[np.ix_(EVEN_ROWS,ODD_COLS)],
                    color_map='viridis',
                    annot_fmt='')

##### Library concentration heatmap, Plate 3

In [None]:
plot_plate_vals(concs[np.ix_(ODD_ROWS,EVEN_COLS)],
                    annot_str= names[np.ix_(ODD_ROWS,EVEN_COLS)],
                    color_map='viridis',
                    annot_fmt='')

##### Library concentration heatmap, Plate 4

In [None]:
plot_plate_vals(concs[np.ix_(ODD_ROWS,ODD_COLS)],
                    annot_str= names[np.ix_(ODD_ROWS,ODD_COLS)],
                    color_map='viridis',
                    annot_fmt='')

### Part 3 of 5, Step 5 of 5: Write plate info to file

We want to keep all that useful information together in one place so that
it can be easily parsed later. Enter the base (without extension) of the output file name; the code will provide the extension.

In [None]:
## INPUT
plate_df_fbase = './test_output/QC/Tellseq'

Add final column to plate df, then save to a file.

In [None]:
plate_df['sample sheet Sample_ID'] = \
    plate_df[PM_SAMPLE_KEY].map(bcl_scrub_name)
plate_df.head()

In [None]:
plate_df_fp = f"{plate_df_fbase}_plate_df_B.txt"
warn_if_fp_exists(plate_df_fp)

In [None]:
plate_df.to_csv(plate_df_fp, sep='\t', index=False)