In [None]:
import pandas as pd
%reload_ext watermark
%matplotlib inline

import os
from scipy.stats import mannwhitneyu
import yaml
from metapool.metapool import *
from metapool.util import (
    join_dfs_from_files, extend_sample_accession_df,
    extend_compression_layout_info, QIITA_STUDY_ID_KEY)
from metapool.plate import PlateReplication, record_gdna_dilution
from metapool import (add_controls, compress_plates, 
                      TUBECODE_KEY, SAMPLE_NAME_KEY, SAMPLE_DNA_CONC_KEY, 
                      NORMALIZED_DNA_VOL_KEY)
from metapool.mp_strings import (
    PM_SAMPLE_KEY, PM_WELL_KEY, PM_LIB_WELL_KEY, TELLSEQ_BARCODE_ID_KEY, 
    TELLSEQ_BARCODE_SET_ID_KEY)
from metapool.util import warn_if_fp_exists
%watermark -i -v -iv -m -h -p metapool,sample_sheet,openpyxl -u

In [None]:
! conda list

# Knight Lab TellSeq pipeline notebook

## Part 1 (of 5): Workflow for normalizing DNA

This portion of the notebook will read in the output of the mini-Pico quantification assay and construct an Echo normalization picklist file. 

As inputs, it requires:
1. A tab-delimited row-wise sample accession file that indicates the sample name (`sample_name`) and its associated matrix tube barcode (`TubeCode`)
2. A tab-delimited metadata file downloaded from Qiita
3. An accurate plate compression form, with appropriate VisionMate barcode scanner files (`Plate map file`)
4. **TWO** DNA concentration files: one for the undiluted plate and one for the 1:10 dilution plate

The workflow then:
1. reads in the specified input files and constructs a dataframe
2. calculates volumes to be added via echo to reach desired input DNA quantity, with info on which samples need to be pulled from the diluted plate and which from the original plate
3. produces an Echo-formatted pick list file

### Part 1 of 5, Step 0 of 8: Provide inputs

In [None]:
## INPUT
expt_name = "RKLtest"

In [None]:
## INPUT
# One dictionary per study included in the samples on this run.
studies_info = [
    # EVERY entry in the dictionary must be specifically updated 
    # *every* time this notebook is run--none of these have defaults!
    {
    'Project Name': 'Wellcome_Leap_15538', # PROJECTNAME_QIITAID
    'Project Abbreviation': 'WellcomeLeap', # PROJECTNAME
    'sample_accession_fp': './test_data/Plate_Maps/Tellseq_Wellcome Leap - 15538 - Sample Accession.csv',
    'qiita_metadata_fp': './test_data/Plate_Maps/15538_20241004-110731.txt',
    'experiment_design_description': 'plasma sequencing',
    'HumanFiltering': 'True', 
    'Email': 'r@gmail.com'
    }  
]

In [None]:
## INPUT
# TODO: ask what you put in here when doing replicates
compression_layout = [
    {
        # top left plate
        'Plate Position': 1, # as int
        'Plate map file': './test_data/Plate_Maps/Tellseq_Test_Plate_1.tsv',
        'Project Name': 'Wellcome_Leap_15538', # PROJECTNAME_QIITAID
        'Project Plate': 'Plate_1', # Plate_#
        'Plate elution volume': 70
    },
    {
        # top right plate
        'Plate Position': 2, # as int
        'Plate map file': './test_data/Plate_Maps/Tellseq_Test_Plate_2.tsv',
        'Project Name': 'Wellcome_Leap_15538', # PROJECTNAME_QIITAID
        'Project Plate': 'Plate_2', # Plate_#
        'Plate elution volume': 70
    },
    {
        # bottom left plate
        'Plate Position': 3, # as int
        'Plate map file': './test_data/Plate_Maps/Tellseq_Test_Plate_3.tsv',
        'Project Name': 'Wellcome_Leap_15538', # PROJECTNAME_QIITAID
        'Project Plate': 'Plate_3', # Plate_#
        'Plate elution volume': 70
    },
    {
        # bottom right plate
        'Plate Position': 4, # as int
        'Plate map file': './test_data/Plate_Maps/Tellseq_Test_Plate_4.tsv',
        'Project Name': 'Wellcome_Leap_15538', # PROJECTNAME_QIITAID 
        'Project Plate': 'Plate_4',  # Plate_#
        'Plate elution volume': 70
    },
]

In [None]:
# CONSTANTS: Users, DO NOT CHANGE THESE
# values without consulting with tech team

# Mask arrays for even and odd rows and columns
EVEN_ROWS = [x for x in range(16) if x % 2 == 0]
ODD_ROWS = [x for x in range(16) if x % 2 == 1]
EVEN_COLS = [x for x in range(24) if x % 2 == 0]
ODD_COLS = [x for x in range(24) if x % 2 == 1]

In [None]:
def get_studies_attr_list(studies_dict, desired_key):
    return [x[desired_key] for x in studies_dict]

def pick_expected_separator(fps_list):
    sep = "\t"
    visible_sep = "tab"
    
    num_fps = len(fps_list)
    num_csv = sum([x.endswith('.csv') for x in fps_list])
    num_txt = sum([x.endswith('.txt') for x in fps_list])
    num_tsv = sum([x.endswith('.tsv') for x in fps_list])
    
    if num_csv == num_fps:
        sep = ','
        visible_sep = "comma"
    elif (num_tsv + num_txt) != num_fps:
        warnings.warn(
            "Could not determine separator; defaulting to " + visible_sep)

    return sep, visible_sep

### Part 1 of 5, Step 1 of 8: Read in sample accession files

In [None]:
# read in the sample accession files
sample_accession_fps = get_studies_attr_list(
    studies_info, 'sample_accession_fp')
sample_acc_sep, sa_sep_name = pick_expected_separator(sample_accession_fps)
print(f"Expected sample accession separator: {sa_sep_name}")

In [None]:
sample_accession_df = join_dfs_from_files(
    sample_accession_fps, [SAMPLE_NAME_KEY, TUBECODE_KEY], sep=sample_acc_sep)
sample_accession_df.shape

In [None]:
sample_accession_df.head()

### Part 1 of 5, Step 2 of 8: Read in the sample info from Qiita

In [None]:
# read in the qiita metadata files
qiita_metadata_fps = get_studies_attr_list(studies_info, 'qiita_metadata_fp')
qiita_metadata_sep, qm_sep_name = pick_expected_separator(qiita_metadata_fps)
print(f"Expected qiita metadata separator: {qm_sep_name}")

In [None]:
metadata_df = join_dfs_from_files(
    qiita_metadata_fps, [SAMPLE_NAME_KEY, QIITA_STUDY_ID_KEY], 
    opt_cols_to_extract=['tube_id'], unique_cols=[SAMPLE_NAME_KEY],
    sep=qiita_metadata_sep)
metadata_df.shape

In [None]:
metadata_df.head()

Now use the metadata to link the study info into the sample accession dataframe:

In [None]:
extended_sample_accession_df = extend_sample_accession_df(
    sample_accession_df, studies_info, metadata_df)
extended_sample_accession_df.head()

### Part 1 of 5, Step 3 of 8: Assign the compression layout and add controls

In [None]:
## INPUT
blanks_dir = './test_data/BLANKS_for_tellseq'

## INPUT
# ATTENTION: Does your plate include katharoseq controls?
# If *yes*, replace the None below with the path to the directory they are in, such as
# katharoseq_dir = './test_data/katharoseq'
katharoseq_dir = None

In [None]:
# copy study info into the compression layout dictionary (so that it doesn't 
# have to be entered manually in both places)
extended_compression_layout = extend_compression_layout_info(
    compression_layout, studies_info)

In [None]:
plate_df = compress_plates(extended_compression_layout, 
                           extended_sample_accession_df, well_col=PM_WELL_KEY)
plate_df.head()

Check for samples with missing names; at this point, we expect all blanks
and katharoseq controls WON'T have names.

In [None]:
def check_nan_samples(a_plate_df, a_blanks_dir=None):
    num_remaining_nans = a_plate_df[a_plate_df[PM_SAMPLE_KEY].isna()].shape[0]
    print("Number of samples with missing names: %d" % num_remaining_nans)
    
    if num_remaining_nans > 0 and a_blanks_dir:
        err_msg = f"""
By now, all samples should have names, so **do not continue** before fixing this!

"Unofficial" blanks are the most likely issue.
Determine if the tube codes for the problem samples (shown below) are blanks.
If they are, add them to the missing_blanks.csv file in the {a_blanks_dir} directory.
Then re-run from 'Part 1 of 5, Step 3 of 8: Assign the compression layout and add controls'."""
        print(err_msg)

In [None]:
check_nan_samples(plate_df)

In [None]:
plate_df = add_controls(plate_df, blanks_dir, katharoseq_dir)

After adding controls, check again for samples with missing names; 
at this point, we expect all blanks and katharoseq controls WILL have names, 
so if there are any remaining samples without names, 
stop processing and fix them!

In [None]:
## DECISION -- stop if there are still samples without names
check_nan_samples(plate_df, a_blanks_dir=blanks_dir)
plate_df[plate_df[PM_SAMPLE_KEY].isna()]

### Part 1 of 5, Step 4 of 8: Validate plate dataframe

In [None]:
# note that this function does not *need* the extended sample accession df,
# but it is easier to use it just to keep things consistent
validate_plate_df(plate_df,metadata_df, extended_sample_accession_df, 
                  blanks_dir, katharoseq_dir)

### Part 1 of 5, Step 5 of 8: read in DNA concentrations and add to plate map

Enter the path to each of the Pico DNA concentration output files. Each one should be
 a tab-separated file produced by the MiniPico assay on the condensed, 
 384-well plate, and should have a format like the below:

```
##BLOCKS= 1
Group: Unknowns
Sample	Wells	RFU_Values	Concentration	Mean_Conc	SD	CV	Dilution	AdjConc	
01	A1	528791.000	2.472	2.472	0.000	0.0			
02	C1	481728.000	2.282	2.282	0.000	0.0			
03	E1	462964.000	2.206	2.206	0.000	0.0			
04	G1	556609.000	2.585	2.585	0.000	0.0			
05	I1	710679.000	3.207	3.207	0.000	0.0			
06	K1	655693.000	2.985	2.985	0.000	0.0		
```

In [None]:
## INPUT
# ORIGINAL (undiluted) gDNA concentration file
sample_concs_fp =  './test_data/Quant/MiniPico/Tellseq_gDNA_Original_Quant.txt'

# 1:10 diluted gDNA concentration file
diluted_sample_concs_fp = './test_data/Quant/MiniPico/Tellseq_gDNA_diluted_10_to_1_Quant.txt'

In [None]:
for curr_fp in [sample_concs_fp, diluted_sample_concs_fp]:
    if not os.path.isfile(curr_fp):
        print("Problem! %s is not a path to a valid file" % curr_fp)

In [None]:
DILUTED_SUFFIX = "_diluted"
UNDILUTED_SUFFIX = "_undiluted"
UNDILUTED_CONC_KEY = f"{SAMPLE_DNA_CONC_KEY}{UNDILUTED_SUFFIX}"
DILUTED_CONC_KEY = f"{SAMPLE_DNA_CONC_KEY}{DILUTED_SUFFIX}"

def read_agnostic_pico_csv(a_fp, name_suffix, plate_reader):
    a_df = read_pico_csv(a_fp, plate_reader=plate_reader)
    suffixed_names = {x: f"{x}{name_suffix}" for x in a_df.columns}
    suffixed_names.pop(PM_WELL_KEY)  # Don't actually want to rename that :)
    a_df.rename(columns=suffixed_names, inplace=True)
    return a_df

In [None]:
sample_concs = read_agnostic_pico_csv(
    sample_concs_fp, UNDILUTED_SUFFIX, plate_reader='SpectraMax_i3x')
sample_concs.head()

In [None]:
diluted_sample_concs = read_agnostic_pico_csv(
    diluted_sample_concs_fp, DILUTED_SUFFIX, 'SpectraMax_i3x')
diluted_sample_concs.head()

In [None]:
plate_df = pd.merge(plate_df, sample_concs, on=PM_WELL_KEY)
plate_df = pd.merge(plate_df, diluted_sample_concs, on=PM_WELL_KEY)
plate_df[SAMPLE_DNA_CONC_KEY] = plate_df[UNDILUTED_CONC_KEY]  # default
plate_df.head()

In [None]:
## INPUT -- verify default
# Add dilution info into the plate df; for every sample with a DILUTED 
# concentration greater than or equal to the min concentration threshold, we
# SHOULD use the diluted plate values.
min_conc_threshold = 1.5  # ng/ul

In [None]:
diluted_mask = plate_df[DILUTED_CONC_KEY] >= min_conc_threshold
plate_df = record_gdna_dilution(plate_df, diluted_mask, DILUTED_CONC_KEY)
plate_df.head()

**Visualize plate DNA concentrations and plate map:**

Undiluted concentrations

In [None]:
# get DNA concentration information
undiluted_dna_concs = make_2D_array(plate_df, data_col=UNDILUTED_CONC_KEY, 
                          well_col=PM_WELL_KEY).astype(float)

# get information for annotation
names = make_2D_array(plate_df, data_col=PM_SAMPLE_KEY, well_col=PM_WELL_KEY)

plot_plate_vals(undiluted_dna_concs,
                annot_str=names,
                color_map='viridis',
                annot_fmt='.5s')

Diluted concentrations

In [None]:
# get DNA concentration information
diluted_dna_concs = make_2D_array(plate_df, data_col=DILUTED_CONC_KEY, 
                                  well_col=PM_WELL_KEY).astype(float)

# get information for annotation
diluted_names = make_2D_array(plate_df, data_col=PM_SAMPLE_KEY, well_col=PM_WELL_KEY)

plot_plate_vals(diluted_dna_concs,
                annot_str=diluted_names,
                color_map='viridis',
                annot_fmt='.5s')

#### Make sample replicates

Set replicate dictionary, if needed.

In [None]:
# Replicate formats:
# replicate_dict = {source1_quadrant:destination1_quadrant}
# replicate_dict = {source1_quadrant:[destination1_quadrants,destination1_quadrants]}
# Replicate example: 
# replicate_dict = {1:[2,3]}
# for no replicates, use:
replicate_dict = None

# 'Well' differs from 'Library Well' because the former specifies the 
# gDNA source well while the latter specifies the well (destination well) that 
# will contain the sequencing library for the sample. These contain the same
# info when replicates are not used, but differ when replicates ARE used,
# so it is safer to use 'Library Well' in both cases.
# (Careful!  well_col is a global variable used throughout rest of notebook)
well_col = PM_LIB_WELL_KEY

In [None]:
# initialize new PlateReplication object to manage metadata, conversions, etc.
# initialize w/preferred well_col.
pr = PlateReplication(well_col)

# set overwrite=False to detect any overwriting of source or destination quads 
# and raise an Error.
plate_df = pr.make_replicates(
    plate_df, replicates=replicate_dict, overwrite=True)

# replicates overlapping sample_wells for other samples should raise warning,
# but will be allowed
if 'True' in plate_df['contains_replicates'].unique():
    raise NotImplementedError("This notebook does not yet support replicates.")
    
    # plate_df['contains_replicates'] = True
    # # get DNA concentration information
    # dna_concs = make_2D_array(plate_df, data_col='Sample DNA Concentration', 
    #                           well_col=well_col).astype(float)
    # 
    # # get information for annotation
    # names = make_2D_array(plate_df, data_col=PM_SAMPLE_KEY, well_col=well_col)
    # 
    # plot_plate_vals(dna_concs,
    #             annot_str=names,
    #             color_map='viridis',
    #             annot_fmt='.6s')
else:
    plate_df['contains_replicates'] = False
    
# show whether this plate contains replicates or not
f"Contains replicates: {plate_df['contains_replicates'].unique()}"

#### gDNA concentration heatmap, Plate 1

In [None]:
plot_plate_vals(undiluted_dna_concs[np.ix_(EVEN_ROWS,EVEN_COLS)],
                annot_str= names[np.ix_(EVEN_ROWS,EVEN_COLS)],
                color_map='viridis',
                annot_fmt='')

#### gDNA concentration heatmap, Plate 2

In [None]:
plot_plate_vals(undiluted_dna_concs[np.ix_(EVEN_ROWS,ODD_COLS)],
                    annot_str= names[np.ix_(EVEN_ROWS,ODD_COLS)],
                    color_map='viridis',
                    annot_fmt='')

#### gDNA concentration heatmap, Plate 3

In [None]:
plot_plate_vals(undiluted_dna_concs[np.ix_(ODD_ROWS,EVEN_COLS)],
                    annot_str= names[np.ix_(ODD_ROWS,EVEN_COLS)],
                    color_map='viridis',
                    annot_fmt='')


#### gDNA concentration heatmap, Plate 4

In [None]:
plot_plate_vals(undiluted_dna_concs[np.ix_(ODD_ROWS,ODD_COLS)],
                    annot_str= names[np.ix_(ODD_ROWS,ODD_COLS)],
                    color_map='viridis',
                    annot_fmt='')

### Part 1 of 5, Step 6 of 8: calculate normalization volumes and add to plate map

This step will calculate volumes for the DNA normalization pick list.

Check the desired values for:
 - **`ng`**: the desired quantity of DNA in normed plate, in ng
 - **`total_vol`**: the total volume of normalized DNA, in nL
 - **`min_vol`**: the minimum quantity of sample to add, in nL
 - **`resolution`**: the resolution of the Echo, in nL (usually 2.5)

In [None]:
## INPUT -- verify defaults
ng = 7.5
total_vol = 5000
min_vol = 25
resolution = 2.5

In [None]:
NORMALIZED_WATER_VOL_KEY = 'Normalized water volume'

dna_vols = calculate_norm_vol(
    plate_df[SAMPLE_DNA_CONC_KEY], ng=ng, min_vol=min_vol, 
    max_vol=total_vol, resolution=resolution)
water_vols = total_vol - dna_vols

plate_df[NORMALIZED_DNA_VOL_KEY] = dna_vols
plate_df[NORMALIZED_WATER_VOL_KEY] = water_vols
plate_df.head()

### Part 1 of 5, Step 7 of 8 (optional): Add synDNA spike-in

In [None]:
## INPUT
# Set syndna_pool_number to 1 if syndna is being used; otherwise, leave as None
syndna_pool_number = None
syndna_picklist_fp = './test_output/Input_Norm/Tellseq_matrix_syndna_absquant.txt'
# The below fp can be the same as the concentration file input at step 1.5, 
# and generally will be EXCEPT if there was a dilution done between the 
# elution and the concentration measurement at step 1.5 (e.g., for NPH),
undiluted_gdna_conc_fp = './test_data/Quant/MiniPico/Tellseq_gDNA_Original_Quant.txt'

In [None]:
plate_df = add_syndna(plate_df, 
                      syndna_pool_number=syndna_pool_number,
                      syndna_concentration=2.22)

In [None]:
f'For this plate, is_absquant = {is_absquant(plate_df)}'

In [None]:
if is_absquant(plate_df):
    # add undiluted gdna concentrations to plate_df
    plate_df = add_undiluted_gdna_concs(plate_df, undiluted_gdna_conc_fp)     

    # create syndna picklist    
    syndna_well='A1'
    syndna_plate = 'synDNA plate'
    syndna_picklist = \
        format_dna_norm_picklist(
            np.array(plate_df['synDNA volume']),
            np.zeros(plate_df.shape[0]),
            np.repeat(syndna_well,plate_df.shape[0]),
            dest_wells = np.array(plate_df[well_col]),
            sample_names = np.array(plate_df[PM_SAMPLE_KEY]),
            sample_plates = np.repeat(syndna_plate,plate_df.shape[0]))

In [None]:
if is_absquant(plate_df):
    if os.path.isfile(syndna_picklist_fp):
        print("Warning! This file exists already.")

In [None]:
if is_absquant(plate_df):
    with open(syndna_picklist_fp, 'w') as f:
        f.write(syndna_picklist)

    !head {syndna_picklist_fp}

### Part 1 of 5, Step 8 of 8: Make pick list and write to file

Format the Echo-compatible pick list.

In [None]:
## INPUT
norm_picklist_fp = './test_output/Input_Norm/Tellseq_inputnorm.txt'

In [None]:
norm_picklist = format_dna_norm_picklist(
    np.array(plate_df[NORMALIZED_DNA_VOL_KEY]),
    np.array(plate_df[NORMALIZED_WATER_VOL_KEY]),
    np.array(plate_df[PM_WELL_KEY]),
    dest_wells = np.array(plate_df[well_col]),
    sample_names = np.array(plate_df[PM_SAMPLE_KEY]),
    sample_plates = np.array(plate_df[PM_COMPRESSED_PLATE_NAME_KEY]),
    dna_concs = np.array(plate_df[SAMPLE_DNA_CONC_KEY]))

In [None]:
# Write the picklist as .txt
warn_if_fp_exists(norm_picklist_fp)

In [None]:
with open(norm_picklist_fp, 'w') as f:
    f.write(norm_picklist)
    
!head {norm_picklist_fp}

## Part 2 (of 5): Workflow for assigning barcodes

This portion of the notebook will assign index values and construct an Echo picklist file for adding barcodes. 

As inputs, it requires:
1. A plate_df dataframe (from previous step)
2. A tab-delimited tellseq barcode file, containing Well and Barcode_ID columns
3. The name of the tellseq barcode source plate

The workflow then:
1. reads in the tellseq barcode list
2. assigns indices per sample
3. produces an Echo-formatted pick list file

### Part 2 of 5, Step 1 of 3: Read in tellseq barcode list

This is a file that contains each unique tellseq barcode on a separate line,
along with plate and well location information. It should look something like this:

```
Well,Barcode_96_Well_Position,Barcode_ID
A1,A1,C501
B1,A2,C509
C1,B1,C502
D1,B2,C510
E1,C1,C503
F1,C2,C511
G1,D1,C504
H1,D2,C512
I1,E1,C505
J1,E2,C513
K1,F1,C506
```

In [None]:
## INPUT
barcodes_plate_name = 'TellSeq_Barcode_Plate_1_LN2409001_EXP052026'
barcodes_fp = './test_data/Tellseq/TELL-Seq_Barcodes_PP_Primer_Plate - PP_Primer_Position.csv'

In [None]:
if not os.path.isfile(barcodes_fp):
    print("Problem! %s is not a path to a valid file" % barcodes_fp)

In [None]:
barcodes = pd.read_csv(barcodes_fp, dtype=str)

# rename the columns to match what `format_index_picklist` expects
# and add the plate information
barcodes.rename(columns={'Well': 'i5 well', 'Barcode_ID': 'i5 name'}, inplace=True)
barcodes['i5 plate'] = barcodes_plate_name
barcodes.head()

### Part 2 of 5, Step 2 of 3: Assign tellseq barcodes

In [None]:
def sort_by_col_then_row(a_df, well_key='Well'):
    # remove the first character from the contents of a_df[well_key] and 
    # store it in its own column named f"{well_key}_row"
    a_df[f"{well_key}_row"] = a_df[well_key].str[:1]
    
    # take everything BUT the first character in the contents of a_df[well_key]
    # and convert it to an integer and store it in its own column named
    # f"{well_key}_col"
    a_df[f"{well_key}_col"] = a_df[well_key].str[1:].astype(int)
    
    # sort the dataframe first by the column and then by the row
    a_df.sort_values(by=[f"{well_key}_col", f"{well_key}_row"], inplace=True)
    return a_df

def get_num_barcode_sets_needed(a_plate_df, barcodes_df):
    # if num_barcode_sets_needed is not an integer, throw an error
    num_barcode_sets_needed = a_plate_df.shape[0]/ barcodes_df.shape[0]
    if num_barcode_sets_needed % 1 != 0:
        raise ValueError(
            f"Number of barcodes ({barcodes_df.shape[0]}) "
            f"does not divide evenly into number of samples "
            f"]({a_plate_df.shape[0]})")
    return int(num_barcode_sets_needed)

In [None]:
# get the plate_df sorted by col then row
p_df = plate_df.copy()
p_df = sort_by_col_then_row(p_df, well_key=PM_LIB_WELL_KEY)
# make the existing index into a column and reindex
p_df.reset_index(inplace=True)

In [None]:
# get the barcodes sorted by col then row
b_df = barcodes.copy()
b_df = sort_by_col_then_row(b_df, well_key='i5 well')
b_df

In [None]:
# make a new barcodes_sets_df that duplicates the barcodes dataframe
# num_barcode_sets_needed times
concat_dfs = []
barcode_max_col = b_df['i5 well_col'].max()
num_barcode_sets = get_num_barcode_sets_needed(p_df, b_df)
curr_min_col = 0
curr_max_col = barcode_max_col
for i in range(num_barcode_sets):
    curr_set = b_df.copy()
    curr_set[TELLSEQ_BARCODE_SET_ID_KEY] = \
        f"col{curr_min_col + 1}to{curr_max_col}"
    curr_min_col = curr_max_col
    curr_max_col += barcode_max_col
    concat_dfs.append(curr_set)
barcode_sets_df = pd.concat(concat_dfs, ignore_index=True)
barcode_sets_df.head()

In [None]:
# merge the (sorted) plate_df and barcode_sets_df
p_df = pd.merge(p_df, barcode_sets_df, 
                    left_index=True, right_index=True)
p_df.set_index('index', inplace=True)
p_df

In [None]:
plate_df = p_df

### Part 2 of 5, Step 3 of 3: Make barcodes pick list and write to file

Format the Echo-compatible pick list.

In [None]:
## INPUT
barcode_picklist_fp = './test_output/Indices/Tellseq_barcode_matrix.txt'

## INPUT -- verify default
barcode_vol = 4000

In [None]:
barcode_picklist = format_index_picklist(
    plate_df[PM_SAMPLE_KEY], plate_df[well_col], barcode_sets_df,
    i5_vol=barcode_vol)

In [None]:
warn_if_fp_exists(barcode_picklist_fp)

In [None]:
with open(barcode_picklist_fp, 'w') as f:
    f.write(barcode_picklist)

!head {barcode_picklist_fp}

## Part 3 (of 5): Library concentration estimation

This portion of the notebook takes in fluorescent
 quantification values and produces visual outputs to interpret and check 
 values. 

As inputs, this workflow requires:
1. A plate map DataFrame (from previous step)
2. MiniPico output (tab-delimited text format with columns 'Concentration' and 'Well')

The workflow:
1. reads in MiniPico output and calculates estimated library concentration
2. visualizes concentration
3. outputs a plate file and a studies info file for later use in per-barcode-set pooling

### Part 3 of 5, Step 1 of 4: read in MiniPico library concentration
Enter path to MiniPico file:

In [None]:
## INPUT
lib_concs_fp = './test_data/Quant/MiniPico/Tellseq_clean_lib_quant.txt'

In [None]:
lib_concs = read_pico_csv(lib_concs_fp, plate_reader='SpectraMax_i3x',
                          conc_col_name='MiniPico Library DNA Concentration')
lib_concs.rename(columns={'Well':well_col},inplace=True)
plate_df = pd.merge(plate_df, lib_concs, on=well_col)

plate_df.head()

### Part 3 of 5, Step 2 of 4: calculate sample concentration from MiniPico

You will want to make sure that 'size' is correct for your average library size.

In [None]:
plate_df['MiniPico Library Concentration'] = \
    compute_pico_concentration(
        plate_df['MiniPico Library DNA Concentration'], size=500)
plate_df.head()

### Part 3 of 5, Step 3 of 4: visualize MiniPico values

This step will present visuals of the results, including:
1. Scatter plot of DNA concentrations by Library concentration
2. Plate-wise heatmap and histogram showing library concentrations
3. per-96-well plate heatmaps and histograms showing library concentrations and sample names
4. Plate-wise heatmap showing pooling values

#### Library concentration by sample DNA concentration:

In [None]:
f, (ax1,ax2,ax3) = plt.subplots(nrows=1, ncols=3, figsize=(14, 4))
plate_df['Input DNA'] = plate_df['Sample DNA Concentration']*plate_df['Normalized DNA volume']/1000
sns.regplot(x="Sample DNA Concentration", y="MiniPico Library DNA Concentration", data=plate_df, ax = ax1)
sns.boxplot(x="Blank", y="MiniPico Library DNA Concentration", data=plate_df, ax = ax2)
sns.swarmplot(x="Blank", y="MiniPico Library DNA Concentration", data=plate_df, ax = ax2,
              size=3,color='black',alpha=0.5)
sns.scatterplot( x="Input DNA",y="MiniPico Library DNA Concentration",hue='Sample DNA Concentration',data=plate_df ,ax = ax3)
ax3.legend(title='Sample DNA Concentration',loc='center left', bbox_to_anchor=(1, 0.5))

In [None]:
blanks_gdna_concs = plate_df.loc[plate_df['Blank']==True,'Sample DNA Concentration']
samples_gdna_concs = plate_df.loc[plate_df['Blank']==False,'Sample DNA Concentration']
mannwhitneyu(samples_gdna_concs, blanks_gdna_concs)

In [None]:
blanks_lib_concs = plate_df.loc[plate_df['Blank']==True,'MiniPico Library Concentration']
samples_lib_concs = plate_df.loc[plate_df['Blank']==False,'MiniPico Library Concentration']
mannwhitneyu(samples_lib_concs, blanks_lib_concs)

#### Library concentration heatmap, whole plate

In [None]:
# get concentration and pooling values for plotting
concs = make_2D_array(plate_df, data_col="MiniPico Library Concentration", well_col=well_col).astype(float)
dna = make_2D_array(plate_df, data_col=SAMPLE_DNA_CONC_KEY, well_col=well_col).astype(float)

# get information for annotation
names = make_2D_array(plate_df, data_col=PM_SAMPLE_KEY, well_col=well_col)

In [None]:
plot_plate_vals(concs, color_map='viridis')

#### Plate maps for individual constituent plates

##### Library concentration heatmap, Plate 1

In [None]:
plot_plate_vals(concs[np.ix_(EVEN_ROWS,EVEN_COLS)],
                    annot_str= names[np.ix_(EVEN_ROWS,EVEN_COLS)],
                    color_map='viridis',
                    annot_fmt='')

##### Library concentration heatmap, Plate 2

In [None]:
plot_plate_vals(concs[np.ix_(EVEN_ROWS,ODD_COLS)],
                    annot_str= names[np.ix_(EVEN_ROWS,ODD_COLS)],
                    color_map='viridis',
                    annot_fmt='')

##### Library concentration heatmap, Plate 3

In [None]:
plot_plate_vals(concs[np.ix_(ODD_ROWS,EVEN_COLS)],
                    annot_str= names[np.ix_(ODD_ROWS,EVEN_COLS)],
                    color_map='viridis',
                    annot_fmt='')

##### Library concentration heatmap, Plate 4

In [None]:
plot_plate_vals(concs[np.ix_(ODD_ROWS,ODD_COLS)],
                    annot_str= names[np.ix_(ODD_ROWS,ODD_COLS)],
                    color_map='viridis',
                    annot_fmt='')

### Part 3 of 5, Step 4 of 4: Write plate and study info to files

We want to keep all that useful information together in one place so that
it can be easily parsed later. Enter the base (without extension) of the two output file names; the code will provide the extensions.

In [None]:
## INPUT
plate_df_fbase = './test_output/QC/Tellseq_plate_df'
expt_info_fbase = './test_output/QC/Tellseq_expt_info'

Add final columns to plate df, then save to a file.

In [None]:
plate_df['sample sheet Sample_ID'] = \
    plate_df[PM_SAMPLE_KEY].map(bcl_scrub_name)
plate_df[TELLSEQ_BARCODE_ID_KEY] = plate_df['i5 name']
plate_df.head()

In [None]:
plate_df_fp = f"{plate_df_fbase}_A.txt"
if os.path.isfile(plate_df_fp):
    print("Warning! This file exists already.")

In [None]:
plate_df.to_csv(plate_df_fp, sep='\t')

Save the experiment and study info so it doesn't have to be re-entered by hand in the next notebook.

In [None]:
expt_info_fp = f"{expt_info_fbase}.yml"
if os.path.isfile(expt_info_fp):
    print("Warning! This file exists already.")

In [None]:
expt_info = {
    "experiment_name": expt_name,
    "studies": studies_info
}

In [None]:
with open(expt_info_fp, 'w') as file:
    yaml.dump(expt_info, file, default_flow_style=False)