In [None]:
%reload_ext watermark
%matplotlib inline

import os
import re 
from contextlib import suppress
from metapool.metapool import *
from metapool.util import (join_dfs_from_files, get_set_fp, warn_if_fp_exists, 
                           SET_SUFFIX)
from metapool.mp_strings import (
    PM_BLANK_KEY, MINIPICO_LIB_CONC_KEY, PM_LIB_WELL_KEY, 
    TELLSEQ_BARCODE_ID_KEY)
%watermark -i -v -iv -m -h -p metapool,sample_sheet,openpyxl -u

In [None]:
! conda list

# Knight Lab TellSeq pipeline notebook D 

## Part 5 (of 5): Workflow for Read Distribution Summary and Pool Normalization

### Step 1 of 5: Import plate info for this barcode set

In [None]:
## INPUT
plate_df_set_fp = './test_output/QC/Tellseq__plate_df_C_set_col19to24.txt'

In [None]:
# if the plate_df_set_fp does not end with "C_set_*.txt", throw an error
expected_suffix = rf"C{SET_SUFFIX}_.+\.txt$"

# Check if the file path matches the pattern
if not re.search(expected_suffix, plate_df_set_fp):
    raise ValueError(f"Expected file ending with '{expected_suffix}'")

In [None]:
if not os.path.isfile(plate_df_set_fp):
    print("Problem! %s is not a path to a valid file" % plate_df_set_fp)

In [None]:
plate_df = pd.read_csv(plate_df_set_fp, sep='\t')
plate_df.head()

Verify that there are no duplicate barcodes in the selected plate df. This must return True.

In [None]:
## DECISION -- verify no duplicate barcodes
plate_df[TELLSEQ_BARCODE_ID_KEY].value_counts().nunique() == 1

In [None]:
# split the evp_plate_df_set_fp to extract the set id
_, set_str = os.path.splitext(plate_df_set_fp)[0].rsplit(SET_SUFFIX, 1)
current_set_id = set_str.replace("_", "").replace(".txt", "")
current_set_id

In [None]:
row_col_key = f"{PM_LIB_WELL_KEY}_row"
col_col_key = f"{PM_LIB_WELL_KEY}_col"

In [None]:
source_well_names = make_compressed_2d_array(
    plate_df, data_col=PM_LIB_WELL_KEY, 
    row_col=row_col_key, col_col=col_col_key)
source_well_names

### Step 2 of 5: Import and merge per_sample read distributions for this set

Import tsv file(s) with read_counts from per_sample_fastq files and merge with growing plate_df


In [None]:
## INPUT
# Make sure this is for the same barcode set as the plate df file.
# Enter paths to read counts file(s)
read_counts_fps = [
    './test_data/Demux/Tellseq_fastqc_sequence_counts.tsv',
]

In [None]:
# Import reads counts from file to dataframes
CATEGORY_KEY = 'Category'
UNIQUE_READS_KEY = 'Unique Reads'
DUPLICATE_READS_KEY = 'Duplicate Reads'
read_counts_df = join_dfs_from_files(
    read_counts_fps, [CATEGORY_KEY, UNIQUE_READS_KEY, DUPLICATE_READS_KEY],
    unique_cols=[CATEGORY_KEY], 
    dtype={CATEGORY_KEY: str, UNIQUE_READS_KEY: int, DUPLICATE_READS_KEY: int})
    
trimmed_reads_mask = read_counts_df[CATEGORY_KEY].str.contains('trimmed')
raw_read_counts_df = read_counts_df.loc[~trimmed_reads_mask].copy()
filtered_read_counts_df = read_counts_df.loc[trimmed_reads_mask].copy()

##Can also import counts from Qiita per_sample_FASTQ summaries.  
# per_sample_fastq_counts_df = pd.read_csv('./test_data/Demux/YYYY_MM_DD_Celeste_Adaptation_16_17_18_21_per_sample_fastq.tsv',
#                                          sep='\t')

In [None]:
# Merge read_counts_df with plate_df 
plate_df_w_reads = merge_read_counts(
    plate_df, raw_read_counts_df, 
    reads_column_name='Raw Reads')
plate_df_w_reads = merge_read_counts(
    plate_df_w_reads, filtered_read_counts_df, 
    reads_column_name='Filtered Reads')

# plate_df_w_reads = merge_read_counts(
#    plate_df_w_reads, per_sample_fastq_counts_fp,
#    reads_column_name='Qiita Reads')

plate_df_w_reads.head()

In [None]:
reads_column = 'Raw Reads'

f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(nrows=2, ncols=2, figsize=(8, 8))
# evenness plot
rmax = int(round(plate_df_w_reads[reads_column].max(),-2))
survival_df = pd.concat([read_survival(plate_df_w_reads.loc[plate_df_w_reads[PM_BLANK_KEY] == True,
                                                            reads_column], label='Blanks',rmax=rmax),
                         read_survival(plate_df_w_reads.loc[plate_df_w_reads[PM_BLANK_KEY] == False,
                                                            reads_column], label='Samples',rmax=rmax)])

ax3.set_xlabel(reads_column)
ax3.set_ylabel('Samples')
survival_df.plot(color = ['coral','steelblue'],ax=ax1)
ax1.set_xlabel(reads_column)
ax1.set_ylabel('Samples')

##Histogram
sns.histplot(plate_df_w_reads[reads_column],ax=ax3)

##Regressopm
sns.regplot(x="MiniPico Library DNA Concentration", y=reads_column, data=plate_df_w_reads, ax = ax2)

#Boxplot
sns.boxplot(x=PM_BLANK_KEY, y=reads_column, data=plate_df_w_reads, ax = ax4)
sns.stripplot(x=PM_BLANK_KEY, y=reads_column, data=plate_df_w_reads, ax = ax4,
              size=3,color='black',alpha=0.5)


plt.tight_layout()

### Step 3 of 5: Calculate iSeqnorm pooling volumes

In [None]:
## INPUT
dynamic_range = 5

In [None]:
plate_df_normalized = calculate_iseqnorm_pooling_volumes(
    plate_df_w_reads,dynamic_range=dynamic_range, normalization_column='Raw Reads')

In [None]:
ISEQ_NORM_VOL_KEY = 'iSeq normpool volume'

vols = make_compressed_2d_array(
    plate_df_normalized, data_col=ISEQ_NORM_VOL_KEY, 
    row_col=row_col_key, col_col=col_col_key).astype(float)
vols

In [None]:
# visualize
conc, vol = estimate_pool_conc_vol(
    plate_df_normalized[ISEQ_NORM_VOL_KEY], 
    plate_df_normalized[MINIPICO_LIB_CONC_KEY])
print("Pool concentration: {:.2f}".format(conc))
print("Pool volume: {:.2f}".format(vol))
with suppress(np.linalg.LinAlgError):
    plot_plate_vals(vols)

### Step 4 of 5: Estimate read depth

In [None]:
#Plots estimate of read depth proportion, and returns a df with estimates. 
plate_df_normalized_with_estimates = estimate_read_depth(plate_df_normalized)
plate_df_normalized_with_estimates.head()

### Step 5 of 5: Make pooling picklist and write to a file

In [None]:
## INPUT
iseqnormed_picklist_fbase = './test_output/Pooling/Tellseq_iSeqnormpool'

In [None]:
iseqnormed_picklist = format_pooling_echo_pick_list(
    vols, max_vol_per_well=30000, source_well_names=source_well_names)

In [None]:
iseqnormed_picklist_fp = get_set_fp(iseqnormed_picklist_fbase, current_set_id)
warn_if_fp_exists(iseqnormed_picklist_fp)

In [None]:
with open(iseqnormed_picklist_fp,'w') as fh:
    fh.write(iseqnormed_picklist)

!head {iseqnormed_picklist_fp}