In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
import os
import re
from tqdm.notebook import tqdm, trange

# Globals

A few general variable about where to find stuff. Adapt to your own needs.

In [None]:
# Inputs
vpipe_working = 'working' # V-pipe's working directory
ww_samples_tsv = f"{vpipe_working}/samples.wastewateronly.tsv" # samples TSV file listing the waste water samples

# ww_samples_tsv = f"{vpipe_working}/samples.wastewateronly.lastweek.tsv" 

# optionnal:
plant_name_tsv = 'ww_plants.tsv' # tsv with names of the plants (or None)

# files generated by snv_count_wastewater3
muttable_tsv='mutlist.txt'
tables_dir='snv_tables'

# Outputs

In [None]:
# RegEx used to decode plantcode and date from sample name
# should return a dict (named groups):
#  - plant: the code of the wastewater plant (if plant_name_tsv is provided, it will be looked up for a full name)
#  - year, month, day: used to make a time code for the time-serie
rxname=re.compile('(?:(?P<plant>\d+)_(?P<year>20\d{2})_(?:(?:(?P<month>[01]?\d)_(?P<day>[0-3]?\d))|(?:R_(?P<repeat>\d+))))|^(?P<KLZH>KLZHCo[vV])(?P<KLZHdate>\d{6})(?:_(?P<KLZHsuffix>\w+))?|^(?P<BA>B[aA])(?P<BAsam>\d{6})(?:_(?P<BAdate>20\d{2}-[01]?\d-[0-3]?\d))?')

# Helper tables

In [None]:
# warning, this table is *1*-based
mut=pd.read_csv(muttable_tsv, sep='\t').astype({'position':'int'})
mut

In [None]:
lst=pd.read_csv(ww_samples_tsv, sep='\t', header=None,names=['sample','batch','reads'])
lst#.drop('reads')

In [None]:
plants=pd.read_csv(plant_name_tsv, sep='\t', header=0,index_col='Code') if plant_name_tsv else pd.Dataframe()
plants

# Processing

In [None]:
def tally_multicol(tsam,tbat):
    # warning that table is *0*-based
    basecount=pd.read_csv(f"working/samples/{tsam}/{tbat}/alignments/basecnt.tsv.gz", sep='\t', header=[0,1],index_col=[0,1]).droplevel('ref').T.droplevel('sample').T
    basecount['cov']=basecount.apply(sum, axis=1)
    # -1 : 1-based to 0-based
    r=pd.DataFrame(data=mut.apply(lambda x: pd.Series([x.position, basecount.loc[x.position-1]['cov'], basecount.loc[x.position-1][x.variant]],index=['pos','cov','var']), axis=1)).set_index('pos').stack().T
    r.index =  [f'{i}_{j}' for i, j in r.index]
    return pd.DataFrame(data={tsam: r})

In [None]:
# TODO: create a patchmap in automation once a format has been decided
patchmap_basel_date = {
	'Ba210461_24112021':	'2021-11-24',
	'Ba210449_2021-11-10':	'2021-11-10',
	'Ba210429_20211027':	'2021-10-27',
	'Ba210417': 	'2021-10-13',
	'Ba210411': 	'2021-10-06',
}

def parse_samname(tsam):
    date=plantcode=plantname=np.nan
    match=rxname.search(tsam)
    if match:
        import datetime
        m=match.groupdict()
        if not m['KLZH'] and not m['BA']:
            if m['month'] and m['day']:
                date=datetime.datetime(int(m['year']),int(m['month']),int(m['day'])).strftime('%Y-%m-%d')
            plantcode=int(m['plant'])
            plantname=plants.at[plantcode,'Plant'] if plantcode in plants.index else ''
        elif m['KLZH']:
            #print('>>>>>>>>>>', tsam, m)
            date = datetime.datetime.strptime(m['KLZHdate'], '%y%m%d').date().strftime('%Y-%m-%d')
            if not m['KLZHsuffix']: # avoid _Promega and _2 
                plantname = 'Kanton Zürich'
                plantcode = 90
            else:
                plantname = 'Kanton Zürich/Promega'
                plantcode = 91
        elif m['BA']:
            if tsam in patchmap_basel_date:
                date = patchmap_basel_date[tsam]
            elif m['BAdate']:
                date = datetime.datetime.strptime(m['BAdate'], '%Y-%m-%d').date().strftime('%Y-%m-%d')
            #plantname = 'Kanton Basel'
            plantname = 'Basel (catchment area ARA Basel)'
            plantcode = 92
    return (date,plantcode,plantname)

In [None]:
def tally_multiline(tsam,tbat):
    (date,plantcode,plantname) = parse_samname(tsam)
    # warning that table is *0*-based
    basecount=pd.read_csv(f"working/samples/{tsam}/{tbat}/alignments/basecnt.tsv.gz", sep='\t', header=[0,1],index_col=[0,1]).droplevel('ref').T.droplevel('sample').T
    basecount['cov']=basecount.apply(sum, axis=1)
    r=pd.DataFrame(data=mut.apply(
        lambda x: pd.Series([tsam, tbat, 
                             date,plantcode,plantname,
                             x.gene,x.position,x.variant,
                             # -1 : 1-based to 0-based
                             basecount.loc[x.position-1]['cov'], 
                             basecount.loc[x.position-1][x.variant],
                             basecount.loc[x.position-1][x.variant]/basecount.loc[x.position-1]['cov'] if 
                                 basecount.loc[x.position-1]['cov'] else np.nan
                            ],index=['sample','batch',
                                     'date','plantcode','plantname',
                                     'gene','pos','base','cov','var','frac']).append(x[4:]),
                            axis=1)).set_index(['sample','batch','pos'])
    # testing
#     if b:
#         print(r)
    return r 

In [None]:
def tally_filter(tsam,tbat,fdirs):
    (date,plantcode,plantname) = parse_samname(tsam)
    # warning that table is *0*-based
    basecount=pd.read_csv(f"working/samples/{tsam}/{tbat}/alignments/basecnt.tsv.gz", sep='\t', header=[0,1],index_col=[0,1]).droplevel('ref').T.droplevel('sample').T
    basecount['cov']=basecount.apply(sum, axis=1)
    r={}
    for fil_dir in fdirs:
        # load ShoRAH-called SNVs
        shorah_fname=f"{fil_dir}_tables/{tsam}-{tbat}_{fil_dir}.csv"
        shorah_snv=None
        if os.path.isfile(shorah_fname):
            shorah_snv=pd.read_csv(shorah_fname, sep=',', header=0, index_col=0)
        else:
            # if no table was generated, consider the whole file empty
            print(f"Warning!!! File {shorah_fname} is missing!!!")
            shorah_snv=pd.DataFrame(data=[], columns=['position','candidate_windows','effective_windows','ave_reads'])
        # combine ShoRAH-called SNVs and mutation list
        fil_snv=pd.merge(left=mut, right=shorah_snv[['position','candidate_windows','effective_windows','ave_reads']],
                         # outer: keep even the mutation not in ShoRAH and zero-fill
                         how='outer', left_on="position", right_on="position").fillna(0)
        # generate output
        r[fil_dir]=pd.DataFrame(data=fil_snv.apply(
            lambda x: pd.Series([tsam, tbat, 
                                 date,plantcode,plantname,
                                 x.gene,x.position,x.variant,
                                 # -1 : 1-based to 0-based
                                 basecount.loc[x.position-1]['cov'] if x.candidate_windows > 0 else 0, 
                                 basecount.loc[x.position-1][x.variant] if x.effective_windows > 0 else 0,
                                 (basecount.loc[x.position-1][x.variant] if x.effective_windows > 0 else 0)/basecount.loc[x.position-1]['cov'] if 
                                     basecount.loc[x.position-1]['cov'] and (x.candidate_windows > 0) else np.nan
                                ],index=['sample','batch',
                                         'date','plantcode','plantname',
                                         'gene','pos','base','cov','var','frac']).append(x[4:-3]),
                                axis=1)).set_index(['sample','batch','pos'])
        
    
    return r 

# Process ShoRAH-filtered data

In [None]:
filters_list=['snv'] # single file with all together

In [None]:
for fdir in filters_list:
    assert os.path.isdir(f'{fdir}_tables/')

filter_tables={ fdir: pd.DataFrame() for fdir in filters_list }

for i,s in tqdm(list(lst.iterrows())):
    table=tally_filter(s['sample'],s['batch'],filters_list)
    for fdir in filters_list:
        filter_tables[fdir]=pd.concat([filter_tables[fdir], table[fdir]], axis=0, join='outer', copy=False)

In [None]:
display(filter_tables['snv'])

In [None]:
for fdir in filters_list:
    assert os.path.isdir(f'{fdir}_tables/')
    fname=f"tallymut_line_{fdir}.tsv"
    print(f"Writing {fname}")
    filter_tables[fdir].to_csv(fname,sep='\t')

# Process unfiltered data

In [None]:
table=pd.DataFrame()
for i,s in tqdm(list(lst.iterrows())):
    #table=pd.concat([table, tally(s['sample'],s['batch'])], axis=1, join='outer', copy=False).T
    table=pd.concat([table, tally_multiline(s['sample'],s['batch'])], axis=0, join='outer', copy=False)
display(table)

In [None]:
table[table['plantname'] == 'Kanton Basel'] #Zürich']

In [None]:
t = table.reset_index()
t[t['sample'] == 'KLZHCov210822']['date']

In [None]:
table.to_csv(f"tallymut_line.tsv",sep='\t')

# Single tests scrap-yard

In [None]:
tally_multiline('A1_12_2020_12_21_NA_NA','20201223_HWKGTDRXX')

In [None]:
tally_filter('A1_12_2020_12_21_NA_NA','20201223_HWKGTDRXX',['snv'])['snv']

In [None]:
tally_filter('C1_10_2020_12_11_NA_NA','20201223_HWKGTDRXX',['snv'])['snv']

In [None]:
tally_multicol('A1_12_2020_12_21_NA_NA','20201223_HWKGTDRXX').T[['23403_var','23403_cov']]

In [None]:
fil_dir='sa_snv'
tsam='A1_12_2020_12_21'
shorah_snv=pd.read_csv(f"{fil_dir}_tables/{tsam}_{fil_dir}.csv", sep=',', header=0, index_col=0)
pd.merge(left=mut, right=shorah_snv[['position','candidate_windows','effective_windows','ave_reads']], how='outer', left_on="position", right_on="position").fillna(0)

In [None]:
rxname=re.compile('(?P<plant>\d+)_(?P<year>20\d{2})_(?P<month>[01]?\d)_(?P<day>[0-3]?\d)')
rxname.search('12_2020_12_21').groupdict()

In [None]:
m=rxname.search('12_2020_12_21').groupdict()
plants.at[int(m['plant']),'Plant']

In [None]:
rxname.search('F1_12_2021_R_02').groupdict()