In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
plotwidth=40

In [None]:
import numpy as np
import pandas as pd
import scipy as sp

import seaborn as sns
import matplotlib.pyplot as plt

from natsort import natsort_keygen

import statsmodels.api as sm
import scipy as sp

import os
import json
import datetime
import psycopg2
import netrc
import re
from tqdm.notebook import tqdm, trange
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()

# Globals

In [None]:
datadir = '.'
plotdir = 'plots'
vpipe_working = 'working' # V-pipe's working directory

# Input
tally_mut = os.path.join(datadir, 'tallymut_line.tsv')
cooc_table = os.path.join(vpipe_working, 'ww-cooc.v3.csv')

# Select
start_date = '2020-12-08'
start_date_klzh = '2021-08-15'
todaydate = datetime.date.today().strftime("%Y-%m-%d")
cities_list=['Altenrhein (SG)', 'Chur (GR)', 'Genève (GE)', 'Laupen (BE)',
       'Lausanne (VD)', 'Lugano (TI)', 'Zürich (ZH)', 'Kanton Zürich', 'Basel (catchment area ARA Basel)']
# variants that will be displayed here AND uploaded:
variants_list_upload=['UK','ZA','BR','C36','IN1','IN2','IN3','om']
# extra variants that will be ONLY display here, but NOT uploaded (typically for variant not present yet)
variants_list=variants_list_upload+['AY42']
variants_pangolin={'UK':'B.1.1.7','ZA':'B.1.351','BR':'P.1','C36':'C.36.3','IN1':'B.1.617.1','IN2':'B.1.617.2','IN3':'B.1.617.3','om':'B.1.1.529','AY42':'AY.4.2'}
exclusive_list=['ZA','BR'] # list of variants where we should apply filtering
exclude_from=['UK','ZA','BR' ] #,'IN2'] #,'C36','IN1','IN2','IN3'] # filter against these variants
# Output
update_data_file = os.path.join(datadir, 'ww_update_data_heatmap.json')

# Load Data

## Load WWTP Sequencing Data

#### Tally of mutations in samples

This loads the TSV file generated by the [mut-table.ipynb](mut-table.ipynb) notebook.

In [None]:
df = pd.read_csv(tally_mut, sep='\t', parse_dates=['date'])

df['mutation'] = df['pos'].astype(str) + df['base']
df#.head()

In [None]:
# remove problematic mutations (from mutation influence diagnostic) (hard-coded now)
# TODO: improve the code

df = df[~(df['mutation'].isin(["28461G", "11201G", "26801C"]))]

### Cooccurrences by cojac

This loads the CSV generated by the `ww.bsub` LSF job.

In [None]:
df_cooc_raw = pd.read_csv(cooc_table, sep=',') #, index_col=['sample','batch'])

#df['mutation'] = df['pos'].astype(str) + df['base']

df_cooc_raw.head()

### Handle duplicates

The code in the notebook will have trouble if some indexes are duplicated

In [None]:
# Look for duplicated samples with suffixes
df[(~df['plantname'].isna()) & (df['date'] >= start_date) & (df['base'] != '-') & df.duplicated(subset=['plantname','date','batch','mutation'], keep=False)]#.groupby('batch').count()

In [None]:
# HACK workaround for a duplicated sample
df['batch'].loc[df['sample']=='A1_05_2021_05_19_CATTCGGA-TTTCCATC']='20210604_JN8TR_CATTCGGA-TTTCCATC'
df_cooc_raw['batch'].loc[df_cooc_raw['sample']=='A1_05_2021_05_19_CATTCGGA-TTTCCATC']='20210604_JN8TR_CATTCGGA-TTTCCATC'

In [None]:
# HACK workaround for a duplicated sample
df['batch'].loc[df.duplicated(subset=['plantname','date','batch','mutation'], keep=False) & df['sample'].str.contains('_P$')]='Promega'
df_cooc_raw['batch'].loc[df_cooc_raw['sample'].str.contains('_P$')]='Promega'

In [None]:
# Look for duplicated samples with suffixes
df[(~df['plantname'].isna()) & (df['date'] >= start_date) & (df['base'] != '-') & df.duplicated(subset=['plantname','date','batch','mutation'], keep=False)].groupby('batch').count()

In [None]:
# plantcode and date from mut_table to cooc

df_map=df[['sample','batch','plantname','date']].drop_duplicates(ignore_index=True).set_index(['sample','batch'])
df_cooc=df_cooc_raw.merge(df_map[['plantname','date']], how='left', on=['sample','batch'], copy=False, validate='many_to_one').set_index(['plantname','date','batch'])

df_cooc.head()

In [None]:
df_cooc.to_csv("df_cooc.csv")

In [None]:
#TODO load actual amplicon information

amplicons = {
    'IN1': { 76: ['22917G', '23012C'], },
    'IN3': { 76: ['22917G', '23012C'], },
    'IN2': { 76: ['22917G', '22995A'],
             91: ['27638C', '27752T'], },
    'BR':  { 71: ['21621A', '21638T'],  # ,'21614T'], # common mutations are removed from the plot
             # 95: ['28877T',  '28878C'], # not part of the signature mutations
           },
    'AY42':{ 73: ['21995C','22227T'], },
    'om':  { 75: ['22578A','22673C','22674T','22679C','22813T'],
             76: ['22882G','22898A','22992A','23013C','23040G','23048A','23055G'],
             78: ['23525T','23599G'],
             88: ['26577G','26709A'],
           },
}

In [None]:
def mutfilter(var, exclusive=False):
    # if exclusive is set on true, it will filter only those mutation which are variant specific and DO NOT show up in other variants
    # e.g.: used to exclude 23063T (V501Y) as all variant have it
    return (df[exclude_from].fillna(0) == ['mut' if v==var else 0 for v in exclude_from]).all(axis=1) if exclusive else (df[var] == "mut")
mutfilter('UK', exclusive=True)

In [None]:
df_wide = {}
amp_col = {}
for var in tqdm(variants_list):
    df_wide[var] = (
        # for the remaining mutations
        df[(~df['plantname'].isna()) & (df['date'] >= start_date) & (df['base'] != '-') & (mutfilter(var, exclusive=(var in exclusive_list)))]
         .pivot(index=['plantname', 'date', 'batch'], columns=['mutation'], values='frac')
         .sort_index(axis=1, key=natsort_keygen())
    )
    # add amplicons
    if var in amplicons:
        amp_col[var] = []
        for amp,muts in amplicons[var].items():
            # transfer this column from the coocurence table...
            aname =  f"Amp {amp}"
            amplicon_table = df_cooc.loc[(df_cooc['amplicon'] == amp) & (df_cooc[var]==1), ['frac']]
            amp_col[var] += [ aname ]
            # (filled with NA on dates for which we don't have coocurence, and subset to target indices)
            missing_cooc = df_wide[var].index.difference(amplicon_table.index)
            na_filled_col = pd.concat([amplicon_table, pd.DataFrame(index=missing_cooc, columns=amplicon_table.columns).fillna(np.nan)])
            # ...into that position 
            df_wide[var].insert(loc=1+df_wide[var].columns.get_loc(muts[-1]), column=aname, value=na_filled_col.loc[df_wide[var].index], allow_duplicates=False)

df_wide['IN2'].head()

In [None]:
df_wide['om'].loc['Basel (catchment area ARA Basel)']

In [None]:
df_wide_cov = (
    df[(~df['plantname'].isna()) & (df['date'] >= '2020-12-08') & (df['base'] != '-') & (df['UK'] == "mut")]
    .pivot(index=['plantname', 'date', 'batch'], columns=['mutation'], values='cov')
    .sort_index(axis=1, key=natsort_keygen())
)

df_wide_counts = (
    df[(~df['plantname'].isna()) & (df['date'] >= '2020-12-08') & (df['base'] != '-') & (df['UK'] == "mut")]
    .pivot(index=['plantname', 'date', 'batch'], columns=['mutation'], values='var')
    .sort_index(axis=1, key=natsort_keygen())
)

# Heatmaps

In [None]:
for city in tqdm(cities_list, desc='Cities', position=0):
    for var in tqdm(variants_list, desc='Variants', position=1):
        tmp=df_wide[var].loc[city]

        # drawing box decorations:
        #      22917G┐
        #      22995A┤
        # Amplicon 76┘
        if var in amplicons:
            rmap={}
            # mutations
            for amp,muts in amplicons[var].items():
                box = ' ┐'
                for m in muts:
                    rmap[m]=f"{m}{box}"
                    box = '┤'
            # amplicons
            for a in amp_col[var]:
                    rmap[a]=f"{a}┘"
            tmp.rename(columns=rmap,inplace=True)

        plt.figure(figsize=(plotwidth,7)) # 17
        trns=tmp.T
        sns.heatmap(
            data=trns.fillna(np.nan), #applymap(lambda x: np.nan if pd.isna(x) else x),
            annot=trns,
            fmt='.1g',
            square=False, cbar=False,
            cmap=sns.color_palette("viridis", as_cmap=True)
        ).set_title(f"{city} - {variants_pangolin[var]}")
        plt.show()

# Prepare Heatmap Data for covSPECTRUM

In [None]:
import warnings

with warnings.catch_warnings(record=True) as w:
    r_df={}
    for city in tqdm(cities_list, desc='Cities', position=0):
        r_df[city]={}
        for var in tqdm(variants_list, desc='Variants', position=1, leave=False):
            r_df[city][var] = df_wide[var].loc[city].dropna(axis=1, how='all').T  # .loc[28111:28111]

m_df={}
# import matplotlib.gridspec as gridspec
for city in tqdm(cities_list, desc='Cities', position=0):
    m_df[city]={}
    for var in tqdm(variants_list_upload, desc='Variants', position=1, leave=False):
        m_df[city][var] = r_df[city][var].T.groupby("date").agg("mean").asfreq('D').T

In [None]:
m_df.keys()

In [None]:
m_df['Altenrhein (SG)'].keys()

In [None]:
m_df['Altenrhein (SG)']['UK']

# Make data for covSPECTRUM

In [None]:
update_data={ }
tdf={city:{}  for city in cities_list}
tdf_mat={city:{}  for city in cities_list}

# add 'undetermined' variant
update_data['undetermined'] = {
    # undertermined don't have their own proper heatmap (they are definted by signature missing for all other variant)
    city: {"mutationOccurrences": None} for city in cities_list
}

# process heatmaps normally for all other variants.
for var in tqdm(variants_list_upload, desc='Variants', position=0):
    update_data[variants_pangolin[var]] = { }
    for city in tqdm(cities_list, desc='Cities', position=1, leave=False):

#         tdf[city][var]["date"] = tdf[city][var]["date"].astype("str")

        tdf_mat[city][var] = m_df[city][var].T.melt(ignore_index=False, var_name="nucMutation", value_name="proportion").reset_index()
        tdf_mat[city][var]["date"] = tdf_mat[city][var]["date"].astype("str")
        # drawing box decorations:
        # ┌22917G
        # ├22995A
        # └Amplicon 76
        if var in amplicons:
            # mutations
            for amp,muts in amplicons[var].items():
                box = '┌'
                for m in muts:
                    tdf_mat[city][var].loc[tdf_mat[city][var]["nucMutation"]==m,"nucMutation"]=f"{box}{m}"
                    box = '├'
            for a in amp_col[var]:
                    tdf_mat[city][var].loc[tdf_mat[city][var]["nucMutation"]==a,"nucMutation"]=f"└{a}"

        update_data[variants_pangolin[var]][city] = {
            #"updateDate": todaydate,
#             "timeseriesSummary": [dict(tdf[city][var].iloc[i,]) for i in range(tdf[city][var].shape[0])],
            "mutationOccurrences": [dict(tdf_mat[city][var].iloc[i,]) for i in range(tdf_mat[city][var].shape[0])]
        }

import json
with open(update_data_file, 'w') as file:
     file.write(json.dumps(update_data))

In [None]:
# tdf_mat['Altenrhein (SG)'].keys()

In [None]:
update_data['undetermined']

In [None]:
(0.5+len(df_wide['om'].loc['Zürich (ZH)'].columns)/5.1)

In [None]:
(0.5+len(df_wide['BR'].loc['Zürich (ZH)'].columns)/5.1)