In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
plotwidth=40

In [None]:
import numpy as np
import pandas as pd
import scipy as sp

import seaborn as sns
import matplotlib.pyplot as plt

from natsort import natsort_keygen

import statsmodels.api as sm
import scipy as sp

import os
import json
import datetime
import psycopg2
import netrc
import re
from tqdm.notebook import tqdm, trange
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()

# Globals

In [None]:
datadir = '.'
plotdir = 'plots'
vpipe_working = 'working' # V-pipe's working directory

# Input
tally_mut = os.path.join(datadir, 'tallymut_line.tsv')
cooc_table = os.path.join(vpipe_working, 'ww-cooc.csv')

# Select
start_date = '2020-12-08'
start_date_klzh = '2021-08-15'
todaydate = datetime.date.today().strftime("%Y-%m-%d")
cities_list=['Altenrhein (SG)', 'Chur (GR)', 'Genève (GE)', 'Kanton Zürich', 'Laupen (BE)',
       'Lausanne (VD)', 'Lugano (TI)', 'Zürich (ZH)']
variants_list=['UK','ZA','BR','C36','IN1','IN2','IN3']
variants_pangolin={'UK':'B.1.1.7','ZA':'B.1.351','BR':'P.1','C36':'C.36.3','IN1':'B.1.617.1','IN2':'B.1.617.2','IN3':'B.1.617.3'}
exclusive_list=['ZA','BR'] # list of variants where we should apply filtering
exclude_from=['UK','ZA','BR' ] #,'IN2'] #,'C36','IN1','IN2','IN3'] # filter against these variants
# Output
update_data_file = os.path.join(datadir, 'ww_update_data_heatmap.json')

# Load Data

## Load WWTP Sequencing Data

In [None]:
df = pd.read_csv(tally_mut, sep='\t', parse_dates=['date'])

df['mutation'] = df['pos'].astype(str) + df['base']
df.head()

In [None]:
df_cooc_raw = pd.read_csv(cooc_table, sep=',') #, index_col=['sample','batch'])

#df['mutation'] = df['pos'].astype(str) + df['base']

df_cooc_raw.head()

In [None]:
# Look for duplicated samples with suffixes
df[(~df['plantname'].isna()) & (df['date'] >= start_date) & (df['base'] != '-') & df.duplicated(subset=['plantname','date','batch','mutation'], keep=False)]

In [None]:
# HACK workaround for a duplicated sample
df.loc[df['sample']=='A1_05_2021_05_19_CATTCGGA-TTTCCATC','batch']='20210604_JN8TR_CATTCGGA-TTTCCATC'
df_cooc_raw.loc[df_cooc_raw['sample']=='A1_05_2021_05_19_CATTCGGA-TTTCCATC','batch']='20210604_JN8TR_CATTCGGA-TTTCCATC'

In [None]:
# plantcode and date from mut_table to cooc
df_map=df[['sample','batch','plantname','date']].drop_duplicates(ignore_index=True).set_index(['sample','batch'])
df_cooc=df_cooc_raw.merge(df_map[['plantname','date']], how='left', on=['sample','batch'], copy=False, validate='many_to_one').set_index(['plantname','date','batch'])
df_cooc.head()

In [None]:
#TODO load actual amplicon information

amplicons = {
    'IN1': { 76: ['22917G', '23012C'], },
    'IN3': { 76: ['22917G', '23012C'], },
    'IN2': { 76: ['22917G', '22995A'],
             91: ['27638C', '27752T'], },
    'BR':  { 71: ['21621A', '21638T'],  # ,'21614T'], # common mutations are removed from the plot
             # 95: ['28877T',  '28878C'], # not part of the signature mutations
           },
}

In [None]:
def mutfilter(var, exclusive=False):
    # if exclusive is set on true, it will filter only those mutation which are variant specific and DO NOT show up in other variants
    # e.g.: used to exclude 23063T (V501Y) as all variant have it
    return (df[exclude_from].fillna(0) == ['mut' if v==var else 0 for v in exclude_from]).all(axis=1) if exclusive else (df[var] == "mut")
mutfilter('UK', exclusive=True)

In [None]:
df_wide = {}
amp_col = {}
for var in tqdm(variants_list):
    df_wide[var] = (
        # for the remaining mutations
        df[(~df['plantname'].isna()) & (df['date'] >= start_date) & (df['base'] != '-') & (mutfilter(var, exclusive=(var in exclusive_list)))]
         .pivot(index=['plantname', 'date', 'batch'], columns=['mutation'], values='frac')
         .sort_index(axis=1, key=natsort_keygen())
    )
    # add amplicons
    if var in amplicons:
        amp_col[var] = []
        for amp,muts in amplicons[var].items():
            aname =  f"Amp {amp}"
            df_wide[var].insert(loc=1+df_wide[var].columns.get_loc(muts[-1]), column=aname, value=df_cooc.loc[(df_cooc['amplicon'] == amp) & (df_cooc[var]==1)].loc[df_wide[var].index,'frac'], allow_duplicates=False)
            amp_col[var] += [ aname ]

df_wide['IN2'].head()

In [None]:
df_wide['IN2'].loc['Zürich (ZH)']

In [None]:
amp_col['IN2']

In [None]:
df_wide_cov = (
    df[(~df['plantname'].isna()) & (df['date'] >= '2020-12-08') & (df['base'] != '-') & (df['UK'] == "mut")]
    .pivot(index=['plantname', 'date', 'batch'], columns=['mutation'], values='cov')
    .sort_index(axis=1, key=natsort_keygen())
)

df_wide_counts = (
    df[(~df['plantname'].isna()) & (df['date'] >= '2020-12-08') & (df['base'] != '-') & (df['UK'] == "mut")]
    .pivot(index=['plantname', 'date', 'batch'], columns=['mutation'], values='var')
    .sort_index(axis=1, key=natsort_keygen())
)

# Heatmaps

In [None]:
for city in tqdm(cities_list, desc='Cities', position=0):
    for var in tqdm(variants_list, desc='Variants', position=1):
        tmp=df_wide[var].loc[city]

        # drawing box decorations:
        #      22917G┐
        #      22995A┤
        # Amplicon 76┘
        if var in amplicons:
            rmap={}
            # mutations
            for amp,muts in amplicons[var].items():
                box = ' ┐'
                for m in muts:
                    rmap[m]=f"{m}{box}"
                    box = '┤'
            # amplicons
            for a in amp_col[var]:
                    rmap[a]=f"{a}┘"
            tmp.rename(columns=rmap,inplace=True)

        plt.figure(figsize=(plotwidth,5)) # 17
        sns.heatmap(
            data=tmp.T.applymap(lambda x: np.nan if pd.isna(x) else x),
            annot=tmp.T,
            fmt='.1g',
            square=False, cbar=False,
            cmap=sns.color_palette("viridis", as_cmap=True)
        ).set_title(f"{city} - {variants_pangolin[var]}")
        plt.show()

# Prepare Heatmap Data for covSPECTRUM

In [None]:
import warnings

with warnings.catch_warnings(record=True) as w:
    r_df={}
    for city in tqdm(cities_list, desc='Cities', position=0):
        r_df[city]={}
        for var in tqdm(variants_list, desc='Variants', position=1, leave=False):
            r_df[city][var] = df_wide[var].loc[city].dropna(axis=1, how='all').T  # .loc[28111:28111]

m_df={}
# import matplotlib.gridspec as gridspec
for city in tqdm(cities_list, desc='Cities', position=0):
    m_df[city]={}
    for var in tqdm(variants_list, desc='Variants', position=1, leave=False):
        m_df[city][var] = r_df[city][var].T.groupby("date").agg("mean").asfreq('D').T

# Make data for covSPECTRUM

In [None]:
update_data={ }
tdf={city:{}  for city in cities_list}
tdf_mat={city:{}  for city in cities_list}

for var in tqdm(variants_list, desc='Variants', position=0):
    update_data[variants_pangolin[var]] = { }
    for city in tqdm(cities_list, desc='Cities', position=1, leave=False):

#         tdf[city][var]["date"] = tdf[city][var]["date"].astype("str")

        tdf_mat[city][var] = m_df[city][var].T.melt(ignore_index=False, var_name="nucMutation", value_name="proportion").reset_index()
        tdf_mat[city][var]["date"] = tdf_mat[city][var]["date"].astype("str")
        # drawing box decorations:
        # ┌22917G
        # ├22995A
        # └Amplicon 76
        if var in amplicons:
            # mutations
            for amp,muts in amplicons[var].items():
                box = '┌'
                for m in muts:
                    tdf_mat[city][var].loc[tdf_mat[city][var]["nucMutation"]==m,"nucMutation"]=f"{box}{m}"
                    box = '├'
            for a in amp_col[var]:
                    tdf_mat[city][var].loc[tdf_mat[city][var]["nucMutation"]==a,"nucMutation"]=f"└{a}"

        update_data[variants_pangolin[var]][city] = {
            #"updateDate": todaydate,
#             "timeseriesSummary": [dict(tdf[city][var].iloc[i,]) for i in range(tdf[city][var].shape[0])],
            "mutationOccurrences": [dict(tdf_mat[city][var].iloc[i,]) for i in range(tdf_mat[city][var].shape[0])]
        }

import json
with open(update_data_file, 'w') as file:
     file.write(json.dumps(update_data))

In [None]:
update_data['B.1.1.7']['Altenrhein (SG)']['mutationOccurrences']
