In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
plotwidth=40

In [None]:
import numpy as np
import pandas as pd
import scipy as sp

import seaborn as sns
import matplotlib.pyplot as plt

from natsort import natsort_keygen

import statsmodels.api as sm
import scipy as sp

import os
import json
import datetime
import psycopg2
import netrc
import re
from tqdm.notebook import tqdm, trange
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()

# Globals

A few general variable about where to find stuff. Adapt to your own needs.

In [None]:
datadir = '.'
plotdir = 'plots'
vpipe_working = 'working' # V-pipe's working directory

# Input
tally_mut = os.path.join(datadir, 'tallymut_line.tsv')
cooc_table = os.path.join(vpipe_working, 'ww-cooc.csv')
viollier_data = os.path.join(datadir, 'viollier_data.csv')
data_per_day_and_canton = os.path.join(datadir, 'data_per_day_and_canton2.csv')

# Select
start_date = '2020-12-08'
todaydate = datetime.date.today().strftime("%Y-%m-%d")
cities_list=['Altenrhein (SG)', 'Chur (GR)', 'Genève (GE)', 'Laupen (BE)',
       'Lausanne (VD)', 'Lugano (TI)', 'Zürich (ZH)']
variants_list=['UK','ZA','BR','C36','IN1','IN2','IN3']
variants_pangolin={'UK':'B.1.1.7','ZA':'B.1.351','BR':'P.1','C36':'C.36.3','IN1':'B.1.617.1','IN2':'B.1.617.2','IN3':'B.1.617.3'}
exclusive_list=['ZA','BR'] # list of variants where we should apply filtering
exclude_from=['UK','ZA','BR' ] #,'IN2'] #,'C36','IN1','IN2','IN3'] # filter against these variants
# Output
update_data_file = os.path.join(datadir, 'ww_update_data.json')

In [None]:
# parameters for resampling
num_resample=100

In [None]:
# RegEx used to decode plantcode and date from sample name
# should return a dict (named groups):
#  - plant: the code of the wastewater plant (if plant_name_tsv is provided, it will be looked up for a full name)
#  - year, month, day: used to make a time code for the time-serie
rxname=re.compile('(?P<plant>\d+)_(?P<year>20\d{2})_(?:(?:(?P<month>[01]?\d)_(?P<day>[0-3]?\d))|(?:R_(?P<repeat>\d+)))')

# Load data

## Load WWTP sequencing data

In [None]:
df = pd.read_csv(tally_mut, sep='\t', parse_dates=['date'])

df['mutation'] = df['pos'].astype(str) + df['base']

df.head()

In [None]:
df_cooc_raw = pd.read_csv(cooc_table, sep=',') #, index_col=['sample','batch'])

#df['mutation'] = df['pos'].astype(str) + df['base']

df_cooc_raw#.head()

In [None]:
# Look for duplicated samples with suffixes
df[(~df['plantname'].isna()) & (df['date'] >= start_date) & (df['base'] != '-') & df.duplicated(subset=['plantname','date','batch','mutation'], keep=False)]

In [None]:
# HACK workaround for a duplicated sample
df.loc[df['sample']=='A1_05_2021_05_19_CATTCGGA-TTTCCATC','batch']='20210604_JN8TR_CATTCGGA-TTTCCATC'
df_cooc_raw.loc[df_cooc_raw['sample']=='A1_05_2021_05_19_CATTCGGA-TTTCCATC','batch']='20210604_JN8TR_CATTCGGA-TTTCCATC'

In [None]:
# plantcode and date from mut_table to cooc
df_map=df[['sample','batch','plantname','date']].drop_duplicates(ignore_index=True).set_index(['sample','batch'])
df_cooc=df_cooc_raw.merge(df_map[['plantname','date']], how='left', on=['sample','batch'], copy=False, validate='many_to_one').set_index(['plantname','date','batch'])
df_cooc#.head()

In [None]:
#TODO load actual amplicon information

amplicons = {
    'IN1': { 76: ['22917G', '23012C'], },
    'IN3': { 76: ['22917G', '23012C'], },
    'IN2': { 76: ['22917G', '22995A'],
             91: ['27638C', '27752T'], },
    'BR':  { 71: ['21621A', '21638T'],  # ,'21614T'], # common mutations are removed from the plot
             # 95: ['28877T',  '28878C'], # not part of the signature mutations
           },
}

In [None]:
def mutfilter(var, exclusive=False):
    # if exclusive is set on true, it will filter only those mutation which are variant specific and DO NOT show up in other variants
    # e.g.: used to exclude 23063T (V501Y) as all variant have it
    return (df[exclude_from].fillna(0) == ['mut' if v==var else 0 for v in exclude_from]).all(axis=1) if exclusive else (df[var] == "mut")
mutfilter('UK', exclusive=True)

In [None]:
df_wide = {}
amp_col = {}
for var in tqdm(variants_list):
    df_wide[var] = (
        # for the remaining mutations
        df[(~df['plantname'].isna()) & (df['date'] >= start_date) & (df['base'] != '-') & (mutfilter(var, exclusive=(var in exclusive_list)))]
         .pivot(index=['plantname', 'date', 'batch'], columns=['mutation'], values='frac')
         .sort_index(axis=1, key=natsort_keygen())
    )
    # add amplicons
    if var in amplicons:
        amp_col[var] = []
        for amp,muts in amplicons[var].items():
            aname =  f"Amp {amp}"
            df_wide[var].insert(loc=1+df_wide[var].columns.get_loc(muts[-1]), column=aname, value=df_cooc.loc[(df_cooc['amplicon'] == amp) & (df_cooc[var]==1)].loc[df_wide[var].index,'frac'], allow_duplicates=False)
            amp_col[var] += [ aname ]

df_wide['IN2'].head()

In [None]:
df_wide['IN2'].loc['Zürich (ZH)']

In [None]:
amp_col['IN2']

In [None]:
df_wide_cov = (
    df[(~df['plantname'].isna()) & (df['date'] >= '2020-12-08') & (df['base'] != '-') & (df['UK'] == "mut")]
    .pivot(index=['plantname', 'date', 'batch'], columns=['mutation'], values='cov')
    .sort_index(axis=1, key=natsort_keygen())
)

df_wide_counts = (
    df[(~df['plantname'].isna()) & (df['date'] >= '2020-12-08') & (df['base'] != '-') & (df['UK'] == "mut")]
    .pivot(index=['plantname', 'date', 'batch'], columns=['mutation'], values='var')
    .sort_index(axis=1, key=natsort_keygen())
)

# Heatmaps

In [None]:
for city in tqdm(cities_list, desc='Cities', position=0):
    for var in tqdm(variants_list, desc='Variants', position=1):
        tmp=df_wide[var].loc[city]

        # drawing box decorations:
        #      22917G┐
        #      22995A┤
        # Amplicon 76┘
        if var in amplicons:
            rmap={}
            # mutations
            for amp,muts in amplicons[var].items():
                box = ' ┐'
                for m in muts:
                    rmap[m]=f"{m}{box}"
                    box = '┤'
            # amplicons
            for a in amp_col[var]:
                    rmap[a]=f"{a}┘"
            tmp.rename(columns=rmap,inplace=True)

        plt.figure(figsize=(plotwidth,5)) # 17
        sns.heatmap(
            data=tmp.T.applymap(lambda x: np.nan if pd.isna(x) else x),
            annot=tmp.T,
            fmt='.1g',
            square=False, cbar=False,
            cmap=sns.color_palette("viridis", as_cmap=True)
        ).set_title(f"{city} - {variants_pangolin[var]}")
        plt.show()

# Do resampling confint for lowess

In [None]:
def resample_fn(x, nsamples):
    return np.array([np.mean(np.random.choice(x[np.isnan(x)==False],
                                              x[np.isnan(x)==False].shape[0],
                                              replace=True)) \
            for i in range(nsamples)])

lowess = sm.nonparametric.lowess

In [None]:
import warnings

with warnings.catch_warnings(record=True) as w:
    #warnings.simplefilter("ignore", category=RuntimeWarning)
    #warnings.filterwarnings(action='ignore', category=RuntimeWarning, message='Mean of empty slice')
    #warnings.simplefilter(action='once')

    r_df={}
    yres={}
    yres2={}
    agg1={}
    agg2={}
    agg3={}
    for city in tqdm(cities_list, desc='Cities', position=0):
        r_df[city]={}
        yres[city]={}
        yres2[city]={}
        agg1[city]={}
        agg2[city]={}
        agg3[city]={}
        for var in tqdm(variants_list, desc='Variants', position=1, leave=False):
            r_df[city][var] = df_wide[var].loc[city].dropna(axis=1, how='all').T  # .loc[28111:28111]
            # TODO be more clever with amplicons (for now we're just ignoring them)
            tmp = r_df[city][var].drop(index=amp_col[var],inplace=False,errors='ignore') if var in amp_col else r_df[city][var]
            yres[city][var] = tmp.T.apply(lambda x: resample_fn(x, num_resample), 1)

            yres2[city][var] = pd.DataFrame(np.array([i for i in yres[city][var].values]))
            yres2[city][var].index = yres[city][var].index

            agg1[city][var] = yres2[city][var].groupby('date').agg('mean').asfreq('D')
            agg2[city][var] = agg1[city][var].apply(lambda x: x.rolling(7, min_periods=1).mean(), 0)
            print(city, var, yres2[city][var].shape[0], 20./yres2[city][var].shape[0], np.clip(20./yres2[city][var].shape[0], 0, 2./3))
            agg3[city][var] = agg1[city][var].apply(lambda x: lowess(x, np.arange(x.shape[0]).astype('float64'),
                                                    xvals = np.arange(x.shape[0]).astype('float64'),
                                                    frac= np.clip(20./yres2[city][var].shape[0], 0, 2./3), it=0), 0)



In [None]:
for city in cities_list:
    for var in variants_list:
        print(f"{city}-{var}:\t{yres2[city][var].shape[0]}")

# Curves

In [None]:
m_df={}
# import matplotlib.gridspec as gridspec
for city in tqdm(cities_list, desc='Cities', position=0):
    m_df[city]={}
    for var in tqdm(variants_list, desc='Variants', position=1, leave=False):
        m_df[city][var] = r_df[city][var].T.groupby("date").agg("mean").asfreq('D').T

        fig, ax = plt.subplots(nrows=1, figsize=(plotwidth, 10), sharex=False)
        ax = [ax]

        # TODO be more clever with amplicons (for now we're just ignoring them)
        tmp = m_df[city][var].drop(index=amp_col[var],inplace=False,errors='ignore') if var in amp_col else m_df[city][var]
        xvals = tmp.T.index#.astype("str")
        yvals = tmp.apply(np.mean, 0)

        ###

        sns.lineplot(x=xvals, y=np.clip(agg3[city][var].apply(np.mean, 1), 0., 1.), ax=ax[0], label="wastewater lowess smoothing")
        ax[0].fill_between(xvals,
                           np.clip(agg3[city][var].apply(lambda x: np.percentile(x, 5), 1).interpolate(), 0, 1),
                           np.clip(agg3[city][var].apply(lambda x: np.percentile(x, 95), 1).interpolate(), 0, 1),
                           alpha=.3)
        ##<-Here
        #sns.lineplot(x=xvals[:66], y=np.nanmean(np.array(ZHclinlowess), axis=0),
        #             ax=ax[0], label="cantonal clinical lowess smoothing")
        #ax[0].fill_between(xvals[:66],
        #                     np.nanpercentile(np.array(ZHclinlowess), q=2.5, axis=0),
        #                     np.nanpercentile(np.array(ZHclinlowess), q=97.5, axis=0),
        #                     alpha=.3)
        #
        #sns.barplot(x=xvals[:66], y=viollierZH[viollierZH["date"].isin(xvals)]["frac"],
        #            ax=ax[0], label="cantonal clinical empirical frequencies", color="orange", alpha=0.5)


        # ax[0].set_ylim((0,0.125))
        ax[0].set_xlim((np.datetime64(start_date), np.datetime64(todaydate)))
        ax[0].set_ylabel(f"frac. {variants_pangolin[var]}")
        ax[0].legend(loc="upper left")
        ax[0].set_title(f"{city}: relative {variants_pangolin[var]} prevalence estimates from wastewater samples\n compared to relative prevalence estimates from cantonal clinical samples")
        # ax[0].set_xticks(["2020-12-15", "2021-01-01", "2021-01-15", "2021-02-01", "2021-02-15"])
        #ax[0].set_xticklabels(labels=xvals, rotation=90, ha='center')

        # plt.savefig("plots/ZurPlot2.pdf", bbox_inches='tight')
        plt.show()

# Make data for covSPECTRUM

In [None]:
update_data={ }
tdf={city:{}  for city in cities_list}
tdf_mat={city:{}  for city in cities_list}

for var in tqdm(variants_list, desc='Variants', position=0):
    update_data[variants_pangolin[var]] = { }
    for city in tqdm(cities_list, desc='Cities', position=1, leave=False):
        tdf[city][var] = agg3[city][var].apply(lambda x: {"proportion":np.clip(np.mean(x), 0., 1.),
                                                          "proportionLower":np.clip(np.percentile(x, 5), 0., 1.),
                                                          "proportionUpper":np.clip(np.percentile(x, 95), 0., 1.)},
                                               axis=1, result_type ='expand')
        tdf[city][var] = tdf[city][var].reset_index()
        tdf[city][var]["date"] = tdf[city][var]["date"].astype("str")

        tdf_mat[city][var] = m_df[city][var].T.melt(ignore_index=False, var_name="nucMutation", value_name="proportion").reset_index()
        tdf_mat[city][var]["date"] = tdf_mat[city][var]["date"].astype("str")
        # drawing box decorations:
        # ┌22917G
        # ├22995A
        # └Amplicon 76
        if var in amplicons:
            # mutations
            for amp,muts in amplicons[var].items():
                box = '┌'
                for m in muts:
                    tdf_mat[city][var].loc[tdf_mat[city][var]["nucMutation"]==m,"nucMutation"]=f"{box}{m}"
                    box = '├'
            for a in amp_col[var]:
                    tdf_mat[city][var].loc[tdf_mat[city][var]["nucMutation"]==a,"nucMutation"]=f"└{a}"

        update_data[variants_pangolin[var]][city] = {
            #"updateDate": todaydate,
            "timeseriesSummary": [dict(tdf[city][var].iloc[i,]) for i in range(tdf[city][var].shape[0])],
            "mutationOccurrences": [dict(tdf_mat[city][var].iloc[i,]) for i in range(tdf_mat[city][var].shape[0])]
        }

import json
with open('ww_update_data.json', 'w') as file:
     file.write(json.dumps(update_data))

## New regression

If you want to use the new regressing this it the point where you would need to switch to the [new `ww_smoothing_regression.ipynb` notebook](ww_smoothing_regression.ipynb).

## Upload to Cov-Spectrum

In [None]:
dbhost='id-hdb-psgr-cp61.ethz.ch'

In [None]:
# load from netrc
dbuser,dbpass=netrc.netrc().authenticators(dbhost)[0::2]

# alternative: input box
#dbuser = input(f"Enter username for database {dbhost}:\n")
#dbpass = input(f"Enter password for user {dbuser}:\n")

# alternative: enviro
#dbuser = os.environ['DB_USERNAME'],
#dbpass = os.environ['DB_PASSWORD'],

dbuser

In [None]:
dbconn = psycopg2.connect(
    host=dbhost,
    database='sars_cov_2',
    user=dbuser,
    password=dbpass,
    port='5432'
)
dbconn

In [None]:
cur = dbconn.cursor()
cur

In [None]:
for var in variants_list:
    for city in cities_list:
        pango=variants_pangolin[var]
        cur.execute("""
        DO $$
        BEGIN
         IF EXISTS (SELECT ww.data FROM public.spectrum_waste_water_result AS ww WHERE ww.variant_name=%(var)s AND ww.location=%(city)s) THEN
          UPDATE public.spectrum_waste_water_result AS ww SET data=%(data)s WHERE ww.variant_name=%(var)s AND ww.location=%(city)s;
         ELSE
          INSERT INTO public.spectrum_waste_water_result (variant_name, location, data)
          VALUES(%(var)s, %(city)s, %(data)s);
         END IF;
        END
        $$
        """, {'data': json.dumps(update_data[pango][city]).replace('NaN','null'), 'var': pango, 'city': city})

In [None]:
## Abort DB update !
dbconn.rollback()

In [None]:
## Save to DB !
dbconn.commit()

In [None]:
cur.close()
dbconn.close()

# Quick checks of data

In [None]:
df.plantname.unique()

In [None]:
df.plantcode.unique()

In [None]:
df[df.plantname == "Bioggio (TI)"]

In [None]:
df_wide.loc["Bioggio (TI)"].index.get_level_values(0).unique()

In [None]:
df_wide.loc["Ski-resort"].index.get_level_values(0).unique()

In [None]:
df_wide.index.get_level_values(0).unique()

## Load national viollier data

In [None]:
viollier = pd.read_csv(viollier_data)
viollier.index = pd.to_datetime(viollier["year"].astype(str), format='%Y') +\
    pd.to_timedelta(((viollier["week"]-1).mul(7)+3-7).astype(str) + ' days')
viollier["frac_b117"] = viollier["b117"] / viollier["n"]
viollier["p_pseudo"] = (viollier["b117"] + 1) / viollier["n"]
viollier["error"] = 1.96*np.sqrt(viollier["p_pseudo"]*(1-viollier["p_pseudo"])/viollier["n"])

## Load cantonal viollier data and aggregate by week

In [None]:
viollier2 = pd.read_csv(data_per_day_and_canton)
viollier2["date"] = pd.DatetimeIndex(viollier2["date"])
viollier2["week"] = viollier2.date.dt.strftime('%W')
viollier2["year"] = viollier2.date.dt.strftime('%Y')
viollier2["date_week"] = pd.to_datetime(viollier2["year"].astype(str), format='%Y') +\
    pd.to_timedelta(((viollier2["week"].astype("int")-1).mul(7)+3-7).astype(str) + ' days')

viollier2_sum = viollier2.groupby(["date_week", "canton"], as_index=False).agg("sum")
viollier2_sum["frac_b117"] = viollier2_sum["b117"] / viollier2_sum["sequenced"]
viollier2_sum["p_pseudo"] = (viollier2_sum["b117"] + 1) / (viollier2_sum["sequenced"] + 2)
viollier2_sum["error"] = 1.96*np.sqrt(viollier2_sum["p_pseudo"]*(1-viollier2_sum["p_pseudo"])/(viollier2_sum["sequenced"]+2))

In [None]:
viollier2["date"] = pd.to_datetime(viollier2["date"])

viollier_city= {}
for city in tqdm(cities_list, desc='Cities', position=0):


viollierZH = viollier2[viollier2["canton"]=="ZH"]
viollierZH = viollierZH.sort_values("date")
viollierZH["frac"] = viollierZH["b117"] / viollierZH["sequenced"]
viollierZH = viollierZH[(viollierZH["date"] >= np.datetime64("2020-12-08")) & (viollierZH["date"] <= np.datetime64("2021-02-11"))]

lowess = sm.nonparametric.lowess
viollierZH["loess"] = lowess(endog=viollierZH["frac"],
                             exog=np.arange(viollierZH.shape[0]).astype("float64"),
                             xvals=np.arange(viollierZH.shape[0]).astype("float64"),
                             frac= 2./3, it=3)

viollierVD = viollier2[viollier2["canton"]=="VD"]
viollierVD = viollierVD.sort_values("date")
viollierVD["frac"] = viollierVD["b117"] / viollierVD["sequenced"]
viollierVD = viollierVD[(viollierVD["date"] >= np.datetime64("2020-12-08")) & (viollierVD["date"] <= np.datetime64("2021-02-11"))]

lowess = sm.nonparametric.lowess
viollierVD["loess"] = lowess(endog=viollierVD["frac"],
                             exog=np.arange(viollierVD.shape[0]).astype("float64"),
                             xvals=np.arange(viollierVD.shape[0]).astype("float64"),
                             frac= 2./3, it=3)


In [None]:
viollier2["date"] = pd.to_datetime(viollier2["date"])

viollierZH = viollier2[viollier2["canton"]=="ZH"]
viollierZH = viollierZH.sort_values("date")
viollierZH["frac"] = viollierZH["b117"] / viollierZH["sequenced"]
viollierZH = viollierZH[(viollierZH["date"] >= np.datetime64("2020-12-08")) & (viollierZH["date"] <= np.datetime64("2021-02-11"))]

lowess = sm.nonparametric.lowess
viollierZH["loess"] = lowess(endog=viollierZH["frac"],
                             exog=np.arange(viollierZH.shape[0]).astype("float64"),
                             xvals=np.arange(viollierZH.shape[0]).astype("float64"),
                             frac= 1./3, it=3)

viollierVD = viollier2[viollier2["canton"]=="VD"]
viollierVD = viollierVD.sort_values("date")
viollierVD["frac"] = viollierVD["b117"] / viollierVD["sequenced"]
viollierVD = viollierVD[(viollierVD["date"] >= np.datetime64("2020-12-08")) & (viollierVD["date"] <= np.datetime64("2021-02-13"))]

lowess = sm.nonparametric.lowess
viollierVD["loess"] = lowess(endog=viollierVD["frac"],
                             exog=np.arange(viollierVD.shape[0]).astype("float64"),
                             xvals=np.arange(viollierVD.shape[0]).astype("float64"),
                             frac= 1./3, it=3)


## Resample clinical loess

# Remake clinical loess by resampling cases

In [None]:
viollier2 = { }
for city in cities_list
viollierZH2 = viollierZH[(viollierZH["date"] >= np.datetime64("2020-12-08")) & (viollierZH["date"] <= np.datetime64("2021-02-11"))]
viollierVD2 = viollierVD[(viollierVD["date"] >= np.datetime64("2020-12-08")) & (viollierVD["date"] <= np.datetime64("2021-02-13"))]

### Create extended df with one row per case

In [None]:
ZH1hotdfs = []
for d in viollierZH2["date"]:
    for i in range(viollierZH2[viollierZH2["date"]==d]["b117"].values[0]):
        ZH1hotdfs.append(pd.DataFrame({"date":d, "wt":0, "b117":1}, index=[0]))
    for i in range(viollierZH2[viollierZH2["date"]==d]["sequenced"].values[0] - viollierZH2[viollierZH2["date"]==d]["b117"].values[0]):
        ZH1hotdfs.append(pd.DataFrame({"date":d, "wt":1, "b117":0}, index=[0]))
viollierZH2_1hot = pd.concat(ZH1hotdfs).reset_index(drop=True)

In [None]:
VD1hotdfs = []
for d in viollierVD2["date"]:
    for i in range(viollierVD2[viollierVD2["date"]==d]["b117"].values[0]):
        VD1hotdfs.append(pd.DataFrame({"date":d, "wt":0, "b117":1}, index=[0]))
    for i in range(viollierVD2[viollierVD2["date"]==d]["sequenced"].values[0] - viollierVD2[viollierVD2["date"]==d]["b117"].values[0]):
        VD1hotdfs.append(pd.DataFrame({"date":d, "wt":1, "b117":0}, index=[0]))
viollierVD2_1hot = pd.concat(VD1hotdfs).reset_index(drop=True)

### resample cases

In [None]:
clinlowess = {}
for city in tqdm(cities_list, desc='Cities'):
    clinlowess[city] = []
    np.random.seed(42)
    for i in trange(1000):
        resamp_df = viollierZH2_1hot.iloc[np.random.randint(0, viollierZH2_1hot.shape[0], viollierZH2_1hot.shape[0]),:]
    resamp_df = resamp_df.groupby("date").agg("sum").reindex(viollierZH2["date"])
    resamp_df["freq"] = resamp_df["b117"] / (resamp_df["b117"] + resamp_df["wt"])
    ZHclinlowess.append(lowess(resamp_df["freq"],
                               np.arange(resamp_df.shape[0]).astype("float64"),
                               xvals=np.arange(resamp_df.shape[0]).astype("float64"), frac=1./3))

In [None]:
VDclinlowess = []
np.random.seed(42)
for i in trange(1000):
    resamp_df = viollierVD2_1hot.iloc[np.random.randint(0, viollierVD2_1hot.shape[0], viollierVD2_1hot.shape[0]),:]
    resamp_df = resamp_df.groupby("date").agg("sum").reindex(viollierVD2["date"])
    resamp_df["freq"] = resamp_df["b117"] / (resamp_df["b117"] + resamp_df["wt"])
    VDclinlowess.append(lowess(resamp_df["freq"],
                               np.arange(resamp_df.shape[0]).astype("float64"),
                               xvals=np.arange(resamp_df.shape[0]).astype("float64"), frac=1./3))

In [None]:
with pd.option_context('display.max_rows', None): #, 'display.max_columns', None):
   newcolumn_data=df_cooc.loc[(df_cooc['amplicon'] == 91) & (df_cooc['IN2']==1)]
   newcolumn_data.loc[((newcolumn_data['mut_all'] < 5) | (newcolumn_data['frac'] < 0.001))].loc['frac']=np.NaN
   display(newcolumn_data.loc[df_wide['IN2'].index,'frac'])