# OUTLIER NOTEBOOK

In [2]:
import sys
sys.path.append("../notebooks/scripts/")

In [3]:
import altair as alt
from altair_saver import save
from augur.utils import json_to_tree
import json
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import re
#from reportlab.graphics import renderPDF
import seaborn as sns
#from svglib.svglib import svg2rlg

from Helpers import linking_tree_with_plots_clickable, linking_tree_with_plots_brush, scatterplot_with_tooltip_interactive
from Helpers import get_y_positions

%matplotlib inline

In [4]:
alt.renderers.set_embed_options(
    padding={"left": 0, "right": 0, "bottom": 1, "top": 1}
)

RendererRegistry.enable('default')

In [5]:
sns.set_style("ticks")
# Disable top and right spines.
mpl.rcParams['axes.spines.top'] = False
mpl.rcParams['axes.spines.right'] = False
# Display and save figures at higher resolution for presentations and manuscripts.
mpl.rcParams['savefig.dpi'] = 300
mpl.rcParams['figure.dpi'] = 100
# Display text at sizes large enough for presentations and manuscripts.
mpl.rcParams['font.weight'] = "normal"
mpl.rcParams['axes.labelweight'] = "normal"
mpl.rcParams['font.size'] = 10
mpl.rcParams['axes.labelsize'] = 10
mpl.rcParams['legend.fontsize'] = 8
mpl.rcParams['xtick.labelsize'] = 10
mpl.rcParams['ytick.labelsize'] = 10
mpl.rcParams['axes.titlesize'] = 8
mpl.rc('text', usetex=False)

In [6]:
try: 
    snakemake.input.node_df
    import selenium
    from selenium.webdriver import Chrome 
    from selenium import webdriver

    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument("--remote-debugging-port=9222")

    browser = webdriver.Chrome(options=chrome_options)
except:
    print("not in Snakemake, imports unnecessary")

not in Snakemake, imports unnecessary


## Pathogen-specific variables

Consider consolidating these into a single configuration file that can be passed to the notebook as a command line argument for more scriptable generation of these figures.

# Snakemake Specific Variables

In [7]:
try:
        node_df = snakemake.input.node_df,

        mds_df_human_swine = snakemake.input.mds_df_human_swine,
        mds_df_full = snakemake.input.mds_df_full,

        outlier_df_human_swine = snakemake.input.outlier_df_human_swine,
        outlier_df_full = snakemake.input.outlier_df_full,

        #KDE Density
        MDS_KDE_df_human_swine = snakemake.input.MDS_KDE_df_human_swine,
        MDS_KDE_metadata_df_human_swine = snakemake.input.MDS_KDE_metadata_df_human_swine,
        MDS_outlier_metadata_df_human_swine = snakemake.input.MDS_outlier_metadata_df_human_swine,


except:
    print("not in Snakemake, imports unnecessary")

not in Snakemake, imports unnecessary


In [8]:
domain =  ['outlier', 'not_outlier'] 
range_ = ['#FF0000','#000000']

# Reading in all the data from the scripts

In [9]:
import os

os.getcwd()

'/mnt/c/Work/BedfordProjects/cartography/outlier_analysis'

In [10]:
#node_df = pd.read_csv(node_df, sep="\t")
metadata_df = pd.read_csv("results/total_metadata.tsv", sep="\t")
node_df = pd.read_csv("results/table.tsv", sep="\t")

In [11]:
metadata_df

Unnamed: 0,strain,date,accession,country,region,outlier
0,A/Alabama/01/2017,2017-01-04,CY216617,USA,N,not_outlier
1,A/Alabama/02/2015,2015-01-26,KT836052,USA,N,not_outlier
2,A/Alabama/02/2016,2016-01-27,KX412662,USA,N,not_outlier
3,A/Alabama/02/2017,2017-01-05,CY216609,USA,N,not_outlier
4,A/Alabama/03/2017,2017-01-19,CY216625,USA,N,not_outlier
...,...,...,...,...,...,...
9573,A/swine/Virginia/A02478454/2019,2019-04-26,MK967619,USA,N,outlier
9574,A/swine/Virginia/A02478581/2019,2019-04-22,MN436839,USA,N,outlier
9575,A/swine/Virginia/A02478738/2019,2019-08-29,MN608561,USA,N,outlier
9576,A/swine/Wisconsin/A02140875/2018,2018-01-19,MG977436,USA,N,outlier


In [12]:
node_df.rename(columns={'num_date':'date', 'y_value':"y"}, inplace=True)

In [13]:
node_df.head()

Unnamed: 0,strain,date,mds1,mds2,clade_membership,mds_label,y
0,A/swine/Saskatchewan/SD0300/2018,2018.35,105.13,198.87,unassigned,-1.0,1
1,A/swine/Iowa/A02478972/2019,2019.98,146.43,-175.78,unassigned,-1.0,2
2,A/swine/BritishColumbia/SD0290/2018,2018.31,6.89,219.21,unassigned,-1.0,3
3,A/swine/Saskatchewan/SD0271/2018,2018.18,211.55,31.16,unassigned,-1.0,4
4,A/swine/Manitoba/D0525/2018,2018.05,-192.32,-89.75,unassigned,-1.0,5


In [14]:
node_df = node_df.merge(metadata_df[["strain", "outlier"]], on="strain")

# Running MDS on the Dataset

In [37]:
MDS_df_human_swine = pd.read_csv("results/embed_mds_human_swine.csv",index_col=0)
MDS_df_full = pd.read_csv("results/embed_mds_full.csv",index_col=0)
outlier_df_human_swine = pd.read_csv("results/outliers_human_swine.csv")
outlier_df_full = pd.read_csv("results/outliers_full.csv")

In [38]:
merged_mds_df_human_swine = MDS_df_human_swine.merge(node_df[["strain", "date", "y", "outlier"]], on="strain")

In [41]:
merged_mds_df_human_swine = merged_mds_df_human_swine.merge(outlier_df_human_swine[['strain', 'predicted_outlier_status']], on="strain")
merged_mds_df_full = MDS_df_full.merge(outlier_df_full[['strain', 'predicted_outlier_status']], on="strain")

In [None]:
list_of_chart = linking_tree_with_plots_brush(merged_mds_df_human_swine,['mds1','mds2'],["MDS1", "MDS2"], 'outlier:N', ['strain','outlier'], domain, range_)
list_of_chart[0]|list_of_chart[1]

In [48]:
mapping = {'not_outlier': 1, 'outlier': -1}

merged_mds_df_human_swine = merged_mds_df_human_swine.replace({'not_outlier': mapping, 'outlier': mapping})

In [49]:
domain =  ['-1', '1'] 
range_ = ['#FF0000','#000000']

In [50]:
merged_mds_df_human_swine['outlier'].value_counts()

1    1457
Name: outlier, dtype: int64

In [51]:
chart_12_mds = scatterplot_with_tooltip_interactive(merged_mds_df_human_swine,'mds1','mds2',"mds1","mds2",['strain','outlier'],'predicted_outlier_status:N', domain, range_)
chart_mds = scatterplot_with_tooltip_interactive(merged_mds_df_human_swine,'mds1','mds2',"mds1","mds2",['strain','outlier'],'outlier:N', domain, range_)
chart_12_mds|chart_mds

In [23]:
from sklearn.metrics import accuracy_score
accuracy_score(merged_mds_df_human_swine["outlier"].values.tolist(), merged_mds_df_human_swine["predicted_outlier_status"].values.tolist())

0.9869130100076983

In [24]:
from sklearn.metrics import confusion_matrix, matthews_corrcoef
matthews_corrcoef(merged_mds_df_human_swine["outlier"].values.tolist(), merged_mds_df_human_swine["predicted_outlier_status"].values.tolist())

0.2912099454229219

In [25]:
confusion_matrix(merged_mds_df_human_swine["outlier"].values.tolist(), merged_mds_df_human_swine["predicted_outlier_status"].values.tolist())

array([[   4,   33],
       [   1, 2560]])

In [56]:
with open("config/outliers_h3n2_full.txt") as w:
    outliers = [line.strip() for line in w]

In [57]:
outlier_df_human_swine["isin_outliers"] = outlier_df_human_swine["strain"].isin(outliers)

In [58]:
outlier_df_human_swine.query("predicted_outlier_status==-1")

Unnamed: 0,strain,mds1,mds2,mds_label,date,accession,country,region,outlier,predicted_outlier_status,predicted_LOF_outlier_status,X_scores,isin_outliers
0,A/Shiraz/1006/2015,-106.99314,32.165168,-1,2015-10-23,MF993038,Iran,N,1,-1.0,-1,-9.783518,False
100,A/Tehran/713825/2015,-0.847709,3.748126,-1,2015-01-17,KY328781,Iran,N,1,-1.0,-1,-2.762879,False
258,A/Shanghai/10/2017,1.361378,-5.267726,-1,2017-01-10,MG984185,China,N,1,-1.0,-1,-4.177157,False
354,A/Firuzkuh/49748/2016,-1.974736,-2.69915,-1,2016-03-26,KY328772,Iran,N,1,-1.0,-1,-3.015752,False
862,A/SouthAustralia/34/2016,-1.666881,3.350855,-1,2016-06-24,CY254417,Australia,S,1,-1.0,-1,-2.813941,False
921,A/Arizona/01/2018,21.898007,-2.312128,5,2018-01-05,MH126433,USA,N,1,-1.0,-1,-2.80143,False
1094,A/Japan/FDX70336/2018,21.693699,-2.100452,5,2018-01-19,MH701628,Japan,N,1,-1.0,-1,-2.768069,False
1216,A/Italy/7836/2018,32.583681,-6.126684,-1,2018-11-01,MK576227,Italy,N,1,-1.0,-1,-2.928392,False
1720,A/NorthCarolina/02/2020,3.517545,-37.033431,-1,2020-01-15,MT245946,USA,N,1,-1.0,-1,-3.060447,False
1904,A/Hawaii/59/2019,-19.819721,48.528672,-1,2019-08-10,MN581560,USA,N,1,-1.0,-1,-2.763003,False


In [32]:
full_outlier = pd.read_csv("results/outliers_full.csv")
human_swine_outlier = pd.read_csv("results/outliers_human_swine.csv")

In [33]:
full_outlier["predicted_outlier_status"].value_counts()

 1.0    5834
-1.0      32
Name: predicted_outlier_status, dtype: int64

In [None]:
import altair as alt
alt.data_transformers.disable_max_rows()
chart1 = alt.Chart(full_outlier).mark_circle(size=60).encode(
            x=alt.X('X_scores', title="X_scores"),
            color='predicted_outlier_status:N',
            tooltip=['strain']
        )
chart1

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(6, 2))
ax = sns.kdeplot(human_swine_outlier["X_scores"], ax=ax)
ax.axvline(x=human_swine_outlier["X_scores"].mean() - 4*human_swine_outlier["X_scores"].std(), color="red")
ax = sns.rugplot(data=human_swine_outlier, x="X_scores", color="orange", ax=ax, height=.1)
ax.set(xlim=(-20, -.5))
ax.set(xlabel='Local Outlier Factor Scores', ylabel='Density')
plt.savefig(snakemake.output.distributionHumanSwine)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(6, 2))
ax = sns.kdeplot(full_outlier["X_scores"], ax=ax)
ax.axvline(x=full_outlier["X_scores"].mean() - 4*full_outlier["X_scores"].std(), color="red")
ax = sns.rugplot(data=full_outlier, x="X_scores", color="orange", ax=ax, height=.1)
ax.set(xlim=(-20, -.5))
ax.set(xlabel='Local Outlier Factor Scores', ylabel='Density')
plt.savefig(snakemake.output.distributionFull)

In [None]:
MDS_KDE_df_human_swine = pd.read_csv("results/KDEDensity_mds_human_swine.csv",index_col=0)
MDS_KDE_metadata_df_human_swine = pd.read_csv("results/KDEDensity_mds_metadata_human_swine.csv", index_col=0)
MDS_outlier_metadata_df_human_swine = pd.read_csv("../docs/LOF_accuracy_human_swine.csv")

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(6, 3))

ax = sns.kdeplot(MDS_KDE_df_human_swine.query("clade_status == 1")["scaled_distance"], label="Same clade", ax=ax)
ax = sns.kdeplot(MDS_KDE_df_human_swine.query("clade_status == 0")["scaled_distance"], label="Different clade", ax=ax)

ax.axvline(x=MDS_KDE_metadata_df_human_swine["threshold"].values.tolist()[0], label="SVC threshold", color="#000000", alpha=0.5)
ax.axvline(x=MDS_outlier_metadata_df_human_swine["threshold"].values.tolist()[0], label="Outlier threshold", color="#800000", alpha=0.5)

ax.set_xlabel("Scaled Euclidean distance ")
ax.set_ylabel("KDE density")

fig.suptitle('MDS KDE Plot', fontsize=16)
sns.despine()
plt.savefig(snakemake.output.KDE_densityHumanSwine)