# Complete candidate search (due_202203, 1,250 sequences)
This notebook contains the evaluation of the complete candidate search concerning the aggregate due_202203 with 1,250 sequences. Breadth search and depth search results were taken into account.
The distance matrices were evaluated in terms of infection recall. Furthermore, the mst structure was evaluated based on community ARI, lineage purity and mean edge weight. MSTs were generated to compare the results with the MST of the optimized algorithm regarding the distribution of outbreak-related attributes.

In [1]:
!pip install ../../gentrain/.

[0mProcessing /Users/benkraling/code/thesis/gentrain
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: gentrain
  Building wheel for gentrain (setup.py) ... [?25ldone
[?25h  Created wheel for gentrain: filename=gentrain-0.1.2-py3-none-any.whl size=26677 sha256=ee7d072c4821a266c742643492cab23f599eadf561f7b434cc688c1b5cf756ff
  Stored in directory: /private/var/folders/2h/923cq6912sqb0snfvqqfdnmm0000gn/T/pip-ephem-wheel-cache-8ak17129/wheels/cf/e4/57/91c03db2e8c043adeefe35dd0969d3049f61ae0218be0acc9f
Successfully built gentrain
[0mInstalling collected packages: gentrain
  Attempting uninstall: gentrain
    Found existing installation: gentrain 0.1.2
    Uninstalling gentrain-0.1.2:
      Successfully uninstalled gentrain-0.1.2
[0mSuccessfully installed gentrain-0.1.2


In [2]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import re
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import adjusted_rand_score
from sklearn.preprocessing import LabelEncoder
from collections import Counter
import numpy as np
import pickle
from gentrain.evaluation import get_computation_rate_plot, candidate_evaluation_and_matrices, get_candidate_evaluation_and_export_mst
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder
from gentrain.encoding import get_nucleotide_sensitive_encodings, get_mutation_sensitive_encodings, generate_one_hot_encoding
from gentrain.nextclade import get_mutations_from_dataframe
from gentrain.candidate_sourcing import bitwise_xor_candidates
from gentrain.graph import build_mst, export_graph_gexf, mean_edge_weight, get_outbreak_community_labels, build_graph
from scipy.spatial.distance import pdist
import community as community_louvain
import umap
import faiss
from sklearn.cluster import DBSCAN
from sklearn.metrics import pairwise_distances_argmin
import os
import shutil

In [3]:
aggregate = "due_202203"
size = 1250

In [4]:
graph_path = f"graphs/{aggregate}/{size}"
if os.path.exists(f"graphs/{aggregate}/{size}") and os.path.isdir(f"graphs/{aggregate}/{size}"):
    shutil.rmtree(f"graphs/{aggregate}/{size}")
if not os.path.exists(f"graphs/{aggregate}"):
    os.mkdir(f"graphs/{aggregate}")
os.mkdir(f"graphs/{aggregate}/{size}")

In [5]:
sequences_df = pd.read_csv(f"../00_data_understanding_and_preparation/aggregates/{aggregate}/{size}/sequences_and_metadata.csv", delimiter=";").set_index("igs_id").sort_index()
sequences_count = len(sequences_df)

In [6]:
mutations_df = get_mutations_from_dataframe(sequences_df)

In [7]:
distance_matrix_df = pd.read_csv(f"../01_algorithm_optimization/distance_matrices/{aggregate}/{size}/distance_matrix.csv", delimiter=";",
                                     index_col="Unnamed: 0").sort_index()
distance_matrix_df = distance_matrix_df[~distance_matrix_df.index.duplicated(keep="first")]
distance_matrix_df = distance_matrix_df.loc[sequences_df.index, sequences_df.index]
distance_matrix = distance_matrix_df.to_numpy()

In [8]:
gentrain_graph = build_graph(distance_matrix)
gentrain_mst = build_mst(gentrain_graph)
gentrain_community_labels = get_outbreak_community_labels(gentrain_mst)
datetime_sampling_dates = pd.to_datetime(sequences_df["date_of_sampling"])
numeric_dates = (datetime_sampling_dates - datetime_sampling_dates.min()).dt.days
export_graph_gexf(gentrain_mst, gentrain_community_labels, sequences_df, f"{graph_path}/brute_force_mst")

mst generation time: 1.69s


In [9]:
communities_count = len(Counter(gentrain_community_labels))
communities_count

70

In [10]:
mask = np.triu(np.ones(distance_matrix_df.shape), k=1).astype(bool)
filtered = distance_matrix_df.where(mask)
infections_count = (filtered < 2).sum().sum()
distances_count = filtered.count().sum()

## Encoding

In [11]:
encodings_N_frequency_filter = get_mutation_sensitive_encodings(sequences_df, exclude_indels=False, use_frequency_filtering=False, filter_N=True)

execution time: 27.22s


In [12]:
encodings_N_and_SNV_frequency_filter = get_mutation_sensitive_encodings(sequences_df, exclude_indels=False, use_frequency_filtering=True, filter_N=True)

execution time: 28.73s


## Accurate candidate search

### Depth search with N frequency filter

In [14]:
depth_search_N_frequency_filter = []
for computation_rate in [0.05, 0.1, 0.15, 0.2]:
    limit = int(computation_rate*distances_count)
    candidates, runtime = bitwise_xor_candidates(encodings_N_frequency_filter, limit, "depth")
    depth_search_N_frequency_filter.append(get_candidate_evaluation_and_export_mst("depth_N", candidates, graph_path, distance_matrix, gentrain_community_labels, gentrain_mst, list(sequences_df["Nextclade_pango"]), sequences_df, runtime))

execution time xor distance calculation: 1.76s
execution time depth search: 0.5s
execution time 39031: 2.36s
mst generation time: 0.05s
execution time xor distance calculation: 1.64s
execution time depth search: 0.47s
execution time 78062: 2.21s
mst generation time: 0.19s
execution time xor distance calculation: 1.72s
execution time depth search: 0.45s
execution time 117093: 2.25s
mst generation time: 0.35s
execution time xor distance calculation: 1.6s
execution time depth search: 0.48s
execution time 156125: 2.17s
mst generation time: 0.44s


In [15]:
pd.DataFrame(depth_search_N_frequency_filter)

Unnamed: 0,computation_rate,infection_detection_rate,infection_recall,infection_precision,infection_f1,runtime,mean_edge_weight,mean_edge_weight_diff,max_edge_weight,subgraph_count,adjusted_rand_index,lineage_purity,lineage_purity_diff
0,0.05,0.734421,1.0,1.0,0.734421,2.36,1.336923,-0.670923,5.0,405,0.248136,0.957255,0.090855
1,0.099999,0.776944,1.0,1.0,0.776944,2.21,1.601502,-0.406345,8.0,251,0.333897,0.940661,0.074261
2,0.149999,0.807504,1.0,1.0,0.807504,2.25,1.772202,-0.235644,7.0,160,0.35464,0.923982,0.057582
3,0.2,0.819141,1.0,1.0,0.819141,2.17,1.874647,-0.1332,8.0,118,0.37533,0.927384,0.060984


### Breadth search with N frequency filter

In [16]:
breadth_search_N_frequency_filter = []
for computation_rate in [0.05, 0.1, 0.15, 0.2]:
    limit = int(computation_rate*distances_count)
    candidates, runtime = bitwise_xor_candidates(encodings_N_frequency_filter, limit, "breadth")
    breadth_search_N_frequency_filter.append(get_candidate_evaluation_and_export_mst("depth_N", candidates, graph_path, distance_matrix, gentrain_community_labels, gentrain_mst, list(sequences_df["Nextclade_pango"]), sequences_df, runtime))

matrix generation time: 0.91s
execution time distance collection: 0.65s
execution time breadth search: 0.62s
execution time 39031: 2.28s
mst generation time: 0.05s
matrix generation time: 0.92s
execution time distance collection: 0.69s
execution time breadth search: 0.66s
execution time 78062: 2.37s
mst generation time: 0.19s
matrix generation time: 0.93s
execution time distance collection: 0.66s
execution time breadth search: 0.66s
execution time 117093: 2.35s
mst generation time: 0.25s
matrix generation time: 0.97s
execution time distance collection: 0.68s
execution time breadth search: 0.65s
execution time 156125: 2.4s
mst generation time: 0.34s


In [17]:
pd.DataFrame(breadth_search_N_frequency_filter)

Unnamed: 0,computation_rate,infection_detection_rate,infection_recall,infection_precision,infection_f1,runtime,mean_edge_weight,mean_edge_weight_diff,max_edge_weight,subgraph_count,adjusted_rand_index,lineage_purity,lineage_purity_diff
0,0.050565,0.477107,1.0,1.0,0.477107,2.28,2.193995,0.186149,12.0,1,0.181549,0.9104,0.044
1,0.098298,0.654704,1.0,1.0,0.654704,2.37,2.160048,0.152202,12.0,1,0.177972,0.8896,0.0232
2,0.144801,0.751713,1.0,1.0,0.751713,2.35,2.138271,0.130424,12.0,1,0.198984,0.8968,0.0304
3,0.190072,0.802393,1.0,1.0,0.802393,2.4,2.124019,0.116173,11.0,1,0.218733,0.9056,0.0392


### Depth search with N and SNV frequency filter

In [18]:
depth_search_N_and_SNV_frequency_filter = []
for computation_rate in [0.05, 0.1, 0.15, 0.2]:
    limit = int(computation_rate*distances_count)
    candidates, runtime = bitwise_xor_candidates(encodings_N_and_SNV_frequency_filter, limit, "depth")
    depth_search_N_and_SNV_frequency_filter.append(get_candidate_evaluation_and_export_mst("depth_N_and_SNV", candidates, graph_path, distance_matrix, gentrain_community_labels, gentrain_mst, list(sequences_df["Nextclade_pango"]), sequences_df, runtime))

execution time xor distance calculation: 0.94s
execution time depth search: 0.54s
execution time 39031: 1.83s
mst generation time: 0.05s
execution time xor distance calculation: 1.02s
execution time depth search: 0.44s
execution time 78062: 1.54s
mst generation time: 0.19s
execution time xor distance calculation: 0.96s
execution time depth search: 0.45s
execution time 117093: 1.48s
mst generation time: 0.26s
execution time xor distance calculation: 0.99s
execution time depth search: 0.45s
execution time 156125: 1.52s
mst generation time: 0.43s


In [19]:
pd.DataFrame(depth_search_N_and_SNV_frequency_filter)

Unnamed: 0,computation_rate,infection_detection_rate,infection_recall,infection_precision,infection_f1,runtime,mean_edge_weight,mean_edge_weight_diff,max_edge_weight,subgraph_count,adjusted_rand_index,lineage_purity,lineage_purity_diff
0,0.05,0.786949,1.0,1.0,0.786949,1.83,1.898376,-0.109471,24.0,265,0.170468,0.966793,0.100393
1,0.099999,0.853725,1.0,1.0,0.853725,1.54,1.860654,-0.147192,19.0,180,0.310475,0.939716,0.073316
2,0.149999,0.903643,1.0,1.0,0.903643,1.48,1.915739,-0.092107,15.0,100,0.365239,0.93045,0.06405
3,0.2,0.934965,1.0,1.0,0.934965,1.52,1.905155,-0.102692,15.0,86,0.444411,0.924433,0.058033


### Breadth search with N and SNV frequency filter

In [20]:
breadth_search_N_and_SNV_frequency_filter = []
for computation_rate in [0.05, 0.1, 0.15, 0.2]:
    limit = int(computation_rate*distances_count)
    candidates, runtime = bitwise_xor_candidates(encodings_N_and_SNV_frequency_filter, limit, "breadth")
    breadth_search_N_and_SNV_frequency_filter.append(get_candidate_evaluation_and_export_mst("breadth_N_and_SNV", candidates, graph_path, distance_matrix, gentrain_community_labels, gentrain_mst, list(sequences_df["Nextclade_pango"]), sequences_df, runtime))

matrix generation time: 0.28s
execution time distance collection: 0.64s
execution time breadth search: 0.59s
execution time 39031: 1.64s
mst generation time: 0.14s
matrix generation time: 0.31s
execution time distance collection: 0.67s
execution time breadth search: 0.6s
execution time 78062: 1.68s
mst generation time: 0.19s
matrix generation time: 0.33s
execution time distance collection: 0.66s
execution time breadth search: 0.6s
execution time 117093: 1.69s
mst generation time: 0.26s
matrix generation time: 0.28s
execution time distance collection: 0.66s
execution time breadth search: 0.61s
execution time 156125: 1.64s
mst generation time: 0.35s


In [21]:
pd.DataFrame(breadth_search_N_and_SNV_frequency_filter)

Unnamed: 0,computation_rate,infection_detection_rate,infection_recall,infection_precision,infection_f1,runtime,mean_edge_weight,mean_edge_weight_diff,max_edge_weight,subgraph_count,adjusted_rand_index,lineage_purity,lineage_purity_diff
0,0.050565,0.36422,1.0,1.0,0.36422,1.64,2.213931,0.206085,20.0,1,0.175434,0.8968,0.0304
1,0.098298,0.550082,1.0,1.0,0.550082,1.68,2.159728,0.151882,19.0,1,0.216792,0.8984,0.032
2,0.144801,0.66547,1.0,1.0,0.66547,1.69,2.136029,0.128183,17.0,1,0.265712,0.9064,0.04
3,0.190072,0.747254,1.0,1.0,0.747254,1.64,2.113211,0.105364,15.0,1,0.257087,0.8944,0.028


In [22]:
depth_search_evaluation = {
    "N frequency filter": {"values": {evaluation["computation_rate"]: evaluation for evaluation in depth_search_N_frequency_filter}, "stroke": "dash", "color": "black"},
    "N & SNV frequency filter": {"values": {evaluation["computation_rate"]: evaluation for evaluation in depth_search_N_and_SNV_frequency_filter}, "stroke": "dot", "color": "black"}
}

In [23]:
breadth_search_evaluation = {
    "N frequency filter": {"values": {evaluation["computation_rate"]: evaluation for evaluation in breadth_search_N_frequency_filter}, "stroke": "dash", "color": "black"},
    "N & SNV frequency filter": {"values": {evaluation["computation_rate"]: evaluation for evaluation in breadth_search_N_and_SNV_frequency_filter}, "stroke": "dot", "color": "black"}
}

### Infection recall for different filters and computation rates using depth search

In [24]:
sub_fig = get_computation_rate_plot("infection_detection_rate", depth_search_evaluation, "Infection recall", dict(
            x=0.85,
            y=0,
            itemwidth=60,
            xanchor="left",
            yanchor="bottom",
            font=dict(size=30),
        ))

sub_fig.show()

### Community ARI for different filters and computation rates using depth search

In [25]:
sub_fig = get_computation_rate_plot("adjusted_rand_index", depth_search_evaluation, "Community ARI", dict(
            x=0.85,
            y=0,
            itemwidth=60,
            xanchor="left",
            yanchor="bottom",
            font=dict(size=30),
        ))

sub_fig.show()

### Infection recall for different filters and computation rates using breadth search

In [26]:
sub_fig = get_computation_rate_plot("infection_detection_rate", breadth_search_evaluation, "Infection recall", legend=dict(
            x=0.65,
            y=0.05,
            xanchor="left",
            yanchor="bottom",
            font=dict(size=35),
        ))
sub_fig.show()

### Community ARI for different filters and computation rates using breadth search

In [27]:
sub_fig = get_computation_rate_plot("adjusted_rand_index", breadth_search_evaluation, "Community ARI", legend=dict(
            x=0.55,
            y=0.02,
            xanchor="left",
            yanchor="bottom",
            font=dict(size=35),
        ))
sub_fig.show()

In [28]:
depth_vs_breadth_search_evaluation = {
    "Depth search": {"values": {evaluation["computation_rate"]: evaluation for evaluation in depth_search_N_and_SNV_frequency_filter}, "stroke": "solid", "color": "blue"},
    "Breadth search": {"values": {evaluation["computation_rate"]: evaluation for evaluation in breadth_search_N_and_SNV_frequency_filter}, "stroke": "solid", "color": "green"},  
}

### Infection recall for different computation rates using depth search and breadth search (N and SNV frequency filter)

In [29]:
sub_fig = get_computation_rate_plot("infection_detection_rate", depth_vs_breadth_search_evaluation, "Infection recall", legend=dict(
            x=0.7,
            y=0.1,
            xanchor="left",
            yanchor="bottom",
            font=dict(size=35),
        ))
sub_fig.show()

### Community ARI for different computation rates using depth search and breadth search (N and SNV frequency filter)

In [30]:
sub_fig = get_computation_rate_plot("adjusted_rand_index", depth_vs_breadth_search_evaluation, "Community ARI", legend=dict(
            x=0.7,
            y=0.1,
            xanchor="left",
            yanchor="bottom",
            font=dict(size=35),
        ))
sub_fig.show()