In [1]:
import plotly.express as px
import pandas as pd
import networkx as nx
import community as community_louvain
from gentrain.nextclade import get_mutations_from_dataframe
from collections import Counter
import numpy as np

## Duesseldorf 2022

In [2]:
dataset_name = "duesseldorf_2022"

In [3]:
distance_matrix_df = pd.read_csv(f"../../data/{dataset_name}/distance_matrix.csv", delimiter=";", index_col="Unnamed: 0").sort_index()
distance_matrix = distance_matrix_df.to_numpy()

In [4]:
isolates_df = pd.read_csv(f"../../data/{dataset_name}/sequences_and_metadata.csv", delimiter=";").set_index("igs_id").sort_index()
mutations_df = get_mutations_from_dataframe(isolates_df).reset_index()

In [5]:
complete_graph = nx.Graph()
n = distance_matrix.shape[0]
complete_graph.add_nodes_from(range(n))
for i in range(n):
    for j in range(i + 1, n):
        distance = distance_matrix[i][j]
        complete_graph.add_edge(i, j, weight=distance)

In [6]:
mst = nx.minimum_spanning_tree(complete_graph, algorithm="prim")

In [7]:
partition = community_louvain.best_partition(mst)

In [8]:
communities = {}
for isolate_id, community_id in partition.items():
    if community_id not in communities:
        communities[community_id] = []
    communities[community_id].append(isolate_id)

In [9]:
cluster_mutations = []
tick_vals = []
tick_texts = []
for community_id, community_isolates in communities.items():
    mutation_strings = mutations_df.loc[community_isolates]["substitutions"].sum()
    mutation_counts = Counter(mutation_strings)
    cluster_mutations.append({k for k, v in mutation_counts.items() if v > 1})

## Analysing characteristic mutations

In [10]:
characteristic_mutations = []
for i, mutations_in_cluster in enumerate(cluster_mutations):
    before = cluster_mutations[:i]
    after = cluster_mutations[i+1:]
    union_other = set().union(*(before+after))
    characteristic_mutations.append(mutations_in_cluster - union_other)

In [11]:
characteristic_subsitution_counts = {}
substitution_counts = mutations_df["substitutions"].explode().value_counts()
for characteristic_mutations_in_cluster in characteristic_mutations:
    for substitution in characteristic_mutations_in_cluster:
        characteristic_subsitution_counts[substitution] = substitution_counts[substitution]

In [12]:
mean_occurency_count = sum(characteristic_subsitution_counts.values()) / len(characteristic_subsitution_counts)
f"Mean ocurrency count through characteristic mutations: {mean_occurency_count}"

'Mean ocurrency count through characteristic mutations: 4.822094691535151'

In [13]:
max_occurency_count = max(characteristic_subsitution_counts.values())
f"Max ocurrency count through characteristic mutations: {max_occurency_count}"

'Max ocurrency count through characteristic mutations: 36'

In [14]:
cleaned_max_occurency_count = round(np.percentile(list(characteristic_subsitution_counts.values()), 99))
f"Clean max ocurrency count through characteristic mutations: {cleaned_max_occurency_count}"

'Clean max ocurrency count through characteristic mutations: 24'

In [15]:
min_occurency_count = min(characteristic_subsitution_counts.values())
f"Min ocurrency count through characteristic mutations: {min_occurency_count}"

'Min ocurrency count through characteristic mutations: 2'

In [16]:
f"Characteristic mutations occur in max {round((max_occurency_count/len(isolates_df)) * 100, 2)}% of all sequences."

'Characteristic mutations occur in max 0.72% of all sequences.'

In [17]:
f"Cleaned characteristic mutations occur in max {round((cleaned_max_occurency_count/len(isolates_df)) * 100, 2)}% of all sequences."

'Cleaned characteristic mutations occur in max 0.48% of all sequences.'

## Duesseldorf 202203

In [18]:
dataset_name = "duesseldorf_202203"

In [19]:
distance_matrix_df = pd.read_csv(f"../../data/{dataset_name}/distance_matrix.csv", delimiter=";", index_col="Unnamed: 0").sort_index()
distance_matrix = distance_matrix_df.to_numpy()

In [20]:
isolates_df = pd.read_csv(f"../../data/{dataset_name}/sequences_and_metadata.csv", delimiter=";").set_index("igs_id").sort_index()
mutations_df = get_mutations_from_dataframe(isolates_df).reset_index()

In [21]:
complete_graph = nx.Graph()
n = distance_matrix.shape[0]
complete_graph.add_nodes_from(range(n))
for i in range(n):
    for j in range(i + 1, n):
        distance = distance_matrix[i][j]
        complete_graph.add_edge(i, j, weight=distance)

In [22]:
mst = nx.minimum_spanning_tree(complete_graph, algorithm="prim")

In [23]:
partition = community_louvain.best_partition(mst)

In [24]:
communities = {}
for isolate_id, community_id in partition.items():
    if community_id not in communities:
        communities[community_id] = []
    communities[community_id].append(isolate_id)

In [25]:
cluster_mutations = []
tick_vals = []
tick_texts = []
for community_id, community_isolates in communities.items():
    mutation_strings = mutations_df.loc[community_isolates]["substitutions"].sum()
    mutation_counts = Counter(mutation_strings)
    cluster_mutations.append({k for k, v in mutation_counts.items() if v > 1})

## Analysing characteristic mutations

In [26]:
characteristic_mutations = []
for i, mutations_in_cluster in enumerate(cluster_mutations):
    before = cluster_mutations[:i]
    after = cluster_mutations[i+1:]
    union_other = set().union(*(before+after))
    characteristic_mutations.append(mutations_in_cluster - union_other)

In [27]:
characteristic_subsitution_counts = {}
substitution_counts = mutations_df["substitutions"].explode().value_counts()
for characteristic_mutations_in_cluster in characteristic_mutations:
    for substitution in characteristic_mutations_in_cluster:
        characteristic_subsitution_counts[substitution] = substitution_counts[substitution]

In [28]:
mean_occurency_count = sum(characteristic_subsitution_counts.values()) / len(characteristic_subsitution_counts)
f"Mean ocurrency count through characteristic mutations: {mean_occurency_count}"

'Mean ocurrency count through characteristic mutations: 4.74163783160323'

In [29]:
max_occurency_count = max(characteristic_subsitution_counts.values())
f"Max ocurrency count through characteristic mutations: {max_occurency_count}"

'Max ocurrency count through characteristic mutations: 27'

In [30]:
cleaned_max_occurency_count = round(np.percentile(list(characteristic_subsitution_counts.values()), 99))
f"Clean max ocurrency count through characteristic mutations: {cleaned_max_occurency_count}"

'Clean max ocurrency count through characteristic mutations: 18'

In [31]:
min_occurency_count = min(characteristic_subsitution_counts.values())
f"Min ocurrency count through characteristic mutations: {min_occurency_count}"

'Min ocurrency count through characteristic mutations: 2'

In [32]:
f"Characteristic mutations occur in max {round((max_occurency_count/len(isolates_df)) * 100, 2)}% of all sequences."

'Characteristic mutations occur in max 0.64% of all sequences.'

In [33]:
f"Cleaned characteristic mutations occur in max {round((cleaned_max_occurency_count/len(isolates_df)) * 100, 2)}% of all sequences."

'Cleaned characteristic mutations occur in max 0.42% of all sequences.'

## Duesseldorf 202203w1

In [34]:
dataset_name = "duesseldorf_202203w1"

In [35]:
distance_matrix_df = pd.read_csv(f"../../data/{dataset_name}/distance_matrix.csv", delimiter=";", index_col="Unnamed: 0").sort_index()
distance_matrix = distance_matrix_df.to_numpy()

In [36]:
isolates_df = pd.read_csv(f"../../data/{dataset_name}/sequences_and_metadata.csv", delimiter=";").set_index("igs_id").sort_index()
mutations_df = get_mutations_from_dataframe(isolates_df).reset_index()

In [37]:
complete_graph = nx.Graph()
n = distance_matrix.shape[0]
complete_graph.add_nodes_from(range(n))
for i in range(n):
    for j in range(i + 1, n):
        distance = distance_matrix[i][j]
        complete_graph.add_edge(i, j, weight=distance)

In [38]:
mst = nx.minimum_spanning_tree(complete_graph, algorithm="prim")

In [39]:
partition = community_louvain.best_partition(mst)

In [40]:
communities = {}
for isolate_id, community_id in partition.items():
    if community_id not in communities:
        communities[community_id] = []
    communities[community_id].append(isolate_id)

In [41]:
cluster_mutations = []
tick_vals = []
tick_texts = []
for community_id, community_isolates in communities.items():
    mutation_strings = mutations_df.loc[community_isolates]["substitutions"].sum()
    mutation_counts = Counter(mutation_strings)
    cluster_mutations.append({k for k, v in mutation_counts.items() if v > 1})

## Analysing characteristic mutations

In [42]:
characteristic_mutations = []
for i, mutations_in_cluster in enumerate(cluster_mutations):
    before = cluster_mutations[:i]
    after = cluster_mutations[i+1:]
    union_other = set().union(*(before+after))
    characteristic_mutations.append(mutations_in_cluster - union_other)

In [43]:
characteristic_subsitution_counts = {}
substitution_counts = mutations_df["substitutions"].explode().value_counts()
for characteristic_mutations_in_cluster in characteristic_mutations:
    for substitution in characteristic_mutations_in_cluster:
        characteristic_subsitution_counts[substitution] = substitution_counts[substitution]

In [44]:
mean_occurency_count = sum(characteristic_subsitution_counts.values()) / len(characteristic_subsitution_counts)
f"Mean ocurrency count through characteristic mutations: {mean_occurency_count}"

'Mean ocurrency count through characteristic mutations: 3.789285714285714'

In [45]:
max_occurency_count = max(characteristic_subsitution_counts.values())
f"Max ocurrency count through characteristic mutations: {max_occurency_count}"

'Max ocurrency count through characteristic mutations: 17'

In [46]:
cleaned_max_occurency_count = round(np.percentile(list(characteristic_subsitution_counts.values()), 99))
f"Clean max ocurrency count through characteristic mutations: {cleaned_max_occurency_count}"

'Clean max ocurrency count through characteristic mutations: 13'

In [47]:
min_occurency_count = min(characteristic_subsitution_counts.values())
f"Min ocurrency count through characteristic mutations: {min_occurency_count}"

'Min ocurrency count through characteristic mutations: 2'

In [48]:
f"Characteristic mutations occur in max {round((max_occurency_count/len(isolates_df)) * 100, 2)}% of all sequences."

'Characteristic mutations occur in max 1.31% of all sequences.'

In [49]:
f"Cleaned characteristic mutations occur in max {round((cleaned_max_occurency_count/len(isolates_df)) * 100, 2)}% of all sequences."

'Cleaned characteristic mutations occur in max 1.0% of all sequences.'

## NRW 2022

In [50]:
dataset_name = "nrw_2022"

In [51]:
distance_matrix_df = pd.read_csv(f"../../data/{dataset_name}/distance_matrix.csv", delimiter=";", index_col="Unnamed: 0").sort_index()
distance_matrix = distance_matrix_df.to_numpy()

In [52]:
isolates_df = pd.read_csv(f"../../data/{dataset_name}/sequences_and_metadata.csv", delimiter=";").set_index("igs_id").sort_index()
mutations_df = get_mutations_from_dataframe(isolates_df).reset_index()

In [53]:
complete_graph = nx.Graph()
n = distance_matrix.shape[0]
complete_graph.add_nodes_from(range(n))
for i in range(n):
    for j in range(i + 1, n):
        distance = distance_matrix[i][j]
        complete_graph.add_edge(i, j, weight=distance)

In [54]:
mst = nx.minimum_spanning_tree(complete_graph, algorithm="prim")

In [55]:
partition = community_louvain.best_partition(mst)

In [56]:
communities = {}
for isolate_id, community_id in partition.items():
    if community_id not in communities:
        communities[community_id] = []
    communities[community_id].append(isolate_id)

In [57]:
cluster_mutations = []
tick_vals = []
tick_texts = []
for community_id, community_isolates in communities.items():
    mutation_strings = mutations_df.loc[community_isolates]["substitutions"].sum()
    mutation_counts = Counter(mutation_strings)
    cluster_mutations.append({k for k, v in mutation_counts.items() if v > 1})

## Analysing characteristic mutations

In [58]:
characteristic_mutations = []
for i, mutations_in_cluster in enumerate(cluster_mutations):
    before = cluster_mutations[:i]
    after = cluster_mutations[i+1:]
    union_other = set().union(*(before+after))
    characteristic_mutations.append(mutations_in_cluster - union_other)

In [59]:
characteristic_subsitution_counts = {}
substitution_counts = mutations_df["substitutions"].explode().value_counts()
for characteristic_mutations_in_cluster in characteristic_mutations:
    for substitution in characteristic_mutations_in_cluster:
        characteristic_subsitution_counts[substitution] = substitution_counts[substitution]

In [60]:
mean_occurency_count = sum(characteristic_subsitution_counts.values()) / len(characteristic_subsitution_counts)
f"Mean ocurrency count through characteristic mutations: {mean_occurency_count}"

'Mean ocurrency count through characteristic mutations: 5.022488755622189'

In [61]:
max_occurency_count = max(characteristic_subsitution_counts.values())
f"Max ocurrency count through characteristic mutations: {max_occurency_count}"

'Max ocurrency count through characteristic mutations: 55'

In [62]:
cleaned_max_occurency_count = round(np.percentile(list(characteristic_subsitution_counts.values()), 99))
f"Clean max ocurrency count through characteristic mutations: {cleaned_max_occurency_count}"

'Clean max ocurrency count through characteristic mutations: 30'

In [63]:
min_occurency_count = min(characteristic_subsitution_counts.values())
f"Min ocurrency count through characteristic mutations: {min_occurency_count}"

'Min ocurrency count through characteristic mutations: 2'

In [64]:
f"Characteristic mutations occur in max {round((max_occurency_count/len(isolates_df)) * 100, 2)}% of all sequences."

'Characteristic mutations occur in max 1.11% of all sequences.'

In [65]:
f"Cleaned characteristic mutations occur in max {round((cleaned_max_occurency_count/len(isolates_df)) * 100, 2)}% of all sequences."

'Cleaned characteristic mutations occur in max 0.61% of all sequences.'

## NRW 202203

In [66]:
dataset_name = "nrw_202203"

In [67]:
distance_matrix_df = pd.read_csv(f"../../data/{dataset_name}/distance_matrix.csv", delimiter=";", index_col="Unnamed: 0").sort_index()
distance_matrix = distance_matrix_df.to_numpy()

In [68]:
isolates_df = pd.read_csv(f"../../data/{dataset_name}/sequences_and_metadata.csv", delimiter=";").set_index("igs_id").sort_index()
mutations_df = get_mutations_from_dataframe(isolates_df).reset_index()

In [69]:
complete_graph = nx.Graph()
n = distance_matrix.shape[0]
complete_graph.add_nodes_from(range(n))
for i in range(n):
    for j in range(i + 1, n):
        distance = distance_matrix[i][j]
        complete_graph.add_edge(i, j, weight=distance)

In [70]:
mst = nx.minimum_spanning_tree(complete_graph, algorithm="prim")

In [71]:
partition = community_louvain.best_partition(mst)

In [72]:
communities = {}
for isolate_id, community_id in partition.items():
    if community_id not in communities:
        communities[community_id] = []
    communities[community_id].append(isolate_id)

In [73]:
cluster_mutations = []
tick_vals = []
tick_texts = []
for community_id, community_isolates in communities.items():
    mutation_strings = mutations_df.loc[community_isolates]["substitutions"].sum()
    mutation_counts = Counter(mutation_strings)
    cluster_mutations.append({k for k, v in mutation_counts.items() if v > 1})

## Analysing characteristic mutations

In [74]:
characteristic_mutations = []
for i, mutations_in_cluster in enumerate(cluster_mutations):
    before = cluster_mutations[:i]
    after = cluster_mutations[i+1:]
    union_other = set().union(*(before+after))
    characteristic_mutations.append(mutations_in_cluster - union_other)

In [75]:
characteristic_subsitution_counts = {}
substitution_counts = mutations_df["substitutions"].explode().value_counts()
for characteristic_mutations_in_cluster in characteristic_mutations:
    for substitution in characteristic_mutations_in_cluster:
        characteristic_subsitution_counts[substitution] = substitution_counts[substitution]

In [76]:
mean_occurency_count = sum(characteristic_subsitution_counts.values()) / len(characteristic_subsitution_counts)
f"Mean ocurrency count through characteristic mutations: {mean_occurency_count}"

'Mean ocurrency count through characteristic mutations: 4.332865168539326'

In [77]:
max_occurency_count = max(characteristic_subsitution_counts.values())
f"Max ocurrency count through characteristic mutations: {max_occurency_count}"

'Max ocurrency count through characteristic mutations: 22'

In [78]:
cleaned_max_occurency_count = round(np.percentile(list(characteristic_subsitution_counts.values()), 99))
f"Clean max ocurrency count through characteristic mutations: {cleaned_max_occurency_count}"

'Clean max ocurrency count through characteristic mutations: 14'

In [79]:
min_occurency_count = min(characteristic_subsitution_counts.values())
f"Min ocurrency count through characteristic mutations: {min_occurency_count}"

'Min ocurrency count through characteristic mutations: 2'

In [80]:
f"Characteristic mutations occur in max {round((max_occurency_count/len(isolates_df)) * 100, 2)}% of all sequences."

'Characteristic mutations occur in max 0.54% of all sequences.'

In [81]:
f"Cleaned characteristic mutations occur in max {round((cleaned_max_occurency_count/len(isolates_df)) * 100, 2)}% of all sequences."

'Cleaned characteristic mutations occur in max 0.35% of all sequences.'

## NRW 202203w1

In [82]:
dataset_name = "nrw_202203w1"

In [83]:
distance_matrix_df = pd.read_csv(f"../../data/{dataset_name}/distance_matrix.csv", delimiter=";", index_col="Unnamed: 0").sort_index()
distance_matrix = distance_matrix_df.to_numpy()

In [84]:
isolates_df = pd.read_csv(f"../../data/{dataset_name}/sequences_and_metadata.csv", delimiter=";").set_index("igs_id").sort_index()
mutations_df = get_mutations_from_dataframe(isolates_df).reset_index()

In [85]:
complete_graph = nx.Graph()
n = distance_matrix.shape[0]
complete_graph.add_nodes_from(range(n))
for i in range(n):
    for j in range(i + 1, n):
        distance = distance_matrix[i][j]
        complete_graph.add_edge(i, j, weight=distance)

In [86]:
mst = nx.minimum_spanning_tree(complete_graph, algorithm="prim")

In [87]:
partition = community_louvain.best_partition(mst)

In [88]:
communities = {}
for isolate_id, community_id in partition.items():
    if community_id not in communities:
        communities[community_id] = []
    communities[community_id].append(isolate_id)

In [89]:
cluster_mutations = []
tick_vals = []
tick_texts = []
for community_id, community_isolates in communities.items():
    mutation_strings = mutations_df.loc[community_isolates]["substitutions"].sum()
    mutation_counts = Counter(mutation_strings)
    cluster_mutations.append({k for k, v in mutation_counts.items() if v > 1})

## Analysing characteristic mutations

In [90]:
characteristic_mutations = []
for i, mutations_in_cluster in enumerate(cluster_mutations):
    before = cluster_mutations[:i]
    after = cluster_mutations[i+1:]
    union_other = set().union(*(before+after))
    characteristic_mutations.append(mutations_in_cluster - union_other)

In [91]:
characteristic_subsitution_counts = {}
substitution_counts = mutations_df["substitutions"].explode().value_counts()
for characteristic_mutations_in_cluster in characteristic_mutations:
    for substitution in characteristic_mutations_in_cluster:
        characteristic_subsitution_counts[substitution] = substitution_counts[substitution]

In [92]:
mean_occurency_count = sum(characteristic_subsitution_counts.values()) / len(characteristic_subsitution_counts)
f"Mean ocurrency count through characteristic mutations: {mean_occurency_count}"

'Mean ocurrency count through characteristic mutations: 4.0013927576601676'

In [93]:
max_occurency_count = max(characteristic_subsitution_counts.values())
f"Max ocurrency count through characteristic mutations: {max_occurency_count}"

'Max ocurrency count through characteristic mutations: 23'

In [94]:
cleaned_max_occurency_count = round(np.percentile(list(characteristic_subsitution_counts.values()), 99))
f"Clean max ocurrency count through characteristic mutations: {cleaned_max_occurency_count}"

'Clean max ocurrency count through characteristic mutations: 13'

In [95]:
min_occurency_count = min(characteristic_subsitution_counts.values())
f"Min ocurrency count through characteristic mutations: {min_occurency_count}"

'Min ocurrency count through characteristic mutations: 2'

In [96]:
f"Characteristic mutations occur in max {round((max_occurency_count/len(isolates_df)) * 100, 2)}% of all sequences."

'Characteristic mutations occur in max 0.57% of all sequences.'

In [97]:
f"Cleaned characteristic mutations occur in max {round((cleaned_max_occurency_count/len(isolates_df)) * 100, 2)}% of all sequences."

'Cleaned characteristic mutations occur in max 0.32% of all sequences.'

## Germany 2022

In [98]:
dataset_name = "germany_2022"

In [99]:
distance_matrix_df = pd.read_csv(f"../../data/{dataset_name}/distance_matrix.csv", delimiter=";", index_col="Unnamed: 0").sort_index()
distance_matrix = distance_matrix_df.to_numpy()

In [100]:
isolates_df = pd.read_csv(f"../../data/{dataset_name}/sequences_and_metadata.csv", delimiter=";").set_index("igs_id").sort_index()
mutations_df = get_mutations_from_dataframe(isolates_df).reset_index()

In [101]:
complete_graph = nx.Graph()
n = distance_matrix.shape[0]
complete_graph.add_nodes_from(range(n))
for i in range(n):
    for j in range(i + 1, n):
        distance = distance_matrix[i][j]
        complete_graph.add_edge(i, j, weight=distance)

In [None]:
mst = nx.minimum_spanning_tree(complete_graph, algorithm="prim")

In [None]:
partition = community_louvain.best_partition(mst)

In [None]:
communities = {}
for isolate_id, community_id in partition.items():
    if community_id not in communities:
        communities[community_id] = []
    communities[community_id].append(isolate_id)

In [None]:
cluster_mutations = []
tick_vals = []
tick_texts = []
for community_id, community_isolates in communities.items():
    mutation_strings = mutations_df.loc[community_isolates]["substitutions"].sum()
    mutation_counts = Counter(mutation_strings)
    cluster_mutations.append({k for k, v in mutation_counts.items() if v > 1})

## Analysing characteristic mutations

In [None]:
characteristic_mutations = []
for i, mutations_in_cluster in enumerate(cluster_mutations):
    before = cluster_mutations[:i]
    after = cluster_mutations[i+1:]
    union_other = set().union(*(before+after))
    characteristic_mutations.append(mutations_in_cluster - union_other)

In [None]:
characteristic_subsitution_counts = {}
substitution_counts = mutations_df["substitutions"].explode().value_counts()
for characteristic_mutations_in_cluster in characteristic_mutations:
    for substitution in characteristic_mutations_in_cluster:
        characteristic_subsitution_counts[substitution] = substitution_counts[substitution]

In [None]:
mean_occurency_count = sum(characteristic_subsitution_counts.values()) / len(characteristic_subsitution_counts)
f"Mean ocurrency count through characteristic mutations: {mean_occurency_count}"

In [None]:
max_occurency_count = max(characteristic_subsitution_counts.values())
f"Max ocurrency count through characteristic mutations: {max_occurency_count}"

In [None]:
cleaned_max_occurency_count = round(np.percentile(list(characteristic_subsitution_counts.values()), 99))
f"Clean max ocurrency count through characteristic mutations: {cleaned_max_occurency_count}"

In [None]:
min_occurency_count = min(characteristic_subsitution_counts.values())
f"Min ocurrency count through characteristic mutations: {min_occurency_count}"

In [None]:
f"Characteristic mutations occur in max {round((max_occurency_count/len(isolates_df)) * 100, 2)}% of all sequences."

In [None]:
f"Cleaned characteristic mutations occur in max {round((cleaned_max_occurency_count/len(isolates_df)) * 100, 2)}% of all sequences."

## Germany 202203

In [None]:
dataset_name = "germany_202203"

In [None]:
distance_matrix_df = pd.read_csv(f"../../data/{dataset_name}/distance_matrix.csv", delimiter=";", index_col="Unnamed: 0").sort_index()
distance_matrix = distance_matrix_df.to_numpy()

In [None]:
isolates_df = pd.read_csv(f"../../data/{dataset_name}/sequences_and_metadata.csv", delimiter=";").set_index("igs_id").sort_index()
mutations_df = get_mutations_from_dataframe(isolates_df).reset_index()

In [None]:
complete_graph = nx.Graph()
n = distance_matrix.shape[0]
complete_graph.add_nodes_from(range(n))
for i in range(n):
    for j in range(i + 1, n):
        distance = distance_matrix[i][j]
        complete_graph.add_edge(i, j, weight=distance)

In [None]:
mst = nx.minimum_spanning_tree(complete_graph, algorithm="prim")

In [None]:
partition = community_louvain.best_partition(mst)

In [None]:
communities = {}
for isolate_id, community_id in partition.items():
    if community_id not in communities:
        communities[community_id] = []
    communities[community_id].append(isolate_id)

In [None]:
cluster_mutations = []
tick_vals = []
tick_texts = []
for community_id, community_isolates in communities.items():
    mutation_strings = mutations_df.loc[community_isolates]["substitutions"].sum()
    mutation_counts = Counter(mutation_strings)
    cluster_mutations.append({k for k, v in mutation_counts.items() if v > 1})

## Analysing characteristic mutations

In [None]:
characteristic_mutations = []
for i, mutations_in_cluster in enumerate(cluster_mutations):
    before = cluster_mutations[:i]
    after = cluster_mutations[i+1:]
    union_other = set().union(*(before+after))
    characteristic_mutations.append(mutations_in_cluster - union_other)

In [None]:
characteristic_subsitution_counts = {}
substitution_counts = mutations_df["substitutions"].explode().value_counts()
for characteristic_mutations_in_cluster in characteristic_mutations:
    for substitution in characteristic_mutations_in_cluster:
        characteristic_subsitution_counts[substitution] = substitution_counts[substitution]

In [None]:
mean_occurency_count = sum(characteristic_subsitution_counts.values()) / len(characteristic_subsitution_counts)
f"Mean ocurrency count through characteristic mutations: {mean_occurency_count}"

In [None]:
max_occurency_count = max(characteristic_subsitution_counts.values())
f"Max ocurrency count through characteristic mutations: {max_occurency_count}"

In [None]:
cleaned_max_occurency_count = round(np.percentile(list(characteristic_subsitution_counts.values()), 99))
f"Clean max ocurrency count through characteristic mutations: {cleaned_max_occurency_count}"

In [None]:
min_occurency_count = min(characteristic_subsitution_counts.values())
f"Min ocurrency count through characteristic mutations: {min_occurency_count}"

In [None]:
f"Characteristic mutations occur in max {round((max_occurency_count/len(isolates_df)) * 100, 2)}% of all sequences."

In [None]:
f"Cleaned characteristic mutations occur in max {round((cleaned_max_occurency_count/len(isolates_df)) * 100, 2)}% of all sequences."

## Germany 202203w1

In [None]:
dataset_name = "germany_202203w1"

In [None]:
distance_matrix_df = pd.read_csv(f"../../data/{dataset_name}/distance_matrix.csv", delimiter=";", index_col="Unnamed: 0").sort_index()
distance_matrix = distance_matrix_df.to_numpy()

In [None]:
isolates_df = pd.read_csv(f"../../data/{dataset_name}/sequences_and_metadata.csv", delimiter=";").set_index("igs_id").sort_index()
mutations_df = get_mutations_from_dataframe(isolates_df).reset_index()

In [None]:
complete_graph = nx.Graph()
n = distance_matrix.shape[0]
complete_graph.add_nodes_from(range(n))
for i in range(n):
    for j in range(i + 1, n):
        distance = distance_matrix[i][j]
        complete_graph.add_edge(i, j, weight=distance)

In [None]:
mst = nx.minimum_spanning_tree(complete_graph, algorithm="prim")

In [None]:
partition = community_louvain.best_partition(mst)

In [None]:
communities = {}
for isolate_id, community_id in partition.items():
    if community_id not in communities:
        communities[community_id] = []
    communities[community_id].append(isolate_id)

In [None]:
cluster_mutations = []
tick_vals = []
tick_texts = []
for community_id, community_isolates in communities.items():
    mutation_strings = mutations_df.loc[community_isolates]["substitutions"].sum()
    mutation_counts = Counter(mutation_strings)
    cluster_mutations.append({k for k, v in mutation_counts.items() if v > 1})

## Analysing characteristic mutations

In [None]:
characteristic_mutations = []
for i, mutations_in_cluster in enumerate(cluster_mutations):
    before = cluster_mutations[:i]
    after = cluster_mutations[i+1:]
    union_other = set().union(*(before+after))
    characteristic_mutations.append(mutations_in_cluster - union_other)

In [None]:
characteristic_subsitution_counts = {}
substitution_counts = mutations_df["substitutions"].explode().value_counts()
for characteristic_mutations_in_cluster in characteristic_mutations:
    for substitution in characteristic_mutations_in_cluster:
        characteristic_subsitution_counts[substitution] = substitution_counts[substitution]

In [None]:
mean_occurency_count = sum(characteristic_subsitution_counts.values()) / len(characteristic_subsitution_counts)
f"Mean ocurrency count through characteristic mutations: {mean_occurency_count}"

In [None]:
max_occurency_count = max(characteristic_subsitution_counts.values())
f"Max ocurrency count through characteristic mutations: {max_occurency_count}"

In [None]:
cleaned_max_occurency_count = round(np.percentile(list(characteristic_subsitution_counts.values()), 99))
f"Clean max ocurrency count through characteristic mutations: {cleaned_max_occurency_count}"

In [None]:
min_occurency_count = min(characteristic_subsitution_counts.values())
f"Min ocurrency count through characteristic mutations: {min_occurency_count}"

In [None]:
f"Characteristic mutations occur in max {round((max_occurency_count/len(isolates_df)) * 100, 2)}% of all sequences."

In [None]:
f"Cleaned characteristic mutations occur in max {round((cleaned_max_occurency_count/len(isolates_df)) * 100, 2)}% of all sequences."