In [324]:
import pandas as pd
import altair as alt
from altair import datum
import numpy as np
import matplotlib as plt
from datetime import datetime as dt
from scipy.stats import ttest_ind
alt.data_transformers.disable_max_rows()


DataTransformerRegistry.enable('default')

## exploring size of transmission clusters

In [91]:
new_clus =  "../../mascot_glm/data/kc_clusters_combined_new.tsv"

In [240]:
clus_df = pd.read_csv(new_clus, sep="\t", parse_dates=  [3])

In [241]:
clus_df

Unnamed: 0,strain,cluster,location,date,is_unique,new_clusters
0,USA/WA-UW-21050529358/2021,1,North_King_County,2021-05-05,True,1
1,USA/WA-UW-21050476716/2021,2,North_King_County,2021-05-04,True,2
2,USA/WA-UW-2021041397284/2021,3,North_King_County,2021-04-13,True,3
3,USA/WA-UW-2021041489375/2021,3,North_King_County,2021-04-14,False,3
4,USA/WA-Altius-ALTCOV-NBDJ5QXGCUU6C2F6/2021,3,South_King_County,2021-04-27,False,3
...,...,...,...,...,...,...
11374,USA/WA-CDC-UW22030161746/2022,1423,North_King_County,2022-03-01,True,5962
11375,USA/WA-CDC-UW22030214877/2022,1424,North_King_County,2022-03-02,True,5963
11376,USA/WA-CDC-UW22030572755/2022,1424,North_King_County,2022-03-05,False,5963
11377,USA/WA-CDC-UW22022865732/2022,1425,North_King_County,2022-02-28,True,5964


In [250]:
cluster_counts_n = pd.DataFrame()
cluster_counts_s = pd.DataFrame()
cluster_counts_n['count_clus'] = clus_df[clus_df['location'] =="North_King_County"].new_clusters.value_counts()
cluster_counts_s['count_clus'] = clus_df[clus_df['location'] =="South_King_County"].new_clusters.value_counts()


In [251]:
cluster_counts_n = cluster_counts_n.reset_index()
cluster_counts_n = cluster_counts_n.rename(columns={"index": "clusid", "count_clus": "size", "location": "Region"})
cluster_counts_s = cluster_counts_s.reset_index()
cluster_counts_s = cluster_counts_s.rename(columns={"index": "clusid", "count_clus": "size", "location": "Region"})

cluster_counts_n["Region"] = "North King County"
cluster_counts_s["Region"] = "South King County"

In [252]:
concat_clus = pd.concat([ cluster_counts_n, cluster_counts_s], ignore_index=True)
#concat_clus
concat_clus = concat_clus[["size", "Region"]]
#test.to_csv("cluster_distribution.tsv",  sep="\t")

In [253]:
concat_clus

Unnamed: 0,size,Region
0,149,North King County
1,122,North King County
2,58,North King County
3,48,North King County
4,46,North King County
...,...,...
6608,1,South King County
6609,1,South King County
6610,1,South King County
6611,1,South King County


In [254]:
df_size = pd.DataFrame(concat_clus.groupby('Region', as_index = False).value_counts())#.reset_index()
df_size['perc'] = df_size.groupby('Region')['count'].apply(lambda x: x/x.sum())

df_size


Unnamed: 0,Region,size,count,perc
0,North King County,1,2999,0.788795
1,North King County,2,437,0.11494
2,North King County,3,172,0.045239
3,North King County,4,57,0.014992
4,North King County,5,43,0.01131
5,North King County,6,25,0.006575
6,North King County,7,15,0.003945
7,North King County,8,9,0.002367
8,North King County,10,9,0.002367
9,North King County,13,5,0.001315


Region
North King County    1.613361
South King County    1.865884
Name: contribution, dtype: float64

Ttest_indResult(statistic=-1.9757115085592991, pvalue=0.048228979458211675)

In [309]:
log_clusters =  alt.Chart(df_size).mark_line(opacity = 1, interpolate = "monotone").encode(
    x=alt.X('size:Q', axis=alt.Axis(title="Cluster Size", grid=False)),
    y=alt.Y('perc:Q',  axis=alt.Axis(title="Log Frequency Distribution",labelFontSize= 14, titleFontSize = 12, grid=False, format='%'), scale=alt.Scale(type = "log", zero = True)),
    color = alt.Color("Region:N", legend=alt.Legend(offset = -140,  labelFontSize = 14, titleFontSize = 12))).transform_filter(datum.size < 11).properties(
    width=300,
    height=200
)
log_clusters

In [310]:
linear_clusters =  alt.Chart(df_size).mark_line(opacity = 1, interpolate = "monotone").encode(
    x=alt.X('size:Q', axis=alt.Axis(title="Cluster Size", grid=False)),
    y=alt.Y('perc:Q',  axis=alt.Axis(title="Frequency Distribution",labelFontSize= 14, titleFontSize = 12, grid=False, format='%')),
    color = alt.Color("Region:N")).transform_filter(datum.size < 11).properties(
    width=300,
    height=200
)
linear_clusters

In [311]:
cluster_count = linear_clusters | log_clusters
cluster_count

In [None]:
#calculate mean cluster size
df_size["contribution"] = df_size["size"] * df_size['perc']
df_size.groupby('Region')["contribution"].sum()

In [None]:
#two sample t test for cluster size 
#define samples
group1 = concat_clus[concat_clus['Region']=='North King County']
group2 = concat_clus[concat_clus['Region']=='South King County']

#perform independent two sample t-test
ttest_ind(group1['size'], group2['size'])


# now look at number of clusters over time for each region

In [312]:
date_clus = clus_df.groupby('new_clusters').first().reset_index()

In [313]:
date_clus

Unnamed: 0,new_clusters,strain,cluster,location,date,is_unique
0,1,USA/WA-UW-21050529358/2021,1,North_King_County,2021-05-05,True
1,2,USA/WA-UW-21050476716/2021,2,North_King_County,2021-05-04,True
2,3,USA/WA-UW-2021041397284/2021,3,North_King_County,2021-04-13,True
3,4,USA/WA-UW-65398/2021,4,North_King_County,2021-02-27,True
4,5,USA/WA-UW-69277/2021,5,South_King_County,2021-03-26,True
...,...,...,...,...,...,...
5959,5960,USA/WA-Altius-ALTCOV-BEQRS2ML7ZX3EQWJ/2021,1421,South_King_County,2021-07-09,True
5960,5961,USA/WA-S2728/2020,1422,North_King_County,2020-07-08,True
5961,5962,USA/WA-CDC-UW22030161746/2022,1423,North_King_County,2022-03-01,True
5962,5963,USA/WA-CDC-UW22030214877/2022,1424,North_King_County,2022-03-02,True


In [314]:
def convert_format(number):
    #date = dt.strptime(number, '%Y-%m-%d')
    date = dt.strftime(number, '%Y-%m')
    return date

In [315]:
date_clus['year-month'] = date_clus['date'].map(convert_format)

In [316]:
date_clus

Unnamed: 0,new_clusters,strain,cluster,location,date,is_unique,year-month
0,1,USA/WA-UW-21050529358/2021,1,North_King_County,2021-05-05,True,2021-05
1,2,USA/WA-UW-21050476716/2021,2,North_King_County,2021-05-04,True,2021-05
2,3,USA/WA-UW-2021041397284/2021,3,North_King_County,2021-04-13,True,2021-04
3,4,USA/WA-UW-65398/2021,4,North_King_County,2021-02-27,True,2021-02
4,5,USA/WA-UW-69277/2021,5,South_King_County,2021-03-26,True,2021-03
...,...,...,...,...,...,...,...
5959,5960,USA/WA-Altius-ALTCOV-BEQRS2ML7ZX3EQWJ/2021,1421,South_King_County,2021-07-09,True,2021-07
5960,5961,USA/WA-S2728/2020,1422,North_King_County,2020-07-08,True,2020-07
5961,5962,USA/WA-CDC-UW22030161746/2022,1423,North_King_County,2022-03-01,True,2022-03
5962,5963,USA/WA-CDC-UW22030214877/2022,1424,North_King_County,2022-03-02,True,2022-03


In [317]:
date_clus.location[date_clus.location == "North_King_County"] = "North King County"
date_clus.location[date_clus.location == "South_King_County"] = "South King County"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  date_clus.location[date_clus.location == "North_King_County"] = "North King County"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  date_clus.location[date_clus.location == "South_King_County"] = "South King County"


In [318]:
clust_over_time =  alt.Chart(date_clus).mark_bar(opacity = 1, interpolate='monotone', size = 10).encode(
    alt.X('year-month:T', axis = alt.Axis(grid= False, format = "%B %Y", title = "")),
    alt.Y('count():Q', axis=alt.Axis(title="Number of Local Outbreak Clusters", grid=False, labelFontSize = 14), stack = None), 
    alt.Color("location", legend = None), 
    column= alt.Column("location:N", title = None, spacing = 70, header=alt.Header(labelFontSize = 14))).properties(
    width=300,
    height=200
)

In [319]:
clus_des = (clust_over_time & cluster_count).resolve_scale(color = "independent")
clus_des

In [320]:
clus_des.save("../figures/clust_over_time.html")