# Figure 6
Jupyter notebook preparing the data used for creating the phylogenetic tree with gene heatmap for scaffolded isolates.

In [150]:
import pickle
import pandas as pd
import seaborn as sns
import colorcet as cc

In [151]:
scaffold_filename = "../0_Data/0_Raw/scaffolds/scaffolding_panaroo_no_merge__contigs_genes_annotations_scaffolded.csv"
plasmidcolor_filename = "../0_Data/1_Intermediate/plasmid_color_scheme.pkl" # generated in `Figure1a.ipynb`
genetobb_filename = "../0_Data/0_Raw/scaffolds/scaffolding_panaroo_no_merge__panaroo_groupID_to_BBgene.tsv"
bbcoords_filename = "../0_Data/0_Raw/bb_rs_gene_coords_b31_v1.tsv"

In [152]:
# import scaffolded file and pull core columns
scaffold = pd.read_csv(scaffold_filename)
scaffold = scaffold.rename(columns={"Gene":"gene",
                                      "Isolate": "assembly",
                                      "plasmid_name": "replicon_name"})
scaffold_tree_df = scaffold[["gene", "assembly", "replicon_name"]].copy()
scaffold_tree_df.head()

Unnamed: 0,gene,assembly,replicon_name
0,lacF,UNY191,cp26
1,guaB,UNY191,cp26
2,group_1602,UNY191,cp26
3,licB,UNY191,cp26
4,guaA~~~guaA_2,UNY191,cp26


In [153]:
# load in plasmid color assignments and append to dataframe
with open(plasmidcolor_filename, 'rb') as pickle_file:
    plasmid_color_scheme = pickle.load(pickle_file)
scaffold_tree_df["color"] = scaffold_tree_df["replicon_name"].apply(lambda z: plasmid_color_scheme[z] if z in plasmid_color_scheme.keys() else "")

In [154]:
# check for new replicons that were not present in the isolate tree
scaffold_tree_df[scaffold_tree_df["color"]==""]["replicon_name"].unique()

array(['cp32-5-1', 'cp32-5+1', 'lp36-28-4'], dtype=object)

In [155]:
print(sns.color_palette(cc.glasbey_light, n_colors=50).as_hex())
sns.color_palette(cc.glasbey_light, n_colors=50).as_hex()

['#d60000', '#018700', '#b500ff', '#05acc6', '#97ff00', '#ffa52f', '#ff8ec8', '#79525e', '#00fdcf', '#afa5ff', '#93ac83', '#9a6900', '#366962', '#d3008c', '#fdf490', '#c86e66', '#9ee2ff', '#00c846', '#a877ac', '#b8ba01', '#f4bfb1', '#ff28fd', '#f2cdff', '#009e7c', '#ff6200', '#56642a', '#953f1f', '#90318e', '#ff3464', '#a0e491', '#8c9ab1', '#829026', '#ae083f', '#77c6ba', '#bc9157', '#e48eff', '#72b8ff', '#c6a5c1', '#ff9070', '#d3c37c', '#bceddb', '#6b8567', '#916e56', '#f9ff00', '#bac1df', '#ac567c', '#ffcd03', '#ff49b1', '#c15603', '#5d8c90']


In [156]:
# manually assign new unused colors to those replicons
scaffold_tree_df.loc[scaffold_tree_df["replicon_name"]=="cp32-5+1", "color"] = "#ff49b1"
scaffold_tree_df.loc[scaffold_tree_df["replicon_name"]=="cp32-5-1", "color"] = "#0000FF"
scaffold_tree_df.loc[scaffold_tree_df["replicon_name"]=="lp36-28-4", "color"] = "#5d8c90"

scaffold_tree_df

Unnamed: 0,gene,assembly,replicon_name,color
0,lacF,UNY191,cp26,#d60000
1,guaB,UNY191,cp26,#d60000
2,group_1602,UNY191,cp26,#d60000
3,licB,UNY191,cp26,#d60000
4,guaA~~~guaA_2,UNY191,cp26,#d60000
...,...,...,...,...
355952,group_189,UMA12,lp28-6,#a877ac
355953,group_1101,UMA12,cp32-11,#9a6900
355954,group_700,UMA12,lp38,#00fdcf
355955,group_90,UMA12,lp38,#00fdcf


In [157]:
# check replicon presence across the isolates
plasmid_presence = scaffold_tree_df[["assembly", "replicon_name"]].copy().drop_duplicates()
plasmid_presence

Unnamed: 0,assembly,replicon_name
0,UNY191,cp26
10,UNY191,chromosome
11,UNY191,lp54
327,UNY191,lp17
803,UNY191,lp28-4
...,...,...
355695,UMA12,cp32-12
355712,UMA12,lp38
355729,UMA12,cp32-1+5
355763,UMA12,cp32-8


In [158]:
# check gene counts by replicon
replicon_counts = scaffold_tree_df.groupby(['gene', 'replicon_name']).size().reset_index(name='count')

best_replicon = replicon_counts.loc[replicon_counts.groupby('gene')['count'].idxmax()]
best_replicon_dict = dict(zip(best_replicon["gene"], best_replicon["replicon_name"]))

replicon_counts = replicon_counts.pivot_table(index='gene', columns='replicon_name', values='count', fill_value=0)

replicon_counts = replicon_counts.reset_index()
replicon_counts

replicon_name,gene,chromosome,cp26,cp32-1,cp32-1+5,cp32-10,cp32-11,cp32-12,cp32-13,cp32-2,...,lp28-7,lp28-8,lp28-9,lp32-3,lp36,lp36-28-4,lp38,lp5,lp54,lp56
0,GFM1~~~fusA_2,298.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,PdeB,297.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ackA,293.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,acpP,298.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ade,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,22.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2479,ywlC_1~~~ywlC,298.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2480,ywlC_2,78.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,10.0,15.0,0.0,0.0
2481,zupT,297.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2482,zwf,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [159]:
# find the best hit replicon for each gene
ordered_plasmids = [x for x in plasmid_presence["replicon_name"].value_counts().index]
most_frequent_replicon_series = scaffold_tree_df.groupby('gene')['replicon_name'].agg(lambda x: x.mode().iloc[0])
replicon_dict = most_frequent_replicon_series.to_dict()
replicon_dict = dict(sorted(
    replicon_dict.items(),
    key=lambda item: ordered_plasmids.index(item[1]) if item[1] in ordered_plasmids else float('inf')
))
replicon_dict

{'celB~~~gmuC~~~licC~~~lacE': 'cp26',
 'glcB~~~ptsG_3': 'cp26',
 'group_1595': 'cp26',
 'group_1596': 'cp26',
 'group_1597': 'cp26',
 'group_1598': 'cp26',
 'group_1599': 'cp26',
 'group_1600': 'cp26',
 'group_1601': 'cp26',
 'group_1602': 'cp26',
 'group_1603': 'cp26',
 'group_1604': 'cp26',
 'group_1605': 'cp26',
 'group_1606': 'cp26',
 'group_1607': 'cp26',
 'group_1608': 'cp26',
 'group_1609': 'cp26',
 'group_1610': 'cp26',
 'group_1611': 'cp26',
 'group_1641': 'cp26',
 'guaA~~~guaA_2': 'cp26',
 'guaB': 'cp26',
 'lacF': 'cp26',
 'licB': 'cp26',
 'oppA_5~~~oppA_2~~~oppA_4': 'cp26',
 'ospC~~~ospC_1': 'cp26',
 'pbuG_2~~~adeP~~~adeQ~~~pbuG': 'cp26',
 'pbuG~~~adeP~~~adeQ_1~~~adeP_1': 'cp26',
 'resT~~~resT_1': 'cp26',
 'yccX': 'cp26',
 'GFM1~~~fusA_2': 'chromosome',
 'PdeB': 'chromosome',
 'ackA': 'chromosome',
 'acpP': 'chromosome',
 'adk': 'chromosome',
 'alaS': 'chromosome',
 'alr1~~~alr': 'chromosome',
 'apeA_1~~~apeA': 'chromosome',
 'apeB': 'chromosome',
 'apt': 'chromosome',
 'arc

In [160]:
# find coordinates for genes based on the B31 reference
bb = pd.read_csv(genetobb_filename, sep="\t")
bb = bb.sort_values(by=["group", "percent_ident", "alignment_length"], ascending=[True, False, False]).reset_index(drop=True)
bb = bb[["group", "gene"]].drop_duplicates(keep="first").reset_index(drop=True)
bb['count'] = bb.groupby('group').cumcount() + 1
bb = bb[bb["count"]==1].drop(columns="count").reset_index(drop=True)

coords = pd.read_csv(bbcoords_filename, sep="\t")
coords['gene'] = coords['gene'].str.replace('gene-', '', regex=False)
coords = coords.drop_duplicates().reset_index(drop=True)
bb_coords = bb.merge(coords, on="gene", how="left")
bb_coords_dict = dict(zip(bb_coords["group"], bb_coords["start"]))
bb_coords_dict

{'PdeB': 384287,
 'ade': 10301,
 'ade_1': 22678,
 'ade_1~~~ade_4~~~ade_6': 10301,
 'ade_2': 10301,
 'ade_4~~~ade_3~~~ade_2~~~ade_5': 10301,
 'adk': 427987,
 'alaS': 224826,
 'alr1~~~alr': 160090,
 'apeA_1~~~apeA': 374277,
 'argF': 900799,
 'arnC': 585080,
 'asnS': 98817,
 'aspS': 465518,
 'atoC': 804417,
 'atpD': 89200,
 'atpE': 93433,
 'bepA_2~~~bepA_1~~~bepA~~~cpoB~~~lapB_1': 212061,
 'bglK': 879315,
 'btuD_1': 146377,
 'btuD_2~~~mglA_1': 323776,
 'carD': 365115,
 'cbpA': 694693,
 'cdaA': 8427,
 'celB~~~gmuC~~~licC~~~lacE': 2476,
 'cheA': 578277,
 'cheB_1~~~cheB_2': 425584,
 'cheB_3': 580486,
 'cheB_3~~~cph2~~~cheB_2~~~cph2_1': 429693,
 'cheR1~~~cheR': 39826,
 'cheR_2~~~cheR1': 424694,
 'cheW_1': 319106,
 'cheW_2': 577395,
 'cheY_1': 561563,
 'clpA~~~clpB': 377039,
 'clpY': 303774,
 'cmk_1': 124149,
 'coaE': 557407,
 'coaX': 537520,
 'cof~~~yigL_1': 63147,
 'comM': 81405,
 'corC': 55501,
 'csd_1~~~csd_2~~~csd': 79505,
 'csrA': 186949,
 'ctpB': 366964,
 'cysQ': 534301,
 'dbpA~~~dbpA_1

In [161]:
# format input data for scaffold tree
scaffold_tree_df["x"]=1
scaffold_tree_df["x"] = scaffold_tree_df.groupby("gene")["x"].transform("sum")
scaffold_tree_df["best_replicon"] = scaffold_tree_df["gene"].apply(lambda z: best_replicon_dict[z])
scaffold_tree_df["pos"] = scaffold_tree_df["gene"].apply(lambda z: bb_coords_dict[z] if z in bb_coords_dict.keys() else 1000000)

scaffold_tree_df['replicon_name'] = pd.Categorical(scaffold_tree_df['replicon_name'], categories=ordered_plasmids, ordered=True)
scaffold_tree_df['best_replicon'] = pd.Categorical(scaffold_tree_df['best_replicon'], categories=ordered_plasmids, ordered=True)
scaffold_tree_df = scaffold_tree_df.sort_values(by=["best_replicon", "pos", "x", "gene"], ascending=[True, True, False, True]).reset_index(drop=True)
scaffold_tree_df["prevalence"] = scaffold_tree_df["x"] / len(scaffold_tree_df["assembly"].unique())
scaffold_tree_df

Unnamed: 0,gene,assembly,replicon_name,color,x,best_replicon,pos,prevalence
0,yccX,UNY191,cp26,#d60000,297,cp26,46,0.993311
1,yccX,ESI315,cp26,#d60000,297,cp26,46,0.993311
2,yccX,UNY144,cp26,#d60000,297,cp26,46,0.993311
3,yccX,UWI248,cp26,#d60000,297,cp26,46,0.993311
4,yccX,UWI225,cp26,#d60000,297,cp26,46,0.993311
...,...,...,...,...,...,...,...,...
355952,group_901,ESI28,lp28-9,#e48eff,5,lp28-9,1000000,0.016722
355953,group_959,ESI26,lp28-9,#e48eff,4,lp28-9,1000000,0.013378
355954,group_959,ESI284,lp28-2,#f4bfb1,4,lp28-9,1000000,0.013378
355955,group_959,ESI309,lp28-9,#e48eff,4,lp28-9,1000000,0.013378


In [162]:
scaffold_tree_df.to_csv("../0_Data/2_Processed/ScaffoldTree.csv",
                    index=False)