# Figure 1
Jupyter notebook corresponding to the input data for the phylogenetic tree with annotated typing schemes and plasmid presence/absence.

In [1]:
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import colorcet as cc

In [2]:
plasmid_file = "../0_Data/0_Raw/best_hits_1000bp_v11.csv"

## Data Processing

In [3]:
plasmids = pd.read_csv(plasmid_file)
plasmids = plasmids.rename(columns={"assembly_id": "Isolate"})

# clean isolate names
plasmids['Isolate'] = plasmids['Isolate'].str.replace(r'GCF_\d+\.\d+_', '', regex=True)
plasmids['Isolate'] = plasmids['Isolate'].str.replace(r'_genomic', '', regex=True)
plasmids = plasmids[["Isolate", "plasmid_name"]].drop_duplicates().copy()

# convert plasmid annotations to a presence/absence matrix
plasmids = (plasmids.assign(value=1)
             .pivot(index='Isolate', columns='plasmid_name', values='value')
             .fillna(0)
             .reset_index()
             .rename_axis(None, axis=1)
)
plasmids.iloc[:, 1:] = plasmids.iloc[:, 1:].astype(int)

plasmids.head()

Unnamed: 0,Isolate,chromosome,cp26,cp32-1,cp32-1+5,cp32-10,cp32-11,cp32-12,cp32-13,cp32-2,...,lp28-6,lp28-7,lp28-8,lp28-9,lp32-3,lp36,lp38,lp5,lp54,lp56
0,ASM1913465v1,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
1,ASM215146v1,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0
2,ASM215148v1,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
3,ASM215150v1,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0
4,ASM2466215v1,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0


In [4]:
# check isolates in sample
plasmids["Isolate"].unique()

array(['ASM1913465v1', 'ASM215146v1', 'ASM215148v1', 'ASM215150v1',
       'ASM2466215v1', 'ASM2466217v1', 'ASM2466219v1', 'ASM336729v1',
       'ASM4079071v1', 'ASM4079073v1', 'ASM4079074v1', 'ASM4079075v1',
       'ASM4079076v1', 'ASM4079077v1', 'ASM4079078v1', 'ASM4079079v1',
       'ASM4079080v1', 'B331P', 'B418P', 'B500P', 'ESI26H', 'ESI361H',
       'ESI403H', 'ESI425H', 'PFhe_I_PB_Ill_cons', 'UCT109H', 'UCT110H',
       'UCT113H', 'UCT124H', 'UCT29H', 'UCT30H', 'UCT31H', 'UCT32H',
       'UCT35H', 'UCT50H', 'UCT92H', 'UCT96H', 'UNY1032P', 'UNY1038P',
       'UNY1083P', 'UNY1085P', 'UNY1090P', 'UNY1128P', 'UNY149P',
       'UNY169P', 'UNY172P', 'UNY193P', 'UNY203P', 'UNY208P', 'UNY990P',
       'URI101H', 'URI102H', 'URI103H', 'URI107H', 'URI111H', 'URI112H',
       'URI117H', 'URI118H', 'URI120H', 'URI33H', 'URI34H', 'URI36H',
       'URI39H', 'URI40H', 'URI41H', 'URI42H', 'URI44H', 'URI46H',
       'URI47H', 'URI48H', 'URI56H', 'URI86H', 'URI87H', 'URI88H',
       'URI89H', 'UR

In [5]:
plasmids = plasmids.drop(columns=["chromosome"])

# conversion to integers
for i in plasmids.columns.drop("Isolate"):
    plasmids[i] = plasmids[i].apply(lambda z: 0 if z==0 else 1)

# sorting the plasmids by overall prevalence in the dataset    
column_sums = plasmids[plasmids.columns.drop("Isolate")].sum()
sorted_columns = column_sums.sort_values(ascending=False)
plasmids_rearranged = plasmids[sorted_columns.index].copy(deep=True)
plasmids = pd.concat([plasmids["Isolate"], plasmids_rearranged], axis=1)

plasmid_list = plasmids.columns.drop("Isolate")

# data processing
for i in plasmid_list:
    plasmids[i] = plasmids[i].apply(lambda z: "Absent" if z==0 else i)

ordered_plasmids = list(plasmids.columns.drop("Isolate"))
ordered_plasmids.append("Unclassified")
ordered_plasmids.append("chromosome")

# producing a plasmid-specific color mapping
palette = sns.color_palette(cc.glasbey_light, n_colors=len(ordered_plasmids)).as_hex()
plasmid_color_scheme = dict(zip(ordered_plasmids, palette))
with open('../0_Data/1_Intermediate/plasmid_color_scheme.pkl', 'wb') as pickle_file:
    pickle.dump(plasmid_color_scheme, pickle_file)

# reshaping the data in preparation for tree-building
plasmids_long = pd.melt(plasmids, id_vars=['Isolate'], var_name='plasmid', value_name='presence')
plasmids_long["color"] = plasmids_long["plasmid"].apply(lambda z: plasmid_color_scheme[z])
plasmids_long = plasmids_long[plasmids_long["presence"]!="Absent"].copy().reset_index(drop=True)
plasmids_long.to_csv("../0_Data/2_Processed/plasmids_binary_long.csv",
                index=False)


## Plasmid Color Scheme

In [6]:
# dictionary assigning hex-codes to replicons
plasmid_color_scheme

{'cp26': '#d60000',
 'lp54': '#018700',
 'lp17': '#b500ff',
 'lp36': '#05acc6',
 'lp28-3': '#97ff00',
 'lp25': '#ffa52f',
 'cp32-7': '#ff8ec8',
 'lp28-4': '#79525e',
 'lp38': '#00fdcf',
 'cp32-9': '#afa5ff',
 'cp32-6': '#93ac83',
 'cp32-11': '#9a6900',
 'lp28-1': '#366962',
 'cp32-3': '#d3008c',
 'cp32-1+5': '#fdf490',
 'cp32-4': '#c86e66',
 'cp32-12': '#9ee2ff',
 'lp56': '#00c846',
 'lp28-6': '#a877ac',
 'cp32-8': '#b8ba01',
 'lp28-2': '#f4bfb1',
 'cp32-1': '#ff28fd',
 'cp32-2': '#f2cdff',
 'lp28-5': '#009e7c',
 'cp32-5': '#ff6200',
 'cp9': '#56642a',
 'cp32-10': '#953f1f',
 'lp21-cp9': '#90318e',
 'lp28-7': '#ff3464',
 'lp28-8': '#a0e491',
 'lp21': '#8c9ab1',
 'lp5': '#829026',
 'cp32-13': '#ae083f',
 'cp9-3': '#77c6ba',
 'cp32-9-4': '#bc9157',
 'lp28-9': '#e48eff',
 'cp32-3+10': '#72b8ff',
 'lp28-11': '#c6a5c1',
 'lp32-3': '#ff9070',
 'Unclassified': '#d3c37c',
 'chromosome': '#bceddb'}

## Plasmid Statistics

In [7]:
# count the number of plasmids per isolate
plasmids_long["Isolate"].value_counts()

Isolate
URI40H          23
UCT110H         22
URI120H         22
UCT124H         21
UCT30H          21
                ..
ESI425H         11
ASM215146v1     11
ESI361H         10
ASM215148v1     10
ASM1913465v1    10
Name: count, Length: 82, dtype: int64

In [8]:
# count the prevalence of each plasmid across the sample
plasmids_long["plasmid"].value_counts()/len(plasmids_long["Isolate"].unique())

plasmid
cp26         1.000000
lp54         1.000000
lp17         0.975610
lp36         0.914634
lp28-3       0.865854
lp25         0.853659
cp32-7       0.829268
lp28-4       0.804878
lp38         0.756098
cp32-9       0.719512
cp32-6       0.695122
cp32-11      0.670732
lp28-1       0.670732
cp32-3       0.634146
cp32-1+5     0.621951
cp32-4       0.475610
cp32-12      0.463415
lp56         0.451220
lp28-6       0.451220
cp32-8       0.439024
lp28-2       0.426829
cp32-1       0.329268
cp32-2       0.329268
lp28-5       0.317073
cp32-5       0.292683
cp9          0.268293
cp32-10      0.231707
lp21-cp9     0.182927
lp28-8       0.085366
lp28-7       0.085366
lp21         0.073171
lp5          0.073171
cp32-13      0.073171
cp9-3        0.048780
cp32-9-4     0.036585
lp28-9       0.024390
cp32-3+10    0.024390
lp28-11      0.012195
lp32-3       0.012195
Name: count, dtype: float64