# Figure 2
Jupyter notebook preparing the data used for creating the phylogenetic tree with gene heatmap.

In [19]:
import pickle
import math
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import colorcet as cc
import seaborn as sns
from scipy.cluster.hierarchy import linkage, leaves_list, dendrogram, fcluster
from scipy.spatial.distance import pdist
from scipy.stats import chi2_contingency
from sklearn.cluster import AgglomerativeClustering
from collections import Counter

pd.set_option('display.max_columns', None)

In [20]:
genetobb_filename = "../../output/results/v9/no_merge_paralogs/intermediates/panaroo_groupID_to_BBgene.tsv"
bbcoords_filename = "../../ref/B31/bb_rs_gene_coords_b31_v1.tsv"
pa_filename = "../../output/results/v9/no_merge_paralogs/gene_presence_absence.Rtab"
lipoprotein_file = "../0_Data/2_Processed/lipoproteins.csv" # generated in `lipoproteins.csv`
clusterdb_filename = "../../output/results/v9/no_merge_paralogs/intermediates/clustered_proteins_db_results.pkl"
plasmidcolors_filename = "../0_Data/1_Intermediate/plasmid_color_scheme.pkl" # generated in `Figure1a.ipynb`

In [21]:
# pull the best match to pair gene groups to BB gene names
bb = pd.read_csv(genetobb_filename, sep="\t")
bb = bb.sort_values(by=["group", "percent_ident", "alignment_length"], ascending=[True, False, False]).reset_index(drop=True)
bb = bb[["group", "gene"]].drop_duplicates(keep="first").reset_index(drop=True)
bb['count'] = bb.groupby('group').cumcount() + 1
bb = bb[bb["count"]==1].drop(columns="count").reset_index(drop=True)
bb

Unnamed: 0,group,gene
0,aNKYR,BB_RS01975
1,ackA,BB_RS03145
2,acpP,BB_RS03560
3,acpS,BB_RS00050
4,acrA,BB_RS00695
...,...,...
1527,ysxB,BB_RS03950
1528,yuiD,BB_RS00345
1529,yvaK,BB_RS03275
1530,zupT,BB_RS01085


In [22]:
# import and process coordinates along the genome
coords = pd.read_csv(bbcoords_filename, sep="\t")
coords = coords.drop_duplicates().reset_index(drop=True)
coords['gene'] = coords['gene'].str.replace('gene-', '', regex=False)

bb_coords = bb.merge(coords, on="gene", how="left")
bb_coords_dict = dict(zip(bb_coords["group"], bb_coords["start"]))
bb_coords_dict


{'aNKYR': 410787,
 'ackA': 649170,
 'acpP': 740446,
 'acpS': 10203,
 'acrA': 141979,
 'acyP': 46,
 'ade': 10301,
 'adeC': 22678,
 'adk': 427987,
 'alaS': 224826,
 'alr': 160090,
 'amiC': 703961,
 'ampS': 64038,
 'amrB': 358458,
 'apeA': 374277,
 'apeB': 657546,
 'apt': 818018,
 'arcA': 899499,
 'argF': 900799,
 'argS': 614014,
 'asnS': 98817,
 'aspS': 465518,
 'atpA': 91137,
 'atpB': 89811,
 'atpD': 89200,
 'atpI': 87377,
 'bamA': 836061,
 'bamB': 26399,
 'bamD': 329262,
 'batD': 176069,
 'bcsQ~~~parA': 20766,
 'bdrA': 22131,
 'bdrH': 16497,
 'bdrH~~~repU~~~bdrW': 20043,
 'bdrM': 22102,
 'bdrO': 22113,
 'bdrP': 17089,
 'bdrQ': 22184,
 'bdrR': 16642,
 'bdrW~~~repU': 20043,
 'bepA': 53534,
 'bepA~~~lmp1': 212061,
 'bglX': 768,
 'bmpA': 393044,
 'bmpB': 391932,
 'bmpC': 394105,
 'bmpD': 395481,
 'bosR': 685977,
 'bppB': 24572,
 'bppC': 25175,
 'bptA': 701395,
 'bpuR': 46011,
 'ccmA': 797119,
 'cdaA': 8427,
 'cdd': 645137,
 'cdnL': 365115,
 'cdsA~~~cdsC': 52135,
 'cdsC': 52135,
 'cdsE': 29

In [23]:
# import and process gene presence / absence data
genes = pd.read_csv(pa_filename, sep="\t").set_index("Gene").T.rename(columns={"index":"Isolate","Transporter, dicarboxylate/amino acid:cation (Na+ or H+) symporter (Daacs) family": "Transporter, dicarboxylate/amino acid"})
gene_freqs = genes.sum().to_dict()

print(genes.shape)
genes.head()

(82, 1987)


Gene,group_1286,group_1204,group_1203,group_1202,group_1201,group_1200,group_1199,nCS2,guaB,immA,acyP,celA,group_1196,group_1195,group_1194,group_1193,celC,celB,resT,guaA,group_1192,group_1191,group_1190,group_1189,group_1188,pspE,tmk,tadD,group_1186,group_1183,wcaA,group_1179,pepF,pdeB,folD,yliI,yqxC,potA,group_1163,group_1161,pepP,group_1158,corB~~~tlyC,hMG1,pASTA,group_1156,group_1155,tolC,group_1153,group_1152,yfmG,yitT,lepA,group_1151,batD,mutL,yibQ,queA,group_1147,rlmB,group_1143,group_1142,hisS,group_1141,group_1140,group_1139,group_1138,rplO,group_1137,group_1134,p22,lysM,murI,prfB,argF,rsmE2,group_1132,cyaB,rimP,ftsH,tgt,group_1131,suhB,nanE,aNKYR,rpmB,group_1130,group_1129,yajL,nusB,yqfA,rpsF,group_1128,group_1127,rodA,plsC,group_1126,group_1125,tlyC,group_1124,uvrA,mgsA,group_1123,gatC,nth,group_1122,group_1121,group_1120,group_1119,group_1118,group_1117,group_1116,group_1115,rrp1,group_1007,group_987,rpmJ,group_972,rpmH,group_970,secE,rpmG,rpmF,rpoZ,rpmC,rpsU,swrD,infA,acpP,rpmA,rpmE2,csrA,khpA,rpsQ,rpsT,rpsP,ptsH,group_948,fliQ,rpsO,groES,rpsS,ftsL,rpsR,spoVG,raiA,yajC,group_937,rplW,efbC,rpmD,rplX,rplU,group_935,rpsJ,group_934,group_932,ysxB,group_931,group_929,hbb,mazG,fliE,fliN,rplT,group_927,group_925,trxA,group_923,rplR,rnpA,bamD,rbfA,rplV,secG,group_922,rplN,bpuR,group_920,rplQ,group_919,rplL,rpsL,acpS,rpsM,dksA,group_912,group_910,rpsK,group_908,fliW,group_906,spo0J,group_905,flgB,group_904,rpsI,tsaE,rplP,group_900,group_899,hinT,group_898,iscU,rplK,yaiI,group_896,group_895,group_894,rplM,flgD,group_893,ssb,group_892,flgC,smpB,yuiD,cdd,ybeY,cutS,rpsG,luxS,hfq,mreD,cheX,cvpA,rplJ,cdnL,coaD,cheD,yaaR,flbF,rpsE,def,rimM,group_889,ndk,lspA,group_886,group_884,group_881,rplI,group_880,apt,bosR,group_879,hlpA,group_878,napA,fliL,rsmD,group_872,group_871,cmk2,cheW,group_869,rplF,group_867,group_866,rnhB,group_865,rplE,rCL,hslV,nusG,group_862,group_854,mpg,infC,group_853,grpE,pth,crr,rlmE,efp,nadD,group_843,group_842,clpP,group_839,hpt1,ruvA,clpP2,group_835,group_834,group_833,dck,group_831,group_830,group_829,rdgB,atpD,group_815,group_814,group_800,dedA,comFC,group_796,rplC,udk,group_766,sEC59,rpsD,rnfE,rsmG,pgsA,rplD,adk,group_712,group_701,group_683,group_673,lolA,group_666,group_665,tsaB,trmH,group_657,gph,cmk,ung,phoU,yhhQ,mnmC,group_623,rplA,group_618,group_611,group_608,rpiA,group_597,pyrH,group_589,pgeF,group_588,group_576,group_563,ybhL,group_554,chrA,pcs,gatB,pgl,ydiL,group_542,group_541,amiC,secA,frr,group_537,rsmE1,trmD,group_524,jag,tACO1,nosY,rnc,amrB,group_500,truA,ftsQ,gpmA,fliS,gerM,group_493,nIF3,group_492,lptD,rsuA,group_479,group_478,map,group_463,group_462,group_461,phnP,tpiA,fliP,glpF,group_431,xth,group_430,group_429,group_428,group_427,cdaA,group_421,group_405,group_393,group_392,trkA,parB,motA,motB,pstB,rpsB,group_391,plzA,rlpA,ldcB,coaX,rpsC,fliR,group_382,potC,pdxK,group_373,flgG,group_372,rpoS,uppP,lolC~~~lolE,group_365,rplS,nupQ,nagB,group_358,fliZ,bepA~~~lmp1,mrcA,potB,group_357,tsf,tatD,sod,group_356,uppS,ykwD,group_353,group_352,dprA,ymdB,lptB,zupT,flgJ1,group_325,rplB,group_319,group_317,group_316,ylqF,nadK,pstS,group_310,mscS,mreC,rsmA,ftsY,murE,group_302,group_299,group_298,group_297,group_293,group_292,group_291,yejE,prmC,group_284,yicC,cutB,era,dppD,proX,yeaD2,group_271,rsfS,group_268,cof,group_266,group_265,fni,flgF,rsmH,group_264,mvk,group_260,secF,proW,murB,pstC,dedD,group_251,miaA,infB,pyk,zwf,group_250,gatA,group_249,fliH,group_247,pfkB,rsgA,group_241,ccmA,group_237,pQQ,hflK,pepD,nagC,mvaD,fmt,group_233,group_229,group_226,coaE,rpsH,group_224,group_223,ycf19,group_218,group_217,pdxI,group_216,ldh,eRG8,acrA,rnz,group_211,nrnA,group_209,flhG,gsiA,hflC,dppB,trxB,lepB,rluD,group_205,yvaK,obgE,lgt,group_203,dusA,holA,moxR,group_193,flgI,gap,flaB,surA,group_186,bmpA,flaA,bmpD,bmpB,nlpD,group_182,group_181,group_180,mltG,group_177,rpoA,group_175,tsaD,fmhB,ispA,ruvB,group_171,group_170,potD,mreB,dppC,bamB,nupN,mraY,trpS,pta,group_167,fliM,bmpC,group_165,dadA,mnmA,wcaG,prfA,group_162,fbaA,ddl,perM,murG,dnaJ,ftsW,recA,yfcC,tdk,bglX,ychF,manA,flhB,alr,proV,group_135,group_130,psgB,xkdP,lapB,pilF,ntrY,nupP,rfaB,cheB,dnaN,group_122,flhF,terY,ompA,coaBC,metK,pgk,group_119,tuf,paaJ,ftsZ,nagA,gltP,group_112,ackA,dacC,tyrS,fliG,prs,pksG,chlD,pcnB,arcA,ampS,sbcD,ftsA,comEC,glyA,group_106,group_105,group_104,rpoN,bamA,ileS,csd,apeB,flgL,serS,murA,lptF,clpX,group_100,der,eno,rseP,secY,atpB,fliI,tilS,trkG,flgE,group_94,panF,ffh,group_93,glyQS,pfkA,hslU,group_89,murD,ntrC,fieF,tig,group_86,norM,mgtE,dnaB,apeA,tPR,nhaC,asnS,group_84,murF,gnd,mnmE,cheR,murC,hEAT,group_83,rqcH,treA,degQ,ctpA,cysS,group_80,pncB,nusA,mglA,dnaA,proS,group_79,gltX,group_78,group_77,lldP,group_76,glpK,gldG,group_75,group_74,murJ,group_73,group_72,rny,yifB,nadE,ptsG,group_71,pstA,rho,group_70,glpA,lysS,group_69,lnt,pheS,hDGYP,pgi,pyrG,group_67,nupO,oppA,uup,yidC,groL,rpsA,group_65,pfp,dnaX,group_64,ptsG2,pheT,fliF,group_61,ptsI,atpA,thrS,group_57,group_56,argS,secD,aspS,group_54,group_53,dnaG,alaS,manB,mrdA,parE,uvrC,group_50,atpI,recD,group_47,htpG,group_46,mnmG,group_45,orfR,group_43,parC,flgK,ftsI,group_41,rpoD,group_39,gyrB,fapA,dnaK,fAA1,bepA,uvrD,ligA,priA,group_35,uvrB,group_34,fliD,spoT,truB,yncE,fusA,pdeA,group_33,recG,flhA,group_31,group_30,ptsN,group_29,group_28,recJ,group_26,group_25,pnp,mltE,metG,tar,clpA,group_24,mMPL,rnfC,mutS2,group_21,ftsK,group_20,bptA,lon,gyrA,group_19,smc,group_18,hrpA,leuS,topA,cheA,valS,greA,polA,mutS,pqqL,group_17,sbcC,cym1,nAGPA,group_12,group_11,rpsZ,rsmI,mfd,dnaE,rpmI,rpoB,recB,rpoC,cheY,group_0,fliK,group_1299,group_1298,group_1297,group_1296,group_1295,group_1294,group_1293,group_1292,group_1291,group_1289,group_1288,group_1287,group_1285,group_1283,group_1282,group_1281,group_1280,group_1279,group_1278,dbpA,group_1277,group_1276,group_1275,group_1274,group_1273,group_1272,group_1270,group_1269,group_1267,group_1266,group_1265,group_1264,group_1263,group_1262,group_1260,group_1259,group_1258,group_1257,group_1256,group_1255,group_1254,group_1253,group_1251,group_1250,group_1249,group_1248,group_1247,group_1246,group_1245,dbpB,group_1244,ospA,group_1243,group_1242,group_1241,ospC,group_1197,group_1150,group_1035,group_1033,group_305,group_1271,group_1268,thyX,group_1261,group_1252,ospB,osm28,group_678,group_558,group_161,group_1180,group_1049,group_1048,group_1047,group_1046,group_269,group_261,group_242,sIMPL,group_103,ade,cdsA~~~cdsC,group_1185,group_1184,group_1165,group_997,group_956,group_863,group_847,group_645,blyA,group_481,group_363,mtnN,group_235,group_192,group_140,group_137,group_132,group_120,group_51,group_49,group_725,group_330,group_276,group_236,group_99,group_655,group_632,group_575,group_573,group_532,group_459,group_294,group_252,group_765,group_759,group_726,group_590,group_561,group_420,group_347,group_336,group_66,group_1174,adeC,group_1008,group_990,group_961,group_499,group_339,group_108,group_721,group_690,group_642,group_602,group_598,group_497,group_464,group_355,group_326,group_304,pncA,group_267,group_178,group_1171,group_779,group_587,group_531,group_424,group_396,group_1054,group_1015,group_803,group_416,group_397,group_321,group_243,group_1050,group_1013,group_882,group_845,group_150,bcsQ~~~parA,group_682,bdrH~~~repU~~~bdrW,group_640,group_581,group_443,group_435,group_390,group_345,group_1011,group_552,group_498,group_376,group_340,group_245,group_124,group_13,group_1003,group_777,group_679,group_616,group_289,bppC,tnpB~~~insQ,group_113,group_1172,group_1133,group_578,group_547,bppA~~~yqaJ,group_370,group_346,cspZ,group_116,group_1043,group_820,group_797,group_741,group_724,group_668,group_667,group_664,group_651,group_509,group_508,group_465,rdgC,group_434,group_412,group_409,group_215,group_773,group_565,group_156,group_1178,group_703,erpH,group_595,group_506,group_417,group_388,group_306,group_262,group_808,group_750,group_736,group_719,group_606,group_556,group_550,group_520,group_423,group_96,group_1023,group_1014,group_739,group_722,group_536,group_511,group_315,group_258,group_1009,group_707,group_631,group_517,group_475,group_474,group_439,group_379,group_234,group_183,blyB,group_1052,group_995,group_870,group_586,group_512,group_381,group_14,group_1145,group_748,group_685,group_614,bdrH,group_288,group_202,group_158,group_968,group_823,group_784,mlp10~~~mlpH~~~mlpD,group_711,group_693,group_527,group_195,eppA,group_110,group_715,group_714,group_570,group_566,group_290,insQ,group_188,group_745,group_603,group_406,group_1169,group_735,group_612,group_549,group_496,group_404,group_322,group_277,group_115,group_1164,group_960,group_924,group_562,group_533,group_473,group_438,group_402,group_308,group_15,group_1022,group_1005,group_775,group_734,group_681,group_256,group_1284,group_993,group_544,group_538,group_449,group_360,group_359,group_342,group_210,group_196,bppA,group_1001,group_992,group_804,group_764,group_757,group_708,group_470,group_371,group_366,group_303,group_36,group_998,group_781,group_772,group_677,group_627,group_624,mlpC,group_507,group_472,group_455,group_452,group_349,group_328,group_1148,group_780,group_705,group_663,group_523,group_185,group_1181,group_742,group_716,group_660,group_654,group_620,group_617,group_609,group_580,group_546,group_540,group_519,group_504,group_413,group_399,group_387,group_384,group_362,yeeA,group_232,group_152,group_1021,group_1010,group_795,group_794,group_696,group_604,group_572,group_555,tsaC,group_141,group_1170,group_1038,group_787,cdsA,group_670,group_636,mlpF~~~mlpH~~~mlpA,group_567,group_432,group_401,group_296,group_1000,group_729,group_661,group_596,group_569,group_560,group_543,group_444,group_62,group_1039,group_774,group_767,group_713,group_686,group_634,group_583,mlpJ,group_422,group_403,group_776,group_659,group_626,group_607,group_594,group_445,group_411,group_332,group_287,group_199,group_1028,group_1026,group_816,group_801,group_733,group_680,group_672,group_671,group_658,erpG,group_446,group_441,group_425,group_394,group_331,group_151,group_818,group_786,group_768,group_731,erpY~~~erpL,group_689,group_669,group_638,erpH~~~erpA~~~erpC,group_553,group_535,group_530,group_426,group_350,group_318,group_238,group_1182,group_1173,group_1072,group_1070,group_921,group_909,group_901,group_873,group_852,group_791,group_754,erpM~~~erpC,group_601,group_579,group_494,group_471,group_386,group_279,group_253,group_244,cdsE,group_129,group_60,group_1159,group_821,group_746,bdrW~~~repU,group_674,group_653,mlpA,group_568,group_551,group_522,group_442,group_274,group_222,group_114,group_98,xtmB,group_16,group_1239,group_1237,group_1232,group_1231,group_1230,group_1227,group_1226,ospD,group_1225,group_1224,group_1223,group_1222,lolC,group_1219,group_1215,group_1214,group_1213,group_1212,group_1210,group_1209,group_1208,group_1207,group_1206,group_1205,group_1149,group_1062,group_819,group_644,group_613,mlpI,group_476,group_454,group_240,group_1228,group_1217,group_1059,group_1057,group_827,group_792,group_778,group_698,group_697,group_495,group_351,group_320,group_212,group_159,group_128,group_125,bppB,group_59,group_1167,group_999,group_860,group_846,bdrP,group_571,group_557,group_526,group_491,group_437,group_377,group_368,group_300,group_219,erpQ,group_187,group_117,group_92,group_3,group_1233,group_1176,group_807,erpA~~~ospE~~~erpC,group_723,group_635,group_521,group_418,group_312,group_295,group_169,group_163,group_155,group_131,group_1290,group_1220,group_1095,group_1093,group_1004,group_738,group_630,mlpF~~~mlpC,group_414,group_354,group_301,group_275,group_166,parA,group_1216,group_1032,group_949,group_855,bdrO,group_812,yqaJ~~~bppA,group_700,group_691,group_641,group_525,group_516,group_510,group_272,group_201,group_154,group_91,group_27,group_8,group_1087,group_1034,group_969,group_957,group_868,group_806,group_789,group_756,group_717,group_706,group_646,group_628,group_593,group_592,group_515,group_335,group_239,group_126,group_52,group_48,group_44,group_1106,group_1086,group_951,group_942,group_810,group_758,group_695,group_485,group_440,mlp9,group_309,mlp8,group_246,group_225,group_189,group_157,group_153,group_109,group_1236,group_883,group_840,group_838,group_788,group_702,group_534,group_513,group_489,bdrA,group_270,group_197,group_121,bdrR,orfB,group_1234,group_1162,mlpL,group_848,group_836,group_730,erpY,group_625,group_482,group_334,group_329,group_313,group_311,group_281,group_259,group_198,group_184,yqaJ,group_147,group_146,group_22,group_1105,group_1094,group_1078,group_996,group_991,group_989,group_813,group_783,group_749,group_720,group_676,group_662,group_621,group_619,lolD,group_528,group_415,group_364,group_344,group_176,group_143,group_118,group_111,group_1144,group_1017,group_980,group_973,group_953,group_826,group_811,group_692,group_639,group_637,group_545,group_486,group_477,group_395,bdrM,group_327,group_248,group_107,group_90,group_58,group_9,group_1175,group_1168,group_1085,group_1083,group_1076,group_1020,group_977,group_967,group_963,group_954,group_947,group_945,group_928,group_917,group_891,group_887,group_875,group_841,group_805,group_802,group_728,group_710,group_585,group_488,group_483,group_458,group_383,group_208,group_179,group_136,group_102,group_10,group_1081,group_1069,group_1012,group_958,group_916,group_824,group_785,group_744,group_740,group_699,group_605,group_548,group_529,group_490,group_480,group_407,group_400,group_337,group_32,group_1114,cdsC,group_979,group_971,group_959,group_940,group_907,group_890,group_885,group_753,group_752,mlpI~~~mlpG,group_577,vlsE1,group_450,bppC1,group_378,group_194,group_174,group_160,group_40,group_4,group_1157,group_1044,group_1029,group_1027,group_1006,erpD,group_790,group_732,group_727,group_675,group_505,group_466,group_419,group_398,group_367,group_97,group_1075,group_1073,group_1071,group_798,group_755,group_747,group_743,group_718,group_615,elpB2,group_433,group_343,group_338,group_280,group_1135,group_1056,group_1037,group_966,group_955,group_769,group_684,erpC,group_168,group_133,group_85,group_1109,group_943,group_851,group_822,tnpB,group_687,group_600,group_591,group_453,group_307,group_278,group_263,group_134,group_1240,group_1238,group_1229,group_1211,group_1063,group_763,group_647,group_285,group_257,group_231,group_213,group_142,group_82,group_1160,group_1112,group_1067,group_1061,group_1060,group_1058,group_1053,group_825,group_643,group_629,group_228,group_190,group_144,group_63,group_6,group_1235,group_1107,group_1104,group_1099,group_1097,group_1031,group_1016,group_1002,group_915,group_888,group_761,group_760,group_652,group_649,group_622,group_610,mlpH,group_255,group_200,group_55,group_1111,group_1101,group_1055,group_1025,group_994,group_809,group_694,group_207,group_87,group_1221,group_1092,erpA,group_1090,group_1079,group_1077,group_1068,group_1051,group_933,group_918,group_502,group_436,mlpG,repU,group_172,group_145,group_2,group_1102,group_1091,group_1074,group_1036,group_539,group_487,group_374,yidD,group_221,erpM,group_81,group_7,group_1218,group_1154,group_1113,group_1066,group_1041,group_1040,group_817,group_771,group_770,group_688,group_460,group_333,group_324,group_123,group_68,group_1084,group_1024,group_1019,group_1018,group_982,group_936,group_656,group_648,group_633,group_484,group_139,group_101,group_1082,group_1080,group_1065,group_1045,group_975,group_861,group_844,group_793,group_737,group_709,group_704,group_564,group_375,group_341,group_314,group_286,group_214,revA,group_1103,group_1089,group_964,group_962,group_849,group_837,group_751,group_599,group_518,group_514,dcm,group_230,group_148,group_95,tnpA,group_1326,group_1325,group_1312,group_1088,erpK,group_876,group_859,group_858,group_451,group_191,group_1313,group_1311,group_1177,group_1136,group_1108,group_978,group_965,group_941,group_939,group_926,group_874,group_850,group_799,group_782,vlsE,tM2,group_584,group_501,group_457,group_456,group_361,group_283,group_220,group_88,group_37,group_5,group_1030,group_946,group_582,group_468,group_467,group_408,group_380,group_348,group_323,group_273,group_206,erpX,group_173,group_42,group_1323,group_1198,group_1146,group_1110,group_1100,group_1064,group_1042,group_981,group_974,group_952,group_938,group_930,group_903,group_897,group_877,group_762,group_650,group_574,group_559,group_469,group_164,group_1324,group_1322,group_1187,group_1166,group_1098,group_1096,group_914,group_913,group_864,group_857,group_856,group_832,group_828,group_503,group_448,group_447,group_410,group_389,group_385,group_369,group_282,group_254,group_149,group_138,group_127,group_1,yjhX,group_1321,group_1320,group_1319,group_1318,group_1317,group_1316,group_1315,group_1314,group_1310,aph3Ia,group_1309,group_1308,group_1307,group_1306,aac3Ia,group_1305,group_1304,group_1303,group_1302,group_1301,bdrQ,group_1300,group_988,group_986,group_985,group_984,group_983,group_976,group_950,group_944,group_911,rnmV,group_902,group_227,erpP,group_204,group_38,group_23
B331P,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,1,1,1,0,1,1,1,0,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,0,0,1,0,1,0,1,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,0,1,1,1,1,0,1,1,1,1,1,0,1,1,1,1,0,0,1,1,1,1,1,1,0,1,0,1,1,1,1,0,1,0,1,1,1,0,1,1,0,1,0,1,0,0,1,1,0,1,1,1,0,0,0,1,1,0,0,1,0,0,1,1,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,0,1,0,1,1,0,0,1,0,1,1,1,0,0,0,0,1,1,1,0,0,0,0,1,1,0,1,1,1,1,0,1,1,1,1,0,1,0,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0,0,1,1,0,1,0,0,1,1,1,1,0,1,1,1,0,0,0,0,0,1,0,1,1,1,1,1,1,1,0,1,1,0,1,1,1,0,0,1,0,0,1,1,0,1,1,0,1,1,1,0,1,1,1,1,1,0,0,1,0,0,1,1,1,1,0,1,1,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,0,0,0,1,1,0,1,1,1,0,0,0,0,0,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,0,0,0,0,1,1,1,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,1,0,0,1,1,0,1,0,1,0,0,1,0,0,0,0,1,1,1,0,1,0,1,0,1,0,1,0,0,1,1,1,1,0,0,0,0,0,1,1,0,1,1,1,0,0,1,0,0,1,0,1,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,1,1,0,1,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,1,0,0,0,0,1,1,0,0,0,0,0,1,1,1,0,1,1,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,1,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,1,1,1,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,1,1,1,0,0,1,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,0,1,0,0,0,0,0,1,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
B418P,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,1,1,1,1,0,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,0,1,0,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,0,1,0,1,1,1,0,0,1,1,1,1,1,1,0,1,1,1,1,1,1,1,0,1,0,1,1,0,1,1,1,0,0,0,1,1,1,0,1,0,1,0,1,1,1,1,1,1,0,1,1,1,0,1,1,1,1,1,1,0,0,1,1,1,0,1,0,0,0,1,1,0,1,1,1,1,0,1,0,0,1,1,0,1,0,0,1,1,1,1,0,1,0,1,0,1,0,0,0,1,1,0,1,1,1,0,1,1,1,0,1,0,1,0,1,1,1,1,1,1,0,1,1,0,0,0,0,0,0,0,1,1,0,1,0,1,1,1,0,0,1,1,0,1,0,1,1,0,0,1,0,1,1,1,1,1,0,1,1,1,0,0,0,1,0,1,1,0,0,1,0,1,1,1,1,1,0,1,1,1,1,1,0,0,0,0,1,1,1,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,1,1,1,0,0,0,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,0,0,1,1,0,1,1,0,0,1,0,1,0,1,1,0,1,0,0,1,1,0,1,0,0,1,1,1,0,0,1,1,0,0,0,0,0,0,0,1,1,0,0,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,0,1,0,0,1,0,1,0,1,0,0,1,1,0,1,0,1,1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,1,0,0,0,1,1,1,1,1,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,1,0,0,0,0,0,0,0,1,0,0,0,1,1,1,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
B500P,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,1,1,1,0,1,1,1,0,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,0,0,1,0,1,0,1,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,1,0,1,1,1,1,1,0,1,1,1,1,1,0,1,1,1,1,1,1,0,1,0,1,1,1,1,0,1,0,1,1,1,0,1,1,0,1,0,1,0,0,0,1,0,1,1,1,1,0,0,1,1,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,0,1,1,1,0,1,0,1,1,0,0,1,0,1,1,1,0,0,0,0,1,1,1,0,0,0,0,1,1,0,1,1,1,1,0,1,1,1,1,0,1,0,1,1,0,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1,1,1,0,0,1,1,0,0,1,1,1,0,1,1,0,1,1,1,0,0,0,0,0,1,0,1,1,1,1,1,0,1,0,1,1,0,1,1,1,0,0,1,0,1,1,1,0,1,1,0,1,1,1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1,0,1,1,1,0,1,0,1,0,0,1,0,0,0,0,1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,0,0,1,1,1,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1,1,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,1,1,1,0,1,0,1,0,1,0,1,0,1,1,1,1,1,0,0,0,1,0,1,1,0,0,1,1,0,0,1,0,0,1,0,1,1,1,1,0,0,1,0,0,0,0,0,1,1,1,0,0,1,0,1,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,1,1,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,1,0,0,0,0,1,1,0,0,0,0,0,1,1,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,0,1,0,0,0,0,0,1,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
ESI26H,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,0,1,0,1,1,0,1,0,0,1,0,1,0,0,1,1,1,1,0,0,0,1,1,0,0,0,1,0,0,1,1,1,0,1,1,1,1,1,1,1,0,1,0,0,0,0,1,0,1,0,0,1,0,1,0,1,0,0,1,0,1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,0,0,1,0,1,1,0,1,1,1,1,0,0,1,1,1,1,0,1,1,0,1,0,1,0,1,1,0,1,0,0,1,1,1,1,0,1,0,1,0,1,0,1,1,0,1,0,1,1,1,1,0,0,0,1,0,1,0,1,0,1,1,0,0,0,0,0,1,1,1,1,0,1,0,1,1,0,1,0,0,1,1,0,0,0,0,1,1,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,1,1,0,0,1,1,0,0,0,1,1,0,0,0,0,1,0,1,0,0,0,1,1,0,0,1,1,0,1,1,0,1,1,1,1,1,0,1,0,0,0,1,0,1,0,0,1,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,1,0,0,1,1,1,0,1,0,1,0,0,1,1,0,1,0,0,0,0,0,1,0,1,1,0,1,0,1,0,0,1,1,0,1,1,0,1,1,0,0,1,0,0,0,0,0,0,0,1,1,0,0,1,0,0,1,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,1,0,0,1,0,0,0,0,1,1,0,1,1,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,1,1,1,1,1,1,0,1,1,0,0,0,0,0,1,1,1,1,1,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,1,0,1,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0
ESI361H,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,1,1,1,1,0,1,0,0,1,0,0,0,1,0,0,1,1,1,1,1,1,1,1,1,1,1,0,1,0,0,1,1,0,0,0,0,1,1,0,1,1,1,1,0,1,0,1,1,0,1,1,0,0,0,1,1,1,1,1,1,0,1,0,0,1,1,1,0,1,0,0,1,1,0,0,0,0,0,1,1,0,0,0,0,0,1,1,0,0,1,1,0,1,0,0,1,0,1,0,0,1,1,0,0,1,1,0,1,0,0,0,1,0,0,0,0,1,0,1,1,1,0,1,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,1,0,1,0,1,1,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,1,0,0,1,0,1,0,1,0,0,0,1,1,1,1,1,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,1,0,0,1,1,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,1,0,1,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [24]:
# import raw data with gene presence on plasmids / isolates
gene_dict = pd.read_pickle(clusterdb_filename)

# initialize an empty list to store the processed data
processed_data = []

# iterate over the dictionary
for gene, df in gene_dict.items():
    # add the gene name to the dataframe
    df['gene'] = gene
    # append the dataframe to the list
    processed_data.append(df)

# concatenate all the dataframes in the list into a single dataframe
gene_tree_df = pd.concat(processed_data)
gene_tree_df["replicon_name"] = gene_tree_df["replicon_name"].replace("NA", "Unclassified")
gene_tree_df['assembly'] = gene_tree_df['assembly'].str.replace(r'GCF_\d+\.\d+_', '', regex=True)  # remove GCF accession with underscore
gene_tree_df['assembly'] = gene_tree_df['assembly'].str.replace(r'_genomic', '', regex=True)  # remove "_genomic"
print(gene_tree_df.shape)

(109623, 30)


In [25]:
# check the number of gene groups encoded on each replicon
gene_tree_df["replicon_name"].value_counts()

replicon_name
chromosome      66891
lp54             5177
cp32-7           2155
cp26             2139
cp32-9           2127
cp32-6           2080
lp36             2036
lp38             1960
lp56             1775
cp32-1+5         1743
cp32-3           1702
cp32-11          1697
lp28-4           1573
cp32-4           1526
lp28-3           1517
lp25             1399
lp28-1           1375
lp17             1258
lp28-2           1209
cp32-12          1109
cp32-8           1075
lp28-6           1046
cp32-2            992
cp32-1            909
cp32-10           604
cp32-5            502
lp28-5            478
lp21-cp9          350
Unclassified      205
lp28-7            198
cp32-13           193
cp9               183
lp28-8            117
cp32-3+10          99
lp28-9             86
lp21               63
cp9-3              38
lp5                34
lp28-11             2
cp32-9-4            1
Name: count, dtype: int64

In [26]:
# check plasmid presence by isolate
plasmid_presence = gene_tree_df[["assembly", "replicon_name"]].drop_duplicates(keep="first").reset_index(drop=True)
plasmid_presence

Unnamed: 0,assembly,replicon_name
0,ASM1913465v1,lp54
1,UCT32H,lp54
2,URI111H,lp54
3,UNY193P,lp54
4,UCT113H,lp54
...,...,...
1505,URI102H,cp32-2
1506,URI36H,lp28-5
1507,URI118H,cp32-8
1508,UCT30H,cp32-8


In [27]:
# verify plasmid presence counts across the sample
plasmid_presence["replicon_name"].value_counts()

replicon_name
lp54            82
chromosome      82
cp26            82
lp17            80
lp28-3          71
lp25            70
cp32-7          68
lp36            68
lp28-4          66
lp38            62
cp32-9          58
cp32-6          57
lp28-1          55
cp32-11         55
cp32-3          52
cp32-1+5        51
cp32-4          39
cp32-12         38
lp28-6          37
lp56            37
cp32-8          36
lp28-2          33
Unclassified    32
cp32-1          27
lp28-5          26
cp32-2          26
cp32-5          23
cp9             21
cp32-10         19
lp21-cp9        15
lp28-7           7
lp28-8           7
lp21             6
cp32-13          6
lp5              6
cp9-3            4
lp28-9           2
cp32-3+10        2
cp32-9-4         1
lp28-11          1
Name: count, dtype: int64

In [28]:
ordered_plasmids = [x for x in plasmid_presence["replicon_name"].value_counts().index]
ordered_plasmids[0], ordered_plasmids[1] = ordered_plasmids[1], ordered_plasmids[0]
# load plasmid color-mapping
with open(plasmidcolors_filename, 'rb') as pickle_file:
    plasmid_color_scheme = pickle.load(pickle_file)


In [29]:
# add color assignments to genes based on the encoding replicon
gene_tree_df = gene_tree_df[['assembly', 'gene', 'replicon_name']]
gene_tree_df.reset_index(drop=True, inplace=True)
gene_tree_df["color"] = gene_tree_df["replicon_name"].apply(lambda z: plasmid_color_scheme[z])
print(len(gene_tree_df["gene"].unique()))
gene_tree_df

1987


Unnamed: 0,assembly,gene,replicon_name,color
0,ASM1913465v1,group_1286,lp54,#018700
1,UCT32H,group_1286,lp54,#018700
2,URI111H,group_1286,lp54,#018700
3,UNY193P,group_1286,lp54,#018700
4,UCT113H,group_1286,lp54,#018700
...,...,...,...,...
109618,URI120H,group_227,cp32-11,#9a6900
109619,URI120H,erpP,cp32-11,#9a6900
109620,URI48H,group_204,cp32-1+5,#fdf490
109621,ASM4079076v1,group_38,lp28-8,#a0e491


In [30]:
# add lipoprotein status to genes
lipo = pd.read_csv(lipoprotein_file)
gene_tree_df = gene_tree_df.merge(lipo, on="gene", how="left")
gene_tree_df

Unnamed: 0,assembly,gene,replicon_name,color,Localization,lipo,surface_lipo
0,ASM1913465v1,group_1286,lp54,#018700,,Other,Other
1,UCT32H,group_1286,lp54,#018700,,Other,Other
2,URI111H,group_1286,lp54,#018700,,Other,Other
3,UNY193P,group_1286,lp54,#018700,,Other,Other
4,UCT113H,group_1286,lp54,#018700,,Other,Other
...,...,...,...,...,...,...,...
109618,URI120H,group_227,cp32-11,#9a6900,S,Lipoprotein,Surface
109619,URI120H,erpP,cp32-11,#9a6900,S,Lipoprotein,Surface
109620,URI48H,group_204,cp32-1+5,#fdf490,,Other,Other
109621,ASM4079076v1,group_38,lp28-8,#a0e491,,Other,Other


In [31]:
# calculate the number of occurrences of each gene on different plasmids across the dataset
# identifies the plasmid that each gene is most commonly located on
replicon_counts = gene_tree_df.groupby(['gene', 'replicon_name']).size().reset_index(name='count')

best_replicon = replicon_counts.loc[replicon_counts.groupby('gene')['count'].idxmax()]
best_replicon_dict = dict(zip(best_replicon["gene"], best_replicon["replicon_name"]))

replicon_counts = replicon_counts.pivot_table(index='gene', columns='replicon_name', values='count', fill_value=0)

replicon_counts = replicon_counts.reset_index()
replicon_counts

replicon_name,gene,Unclassified,chromosome,cp26,cp32-1,cp32-1+5,cp32-10,cp32-11,cp32-12,cp32-13,cp32-2,cp32-3,cp32-3+10,cp32-4,cp32-5,cp32-6,cp32-7,cp32-8,cp32-9,cp32-9-4,cp9,cp9-3,lp17,lp21,lp21-cp9,lp25,lp28-1,lp28-11,lp28-2,lp28-3,lp28-4,lp28-5,lp28-6,lp28-7,lp28-8,lp28-9,lp36,lp38,lp5,lp54,lp56
0,aNKYR,0.0,82.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,aac3Ia,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ackA,0.0,82.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,acpP,0.0,82.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,acpS,0.0,82.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1982,ysxB,0.0,82.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1983,yuiD,0.0,82.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1984,yvaK,0.0,82.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1985,zupT,0.0,82.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
# calculate the most frequent replicon for each gene and sort the dictionary
most_frequent_replicon_series = gene_tree_df.groupby('gene')['replicon_name'].agg(lambda x: x.mode().iloc[0])
replicon_dict = most_frequent_replicon_series.to_dict()
replicon_dict = dict(sorted(
    replicon_dict.items(),
    key=lambda item: ordered_plasmids.index(item[1]) if item[1] in ordered_plasmids else float('inf')
))
with open("../0_Data/1_Intermediate/gene_with_best_replicon_hit.pkl", 'wb') as fp:
    pickle.dump(replicon_dict, fp)

In [33]:
# process the data for tree construction
gene_tree_df["x"]=1
gene_tree_df["x"] = gene_tree_df.groupby("gene")["x"].transform("sum")
gene_tree_df["best_replicon"] = gene_tree_df["gene"].apply(lambda z: best_replicon_dict[z])
gene_tree_df["pos"] = gene_tree_df["gene"].apply(lambda z: bb_coords_dict[z] if z in bb_coords_dict.keys() else 1000000)
gene_tree_df = gene_tree_df.merge(replicon_counts, on="gene", how="left")
gene_tree_df['replicon_name'] = pd.Categorical(gene_tree_df['replicon_name'], categories=ordered_plasmids, ordered=True)
gene_tree_df['best_replicon'] = pd.Categorical(gene_tree_df['best_replicon'], categories=ordered_plasmids, ordered=True)
gene_tree_df = gene_tree_df.sort_values(by=["best_replicon", "pos", "x", "gene"], ascending=[True, True, False, True]).reset_index(drop=True)

# save thresholded gene output
thresh = 26
gene_tree_df_subset = gene_tree_df[gene_tree_df["x"] >= thresh].copy().drop(columns="x")
gene_tree_df_subset.to_csv("../0_Data/2_Processed/GeneTree_Ordered_Thresh.csv",
                    index=False)

# save the full gene output
gene_tree_df["prevalence"] = gene_tree_df["x"] / len(gene_tree_df["assembly"].unique())
gene_tree_df = gene_tree_df.drop(columns="x")
gene_tree_df.to_csv("../0_Data/2_Processed/GeneTree_Ordered.csv",
                    index=False)
gene_tree_df


Unnamed: 0,assembly,gene,replicon_name,color,Localization,lipo,surface_lipo,best_replicon,pos,Unclassified,chromosome,cp26,cp32-1,cp32-1+5,cp32-10,cp32-11,cp32-12,cp32-13,cp32-2,cp32-3,cp32-3+10,cp32-4,cp32-5,cp32-6,cp32-7,cp32-8,cp32-9,cp32-9-4,cp9,cp9-3,lp17,lp21,lp21-cp9,lp25,lp28-1,lp28-11,lp28-2,lp28-3,lp28-4,lp28-5,lp28-6,lp28-7,lp28-8,lp28-9,lp36,lp38,lp5,lp54,lp56,prevalence
0,ASM1913465v1,group_1150,chromosome,#bceddb,,Lipoprotein,Other,chromosome,105,0.0,79.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.963415
1,UCT32H,group_1150,chromosome,#bceddb,,Lipoprotein,Other,chromosome,105,0.0,79.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.963415
2,URI111H,group_1150,chromosome,#bceddb,,Lipoprotein,Other,chromosome,105,0.0,79.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.963415
3,UCT113H,group_1150,chromosome,#bceddb,,Lipoprotein,Other,chromosome,105,0.0,79.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.963415
4,UCT29H,group_1150,chromosome,#bceddb,,Lipoprotein,Other,chromosome,105,0.0,79.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.963415
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109618,PFhe_I_PB_Ill_cons,group_864,cp32-3+10,#72b8ff,,Other,Other,cp32-3+10,11924,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.024390
109619,ESI26H,group_254,cp32-3+10,#72b8ff,,Other,Other,cp32-3+10,22184,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.024390
109620,PFhe_I_PB_Ill_cons,group_254,cp32-3+10,#72b8ff,,Other,Other,cp32-3+10,22184,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.024390
109621,ESI26H,group_369,cp32-3+10,#72b8ff,S,Lipoprotein,Surface,cp32-3+10,26257,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.024390
