### Lipoprotein Identification
We utilize a combination of the Dowdell lipoprotein files and results from our lab's lipoprotein-prediction program to classify lipoproteins. Additionally, erp, mlp, osp, dbp, and vls related genes are designated surface lipoproteins.

In [5]:
import pandas as pd

In [6]:
gene_presence_absence_file = "../0_Data/0_Raw/no_merge_paralogs/gene_presence_absence.Rtab"
splip_file = "../0_Data/0_Raw/no_merge_paralogs/panaroo_no_merge_paralogs_faa_SpLip.txt"
locustag_file  = "../0_Data/0_Raw/B31_refseq_id_to_old_name.tsv" 
gg_id_file = "../0_Data/0_Raw/no_merge_paralogs/intermediates/panaroo_group_to_ID_mapping.txt"
bbnames_file = "../0_Data/0_Raw/no_merge_paralogs/intermediates/panaroo_groupID_to_BBgene.tsv" 
lipopredict_file = "../0_Data/0_Raw/no_merge_paralogs/lipopredict/pan_genome_reference_lipoproteins.tsv"
dowdell_file = "../0_Data/0_Raw/Dowdell_lipoproteins.csv"


In [7]:
# load in the gene presence/absence file
genes = pd.read_csv(gene_presence_absence_file,
                sep="\t").rename(columns={"Gene": "gene"})
genes = genes[["gene"]]
genes

Unnamed: 0,gene
0,group_1286
1,group_1204
2,group_1203
3,group_1202
4,group_1201
...,...
1982,group_227
1983,erpP
1984,group_204
1985,group_38


In [8]:
# import and merge BB names
locustag = pd.read_csv(locustag_file, sep="\t")
bb_names = pd.read_csv(bbnames_file, sep="\t")
bb_names['refseq_id'] = bb_names['gene'].str.replace('gene-', '')

bb_genes = bb_names[["group", "refseq_id"]].drop_duplicates(subset='group', keep='first').reset_index(drop=True)
bb_genes = bb_genes.merge(locustag, on="refseq_id", how="left")
bb_genes['old_name'] = bb_genes['old_name'].fillna(bb_genes['refseq_id'])
bb_genes = bb_genes.rename(columns={"group": "gene"})
assert bb_genes.isna().sum().sum() ==0

bb_genes_dict = dict(zip(bb_genes["gene"], bb_genes["old_name"]))

genes = genes.merge(bb_genes, on="gene", how="outer")
genes

Unnamed: 0,gene,refseq_id,old_name
0,aNKYR,BB_RS01975,BB_0399
1,aac3Ia,,
2,ackA,BB_RS03145,BB_0622
3,acpP,BB_RS03560,BB_0704
4,acpS,BB_RS00050,BB_0010
...,...,...,...
1982,ysxB,BB_RS03950,BB_0779
1983,yuiD,BB_RS00345,BB_0070
1984,yvaK,BB_RS03275,BB_0646
1985,zupT,BB_RS01085,BB_0219


In [9]:
# import Lipopredict data
lipo = pd.read_csv(lipopredict_file, sep='\t').rename(columns={"name":"gene"})
lipo["lipo"] = 1
lipo = lipo[["gene", "lipo"]]

genes = genes.merge(lipo, on="gene", how="left")

lipo_gg_names = lipo[lipo["lipo"]==1]['gene'].unique().tolist()
print(f"Lipopredict count: {len(lipo_gg_names)}")

# import Dowdell set of lipoproteins
dowdell = pd.read_csv(dowdell_file, sep=",")
dowdell['gene'] = dowdell['Locus'].map({v: k for k, v in bb_genes_dict.items()})
dowdell = dowdell[dowdell["gene"].notna()].copy().reset_index(drop=True)
dowdell_set = dict(zip(dowdell["Locus"], dowdell["Localization"]))
dowdell_lipo_names = dowdell["gene"].unique()
print(f"Dowdell lipoprotein count: {dowdell.shape[0]}")

# manually designate certain gene families as surface lipoproteins
genes["Localization"] = genes["old_name"].apply(lambda z: dowdell_set[z] if z in dowdell_set.keys() else "")
genes.loc[genes["gene"].str.contains("erp", case=False, na=False), "Localization"] = "S"
genes.loc[genes["gene"].str.contains("mlp", case=False, na=False), "Localization"] = "S"
genes.loc[genes["gene"].str.contains("osp", case=False, na=False), "Localization"] = "S"
genes.loc[genes["gene"].str.contains("dbp", case=False, na=False), "Localization"] = "S"
genes.loc[genes["gene"].str.contains("vls", case=False, na=False), "Localization"] = "S"

# create a list that contains the full set of lipoproteins
lipo_gg_names = list(set(lipo_gg_names) | set(dowdell_lipo_names)| set(list(genes[genes["Localization"].isin(["S", "P-OM", "P-IM"])]["gene"].unique())))
print(f"Lipoprotein union: {len(lipo_gg_names)}")

surface = list(genes[genes["Localization"]=="S"]["gene"].unique())
pom = list(genes[genes["Localization"]=="P-OM"]["gene"].unique())
pim = list(genes[genes["Localization"]=="P-IM"]["gene"].unique())
peri = pom + pim
print(f"Surface lipoprotein count: {len(surface)}")
print(f"P-OM lipoprotein count: {len(pom)}")
print(f"P-IM lipoprotein count: {len(pim)}")
print(f"Periplasmic lipoprotein count: {len(peri)}")

Lipopredict count: 225
Dowdell lipoprotein count: 111
Lipoprotein union: 247
Surface lipoprotein count: 127
P-OM lipoprotein count: 9
P-IM lipoprotein count: 31
Periplasmic lipoprotein count: 40


In [10]:
# export lipoprotein designations
lipos = genes[["gene", "Localization"]].copy()
lipos["lipo"] = lipos["gene"].apply(lambda z: "Lipoprotein" if z in lipo_gg_names else "Other")
lipos["surface_lipo"] = lipos["gene"].apply(lambda z: "Surface" if z in surface else "Other")
lipos.to_csv("../0_Data/2_Processed/lipoproteins.csv", index=False)
lipos

Unnamed: 0,gene,Localization,lipo,surface_lipo
0,aNKYR,,Other,Other
1,aac3Ia,,Other,Other
2,ackA,,Other,Other
3,acpP,,Other,Other
4,acpS,,Other,Other
...,...,...,...,...
1982,ysxB,,Other,Other
1983,yuiD,,Other,Other
1984,yvaK,,Other,Other
1985,zupT,,Other,Other


In [11]:
# check through certain identified lipoproteins
lipos[lipos["gene"].str.len() < 5]["gene"].unique()

array(['ackA', 'acpP', 'acpS', 'acrA', 'acyP', 'ade', 'adeC', 'adk',
       'alaS', 'alr', 'amiC', 'ampS', 'amrB', 'apeA', 'apeB', 'apt',
       'arcA', 'argF', 'argS', 'asnS', 'aspS', 'atpA', 'atpB', 'atpD',
       'atpI', 'bamA', 'bamB', 'bamD', 'batD', 'bdrA', 'bdrH', 'bdrM',
       'bdrO', 'bdrP', 'bdrQ', 'bdrR', 'bepA', 'bglX', 'blyA', 'blyB',
       'bmpA', 'bmpB', 'bmpC', 'bmpD', 'bosR', 'bppA', 'bppB', 'bppC',
       'bptA', 'bpuR', 'ccmA', 'cdaA', 'cdd', 'cdnL', 'cdsA', 'cdsC',
       'cdsE', 'celA', 'celB', 'celC', 'cheA', 'cheB', 'cheD', 'cheR',
       'cheW', 'cheX', 'cheY', 'chlD', 'chrA', 'clpA', 'clpP', 'clpX',
       'cmk', 'cmk2', 'coaD', 'coaE', 'coaX', 'cof', 'crr', 'csd', 'cspZ',
       'csrA', 'ctpA', 'cutB', 'cutS', 'cvpA', 'cyaB', 'cym1', 'cysS',
       'dacC', 'dadA', 'dbpA', 'dbpB', 'dck', 'dcm', 'ddl', 'dedA',
       'dedD', 'def', 'degQ', 'der', 'dksA', 'dnaA', 'dnaB', 'dnaE',
       'dnaG', 'dnaJ', 'dnaK', 'dnaN', 'dnaX', 'dppB', 'dppC', 'dppD',
       'dprA