# VAE Data
This workbook formats a dataframe for use in the VAE modelling pipeline. 

In [6]:
import pandas as pd
paralogs = "panaroo"

In [7]:
gene_pa_file = "../0_Data/0_Raw/no_merge_paralogs/gene_presence_absence.Rtab"
metadata_file = "../0_Data/2_Processed/MetadataFull.csv"

In [8]:
gene_pa = pd.read_csv(gene_pa_file, sep="\t")
gene_pa = gene_pa.set_index("Gene").T

# sort presence/absence matrix from most to least frequent
column_sums = gene_pa[gene_pa.columns].sum()
sorted_columns = column_sums.sort_values(ascending=False)
gene_pa = gene_pa[sorted_columns.index].copy(deep=True)
gene_pa = gene_pa.astype(float)

print(gene_pa.shape)
gene_pa = gene_pa.reset_index().rename(columns={"index": "Isolate"})
gene_pa["Isolate"] = gene_pa["Isolate"].str.replace(r"GCF_\d+\.\d+_", "", regex=True)
gene_pa["Isolate"] = gene_pa["Isolate"].str.replace(r"_genomic", "", regex=True)
gene_pa.head()

(82, 1987)


Gene,Isolate,group_1286,perM,fmhB,ispA,ruvB,group_171,group_170,potD,mreB,...,group_1314,group_1304,group_1310,aph3Ia,group_1309,group_1308,group_1306,aac3Ia,group_1305,group_23
0,B331P,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,B418P,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,B500P,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ESI26H,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ESI361H,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
md = pd.read_csv(metadata_file)

# reshape the metadata table such that the information for each isolate can be accessed through any of its names
name_options = ['Strain_ID', 'Alias', 'Assembly_ID']
md = md.melt(
    id_vars=[x for x in md.columns if x not in name_options],
    value_vars=name_options,
    var_name='Name_Type',
    value_name='Isolate'
)
md = md[["Isolate", "RST", "OspC", "MLST"]]
md = md[md["Isolate"].notna()].copy()
md.head()

Unnamed: 0,Isolate,RST,OspC,MLST
0,N40,3.0,E,19.0
1,JD1,3.0,C,11.0
2,B-17/2013,3.0,L,24.0
3,UNY196,3.0,I,15.0
4,PAli,1.0,A,1.0


In [10]:
# merge metadata onto gene presence
df = gene_pa.merge(md, on="Isolate", how="left")
df.to_pickle(f"/Users/rl275/Projects/bb_longread/VAE/0_data/vae_{paralogs}.pkl")