In [1]:
using DrWatson
@quickactivate "Doran_etal_2022"

using SPI
using Muon
using CSV, DataFrames
using Distances
using NeighborJoining
using NewickTree

In [2]:
ddir = datadir("exp_raw", "BB669") |> mkpath
rdir = datadir("exp_pro", "BB669") |> mkpath

"/Users/ben/projects/Doran_etal_2022/data/exp_pro/BB669"

In [3]:
uniprot = readh5ad(joinpath(datadir(), "exp_pro", "UP7047", "2020_02_UP7047.h5ad"));
UPusv = SVD(uniprot.obsm["LSVs"][:, :], uniprot.uns["SVs"][:], uniprot.varm["RSVs"][:, :]');
bb = CSV.read(
    joinpath(datadir(), "exp_raw", "BB669", "commensal_metabolomics_ogg_matrix_20220307.csv"), 
    DataFrame; 
    missingstring="NA");

In [4]:
bbacc = CSV.read(joinpath(ddir, "BB669_rowmeta.csv"), DataFrame);

In [5]:
BBIDs = join.(split.(bb.unique_id, r"\[|\]")) |> x->join.(split.(x, r" "), "_");
BBtaxa = bb[!,  2:7]
BBtaxa[!,:donor] = join.(first.(split.(BBIDs, "."),2), ".");
BBMfc = bb[:,  8:45]
BB4mM = bb[:, 46:49]
BBMmM = bb[!, 46:55]
BBOGG = bb[!, 56:end]
logBBMfc = log2.(Matrix(coalesce.(BBMfc, 1.0)))
ZBBMmM = SPI.zscore(Matrix(coalesce.(BBMmM, 0.0)))
ZBB4mM = SPI.zscore(Matrix(coalesce.(BB4mM, 0.0)))
bbgenenames = names(BBOGG);

In [6]:
BBIDs

669-element Vector{String}:
 "MSK.16.19_Bacteroides_eggerthii"
 "MSK.10.5_Bacteroides_fragilis"
 "MSK.13.23_Bacteroides_fragilis"
 "MSK.16.61_Phocaeicola_vulgatus"
 "MSK.18.56_Bacteroides_caccae"
 "MSK.18.69_Bacteroides_caccae"
 "MSK.14.51_Bacteroides_thetaiotaomicron"
 "MSK.9.20_Phocaeicola_vulgatus"
 "MSK.16.76_Bacteroides_xylanisolvens"
 "MSK.15.13_Phocaeicola_vulgatus"
 ⋮
 "DFI.3.108_Odoribacter_splanchnicus"
 "DFI.3.53_Odoribacter_splanchnicus"
 "MSK.22.29_Bacteroides_ovatus"
 "DFI.1.156_Odoribacter_splanchnicus"
 "MSK.21.65_Prevotella_copri"
 "DFI.5.9_Bacteroides_faecis"
 "MSK.22.78_Bacteroides_fragilis"
 "DFI.1.157_Odoribacter_splanchnicus"
 "MSK.21.77_Prevotella_copri"

In [7]:
BBobs = leftjoin(bbacc, hcat(DataFrame(:BBID => BBIDs)), on= :first_strain_id => :BBID)
# BBobs[!, :ID] = BBobs.msk_id
# BBobs = BBobs[:, [13, collect(1:12)...]]
# BBobs = coalesce.(BBobs, "")
# # correct missing Family annotation
# BBobs.family[BBobs.genus .== "Phocaeicola"] .= "Bacteroidaceae";
# rename!(BBobs, :BBID => :first_strain_id, :msk_id => :strain_id)
BBobs = coalesce.(BBobs, "<unclassified>")
BBobs = String.(BBobs);

In [8]:
biobank_ogg = AnnData(X=Matrix(BBOGG),obs_names=BBIDs,var_names=bbgenenames)
biobank_metabolite_rel = AnnData(X=logBBMfc,obs_names=BBIDs,var_names=names(BBMfc))
biobank_metabolite_rel.layers["raw"] = coalesce.(Matrix(BBMfc), 1.0)
biobank_metabolite_mM = AnnData(X=ZBBMmM,obs_names=BBIDs,var_names=names(BBMmM))
biobank_metabolite_mM.layers["raw"] = coalesce.(Matrix(BBMmM), 0.0)
biobank_metabolite_mM.uns["4keymetabolites"] = names(BB4mM)
biobank = MuData(mod=Dict(
    "oggs"=>biobank_ogg, 
    "metabolites_foldchange"=>biobank_metabolite_rel,
    "metabolites_millimolar"=>biobank_metabolite_mM,
    ))
biobank.obs = BBobs;

In [9]:
names(BB4mM)

4-element Vector{String}:
 "Butyrate_mM"
 "Propionate_mM"
 "Succinate_mM"
 "Acetate_mM"

In [10]:
# Match BioBank measurements to column order of UniProt
bbmtx = zeros(size(BBOGG,1), length(uniprot.var_names.vals));
rawidxs = indexin(bbgenenames, uniprot.var_names.vals)
bbmask = .!isnothing.(rawidxs)
uporderedmask = filter(x->.!isnothing(x), rawidxs);
bbmtx[:, uporderedmask] .= BBOGG[:, bbmask];

In [11]:
@show size(bbmtx);

size(bbmtx) = (669, 10177)


In [12]:
biobank["UPorder_oggs"] = AnnData(X=bbmtx, obs_names=BBIDs, var_names=uniprot.var_names.vals);
biobank["UPorder_oggs"].var = DataFrame(
    :ID=>uniprot.var_names.vals,
    :ispresent=>mapslices(x->sum(x)>0, bbmtx, dims=1)|>vec,
);

In [13]:
biobank[:UPorder_oggs].obsm["inferredLSVs"] = projectinLSV(bbmtx, UPusv);
biobank[:UPorder_oggs].varm["UP_RSVs"] = UPusv.V;
biobank[:UPorder_oggs].uns["UP_SVs"] = UPusv.S;


In [14]:
BBuhat = biobank[:UPorder_oggs].obsm["inferredLSVs"][:, :];

In [15]:
@time BBDij = calc_spi_mtx(BBuhat, UPusv.S, getintervals(UPusv.S)) ./ size(biobank[:UPorder_oggs], 2);

  7.575361 seconds (3.89 M allocations: 6.142 GiB, 28.89% gc time, 13.34% compilation time)


In [16]:
bbhc = hclust(BBDij, linkage=:average, branchorder=:optimal);

In [17]:
bbnj = regNJ(BBDij)
newicktreestring_nj = newickstring(bbnj, BBobs.ID)

"((((((MSK.22.4:9.970886e-04,MSK.22.2:4.595173e-04):6.471248e-04,MSK.22.68:0.000000e+00):6.183301e-05,(MSK.22.57:0.000000e+00,MSK.22.64:3.689898e-04):1.865055e-04):8.783890e-05,(((((((((((MSK.22.80:3.300148e-03,MSK.22.73:5.227851e-03):6.349715e-03,MSK.22.108:0.000000e+00" ⋯ 24648 bytes ⋯ ",MSK.22.45:0.000000e+00):1.097491e-05,(((((MSK.22.113:0.000000e+00,MSK.22.101:9.583733e-03):1.051869e-03,MSK.22.11:0.000000e+00):3.100763e-04,MSK.22.34:3.064608e-03):1.432298e-04,MSK.22.100:0.000000e+00):5.948697e-05,MSK.22.106:1.304148e-03):1.097491e-05):0.000000e+00;"

In [18]:
spitree = readnw(newicktreestring_nj);

In [19]:
sum(contains.(BBobs.phylum, "Proteobacteria"))

1

In [20]:
outgroupname = BBobs.ID[contains.(BBobs.phylum, "Proteobacteria")]
outgroupnode = getleaves(spitree)[name.(getleaves(spitree)) .== outgroupname][1]
spitree = set_outgroup(outgroupnode)
writenw(joinpath(rdir, "BB669_inferred_spitree_nj_outgrouped.nw"), spitree)

20742

In [21]:
newicktreestring = SPI.nwstr(bbhc, BBobs.ID; labelinternalnodes=false);

In [22]:
biobank["oggs"].obsm["inferredLSVs"] = BBuhat
biobank["UPorder_oggs"].obsp["SDij"] = BBDij
biobank["UPorder_oggs"].uns["inferrednewicktree"] = newicktreestring;
biobank["UPorder_oggs"].uns["inferrednewicktree_nj"] = newicktreestring_nj;

In [23]:
rdir = datadir("exp_pro", "BB669") |> mkpath
open(joinpath(rdir, "BB669_inferred_spitree.nw"), "w") do io
    println(io, newicktreestring)
end
writeh5mu(joinpath(rdir, "BB669.h5mu"), biobank)
CSV.write(joinpath(rdir, "BB669_rowmeta.csv"), BBobs)

"/Users/ben/projects/Doran_etal_2022/data/exp_pro/BB669/BB669_rowmeta.csv"

In [24]:
open(joinpath(rdir, "BB669_inferred_spitree_nj.nw"), "w") do io
    println(io, newicktreestring_nj)
end