In [1]:
using DrWatson
@quickactivate "Doran_etal_2022"

using SPI
using StatsBase
using CSV, DataFrames, Muon

In [2]:
rdir = datadir("exp_pro", "BB728") |> mkpath

"/Users/ben/projects/Doran_etal_2022/data/exp_pro/BB728"

In [3]:
biobankdf = CSV.read(datadir("exp_raw","BB728","BB728_commensal_metabolomics_ogg_matrix_20220713.csv"), DataFrame; header=1, missingstring="NA");
bb673obs = CSV.read(datadir("exp_raw","BB673","BBIDS.csv"), DataFrame; header=1);

In [4]:
obscols = 1:8
relcols = 9:58
dmmcols = 59:68
oggcols = 69:11316;

In [5]:
# only rel cols are missing values
Matrix(biobankdf[:, relcols]) |>
    x-> sum(ismissing.(x))

86

In [6]:
BBIDs = biobankdf.ID;
bbnames = names(biobankdf);

bbtaxa = biobankdf[:, obscols];
bbtaxa[:, "donor"] = BBIDs |> 
    x->split.(x, ".") |>
    x->getindex.(x, 2)
bbtaxa[:, "inBB673"] = .!isnothing.(indexin(biobankdf.ID, first.(split.(bb673obs.BBID, "_"))));

bbogg = Matrix(biobankdf[:, oggcols]);

bbrelraw = coalesce.(Matrix(biobankdf[:, relcols]), 1.0);
bbrel = mapslices(c->log2.(c), bbrelraw; dims=1);

bbdmmraw = Matrix(biobankdf[:, dmmcols]);
bbdmm = mapslices(StatsBase.zscore, bbdmmraw; dims=1);

In [7]:
biobank_ogg = AnnData(X=bbogg, obs_names=BBIDs, var_names=bbnames[oggcols])
biobank_ogg.var = DataFrame(
    :ID=>bbnames[oggcols],
    :ispresent=>mapslices(x->sum(x)>0, biobank_ogg.X, dims=1)|>vec,
    )

biobank_metabolite_fc = AnnData(X=bbrel, obs_names=BBIDs, var_names=bbnames[relcols])
biobank_metabolite_fc.layers["raw"] = bbrelraw

biobank_metabolite_mM = AnnData(X=bbdmm, obs_names=BBIDs, var_names=bbnames[dmmcols])
biobank_metabolite_mM.layers["raw"] = bbdmmraw

biobank = MuData(mod=Dict(
    "oggs"=>biobank_ogg, 
    "metabolites_foldchange"=>biobank_metabolite_fc,
    "metabolites_millimolar"=>biobank_metabolite_mM,
    ))
biobank.obs = coalesce.(bbtaxa, "")
biobank

MuData object 728 ✕ 11308
└ metabolites_foldchange
  AnnData object 728 ✕ 50
└ metabolites_millimolar
  AnnData object 728 ✕ 10
└ oggs
  AnnData object 728 ✕ 11248

In [8]:
uniprot = readh5ad(datadir("exp_pro", "UP7047", "2020_02_UP7047.h5ad"));
UPusv = SVD(uniprot.obsm["LSVs"][:, :], uniprot.uns["SVs"][:], uniprot.varm["RSVs"][:, :]');

In [9]:
rawidxs = indexin(biobank[:oggs].var_names.vals, uniprot.var_names.vals)
bbmask =  .!isnothing.(rawidxs)
biobank[:oggs].var[:, "isinuniprot"] = bbmask;

bbmtx = zeros(size(biobank,1), size(uniprot, 2));
bbmtx[:, filter(x->.!isnothing.(x), rawidxs)] .= bbogg[:, bbmask];

In [19]:
biobank["UPorder_oggs"] = AnnData(X=bbmtx, obs_names=BBIDs, var_names=uniprot.var_names.vals);
biobank["UPorder_oggs"].var = DataFrame(
    :ID=>uniprot.var_names.vals,
    :ispresent=>mapslices(x->sum(x)>0, bbmtx, dims=1)|>vec,
);

In [20]:
biobank[:UPorder_oggs].obsm["inferredLSVs"] = projectinLSV(bbmtx, UPusv);
biobank[:UPorder_oggs].varm["UP_RSVs"] = UPusv.V;
biobank[:UPorder_oggs].uns["UP_SVs"] = UPusv.S;


In [21]:
BBuhat = biobank[:UPorder_oggs].obsm["inferredLSVs"][:, :];

In [22]:
partitions = getintervals(UPusv.S);

In [23]:
earlywindow = partitions[1:10]
middlewindow = partitions[11:89];
latewindow = partitions[90:end];

In [24]:
biobank[:UPorder_oggs].obsp["SDij"] = calc_spi_mtx(BBuhat, UPusv.S, getintervals(UPusv.S)) ./ size(biobank[:UPorder_oggs], 2);

In [25]:
biobank[:UPorder_oggs].obsp["SDij_earlywindow"] = calc_spi_mtx(BBuhat, UPusv.S, earlywindow) ./ size(biobank[:UPorder_oggs], 2);
biobank[:UPorder_oggs].obsp["SDij_middlewindow"] = calc_spi_mtx(BBuhat, UPusv.S, middlewindow) ./ size(biobank[:UPorder_oggs], 2);
biobank[:UPorder_oggs].obsp["SDij_latewindow"] = calc_spi_mtx(BBuhat, UPusv.S, latewindow) ./ size(biobank[:UPorder_oggs], 2);

In [26]:
biobank

MuData object 728 ✕ 11308
└ metabolites_foldchange
  AnnData object 728 ✕ 50
└ metabolites_millimolar
  AnnData object 728 ✕ 10
└ oggs
  AnnData object 728 ✕ 11248
└ UPorder_oggs
  AnnData object 728 ✕ 10177

In [27]:
writeh5mu(joinpath(rdir, "BB728.h5mu"), biobank)