In [17]:
# imports

import os
import pandas as pd
import xarray as xr


# Step 3: Build your data tensor

Use this notebook to read in your normalized data and arrange it into tensor format using the [xarray](https://docs.xarray.dev/en/stable/) package. 

NOTE: Your data should be normalized, and in a csv file using [tidy format](https://tidyr.tidyverse.org/articles/tidy-data.html).


In [24]:
# input data variables

# data/example-data.csv, sample_name, phylum, KOfam, replicate, counts

# data file
datapath = input('Enter the filepath of your input data file:')
# check data file exists
if not os.path.isfile(datapath):
    raise Exception(f'Unable to find the file "{datapath}"')

# output directory
outdir = input('Enter the filepath of the output directory where you want files saved:')
# check output directory exists
if not os.path.isdir(outdir):
    raise Exception(f'Unable to find the directory "{outdir}"')

# column names
mode0 = input('Enter the column name that corresponds to the first mode of your tensor:')
mode1 = input('Enter the column name that corresponds to the second mode of your tensor:')
mode2 = input('Enter the column name that corresponds to the third mode of your tensor:')
rep = input('Enter the column name that corresponds to replicate IDs:')
data = input('Enter the column name that corresponds to your data:')


Enter the filepath of your input data file: data/example-data.csv
Enter the filepath of the output directory where you want files saved: data
Enter the column name that corresponds to the first mode of your tensor: sample_name
Enter the column name that corresponds to the second mode of your tensor: phylum
Enter the column name that corresponds to the third mode of your tensor: KOfam
Enter the column name that corresponds to replicate IDs: replicate
Enter the column name that corresponds to your data: counts


In [25]:
# read in data

# read in csv
df = pd.read_csv(datapath)

# check column names match inputs
for column in [mode0, mode1, mode2, rep, data]:
    if column not in df.columns:
        raise Exception(f'Column name "{column}" not found in headers of file {datapath}')

# tidy up dataframe
df = df[[mode0, mode1, mode2, rep, data]]

df

Unnamed: 0,sample_name,phylum,KOfam,replicate,counts
0,G3.UW.ALL.L25S1,Acidobacteriota,K00070,A,4.00000
1,G3.UW.ALL.L25S1,Acidobacteriota,K00078,A,5.83841
2,G3.UW.ALL.L25S1,Acidobacteriota,K00311,A,0.00000
3,G3.UW.ALL.L25S1,Acidobacteriota,K00406,A,3.00000
4,G3.UW.ALL.L25S1,Acidobacteriota,K00410,A,19.03930
...,...,...,...,...,...
3181481,G3.UW.ALL.L40S2,Xanthophyceae,K24743,C,1.40753
3181482,G3.UW.ALL.L40S2,Xanthophyceae,K25440,C,2.05893
3181483,G3.UW.ALL.L40S2,Xanthophyceae,K25458,C,0.00000
3181484,G3.UW.ALL.L40S2,Xanthophyceae,K25547,C,0.00000


In [28]:
# organize dataset into xarray.Dataset tensor

# create xr.Dataset
ds = xr.Dataset(
    dict(
        Data=xr.DataArray.from_series(
            df.set_index([mode0, mode1, mode2, rep])[data]
        )
    )
)

# fill missing values with zeros
ds = ds.fillna(0)

# save to output directory
ds.to_netcdf(f'{outdir}/data-tensor.nc')

ds


### Visualize data tensor (optional)

Use Barnacle's visualization package to visualize your data tensor. 

NOTE: This visualization tool is not intended for tensors larger than a couple of hundred values per mode. If your tensor is larger than that, select a region you would like to visualize.

In [29]:
# import visualization function from Barnacle

from barnacle.utils import visualize_3d_tensor


In [42]:
# choose a replicate to visualize

print(f'Replicates: {ds[rep].data}')
viz_rep = input('Enter which replicate you would like to visualize:')

viz_ds = ds.sel({rep: viz_rep})

viz_ds

Replicates: ['A' 'B' 'C']


Enter which replicate you would like to visualize: A


In [48]:
viz_ds.sum(dim=mode0)

In [51]:
df[df[data] != 0][[mode0, mode1, mode2]].value_counts()

sample_name      phylum                          KOfam 
G3.UW.ALL.L40S2  candidate-division-WWE3         K20777    3
G3.UW.ALL.L37S1  MAST-3-4                        K04536    3
                                                 K04461    3
                                                 K04464    3
                                                 K04469    3
                                                          ..
G3.UW.ALL.L40S1  Candidatus-Neomarinimicrobiota  K22531    1
G3.UW.ALL.L32S3  Oomycota                        K21862    1
G3.UW.ALL.L31S2  Glaucocystophyceae              K07297    1
                                                 K07935    1
G3.UW.ALL.L35S2  Bolidophyceae                   K20972    1
Name: count, Length: 1226091, dtype: int64