# Polus-data

In [2]:
from polus.data import collections

To list all Collections present in storage directory:

In [3]:
collections.list

['BBBC004',
 'BBBC010',
 'BBBC033',
 'BBBC039',
 'Hansen2019Iowa',
 'MaricRatBrain2019',
 'Mark2021NF-kB',
 'Nadia2017ImportTest',
 'SchaubHotaling2020Features',
 'Schwendy',
 'TissueNet',
 'Ty2016Fillipin',
 'Ty2017Alpha1',
 'Ty2017ERDislocation',
 'Ty2018EOSKNkinome',
 'Ty2018Transomic',
 'sod']

To select a `Collection`:

In [4]:
data = collections.MaricRatBrain2019 

## List all Datasets in the Collection

In [7]:
data.datasets

['raw', 'standard', 'subset', 'fovs']

## Look at the structure (model) of the collection

In [8]:
data.model

CollectionScheme(raw=GenericDataScheme(intensity=Data(path=PosixPath('/home/ec2-user/polus-storage/images/MaricRatBrain2019/raw/intensity'), description='Original czi files from each of 5 different staining rounds', wipp_type=<WippType.genericData: 'genericData'>, tags=['intensity', 'czi']), metadata=Data(path=PosixPath('/home/ec2-user/polus-storage/images/MaricRatBrain2019/raw/metadata'), description='Additional information from Dragan', wipp_type=<WippType.genericData: 'genericData'>, tags=['metadata'])), standard=WippDataScheme(intensity=WippData(path=PosixPath('/home/ec2-user/polus-storage/images/MaricRatBrain2019/standard/intensity'), description='Data stitched by stage position, channels and replicates are saved as separate files', wipp_type=<WippType.collection: 'collection'>, tags=['intensity', 'fluorescence', 'phase_contrast'], patterns={'all': 'S1_R{r}_C1-C11_A1_c{ccc}.ome.tif', 'DAPI': 'S1_R{r}_C1-C11_A1_c000.ome.tif', 'phase-contrast': 'S1_R{r}_C1-C11_A1_c010.ome.tif'})), s

### Easy access to datasets and subdatasets

In [9]:
data.raw

GenericDataScheme(intensity=Data(path=PosixPath('/home/ec2-user/polus-storage/images/MaricRatBrain2019/raw/intensity'), description='Original czi files from each of 5 different staining rounds', wipp_type=<WippType.genericData: 'genericData'>, tags=['intensity', 'czi']), metadata=Data(path=PosixPath('/home/ec2-user/polus-storage/images/MaricRatBrain2019/raw/metadata'), description='Additional information from Dragan', wipp_type=<WippType.genericData: 'genericData'>, tags=['metadata']))

In [10]:
data.raw.intensity

Data(path=PosixPath('/home/ec2-user/polus-storage/images/MaricRatBrain2019/raw/intensity'), description='Original czi files from each of 5 different staining rounds', wipp_type=<WippType.genericData: 'genericData'>, tags=['intensity', 'czi'])

In [11]:
data.subset.intensity

WippData(path=PosixPath('/home/ec2-user/polus-storage/images/MaricRatBrain2019/subset/intensity'), description='A 5x5 grid of field of views for 11 channels and 2 replicates', wipp_type=<WippType.collection: 'collection'>, tags=['intensity', 'fluorescence', 'phase_contrast', 'stitching', 'bleedthrough', 'flatfield', 'nuclear_segmentation'], patterns={'all': 'S1_R{r}_C1-C11_A1_y0{yy}_x0{xx}_c0{cc}.ome.tif', 'DAPI': 'S1_R{r}_C1-C11_A1_y0{yy}_x0{xx}_c000.ome.tif', 'phase-contrast': 'S1_R{r}_C1-C11_A1_y0{yy}_x0{xx}_c010.ome.tif'})

## FilePatterns

In [12]:
data.subset.intensity.patterns

{'all': 'S1_R{r}_C1-C11_A1_y0{yy}_x0{xx}_c0{cc}.ome.tif',
 'DAPI': 'S1_R{r}_C1-C11_A1_y0{yy}_x0{xx}_c000.ome.tif',
 'phase-contrast': 'S1_R{r}_C1-C11_A1_y0{yy}_x0{xx}_c010.ome.tif'}

There exists a Python object for a FilePattern. Instead of having our filepatterns as strings, it would be nice to have them as `FilePattern`

A `FilePattern` Python class has:
* **Pattern**: string that represents the pattern followed by the files' names
* **<span style="color:red">Path</span>**: path to the directory where the files are stored. <span style="color:red">It's missing! No files = no path</span>

We have not downloaded any data. We can't have a path in our machine to the directory of the files because we have no files. Let's download them

## Download directly from Python (just like `dvc pull`)

In [13]:
data.fetch("subset")

A       subset/
1 file added and 550 files fetched


When initializing a `Collection` for which files live locally in the polus-storage directory, `FilePattern` objects will be created for such files:

In [14]:
data_new = collections.MaricRatBrain2019

In [15]:
data_new.subset.intensity.patterns

{'all': <filepattern.classes.FilePattern at 0x7f109de56880>,
 'DAPI': <filepattern.classes.FilePattern at 0x7f10bd6a9250>,
 'phase-contrast': <filepattern.classes.FilePattern at 0x7f10bc29d610>}

In [16]:
data_new.subset.intensity.patterns['all'].pattern

'S1_R{r}_C1-C11_A1_y0{yy}_x0{xx}_c0{cc}.ome.tif'

In [17]:
data_new.subset.intensity.patterns['all'].path

PosixPath('/home/ec2-user/polus-storage/images/MaricRatBrain2019/subset/intensity')

## Paths

Full paths are returned, no need to join root path with relative paths

In [18]:
data_new.standard.intensity.path

PosixPath('/home/ec2-user/polus-storage/images/MaricRatBrain2019/standard/intensity')

In [19]:
data_new.subset.intensity.path

PosixPath('/home/ec2-user/polus-storage/images/MaricRatBrain2019/subset/intensity')

## Tags as list

In [20]:
data_new.standard.intensity.tags

['intensity', 'fluorescence', 'phase_contrast']