# Calour Experiment class tutorial
Learn about how calour stores the data of an experiment

## Setup

In [1]:
import calour as ca
ca.set_log_level(11)
import numpy as np
import matplotlib.pyplot as plt
%matplotlib notebook

  from ._conv import register_converters as _register_converters


## Load the data
we use the chronic fatigue syndrome data from:

Giloteaux, L., Goodrich, J.K., Walters, W.A., Levine, S.M., Ley, R.E. and Hanson, M.R., 2016.

Reduced diversity and altered composition of the gut microbiome in individuals with myalgic encephalomyelitis/chronic fatigue syndrome.

Microbiome, 4(1), p.30.

In [2]:
cfs=ca.read_amplicon('data/chronic-fatigue-syndrome.biom',
                     'data/chronic-fatigue-syndrome.sample.txt',
                     normalize=10000,min_reads=1000)

2018-03-04 12:36:35 INFO loaded 87 samples, 2129 features
2018-03-04 12:36:35 INFO After filtering, 87 remaining


## The `Experiment` class
Calour stores the experiment as two Pandas.DataFrame (for sample_metadata and feature_metadata) and a (sparse or dense) data matrix.

The order in the dataframes and the table is synchronized, so entry number X in the sample_metadata dataframe always corresponds to row X in the data matrix (and similarily entry Y in the feature_metadata always corresponds to column Y in the data matrix).

## The \__repr__ of the Experiment
Contains the class (in our case - ca.AmpliconExperiment - which is derived from ca.Experiment),

the original biom table filename,

and how many samples and features does it have.

In [3]:
print(cfs)

AmpliconExperiment ("chronic-fatigue-syndrome.biom") with 87 samples, 2129 features


## The per-sample metadata (`Experiment.sample_metadata`)
This is a Pandas.DataFrame, with the index being the SampleID, and columns for the sample metadata fields (loaded from the mapping file).

Note that Calour also added the "_calour_original_abundance" field

In [4]:
cfs.sample_metadata

Unnamed: 0_level_0,BioSample_s,Experiment_s,MBases_l,MBytes_l,Run_s,SRA_Sample_s,Sample_Name_s,Assay_Type_s,AssemblyName_s,BioProject_s,...,Description,Subject,Emotional_well_being,Role_physical,Bell,Physical_functioning,Pain,Age,BMI,_calour_original_abundance
#SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ERR1331798,SAMEA3904128,ERX1403418,43,29,ERR1331798,ERS1091262,LR16,AMPLICON,<not provided>,PRJEB13092,...,,Patient,72.0,0.0,20.0,75.0,23.0,50,37.59,62629.0
ERR1331812,SAMEA3904142,ERX1403432,77,54,ERR1331812,ERS1091276,LR72,AMPLICON,<not provided>,PRJEB13092,...,,Patient,56.0,,30.0,60.0,68.0,64,22.85,96404.0
ERR1331836,SAMEA3904166,ERX1403456,83,56,ERR1331836,ERS1091300,LR42,AMPLICON,<not provided>,PRJEB13092,...,,Control,,,,,,35,30.66,105470.0
ERR1331831,SAMEA3904161,ERX1403451,38,26,ERR1331831,ERS1091295,IC10,AMPLICON,<not provided>,PRJEB13092,...,,Control,,,,,,45,22.24,50560.0
ERR1331815,SAMEA3904145,ERX1403435,49,33,ERR1331815,ERS1091279,LR75,AMPLICON,<not provided>,PRJEB13092,...,,Patient,,,,,,41,32.30,66414.0
ERR1331870,SAMEA3904200,ERX1403490,61,42,ERR1331870,ERS1091334,LR31,AMPLICON,<not provided>,PRJEB13092,...,,Patient,72.0,0.0,,0.0,20.0,50,21.96,97011.0
ERR1331791,SAMEA3904121,ERX1403411,52,35,ERR1331791,ERS1091255,LR08,AMPLICON,<not provided>,PRJEB13092,...,,Patient,50.0,0.0,10.0,45.0,10.0,45,25.23,77673.0
ERR1331854,SAMEA3904184,ERX1403474,46,31,ERR1331854,ERS1091318,LR51,AMPLICON,<not provided>,PRJEB13092,...,,Control,,,,,,46,28.34,59655.0
ERR1331853,SAMEA3904183,ERX1403473,73,48,ERR1331853,ERS1091317,IC21,AMPLICON,<not provided>,PRJEB13092,...,,Patient,56.0,0.0,20.0,40.0,33.0,34,25.70,100206.0
ERR1331838,SAMEA3904168,ERX1403458,24,16,ERR1331838,ERS1091302,LR40,AMPLICON,<not provided>,PRJEB13092,...,,Patient,8.0,0.0,50.0,50.0,23.0,27,26.47,34044.0


## The per-feature metadata (`Experiment.feature_metadata`)
This is a Pandas.DataFrame, with the index being the featureID (usually the sequence), and columns for the feature metadata (usually "taxonomy", and also additional fields added by calour following differential abundance testing)

In [5]:
cfs.feature_metadata

Unnamed: 0,taxonomy
TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCGGACGCTTAAGTCAGTTGTGAAAGTTTGCGGCTCAACCGTAAAATTGCAGTTGATACTGGGTGTCTTGAGTACAGTAGAGGCAGGCGGAATTCGTGG,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o_...
TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCGGGTTGTTAAGTCAGTTGTGAAAGTTTGCGGCTCAACCGTAAAATTGCAGTTGATACTGGCGACCTTGAGTGCAACAGAGGTAGGCGGAATTCGTGG,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o_...
TACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGCAGGCGGTGCGGCAAGTCTGATGTGAAAGCCCGGGGCTCAACCCCGGGACTGCATTGGAAACTGTCGTACTTGAGTATCGGAGAGGTAAGTGGAATTCCTAG,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clo...
TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGATGGATGTTTAAGTCAGTTGTGAAAGTTTGCGGCTCAACCGTAAAATTGCAGTTGATACTGGATATCTTGAGTGCAGTTGAGGCAGGCGGAATTCGTGG,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o_...
TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGTGGATTGTTAAGTCAGTTGTGAAAGTTTGCGGCTCAACCGTAAAATTGCAGTTGAAACTGGCAGTCTTGAGTACAGTAGAGGTGGGCGGAATTCGTGG,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o_...
TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGATGGATGTTTAAGTCAGTTGTGAAAGTTTGCGGCTCAACCGTAAAATTGCAGTTGATACTGGATGTCTTGAGTGCAGTTGAGGCAGGCGGAATTCGTGG,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o_...
TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTTGTTAAGTCAGATGTGAAATCCCCGGGCTCAACCTGGGAACTGCATCTGATACTGGCAAGCTTGAGTCTCGTAGAGGGGGGTAGAATTCCAGG,k__Bacteria;p__Proteobacteria;c__Gammaproteoba...
AACGTAGGGTGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGACCGGCAAGTTGGAAGTGAAAACTATGGGCTCAACCCATAAATTGCTTTCAAAACTGCTGGCCTTGAGTAGTGCAGAGGTAGGTGGAATTCCCGG,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clo...
AACGTAGGGTGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGAAGACAAGTTGGAAGTGAAAACCATGGGCTCAACCCATGAATTGCTTTCAAAACTGTTTTTCTTGAGTAGTGCAGAGGTAGATGGAATTCCCGG,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clo...
TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCGGATTGTTAAGTCAGTTGTGAAAGTTTGCGGCTCAACCGTAAAATTGCAGTTGATACTGGCAGTCTTGAGTGCAGTAGAGGTGGGCGGAATTCGTGG,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o_...


## Reads table (`Experiment.data`)
This is a numpy 2D array or a scipy.Sparse matrix containing the feature X sample reads.

Rows are samples, columns are features.

In [6]:
cfs.data

<87x2129 sparse matrix of type '<class 'numpy.float64'>'
	with 21995 stored elements in Compressed Sparse Row format>

### Choosing sparse/dense representation of the data

When loading the data, it is by default loaded as a scipy.Sparse.CSR matrix (which is more memory efficient for sparse data).

We can force Calour to load the data as a dense numpy 2D array using the `sparse=False` parameter in the `read_amplicon()` function.

We can also convert between sparse and dense using the `sparse` attribute of the experiment

#### Convert to dense

In [7]:
cfs.sparse=False
cfs.data

array([[3.17744176e+03, 9.53232528e+02, 7.34643695e+02, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.59231982e+03, 3.83801502e+00, 3.63055475e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.12373187e+03, 4.78524699e+02, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [3.01104327e+03, 0.00000000e+00, 2.22422402e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [3.40518245e+02, 1.77466873e+03, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [5.02440604e+02, 0.00000000e+00, 0.00000000e+00, ...,
        3.39941226e+01, 5.10534442e+00, 1.36972655e+00]])

#### convert to sparse

In [8]:
cfs.sparse=True
cfs.data

<87x2129 sparse matrix of type '<class 'numpy.float64'>'
	with 21995 stored elements in Compressed Sparse Row format>

### Getting the data
We can use the `Experiment.get_data()` function to obtain a copy of the data, either as sparse or dense.

#### sparse=None means keep the original format

In [9]:
dat = cfs.get_data(sparse=None)
dat

<87x2129 sparse matrix of type '<class 'numpy.float64'>'
	with 21995 stored elements in Compressed Sparse Row format>

#### sparse=True returns a sparse representation of the data (copies if needed)

In [10]:
dat = cfs.get_data(sparse=True)
dat

<87x2129 sparse matrix of type '<class 'numpy.float64'>'
	with 21995 stored elements in Compressed Sparse Row format>

#### sparse=False returns a dense representation of the data (copies if needed)

In [11]:
dat = cfs.get_data(sparse=False)
dat

array([[3.17744176e+03, 9.53232528e+02, 7.34643695e+02, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.59231982e+03, 3.83801502e+00, 3.63055475e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.12373187e+03, 4.78524699e+02, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [3.01104327e+03, 0.00000000e+00, 2.22422402e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [3.40518245e+02, 1.77466873e+03, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [5.02440604e+02, 0.00000000e+00, 0.00000000e+00, ...,
        3.39941226e+01, 5.10534442e+00, 1.36972655e+00]])

#### we can also force copying the data using `copy=True`

In [12]:
dat = cfs.get_data(sparse=None, copy=False)
dat is cfs.data

True

In [13]:
dat = cfs.get_data(sparse=None, copy=True)
dat is cfs.data

False

#### getting a single entry based on feature and sample values
We can use the \__getitem(sampleid, featureid)\__ attribute.

In [14]:
cfs['ERR1331815','TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCGGACGCTTAAGTCAGTTGTGAAAGTTTGCGGCTCAACCGTAAAATTGCAGTTGATACTGGGTGTCTTGAGTACAGTAGAGGCAGGCGGAATTCGTGG']

1407.0828439786792

## Saving an Experiment

### Save everything (biom table+sample/feature mapping files)
By default saves as an HDF5 biom table. We can save as a text biom table insteadusing `fmt="txt"` parameter

In [15]:
cfs.save('cfs')

In [16]:
!ls cfs*

cfs.biom        cfs_feature.txt cfs_sample.txt
