# BIOFLEX example notebook

In [1]:
import bioflex

### Create a connection and list out available databases
Does not have access token ?
Please register at [BioTuring Data Science](https://datascience.bioturing.com)

In [2]:
conn = bioflex.connect('70d2acfda3a54ca6a439069*****')
databases = conn.databases()
databases

[DataBase(id="5010c7d573ae4ff2b9691422b99aa2cd",name="BioTuring database",species="human",version=1),
 DataBase(id="5010c7d573ae4ff2b9691422b99aa2cd",name="BioTuring database",species="human",version=2),
 DataBase(id="5010c7d573ae4ff2b9691422b99aa2cd",name="BioTuring database",species="human",version=3),
 DataBase(id="5010c7d573ae4ff2b9691422b99aa2cd",name="BioTuring database",species="mouse",version=1),
 DataBase(id="5010c7d573ae4ff2b9691422b99aa2cd",name="BioTuring database",species="primate",version=1)]

### Using a database from list

In [3]:
using_database = databases[2]

### Get cell types gene expression across database

In [23]:
result = using_database.get_celltypes_expression_summary(['CD3D', 'CD3E'])
result['CD3D'][:5]
result['CD3E'][:5]

[Summary(name="B cell",sum=569738449.0,mean=4098.277566375819,rate=0.028884262036286558,count=139019.0,total=3172495),
 Summary(name="CD4-positive, alpha-beta T cell",sum=10050349852.0,mean=4702.274442320307,rate=0.5542041025880377,count=2137338.0,total=2751088),
 Summary(name="CD4-positive, alpha-beta cytotoxic T cell",sum=362243512.0,mean=4758.973068131059,rate=0.5755353254294702,count=76118.0,total=91026),
 Summary(name="CD8-positive, alpha-beta T cell",sum=9239057247.0,mean=4722.38210576353,rate=0.5722922492023789,count=1956440.0,total=2452094),
 Summary(name="CD8-positive, alpha-beta cytotoxic T cell",sum=376955768.0,mean=4697.736447247077,rate=0.5496177977478836,count=80242.0,total=105282)]

### Create study instance
For study hash ID, search from [BioTuring studies](https://talk2data.bioturing.com/studies/)

In [5]:
study = using_database.get_study('GSE96583_batch2')
study

Study(hash_id="GSE96583_batch2",title="Multiplexed droplet single-cell RNA-sequencing using natural genetic variation (Batch 2)",reference="https://www.nature.com/articles/nbt.4042")

### Take a peek at study metadata

In [6]:
study.metalist

[Metadata(id=0,name="Author's cell type",type="Category"),
 Metadata(id=1,name="Condition",type="Category"),
 Metadata(id=2,name="Louvain clustering",type="Category"),
 Metadata(id=3,name="Multiplets",type="Category"),
 Metadata(id=4,name="Number of genes",type="Numeric"),
 Metadata(id=5,name="Number of mRNA transcripts",type="Numeric"),
 Metadata(id=6,name="Percentage of mitochondrial genes",type="Numeric"),
 Metadata(id=7,name="Quantification",type="Category"),
 Metadata(id=8,name="Sample ID",type="Category"),
 Metadata(id=9,name="Sampling site",type="Category"),
 Metadata(id=10,name="Sampling technique",type="Category"),
 Metadata(id=11,name="Sequencing platform",type="Category"),
 Metadata(id=12,name="Stimulation",type="Category"),
 Metadata(id=13,name="Storage technique",type="Category"),
 Metadata(id=14,name="Tissue",type="Category")]

### Fetch a study metadata

In [11]:
metadata = study.metalist[4]
metadata.fetch()
metadata.values

array([590., 795., 585., ..., 385., 531., 492.], dtype=float32)

### Query genes

In [12]:
study.query_genes(['CD3D', 'CD3E'], bioflex.UNIT_RAW)

<29065x2 sparse matrix of type '<class 'numpy.float32'>'
	with 15492 stored elements in Compressed Sparse Column format>

### Get study barcodes

In [24]:
np.array(study.barcodes())

array(['GSM2560249_AAACATACCAAGCT-1', 'GSM2560249_AAACATACCCCTAC-1',
       'GSM2560249_AAACATACCCGTAA-1', ..., 'GSM2560248_TTTGCATGGGAACG-1',
       'GSM2560248_TTTGCATGGTCCTC-1', 'GSM2560248_TTTGCATGTTCATC-1'],
      dtype='<U27')

### Get study features

In [25]:
np.array(study.features())

array(['5S_RRNA', '5_8S_RRNA', '7SK', ..., 'C17orf72', 'RP11-361K17.2',
       'CR759784.2'], dtype='<U26')

### Get study full matrix 

In [22]:
study.matrix(bioflex.UNIT_LOGNORM)

Downloading: 100%|██████████| 141M/141M [01:07<00:00, 2.08Mbytes/s] 


<29065x64642 sparse matrix of type '<class 'numpy.float32'>'
	with 17570739 stored elements in Compressed Sparse Column format>

### Export study

In [13]:
study.export_study(bioflex.EXPORT_H5AD)

{'download_link': 'https://talk2data.bioturing.com/api/export/a1003bad3dd146b28c7bda913a2fc3f0',
 'study_hash_id': 'GSE96583_batch2'}