# BIOFLEX example notebook

In [1]:
import bioflex

### Create a connection and list out available databases
Does not have access token ?
Please register at [BioTuring Data Science](https://datascience.bioturing.com)

In [2]:
conn = bioflex.connect('70d2acfda3a54ca6a439069*****')
databases = conn.databases()
databases

[DataBase(id="5010c7d573ae4ff2b9691422b99aa2cd",name="BioTuring database",species="human",version=1),
 DataBase(id="5010c7d573ae4ff2b9691422b99aa2cd",name="BioTuring database",species="human",version=2),
 DataBase(id="5010c7d573ae4ff2b9691422b99aa2cd",name="BioTuring database",species="human",version=3),
 DataBase(id="5010c7d573ae4ff2b9691422b99aa2cd",name="BioTuring database",species="mouse",version=1),
 DataBase(id="5010c7d573ae4ff2b9691422b99aa2cd",name="BioTuring database",species="primate",version=1),
 DataBase(id="1de28e67227b4ed9bd54aa9b642736e3",name="Lung atlas",species="human",version=1),
 DataBase(id="31052bef5c3f4514b9dbd194a03bcafa",name="Renal atlas",species="human",version=1),
 DataBase(id="58651b0a42434cfba267f78ac42a6fec",name="NK cell atlas (full)",species="human",version=1)]

### Using a database from list

In [3]:
using_database = databases[2]

### Get cell types gene expression across database

In [4]:
using_database.get_celltypes_expression_summary(['CD3D', 'CD3E'])

{'CD3D': [Summary(name="B cell",sum=707108874.0,mean=4192.709686217774,rate=0.03504117106973723,count=168652.0,total=4812967),
  Summary(name="CD4-positive, alpha-beta T cell",sum=9489987442.0,mean=4657.561967741555,rate=0.5283278751435854,count=2037544.0,total=3856590),
  Summary(name="CD4-positive, alpha-beta cytotoxic T cell",sum=342799107.0,mean=4684.903951018846,rate=0.5532527824824582,count=73171.0,total=132256),
  Summary(name="CD8-positive, alpha-beta T cell",sum=8799563254.0,mean=4704.7405575715065,rate=0.5471126656122398,count=1870361.0,total=3418603),
  Summary(name="CD8-positive, alpha-beta cytotoxic T cell",sum=411976171.0,mean=4748.566944835058,rate=0.5942491575111647,count=86758.0,total=145996),
  Summary(name="Cajal-Retzius cell",sum=6910.0,mean=2303.3333333333335,rate=0.0003163222269084774,count=3.0,total=9484),
  Summary(name="GABAergic interneuron",sum=0.0,mean=0.0,rate=0.0,count=0.0,total=76618),
  Summary(name="GABAergic neuron",sum=16857.0,mean=2408.1428571428573,

### Create study instance
For study hash ID, search from [BioTuring studies](https://talk2data.bioturing.com/studies/)

In [5]:
study = using_database.get_study('GSE96583_batch2')
study

Study(id="1557",hash_id="GSE96583_batch2",title="Multiplexed droplet single-cell RNA-sequencing using natural genetic variation (Batch 2)",reference="https://www.nature.com/articles/nbt.4042")

### Take a peek at study metadata

In [6]:
study.metalist

[Metadata(id=0,name="Number of mRNA transcripts",type="Numeric"),
 Metadata(id=1,name="Number of genes",type="Numeric"),
 Metadata(id=2,name="Batch id",type="Category"),
 Metadata(id=3,name="Stimulation",type="Category"),
 Metadata(id=4,name="Author's cell type",type="Category"),
 Metadata(id=5,name="Multiplets",type="Category"),
 Metadata(id=6,name="Quantification",type="Category"),
 Metadata(id=7,name="Sequencing platform",type="Category"),
 Metadata(id=8,name="Storage technique",type="Category"),
 Metadata(id=9,name="Condition",type="Category"),
 Metadata(id=10,name="Sampling technique",type="Category"),
 Metadata(id=11,name="Sampling site",type="Category"),
 Metadata(id=12,name="Tissue",type="Category"),
 Metadata(id=13,name="Louvain clustering",type="Category")]

### Fetch a study metadata

In [7]:
metadata = study.metalist[4]
metadata.fetch()
metadata.values

array(['CD8 T cells', 'Dendritic cells', 'CD4 T cells', ...,
       'CD8 T cells', 'B cells', 'CD4 T cells'], dtype='<U17')

### Query genes

In [8]:
study.query_genes(['CD3D', 'CD3E'], bioflex.UNIT_RAW)

<29065x2 sparse matrix of type '<class 'numpy.float32'>'
	with 15492 stored elements in Compressed Sparse Column format>

### Get study barcodes

In [9]:
study.barcodes()

['GSM2560249_AAACATACCAAGCT-1',
 'GSM2560249_AAACATACCCCTAC-1',
 'GSM2560249_AAACATACCCGTAA-1',
 'GSM2560249_AAACATACCCTCGT-1',
 'GSM2560249_AAACATACGAGGTG-1',
 'GSM2560249_AAACATACGCGAAG-1',
 'GSM2560249_AAACATACGTCGTA-1',
 'GSM2560249_AAACATACGTTGCA-1',
 'GSM2560249_AAACATACTATGGC-1',
 'GSM2560249_AAACATACTCAGGT-1',
 'GSM2560249_AAACATACTGCTAG-1',
 'GSM2560249_AAACATACTGTTTC-1',
 'GSM2560249_AAACATTGAACAGA-1',
 'GSM2560249_AAACATTGAAGCCT-1',
 'GSM2560249_AAACATTGAAGGGC-1',
 'GSM2560249_AAACATTGACCTGA-1',
 'GSM2560249_AAACATTGACTTTC-1',
 'GSM2560249_AAACATTGAGACTC-1',
 'GSM2560249_AAACATTGATCACG-1',
 'GSM2560249_AAACATTGGTACCA-1',
 'GSM2560249_AAACATTGGTGTCA-1',
 'GSM2560249_AAACATTGTAGCCA-1',
 'GSM2560249_AAACATTGTCCAGA-1',
 'GSM2560249_AAACATTGTGCTAG-1',
 'GSM2560249_AAACATTGTGTCCC-1',
 'GSM2560249_AAACATTGTGTGGT-1',
 'GSM2560249_AAACATTGTTTGTC-1',
 'GSM2560249_AAACCGTGCTCTCG-1',
 'GSM2560249_AAACCGTGCTGGAT-1',
 'GSM2560249_AAACCGTGTATGCG-1',
 'GSM2560249_AAACGCACAAGGGC-1',
 'GSM256

### Get study features

In [10]:
study.features()

['5S_RRNA',
 '5_8S_RRNA',
 '7SK',
 'A1BG',
 'A1BG-AS1',
 'A1CF',
 'A2M',
 'A2M-AS1',
 'A2ML1',
 'A2ML1-AS1',
 'A2ML1-AS2',
 'A2MP1',
 'A3GALT2',
 'A4GALT',
 'A4GNT',
 'AA06',
 'AAAS',
 'AACS',
 'AACSP1',
 'AADAC',
 'AADACL2',
 'AADACL2-AS1',
 'AADACL3',
 'AADACL4',
 'AADACP1',
 'AADAT',
 'AAED1',
 'AAGAB',
 'AAK1',
 'AAMDC',
 'AAMP',
 'AANAT',
 'AAR2',
 'AARD',
 'AARS',
 'AARS1',
 'AARS1P1',
 'AARS2',
 'AARSD1',
 'AARSP1',
 'AASDH',
 'AASDHPPT',
 'AASS',
 'AATBC',
 'AATF',
 'AATK',
 'AATK-AS1',
 'AB015752',
 'AB019438',
 'AB019440',
 'AB019441',
 'ABALON',
 'ABAT',
 'ABBA01000933',
 'ABBA01000935',
 'ABBA01006766',
 'ABBA01010928',
 'ABBA01017803',
 'ABBA01031658',
 'ABBA01031669',
 'ABBA01037345',
 'ABBA01037346',
 'ABBA01037348',
 'ABBA01037349',
 'ABBA01045074',
 'ABC12-47043100G14',
 'ABC12-47964100C23',
 'ABC12-49244600F4',
 'ABC13-48559800H2',
 'ABC14-1080714F14',
 'ABC14-864958H18',
 'ABC7-42418200C9',
 'ABC7-43046200P7',
 'ABC7-481722F1',
 'ABC9-43961700F1',
 'ABCA1',
 'ABCA10'

### Get study full matrix 

In [11]:
study.matrix(bioflex.UNIT_LOGNORM)

Downloading: 100%|██████████| 141M/141M [02:09<00:00, 1.09Mbytes/s] 


<29065x64642 sparse matrix of type '<class 'numpy.float32'>'
	with 17570739 stored elements in Compressed Sparse Column format>