# 02 Make Parquet (Data Ingest)
This notebook demonstrates the process of ingesting data from various experimental platforms to create the parquet file required by SPARROW.
## Vizgen (merFISH)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import dask.dataframe as dd
import anndata
import os
os.chdir("/home/jupyter/tools") #change to the directory in which SPARROW is located
from SPARROW import preprocessing


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Set the project directory you are working in. Within this directory, you will create directories to
# save trained model outputs.
directory = "/home/jupyter/sparrow_demo"
if not os.path.exists (directory):
    os.makedirs(directory)
os.chdir("/home/jupyter/sparrow_demo")

In [4]:
cell_by_gene_path = '/home/jupyter/tools/SPARROW/testdata/VizgenOutput/cellpose_cell_by_gene.csv'
cell_metadata_path = '/home/jupyter/tools/SPARROW/testdata/VizgenOutput/cellpose_cell_metadata.csv'
obj=preprocessing.ingest.make_parquet(cell_by_gene=cell_by_gene_path,cell_meta=cell_metadata_path)  


In [5]:
#make sure that cell by gene is properly ingested
obj.cell_by_gene.head()

Unnamed: 0_level_0,CD4,TNFRSF17,IL4R,TBX21,CLIC5,PILRA,OSM,LGALS2,PIK3IP1,IL2RB,...,Blank-10,Blank-11,Blank-12,Blank-13,Blank-14,Blank-15,Blank-16,Blank-17,Blank-18,Blank-19
cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
#make sure that cell_meta is properly ingested
obj.cell_meta.head()

Unnamed: 0_level_0,fov,volume,center_x,center_y,min_x,min_y,max_x,max_y,barcodeCount
cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,,2630.095485,10370.12823,7761.033472,10358.7428,7751.659322,10381.551212,7769.322908,
1,,499.611966,10352.894917,7777.049556,10346.769523,7773.337429,10359.262583,7779.775336,
2,,421.776481,10356.319033,7782.061134,10348.702874,7778.396326,10361.622268,7785.28469,
3,,117.930884,10610.45136,7781.362814,10607.736506,7779.762249,10613.090692,7783.215533,
4,,169.695391,10354.068607,7785.058095,10350.610216,7782.396532,10357.558281,7787.659548,


In [7]:
print (f'before filtering, the total cell number is {obj.prefilt_cell_num}')


before filtering, the total cell number is 952082


In [8]:
print (f'before filtering, the number of transcripts per cell is as following:{obj.prefilt_cell_sum}')


before filtering, the number of transcripts per cell is as following:cell
0           5.0
1           1.0
2           1.0
3           0.0
4           0.0
          ...  
952077    246.0
952078    308.0
952079    199.0
952080    243.0
952081     86.0
Length: 952082, dtype: float64


In [9]:
print (f'before filtering, the transcript sum per gene is as following:{obj.prefilt_trx_sum}')


before filtering, the transcript sum per gene is as following:CD4         1211946.0
TNFRSF17     238423.0
IL4R        2894524.0
TBX21        143756.0
CLIC5         14873.0
              ...    
Blank-15      18221.0
Blank-16      11940.0
Blank-17      68831.0
Blank-18      18410.0
Blank-19      16415.0
Length: 462, dtype: float64


In [10]:
#filter out cells with extreme transcript levels to remove artefacts
#and save the dataframe as a parquet file on disc
obj.filt(lower_threshold=5,upper_threshold=300,output_name='test_parquet',output_fmt='parquet',output_name_prefix='test')




In [11]:
#basic metrics can be written to disc too
obj.write_metric('metric.txt')

In [12]:
!head metric.txt

Number of cells (prefilter): 952082
Number of cells (postfilter): 625255
Median transcripts per cell (prefilter): 187

numbers of transcripts pre- and post- filter:
            pre-filt  post-filt
POU2AF1    4813237.0  1525968.0
IRF8       3775437.0  1360630.0
IGHD       3671429.0  1944683.0
PLCG2      3666018.0  1321505.0


## Xenium

In [13]:
cell_by_gene_path = '/home/jupyter/tools/SPARROW/testdata/XeniumOutput/xenium_demo_cell_feature_matrix.h5'
cell_meta_path='/home/jupyter/tools/SPARROW/testdata/XeniumOutput/xenium_demo_cells.csv.gz'
obj=preprocessing.ingest.make_parquet(cell_by_gene=cell_by_gene_path,cell_meta=cell_meta_path,input_format='Anndata')  


In [14]:
#as before, make sure the files are properly ingested
obj.cell_by_gene.head()

Unnamed: 0_level_0,ABLIM1,ACKR4,ADTRP,AICDA,ALDH2,ALOX15,ANGPT1,ANGPT2,APOBEC3C,APP,...,TRARG1,TSPAN3,TULP4,VCAN,VEGFA,VEGFB,XCR1,ZAP70,ZBTB16,ZEB2
cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
aaaaabpd-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aaabcfid-1,3.0,0.0,8.0,0.0,29.0,0.0,0.0,0.0,0.0,22.0,...,0.0,31.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
aaaciajo-1,2.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,7.0,16.0,0.0,1.0,1.0,0.0,4.0
aaaedbcn-1,9.0,0.0,19.0,0.0,23.0,0.0,0.0,0.0,0.0,9.0,...,0.0,25.0,2.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
aaaiaipl-1,0.0,0.0,8.0,0.0,3.0,0.0,0.0,0.0,0.0,4.0,...,0.0,6.0,1.0,0.0,4.0,1.0,0.0,0.0,0.0,0.0


In [16]:
#filter out cells with extreme transcript levels to remove artefacts
#and save the dataframe as a parquet file on disc
obj.filt(lower_threshold=5,upper_threshold=300,output_name='test_parquet_Xenium',output_fmt='parquet',output_name_prefix='test_Xenium')

