# Pipeline for Ohta et al., 2021 scRNA-seq experiment

In [1]:
# set the working directory
wd = '/Users/calebreagor/Documents/hudspeth-lab'

In [2]:
# core dependencies
import sys, pickle
import h5py, rpy2
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
sys.path.insert(0, wd)

In [3]:
# custom class for scRNA-seq datasets
from classes.singlecell import dataset

In [4]:
# adjust the plotting and display settings
%matplotlib inline
mpl.rcParams['figure.dpi']= 1000

from IPython.display import Markdown
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 5)

## Load, pre-process & save the single-cell experiment

In [5]:
# load GEO dataset from hdf5 file
dd = f'{wd}/geo-datasets/drerio/GSE152859'
ohta = dataset(name='ohta et al., 2021')
f = h5py.File(f'{dd}/GSE152859.h5','r')
ohta.raw_counts_from_sparse_matrix(
    
    cell_names = [i.decode('ascii') for i in f['matrix']['barcodes'][:]],
    gene_names = [i.decode('ascii') for i in f['matrix']['features']['id'][:]], 
    data=f['matrix']['data'], dtype='i4', indices=f['matrix']['indices'],
    indptr=f['matrix']['indptr'], shape=tuple(reversed(f['matrix']['shape'])) )

# pre-process, scale and impute expression
# * filter rare genes and cells with low counts
# * normalize library sizes, then log scale
# * impute expression using data diffusion
ohta.preprocess_raw_counts(library_size_cutoff=(400,2500))
ohta.impute_from_normalized(genes='all_genes')

with open('class-datasets/ohta.pickle', 'wb') as f:
    pickle.dump(ohta, f)

  Running MAGIC with `solver='exact'` on 11774-dimensional data may take a long time. Consider denoising specific genes with `genes=<list-like>` or using `solver='approximate'`.
