## Prep traj outputs from stream

In [1]:
# Removes CPU usage limit by some jupyter versions
import os
os.environ['KMP_AFFINITY'] = ''
# Configure matplotlib to enable large animations
import matplotlib
matplotlib.rcParams['animation.embed_limit'] = 2**128
import matplotlib.pyplot as plt
# Prepare trajectory files
import pandas as pd
import dictys

# path to stream outputs
stream_outs = "/ocean/projects/cis240075p/asachan/datasets/B_Cell/multiome_1st_donor_UPMC_aggr/stream_outs"
dictys_data_path = "/ocean/projects/cis240075p/asachan/datasets/B_Cell/multiome_1st_donor_UPMC_aggr/dictys_outs/data"

In [2]:
# Load data
dist = pd.read_csv(os.path.join(stream_outs, 'dist.tsv.gz'), header=0, index_col=0, sep='\t')
edge = pd.read_csv(os.path.join(stream_outs, 'edge.tsv.gz'), header=None, index_col=None, sep='\t')
branch = pd.read_csv(os.path.join(stream_outs, 'branch.tsv.gz'), header=None, index_col=None, sep='\t')

# Display the first few rows of each DataFrame to confirm successful loading
print("dist DataFrame:")
display(dist.head())

print("edge DataFrame:")
display(edge.head())

print("branch DataFrame:")
display(branch.head())
print("branch DataFrame shape:", branch.shape)

dist DataFrame:


Unnamed: 0,S0,S1,S2,S3
AAACAGCCAAGCCACT-3,0.00393,0.010666,0.006064,0.000259
AAACAGCCAAGGTGCA-1,0.000728,0.007464,0.001405,0.004918
AAACAGCCAAGTTATC-1,0.000459,0.006277,0.002593,0.004648
AAACAGCCAATAGCCC-1,0.000553,0.007288,0.001581,0.004742
AAACAGCCAGTTAGCC-1,0.000495,0.00723,0.001639,0.004684


edge DataFrame:


Unnamed: 0,0,1
0,0,1
1,0,2
2,0,3


branch DataFrame:


Unnamed: 0,0,1
0,0,3
1,0,2
2,0,1
3,0,2
4,0,2


branch DataFrame shape: (28236, 2)


In [3]:
# Save data
traj = dictys.traj.trajectory.fromdist(edge.values, dist.values)
traj_file_path = os.path.join(dictys_data_path, 'traj_node.h5')
traj.to_file(traj_file_path)

point = dictys.traj.point.fromdist(traj, branch.values, dist.values)
point_file_path = os.path.join(dictys_data_path, 'traj_cell_rna.h5')
point.to_file(point_file_path, traj=False)

In [4]:
# inspect the output .h5 files
import h5py
traj_file = h5py.File(traj_file_path, 'r')
point_file = h5py.File(point_file_path, 'r')

# print keys
print("traj_file keys:", traj_file.keys())
print("point_file keys:", point_file.keys())

#print values
print("traj_file values:")
for key in traj_file.keys():
    print(key, ":", traj_file[key])

print("point_file values:")
for key in point_file.keys():
    print(key, ":", point_file[key])

traj_file keys: <KeysViewHDF5 ['edges', 'lens']>
point_file keys: <KeysViewHDF5 ['edges', 'locs']>
traj_file values:
edges : <HDF5 dataset "edges": shape (3, 2), type "<i8">
lens : <HDF5 dataset "lens": shape (3,), type "<f8">
point_file values:
edges : <HDF5 dataset "edges": shape (28236,), type "<i8">
locs : <HDF5 dataset "locs": shape (28236,), type "<f8">


## Prep the subsets and cell barcodes post filtering cells (28,236 cells only) 
* Submit sbatch

In [5]:
################# Check the subsets output #################
#Cell subset list
!head $dictys_data_path/subsets.txt

Activated_B_Cells
Day_1_Cells
Day_3_Cells
Germinal_Center
Plasma_Blast
Undefined


## Prepare configs for network inference

In [10]:
%%bash
# Generate configuration template
rm -Rf /ocean/projects/cis240075p/asachan/bio_informatics_analysis/B_Cells_human_analysis/analysis_repo/multiome_dictys/makefiles
mkdir /ocean/projects/cis240075p/asachan/bio_informatics_analysis/B_Cells_human_analysis/analysis_repo/multiome_dictys/makefiles
cd /ocean/projects/cis240075p/asachan/bio_informatics_analysis/B_Cells_human_analysis/analysis_repo/multiome_dictys/makefiles
dictys_helper makefile_template.sh common.mk config.mk env_none.mk dynamic.mk

# Update configurations, such as:
# DEVICE: pytorch device, e.g. cpu, cuda:0. If you do not have a GPU, use 'cpu' and expect LONG computing time.
# GENOME_MACS2: effective genome size for macs2. See https://deeptools.readthedocs.io/en/develop/content/feature/effectiveGenomeSize.html
# JOINT: whether dataset is joint profiling of RNA and ATAC.
# Other configurations include quality control thresholds, number of threads in each job, number of hidden confounders, etc.
# They can be obtained in the full-multiome tutorial.
dictys_helper makefile_update.py /ocean/projects/cis240075p/asachan/bio_informatics_analysis/B_Cells_human_analysis/analysis_repo/multiome_dictys/makefiles/config.mk '{"DEVICE": "cuda:0", "GENOME_MACS2": "hs", "JOINT": "1"}'


In [1]:
!dictys_helper makefile_check.py --dir_data /ocean/projects/cis240075p/asachan/datasets/B_Cell/multiome_1st_donor_UPMC_aggr/dictys_outs/data --dir_makefiles /ocean/projects/cis240075p/asachan/bio_informatics_analysis/B_Cells_human_analysis/analysis_repo/multiome_dictys/makefiles

Joint profile: True
Found 36306 cells with RNA profile
Found 24026 genes with RNA profile
Found 15285 cells with ATAC profile
Traceback (most recent call last):
  File "/ocean/projects/cis240075p/asachan/.conda/envs/dictys/lib/python3.9/site-packages/dictys/scripts/helper/makefile_check.py", line 149, in <module>
    raise FileNotFoundError(s)
FileNotFoundError: Not all cells with RNA profile in expression.tsv.gz has a bam file in bams folder for the joint profiling dataset. First three cells missing: GAGCCACTCCTTGCGT-3, AGGTTAGAGCACCACA-2, GCAGGAAGTACCCACC-3


#### check preproc function on data separately before running on the make system