In [None]:
import sys
import os
import pandas as pd
import pickle
import time
import json
import yaml
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
import matplotlib as mpl
mpl.rcParams["pdf.fonttype"] = 42
mpl.rcParams["ps.fonttype"] = 42
import scanpy as sc
import assembly
import subprocess

sc.settings.verbosity = 0  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.set_figure_params(scanpy=True, dpi=100, dpi_save=300, facecolor='white', frameon=True, vector_friendly=True, 
                     fontsize=12,
                    #  figsize=(4,4),
                     color_map=None, format='png', transparent=False, ipython_format='png2x')
sc.settings.n_jobs=1
sc.settings.figdir = "./"

sys.path.append('/home/GitHub/3Dgenome/Higashi-main')
sys.path.append('/home/GitHub/3Dgenome/Fast-Higashi-main')
from fasthigashi.FastHigashi_Wrapper import *
from higashi.Higashi_wrapper import *

species='mm10'
chroms=list(assembly.build(species, 1)._chromsizes.keys())
work_dir='/home/spaceA'
cpu_num=60
filter_spot=False
umap_n_neighbors=20
tolerance= 3e-4 # 2e-5 3e-4
restore_order=False
do_conv=True
do_rwr=False
embed_type='embed_l2_norm'
batch_norm=False

out_dir=os.path.join(work_dir,'higashi_v2','fasthigashi','intergrade_v2')
os.makedirs(out_dir,exist_ok=True)
os.chdir(out_dir)

with open("/home/spaceA/config.yaml", "r") as f:
    config = yaml.load(f, Loader=yaml.FullLoader)

sampleid_list=config['spatial_infor'].keys()
kept_samples= ['E11.5L1', 'E11.5L2', 'E12.5L5', 'E12.5L6', 'E13.5C1', 
                'E13.5C4', 'E13.5C6', 'E14.5F5', 'E14.5F6']
len(kept_samples)

9

In [2]:
kept_samples

['E11.5L1',
 'E11.5L2',
 'E12.5L5',
 'E12.5L6',
 'E13.5C1',
 'E13.5C4',
 'E13.5C6',
 'E14.5F5',
 'E14.5F6']

## 2. prepare the fasthigashi input

mk filelist & prepare spot contact files

In [3]:
out_intergrade_fasthigashi_spot_dir=os.path.join(work_dir,'higashi_pre_v2/intergrade')
os.makedirs(out_intergrade_fasthigashi_spot_dir,exist_ok=True)

out_spot_higashi_filelst_f=os.path.join(out_dir,'filelist.txt')
if not os.path.exists(out_spot_higashi_filelst_f):
    with open(out_spot_higashi_filelst_f,'w') as f:
        for sample_name in tqdm(kept_samples):
            print(sample_name)
            spot_infor_f=os.path.join(work_dir,'higashi_v2','fasthigashi',sample_name+'_fasthigashi_obs_new.csv')
            spot_infor_tmp=pd.read_csv(spot_infor_f,index_col=0)
            for spot in spot_infor_tmp.spot_id.tolist(): # 文件保持spot顺序
                in_spot_higashi_dir=os.path.join(work_dir,f'higashi_pre_v2/{sample_name}')
                spot_f_path=os.path.join(in_spot_higashi_dir,f'{spot}.contact.tsv.gz')
                out_spot_lnk_path=os.path.join(out_intergrade_fasthigashi_spot_dir,f'{sample_name}@@{spot}.contact.tsv.gz')
                # subprocess.run(f'ln -s {spot_f_path} {out_spot_lnk_path} -f',shell=True)
                f.write(f'{out_spot_lnk_path}\n')


In [4]:
spot_infor_all=pd.DataFrame()

for sample_name in kept_samples:
    spot_infor_f=os.path.join(work_dir,'higashi_v2','fasthigashi',sample_name+'_fasthigashi_obs_new.csv')
    spot_infor_tmp=pd.read_csv(spot_infor_f,index_col=0)
    spot_infor_all=pd.concat([spot_infor_all,spot_infor_tmp])
    del spot_infor_tmp
spot_infor_all.to_csv(os.path.join(work_dir,'higashi_v2','fasthigashi','fasthigashi_obs_new_keptsamples_v2.csv'),index=False)
spot_infor_all.head()

Unnamed: 0,spot_id,sample,spot_raw_read_num,spot_raw_C_num,spot_raw_ct_num,spot_fcsize_read_num,spot_fcsize_C_num,spot_fcsize_ct_num,spot_fcsize_cis_ct_num,spot_fcsize_cis_ct_num_ratio,...,fasthigashi_kmeans_10,fasthigashi_kmeans_11,fasthigashi_kmeans_12,fasthigashi_kmeans_13,fasthigashi_kmeans_14,fasthigashi_kmeans,leiden_res_1.00,fasthigashi_leiden,fasthigashi_leiden_anno,fasthigashi_leiden_anno_man
0,oddBo81.evenBo61,E11.5L1,14962,5303,362025,11101,1442,362025,75446,0.2084,...,7,6,6,7,5,2,1,0,,
1,oddBo36.evenBo34,E11.5L1,31340,3912,12754783,18944,1053,2504051,363528,0.145176,...,3,3,7,8,12,0,3,2,Liver,
2,oddBo41.evenBo54,E11.5L1,13177,4897,694965,9278,998,694965,116855,0.168145,...,5,4,11,10,3,0,3,2,Liver,
3,oddBo24.evenBo53,E11.5L1,18320,2051,3261907,14842,664,2167785,275702,0.127181,...,8,2,8,5,13,5,0,1,Brain,
4,oddBo20.evenBo85,E11.5L1,18307,4150,2318148,13778,941,1446288,162796,0.112561,...,1,5,0,12,2,5,0,1,Brain,


In [5]:
label_info_new=spot_infor_all[['fasthigashi_leiden_anno_man','fasthigashi_leiden_anno','sample']].to_dict(orient='list')

label_info_f=os.path.join(out_dir,'label_info.pickle')
with open (label_info_f,'wb') as f:
    pickle.dump(label_info_new, f)


In [None]:
## 1. Prepare the fasthigashi input of config_fasthigashi
config_fasthigashi = os.path.join(out_dir,'config_fasthigashi_v1.JSON')
template_config =os.path.join(work_dir,"higashi_v2/fasthigashi",'config_higashi_template_v1.JSON')
with open(template_config, 'r') as f:
    config_template = json.load(f)
    config_template['config_name']='intergrade'
    config_template['data_dir']=out_dir
    config_template['temp_dir']=os.path.join(out_dir,'temp_fasthigashi')
    config_template['input_format']="higashi_v2"
    config_template['header_included']=False
    config_template['cpu_num']=cpu_num
    config_template['gpu_num']=0
    config_template['contact_header']=["chrom1", "pos1", "chrom2", "pos2", "count"]
    config_template['neighbor_num']=5
    config_template['UMAP_params']['n_neighbors']=umap_n_neighbors
    config_template['optional_smooth']=False
    config_template['plot_label']='fasthigashi_leiden_anno_man'
    config_template['chrom_list']=[x for x in chroms if x not in ['chrM','chrX','chrY']]
    config_template['genome_reference_path']="/home/mm10.mainchr.sizes"
    config_template['cytoband_path']="/home/spaceA/higashi_v2/fasthigashi/mm10_cytoBand.txt"

with open(config_fasthigashi, 'w') as file:
        json.dump(config_template, file, indent=4)


## 3. Run the Fast-Higashi model on its own or as an initialization for Higashi
### 3.1 Initialize Fast-Higashi model and turn sparse matrices into sparse tensors

In [7]:

fh_model = FastHigashi(config_path=config_fasthigashi,
                    path2input_cache=None, # when setting at None, will use the temp_dir on the JSON file
                    path2result_dir=None, # same as above
                    off_diag=100,
                    filter=filter_spot,
                    do_conv=do_conv, # at coarser resolution for high cov data, recommend to be False
                    do_rwr=do_rwr, # For high-cov data, the differences are minor, will show later with do_rwr=True option
                    do_col=False,
                    no_col=False)


# config_path           The path to the configuration JSON file that you created.
# path2input_cache      The path to the directory where the cached tensor file will be stored
# path2result_dir       The path to the directory where the cached tensor file will be stored
# off_diag              Maximum No of diagonals to consider. When set as 100, the 0-100th diagonal would 
#                       be considered
# filter                Whether only use cells that pass the quality control standard to learn the meta-interactions, 
#                       and then infers the embeddings for the result of the cells. 
# do_conv               Whether use linear convolution or not.
# do_rwr                Whether use partial random walk with restart or not
# do_col                Whether use sqrt_vc normalization or not, the program 
#                       would automatically uses it when needed
# no_col                Whether force the program to not use sqrt_vc normalization, the program would automatically uses it when needed
# batch_norm            Whether uses batch corrected normalization or not


setting to gpu:0 available memory = 5111 MB


In [8]:
# From contact pairs to the sparse matrices and store them on disk
if not os.path.exists(os.path.join(fh_model.temp_dir, "raw", "%s_sparse_adj.npy" % fh_model.chrom_list[0])):
    start = time.time()
    fh_model.fast_process_data()
    print("contact pairs to sparse mtx takes: %.2f s" % (time.time() - start))


generating start/end dict for chromosome
extracting from filelist.txt


  0%|          | 0/40564 [00:00<?, ?it/s]

fast process finishes
contact pairs to sparse mtx takes: 427.31 s


In [9]:
# packing data from sparse matrices to sparse tensors
start = time.time()
fh_model.prep_dataset(batch_norm=batch_norm) # we don't have batch_id provided so, set as False
print("packing sparse mtx takes: %.2f s" % (time.time() - start))


total number of cells that pass qc check 23291 bad 17273 total: 40564
cache file = /home/goubo/CRICK/CRICK/spaceA/higashi_v2/fasthigashi/intergrade_v2/temp_fasthigashi/cache_intra_500000_offdiag_100_.pkl
saving cached input to /home/goubo/CRICK/CRICK/spaceA/higashi_v2/fasthigashi/intergrade_v2/temp_fasthigashi/cache_intra_500000_offdiag_100_.pkl


sparse mtx into tensors:   0%|          | 0/19 [00:00<?, ?it/s]

breaking into batches:   0%|          | 0/19 [00:00<?, ?it/s]

sparsity 0.17845394468090295
do_conv True do_rwr False do_col False
recommend_bs_cell [3688, 4057, 4508, 4508, 5071, 5071, 5071, 2536, 2705, 2536, 2705, 2705, 2705, 2705, 3381, 3688, 3688, 4057, 6761] pinning memory
packing sparse mtx takes: 1233.48 s


In [10]:
fh_model.run_model(extra="", # can be any words, this will be appended to the model name when the model is saved. Used as an identifier.
                rank=256,
                n_iter_parafac=1,
                tol=tolerance #3e-4 # In the original manuscript, we use this tolerance, but later we found that setting it to smaller ones might lead to better performance on some data. Will do an ablation later
                )



dim1_0.6_rank_256_niterp_1_
n_iter_parafac 1
empty params initialized
time elapsed: 0.53


initializing params:   0%|          | 0/19 [00:00<?, ?it/s]

rwr iters: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
time elapsed: 93.53
finish init
Starting iteration 0

PARAFAC2 re=3.156 takes 50.0s
Starting iteration 1

PARAFAC2 re=0.830 9.31e-01 variation min8.8e-01 at chrom 18, max9.5e-01 at chrom 0 takes 44.3s
Starting iteration 2

PARAFAC2 re=0.796 8.16e-02 variation min4.0e-02 at chrom 0, max1.0e-01 at chrom 16 takes 44.7s
Starting iteration 3

PARAFAC2 re=0.780 3.94e-02 variation min2.6e-02 at chrom 0, max1.0e-01 at chrom 18 takes 44.8s
Starting iteration 4

PARAFAC2 re=0.777 7.56e-03 variation min4.0e-03 at chrom 0, max5.8e-02 at chrom 18 takes 44.9s
Starting iteration 5

PARAFAC2 re=0.776 2.28e-03 variation min7.2e-04 at chrom 0, max2.2e-02 at chrom 18 takes 44.9s
Starting iteration 6

PARAFAC2 re=0.776 1.05e-03 variation min8.7e-05 at chrom 1, max8.8e-03 at chrom 18 takes 44.9s
Starting iteration 7

PARAFAC2 re=0.775 6.19e-04 variation min-5.2e-05 at chrom 1, max4.9e-03 at chrom 18 takes 44.9s
Starting iteration 8

PARAFAC2 re=0.775 4.21e

In [11]:
# loading existing trained models
# This operation is optional when the model is just trained
fh_model.load_model(extra="",rank=256,n_iter_parafac=1)


model loaded


In [12]:
# getting embedding
embed = fh_model.fetch_cell_embedding(final_dim=256,
                                    restore_order=restore_order)
# The returned embed is a dictionary that stores the embeddings after different ways of post-processing.
# 'embed_l2_norm' or 'embed_l2_norm_correct_coverage_fh' usually yields the best results, the latter one represents linear correction of sequencing depth bias.


fetching embedding


In [None]:
fh_model.correct_batch_linear("sample")

In [14]:
print(embed.keys())

dict_keys(['embed_all', 'embed_raw', 'embed_l2_norm', 'restore_order', 'embed_correct_coverage_fh', 'embed_l2_norm_correct_coverage_fh', 'embed_correct_sample', 'embed_l2_norm_correct_sample'])


In [15]:
# save embedding
embed_f='embed.pickle'
if not os.path.exists(embed_f):
    with open (embed_f,'wb') as f:
        pickle.dump(embed, f)
