In [None]:

import sys
import os
import pandas as pd
import pickle
import time
import json
import yaml
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
import matplotlib as mpl
mpl.rcParams["pdf.fonttype"] = 42
mpl.rcParams["ps.fonttype"] = 42
import scanpy as sc
sys.path.append('/home/goubo/spt')
import assembly

sc.settings.verbosity = 0  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.set_figure_params(scanpy=True, dpi=100, dpi_save=300, facecolor='white', frameon=True, vector_friendly=True, 
                     fontsize=12,
                    #  figsize=(4,4),
                     color_map=None, format='png', transparent=False, ipython_format='png2x')
sc.settings.n_jobs=1
sc.settings.figdir = "./"

sys.path.append('/home/goubo/GitHub/3Dgenome/Higashi-main')
sys.path.append('/home/goubo/GitHub/3Dgenome/Fast-Higashi-main')
from fasthigashi.FastHigashi_Wrapper import *
from higashi.Higashi_wrapper import *

species='mm10'
chroms=list(assembly.build(species, 1)._chromsizes.keys())
work_dir='/home/spaceA/higashi/fasthigashi'
cpu_num=60
filter_spot=False
umap_n_neighbors=20
tolerance= 2e-5 
restore_order=True
do_conv=True
do_rwr=False
embed_type='embed_l2_norm'

with open("/home/spaceA/config.yaml", "r") as f:
    config = yaml.load(f, Loader=yaml.FullLoader)


In [None]:
def main(sample_name):
    print(f'\nNow process:{sample_name}')
    out_dir=os.path.join(work_dir,sample_name)
    os.makedirs(out_dir,exist_ok=True)
    os.chdir(out_dir)

    ## 1. Prepare the hagashi input of config_hagashi
    config_hagashi = os.path.join(out_dir,'config_higashi_v1.JSON')
    template_config =os.path.join(work_dir,'config_higashi_template_v1.JSON')
    with open(template_config, 'r') as f:
        config_template = json.load(f)
        config_template['config_name']=sample_name
        config_template['data_dir']=out_dir
        config_template['temp_dir']=os.path.join(out_dir,'temp_fasthigashi')
        config_template['input_format']="higashi"
        config_template['header_included']=False
        config_template['cpu_num']=cpu_num
        config_template['contact_header']=["chrom1", "pos1", "chrom2", "pos2", "count"]
        config_template['plot_label']='spot_cluster'
        config_template['neighbor_num']=5
        config_template['UMAP_params']['n_neighbors']=umap_n_neighbors
        config_template['optional_smooth']=False
        config_template['chrom_list']=[x for x in chroms if x not in ['chrM','chrX','chrY']]
        config_template['genome_reference_path']="/home/mm10.mainchr.sizes"
        config_template['cytoband_path']="/home/spaceA/fasthigashi/mm10_cytoBand.txt"

    with open(config_hagashi, 'w') as file:
            json.dump(config_template, file, indent=4)

    ## 2. prepare the hagashi input of leiden label_info from other cluster method not hagashi
    spot_infor_f=os.path.join('/home/spaceA/SpatialSPRITE_res/Filter_Spot_v4_tmp',
                                f'{sample_name}_spot_infor_final.csv')
    spot_infor=pd.read_csv(spot_infor_f)
    spot_cluster_infor_f=os.path.join('/home/spaceA/SpatialSPRITE_res/Leiden.0',
                            sample_name+'_obs.csv')
    spot_cluster_infor=pd.read_csv(spot_cluster_infor_f,index_col=0)

    spot_infor_merge=spot_infor.merge(spot_cluster_infor,left_on='spot_id',right_on='spotid',how='left')
    assigned_clusters=spot_infor_merge['leiden'].apply(lambda x: 'cluster_'+ str(x)).to_list()

    label_info_new={'spot_cluster':assigned_clusters}
    with open ('label_info.pickle','wb') as f:
        pickle.dump(label_info_new, f)

    ## 3. Run the Fast-Higashi model on its own or as an initialization for Higashi
    ### 3.1 Initialize Fast-Higashi model and turn sparse matrices into sparse tensors
    fh_model = FastHigashi(config_path=config_hagashi,
                        path2input_cache=None, # when setting at None, will use the temp_dir on the JSON file
                        path2result_dir=None, # same as above
                        off_diag=100,
                        filter=filter_spot,
                        do_conv=do_conv, # at coarser resolution for high cov data, recommend to be False
                        do_rwr=do_rwr, # For high-cov data, the differences are minor, will show later with do_rwr=True option
                        do_col=False,
                        no_col=False)


    # config_path           The path to the configuration JSON file that you created.
    # path2input_cache      The path to the directory where the cached tensor file will be stored
    # path2result_dir       The path to the directory where the cached tensor file will be stored
    # off_diag              Maximum No of diagonals to consider. When set as 100, the 0-100th diagonal would 
    #                       be considered
    # filter                Whether only use cells that pass the quality control standard to learn the meta-interactions, 
    #                       and then infers the embeddings for the result of the cells. 
    # do_conv               Whether use linear convolution or not.
    # do_rwr                Whether use partial random walk with restart or not
    # do_col                Whether use sqrt_vc normalization or not, the program 
    #                       would automatically uses it when needed
    # no_col                Whether force the program to not use sqrt_vc normalization, the program would automatically uses it when needed
    # batch_norm            Whether uses batch corrected normalization or not

    # From contact pairs to the sparse matrices and store them on disk
    if not os.path.exists(os.path.join(fh_model.temp_dir, "raw", "%s_sparse_adj.npy" % fh_model.chrom_list[0])):
        start = time.time()
        fh_model.fast_process_data()
        print("contact pairs to sparse mtx takes: %.2f s" % (time.time() - start))
        
    # packing data from sparse matrices to sparse tensors
    start = time.time()
    fh_model.prep_dataset(batch_norm=False) # we don't have batch_id provided so, set as False
    print("packing sparse mtx takes: %.2f s" % (time.time() - start))

    fh_model.run_model(extra="", # can be any words, this will be appended to the model name when the model is saved. Used as an identifier.
                    rank=256,
                    n_iter_parafac=1,
                    tol=tolerance #3e-4 # In the original manuscript, we use this tolerance, but later we found that setting it to smaller ones might lead to better performance on some data. Will do an ablation later
                    )

    # loading existing trained models
    # This operation is optional when the model is just trained
    fh_model.load_model(extra="",rank=256,n_iter_parafac=1)

    # getting embedding
    embed = fh_model.fetch_cell_embedding(final_dim=256,
                                        restore_order=restore_order)
    # The returned embed is a dictionary that stores the embeddings after different ways of post-processing.
    # 'embed_l2_norm' or 'embed_l2_norm_correct_coverage_fh' usually yields the best results, the latter one represents linear correction of sequencing depth bias.

    # save embedding
    embed_f='embed.pickle'
    with open (embed_f,'wb') as f:
        pickle.dump(embed, f)

    print(f'{sample_name} done!')

In [None]:

sampleid_list= config['spatial_infor'].keys() 
for sample in tqdm(sampleid_list):
    main(sample)
