In [None]:
import os
import numpy as np
import pandas as pd
import multiprocessing

In [None]:
chromosome_length = {
    'chr1':248956422, 'chr2':242193529, 'chr3':198295559, 'chr4':190214555, 'chr5':181538259,
    'chr6':170805979, 'chr7':159345973, 'chr8':145138636, 'chr9':138394717, 'chr10':133797422,
    'chr11':135086622, 'chr12':133275309, 'chr13':114364328, 'chr14':107043718, 'chr15':101991189,
    'chr16':90338345, 'chr17':83257441, 'chr18':80373285, 'chr19':58617616, 'chr20':64444167, 'chr21':46709983,
    'chr22':50818468, 'chrX':156040895, 'chrY':57227415,
}

normal_chromosome = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9',
                     'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17',
                     'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY']

---


In [None]:
index_dir = './record/records/predict_joint'
prepare_dir = './record/records/prepare_joint'
save_dir = './record/records/predict_result/single_results'

In [None]:
child_dirs = os.listdir(index_dir)
for i in child_dirs:
    index_files = os.listdir(os.path.join(index_dir, i))
    for j in index_files:
        file_name = os.path.splitext(j)[0]

        index = np.load(os.path.join(index_dir, i, j))
        index = index / 100

        prepare_file = os.path.join(prepare_dir, i, f'{file_name}.parquet')
        prepare_data = pd.read_parquet(prepare_file)

        if len(prepare_data) != len(index):
            raise ValueError
        prepare_data['prob'] = index
        prepare_data = prepare_data[prepare_data['prob'] >= 0.5]


        save_path = os.path.join(save_dir, i)
        os.makedirs(save_path, exist_ok=True)
        prepare_data.to_parquet(os.path.join(save_path, f'{file_name}.parquet'))

    print(f'Finish {i}')

---

In [None]:
def overlap_value_list(df, chr_name):
    chrom_len = chromosome_length[chr_name]
    raw_list = np.zeros(chrom_len + 1, dtype=np.float64)

    starts = df.iloc[:, 1].values - 1
    ends = df.iloc[:, 2].values
    probs = df.iloc[:, 3].values

    np.add.at(raw_list, starts, probs)
    np.add.at(raw_list, ends, -probs)

    raw_list = np.cumsum(raw_list[:-1])

    return raw_list

def merge_single_chrom_result(dir_name, dir_list, save_dir):
    for i in dir_list:
        print(f'Start {i}')
        temp_chr_result = []
        files = os.listdir(os.path.join(dir_name, i))
        for j in files:
            file_path = os.path.join(dir_name, i, j)
            temp_data = pd.read_parquet(file_path)

            temp_chr_result.append(temp_data)

        temp_chr_result = pd.concat(temp_chr_result, axis=0)
        print(f'{i} have {len(temp_chr_result)}')

        overlap_list = overlap_value_list(temp_chr_result, i)
        indexs = np.arange(len(overlap_list))
        overlap_list[overlap_list < 0.49] = 0
        overlap_result = pd.DataFrame({
            'site': indexs,
            'prob': overlap_list
        })

        save_path= os.path.join(save_dir, i)
        os.makedirs(save_path, exist_ok=True)
        overlap_result.to_parquet(os.path.join(save_path, f'{i}_site_result.parquet'))
        print(f'Finish store {i}')
        print(f'----')

In [None]:
dir_name = './record/records/predict_result/single_results'
dir_list = os.listdir(dir_name)
save_dir = './record/records/predict_result/single_merged_results'

merge_single_chrom_result(dir_name, dir_list, save_dir)

---

In [None]:
for chr in normal_chromosome:
    file_path = './record/records/predict_result/single_merged_results'
    file = os.path.join(file_path, chr, f'{chr}_site_result.parquet')
    if not os.path.exists(file):
        print(f'{chr} is not exist')
        continue
    save_path = './record/records/predict_result/single_merged_results_log'

    data = pd.read_parquet(file)
    value = np.array(data['prob'])
    value_ = np.log10(value + 1)
    value_ = np.round(value_, 1)
    data = data[data['prob'] != 0]
    data.to_parquet(os.path.join(save_path, f'{chr}_log.parquet'))
    print(f'{chr} is finished')

In [None]:
def merge_data(name_list):
    merge_data = []
    for i in name_list:
        file_name = f'./record/records/predict_result/single_merged_results_log/{i}_log.parquet'
        if os.path.exists(file_name):
            data = pd.read_parquet(file_name)
            chr_list = [i]*len(data)
            data['chr'] = chr_list
            data = data[['chr', 'site', 'prob']]
            merge_data.append(data)
            print(f'Finish {i}')
        else:
            print(f'{i} is not exist')
    save_path = './record/records/predict_result/merged_results'
    merge_data = pd.concat(merge_data, axis=0)
    merge_data.to_parquet(os.path.join(save_path, 'all_predict_result_log.parquet'))
    print('Finish all !!!')

In [None]:
merge_data(normal_chromosome)

---

In [None]:
all_result = pd.read_parquet('./record/records/predict_result/merged_results/all_predict_result_log.parquet')

In [None]:
all_result_ = all_result.copy()
all_result_['start'] = all_result_['site']
all_result_['end'] = all_result_['start'] + 1
all_result_ = all_result_[['chr', 'start', 'end', 'prob']]

In [None]:
all_result_.to_csv('./record/records/predict_result/merged_results/all_predict_result_log_bed.bed',
                   header=False, index=False, sep='\t')