In [None]:
from genepy.epigenetics import chipseq as chip
from genepy.utils import helper as h
import pandas as pd
import os 
import dalmatian as dm # give link to dalmatian
from depmapomics.config_prod import *
import multiprocessing
from depmapomics import loading, tracker
%load_ext autoreload
%autoreload 2


In [None]:
! gsutil cp gs://fc-4a2230c7-3b51-4476-8c82-84c15d3962f8/filtering.bed /tmp/
CORES=16

In [None]:
wm = dm.WorkspaceManager(WGSWORKSPACE)
samp = wm.get_samples()
vcfs = samp['cnn_filtered_vcf']
vcfslist = vcfs[~vcfs.isna()].tolist()
# load vcfs using dalmatian (column: cnn_filtered_vcf)
h.createFoldersFor('/tmp/vcfs/')
guides_bed = pd.read_csv("/tmp/filtering.bed", sep='\t', header=None, names=['chrom', 'start', 'end', 'foldchange'])

In [None]:
cmd = [
"gsutil cp "+sam+" /tmp/vcfs/"+sam.split('/')[-1]+"&& \
 bcftools index /tmp/vcfs/"+sam.split('/')[-1]+" && \
 bcftools query \
  --exclude \"FILTER!='PASS'&GT!='mis'&GT!~'\.'\" \
  --regions-file /tmp/filtering.bed \
  --format '%CHROM\\t%POS\\t%END\\t%ALT{0}\n' /tmp/vcfs/"+sam.split('/')[-1]+" >\
 /tmp/vcfs/loc_"+sam.split('/')[-1].split('.')[0]+".bed &&\
 rm /tmp/vcfs/"+sam.split('/')[-1]+"*" for sam in vcfslist]
h.parrun(cmd, cores=CORES)

In [None]:
def stuff(file):    
    bed = pd.read_csv('/tmp/vcfs/'+file, sep='\t', header=None, names=['chrom', 'start', 'end', 'foldchange'])
    bed['foldchange'] = 1
    name = file.split('/')[-1].split('.')[0].split('_')[1]
    if len(bed)==0:
        return (name, None)
    val = chip.putInBed(guides_bed, bed, mergetype='sum')
    return (name, val)

pool = multiprocessing.Pool(CORES)
res = pool.map(stuff, os.listdir('/tmp/vcfs/'))
sorted_guides_bed = guides_bed.sort_values(by=["chrom", "start", "end"]).reset_index(drop=True)
#res = [stuff(val) for val in os.listdir('/tmp/vcfs/')[5:]]
print('done pooling')
for name, val in res:
    if val is not None:
        sorted_guides_bed[name] = val
print('saving matrix')
sorted_guides_bed.to_csv('../temp/binary_mutguides_wgs.tsv.gz', sep='\t')

In [None]:
################# merge wgs and wes matrices ###################
wgs_mat = pd.read_csv("../temp/binary_mutguides_wgs.tsv.gz", sep='\t', index_col='Unnamed: 0')
wes_mat = pd.read_csv("../temp/binary_mutguides_wes.tsv.gz", sep='\t', index_col='Unnamed: 0')

In [None]:
wgs_mat_noguides = wgs_mat.iloc[:, 4:]
wes_mat_noguides = wes_mat.iloc[:, 4:]

In [None]:
wgs_mat_noguides

In [None]:
import json

wgs_renaming = {}
wes_renaming = {}

with open('../temp/22Q2/wgs_sample_renaming.json') as json_file:
    wgs_renaming = json.load(json_file)

with open('../temp/22Q2/wes_sample_renaming.json') as json_file:
    wes_renaming = json.load(json_file)


In [None]:
wgs_whitelist = [x for x in wgs_mat_noguides.columns if x in wgs_renaming]
wes_whitelist = [x for x in wes_mat_noguides.columns if x in wes_renaming]

len(wgs_whitelist)

In [None]:
wgs_whitelist_mat = wgs_mat_noguides[wgs_whitelist]
wes_whitelist_mat = wes_mat_noguides[wes_whitelist]
wgs_whitelist_mat = wgs_whitelist_mat.rename(columns=wgs_renaming)
wes_whitelist_mat = wes_whitelist_mat.rename(columns=wes_renaming)
wgs_whitelist_mat

In [None]:
wes_to_append = [x for x in wes_whitelist_mat.columns if x not in wgs_whitelist_mat.columns]
mergedmat = wgs_whitelist_mat.join(wes_whitelist_mat[wes_to_append])

In [None]:
[x for x in mergedmat.columns.tolist() if x[:4] != 'ACH-']

In [None]:
# binarize the matrix
mergedmat = mergedmat.astype(bool).astype(int)

In [None]:
sorted_mat = sorted_guides_bed.iloc[:, :4].join(mergedmat)
sorted_mat

In [None]:
sorted_mat['end'] = sorted_mat['end'].astype(int)
sorted_mat.to_csv('../temp/merged_binary_germline.csv', index=False)

In [None]:
from taigapy import TaigaClient

tc = TaigaClient()

tc.update_dataset(
    changes_description="add binary germline matrix",
    dataset_permaname=TAIGA_CN_ACHILLES,
    upload_files=[
        {
            "path": '../temp/merged_binary_germline.csv',
            "format": "TableCSV",
            "encoding": "utf-8",
        },
    ],
    add_all_existing_files=True,
)

In [None]:
sorted_mat

In [None]:
arx = 'ACH-000550'
spot_df = sorted_mat[['chrom', 'start', 'end', arx]]
spot_hits = spot_df[spot_df[arx] != 0]
spot_hits

In [None]:
spot_hits[(spot_hits['chrom'] == 'chrX') & (spot_hits['start'] > 797471)]