In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('../scripts')
import numpy as np
import os, h5py
import pandas as pd
from gopher import variant_effect

In [3]:
# read df and add strand
all_dfs = []
cagi_data = '../../data/CAGI/raw/'
combined_filename = '../../data/CAGI/combined_cagi.bed'
for filename in os.listdir(cagi_data):
    prefix, regulator = filename.split('.tsv')[0].split('_')

    one_reg = pd.read_csv(os.path.join(cagi_data,filename), skiprows=7, sep='\t', header=None)
    one_reg['regulator'] = regulator
    one_reg['set'] = prefix
    all_dfs.append(one_reg)
    

combined_cagi = pd.concat(all_dfs)
combined_cagi.insert(4, 'strand', '+')
combined_cagi.insert(2,'end',combined_cagi.iloc[:,1]+1)
combined_cagi.iloc[:,0] = 'chr'+combined_cagi.iloc[:,0].astype(str)
combined_cagi.to_csv(combined_filename, sep='\t', header=False, index=None)

In [3]:
output_filename = '../data/nonneg_cagi_3K.bed'
variant_effect.expand_range(combined_filename, output_filename)

In [4]:
fa_filename = '../data/cagi_3k.fa'
coords_list, seqs_list = variant_effect.convert_bed_to_seq(output_filename, fa_filename, genomefile='/home/shush/genomes/hg19.fa')

In [5]:
window = 3072
bad_lines = []
N = len(seqs_list)
nonneg_df = pd.read_csv(output_filename, sep='\t', header=None)
mid = window // 2
onehot_ref = []
onehot_alt = []
coord_np = np.empty((N, 4)) # chrom, start, end coordinate array
pos_dict = {'+': 1535, '-':1536}
for i,(chr_s_e, seq) in enumerate(zip(coords_list, seqs_list)):
    alt = ''
    strand = chr_s_e.split('(')[-1].split(')')[0]
    pos = pos_dict[strand]
#     coord_np[i,3] = pos_dict[strand] - 1535

    if seq[pos] != nonneg_df.iloc[i, 3]:
#         print('Error in line ' + str(i))
        bad_lines.append(i)
    else:
        alt = nonneg_df.iloc[i,4]

        onehot = variant_effect.dna_one_hot(seq)
        mutated_onehot = onehot.copy()
        mutated_onehot[pos] = variant_effect.dna_one_hot(alt)[0]
        onehot_ref.append(onehot)

        onehot_alt.append(mutated_onehot) 

onehot_alt = np.array(onehot_alt)
onehot_ref = np.array(onehot_ref)

In [6]:
included_df = nonneg_df[~nonneg_df.index.isin(bad_lines)]
included_df.to_csv('../data/final_cagi_metadata.csv')

In [7]:
onehot_ref_alt = h5py.File('../data/CAGI_onehot.h5', 'w')
onehot_ref_alt.create_dataset('ref', data=onehot_ref)
onehot_ref_alt.create_dataset('alt', data=onehot_alt)
onehot_ref_alt.close()


## Sanity check that only one nucleotide is different

In [8]:
onehot_ref_alt = h5py.File('../data/CAGI_onehot.h5', 'r')
np.argwhere(onehot_ref_alt['ref'][0,:,:] != onehot_ref_alt['alt'][0,:,:])

array([[1535,    0],
       [1535,    1]])

In [9]:
onehot_ref_alt['ref'][0,1535,:], onehot_ref_alt['alt'][0,1535,:]

(array([0., 1., 0., 0.], dtype=float16),
 array([1., 0., 0., 0.], dtype=float16))

## Run functional variant calling with eixisting model

In [2]:
#Read in created dataset
onehot_ref_alt = h5py.File('../data/CAGI_onehot.h5', 'r')
ref = onehot_ref_alt['ref'][()]
alt = onehot_ref_alt['alt'][()]

In [3]:
ref.shape

(18442, 3072, 4)

In [4]:
# Quantiative model
import variant_effect
variant_effect.vcf_quantitative('../tutorial_outputs/',ref,alt,2048,
                                '../data/cagi_robust',robust = True)

2022-03-17 18:36:53.738143: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-03-17 18:36:54.284081: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 14257 MB memory:  -> device: 0, name: NVIDIA RTX A4000, pci bus id: 0000:c2:00.0, compute capability: 8.6
2022-03-17 18:36:55.637068: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8100
2022-03-17 18:36:57.175156: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


In [5]:
#Binary model
import tensorflow as tf
tf.keras.backend.clear_session()
variant_effect.vcf_binary('../tutorial_binary/files/best_model.h5',ref,alt,-1,2048,
                               '../data/cagi_binary_robust',robust=True)