In [1]:
import os, h5py
import numpy as np
import pandas as pd
import h5py
from collections import defaultdict
import csv
organism = 'plants'
#np.random.seed(22) # for reproducibility

## Process JASPAR pfm and convert to pwm

In [2]:
REVERSE_VOCAB = np.array(['A','C','G','T','N'])
def onehot_to_seq(onehot_array):
    seq_array = []
    for onehot in onehot_array:
        adj = np.sum(onehot, axis=-2) == 0
        x_index = np.argmax(onehot,axis=-2) - adj
        seq_onehot = REVERSE_VOCAB[x_index]
        seq_char = ''.join(seq_onehot)
        seq_array.append(seq_char)
    return np.array(seq_array)


def list_duplicates(seq):
    tally = defaultdict(list)
    for i,item in enumerate(seq):
        tally[item].append(i)
    return ((key,locs) for key,locs in tally.items() 
                            if len(locs)>1)

def get_jaspar_motifs(file_path):
    def get_motif(f):
        line = f.readline()
        name = line.strip().split()[1]
        pfm = []
        for i in range(4):
            line = f.readline()
            if len(line.split()[1]) > 1:
                pfm.append(np.asarray(np.hstack([line.split()[1][1:], line.split()[2:-1]]), dtype=float))
            else:
                pfm.append(np.asarray(line.split()[2:-1], dtype=float))
        pfm = np.vstack(pfm)
        sum_pfm = np.sum(pfm, axis=0)
        pwm = pfm/np.outer(np.ones(4), sum_pfm)
        return name, pwm

    num_lines = sum(1 for line in open(file_path))
    num_motifs = int(num_lines/5)

    f = open(file_path)
    tf_names = []
    tf_motifs = []
    for i in range(num_motifs):
        name, pwm = get_motif(f)
        tf_names.append(name)
        tf_motifs.append(pwm)

    return tf_motifs, tf_names


In [3]:
# parse JASPAR motifs
savepath = '/home/amber/multitask_RNA/data/motif/'+organism+'/'
file_path = os.path.join(savepath, 'pfm_'+organism+'.txt')
motif_set, motif_names = get_jaspar_motifs(file_path)
total_index = np.arange(len(motif_set))

#remove duplicates
dup_index = []
dup_index = [dup[-1][-1] for dup in list_duplicates(motif_names)]
unique_index = set(total_index) - set(dup_index)

# get a subset of core motifs
if organism == 'plants':
     core_names = ['TRB1','TRB2']
elif organism == 'vertebrates':
    core_names = ['SP1', 'GABPA', 'CEBPB', 'MAX', 'Yy1']

strand_motifs = []
for name in core_names:
    index = motif_names.index(name)
    unique_index.remove(index)
    strand_motifs.append(motif_set[index])
#========================================================
# #or pure random selection
# core_names=[]
# strand_motifs=[]

# randomly select more motifs
num_background = 95        
motif_index = np.random.choice(list(unique_index),num_background,replace=False)
core_names.extend(np.array(motif_names)[motif_index])
for index in motif_index:
    pwm = motif_set[index]
    strand_motifs.append(pwm)  

# generate reverse compliments
core_motifs = []
for pwm in strand_motifs:
    core_motifs.append(pwm)
    reverse = pwm[:,::-1]
    core_motifs.append(reverse[::-1,:]) 

## Simulation to creat sequence with motif embed (Full random)

In [85]:

def generate_seq(motifs,seq_length,rep_num,center=False):

    def per_motif_insertion(motif,seq_length,rep_num,center = False):
        motif_len = motif.shape[1]
        seq_pwm = np.ones((rep_num,4,seq_length))/4
        if center == False:
            loci = np.random.randint(0,seq_length-motif.shape[1],rep_num)
        elif center == True:
            loci = np.full(rep_num, int((seq_length-motif.shape[1])/2))
        else:
            raise ValueError('Enter boolean value for center field.')
        location = []
        for i in range(len(loci)):
            seq_pwm[i,:,loci[i]:loci[i]+motif_len] = motif
            location.append(np.arange(loci[i],loci[i]+motif_len))    

        Z = np.random.uniform(0,1,(rep_num,seq_length))
        cum_prob = seq_pwm.cumsum(axis=1)
        one_hot_seq = np.zeros(seq_pwm.shape)
        for i in range(rep_num):
            for j in range(seq_length):
                index=[k for k in range(4) if Z[i,j] < cum_prob[i,k,j]][0]
                one_hot_seq[i,index,j] = 1
        return one_hot_seq, np.array(location)

    sequence = []
    insert_loci = []
    for motif in motifs:
        seq,loci, =  per_motif_insertion(motif,seq_length,rep_num,center=center)
        sequence.append(onehot_to_seq(seq))
        insert_loci.append(loci)
        
    return np.array(sequence),insert_loci



In [86]:
onehot_seq,loci = generate_seq(core_motifs,512,50,center=True)

In [87]:
file = h5py.File('/home/amber/multitask_RNA/data/motif/plants/random_center/synthetic_seq.h5','w')
for i in range(len(core_names)):
    file.create_dataset(core_names[i],data = np.hstack([onehot_seq[2*i],onehot_seq[2*i+1]]).tolist())
    file.create_dataset(core_names[i]+'_loci',data = np.vstack([loci[2*i],loci[2*i+1]]).tolist())
file.close()

with open('/home/amber/multitask_RNA/data/motif/plants/random_center/selected_motif.csv', 'w') as fp:
    for item in core_names:
        # write each item on a new line
        fp.write("%s\n" % item)
 

## Di-nuc shuffle dataset

In [4]:
from gopher.dinuc_shuffle import dinuc_shuffle
from gopher.variant_effect import dna_one_hot
from datasets import load_dataset

In [5]:
rep_per_motif=100
data_files={}
data_files["test"] ='/home/amber/multitask_RNA/data/GPN_plant/dataset/test/Arabidopsis_thaliana.test.512.256.parquet'
extension = 'parquet'
raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=None)

Using custom data configuration default-27cf6341afd1f923
Found cached dataset parquet (/home/amber/.cache/huggingface/datasets/parquet/default-27cf6341afd1f923/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

In [6]:
motif_len = [i.shape[-1] for i in core_motifs]
max_motif_len = max(motif_len)
background_seq_index = np.random.choice(len(raw_datasets['test']),rep_per_motif)
background_seq = np.array(raw_datasets['test']['seq'])[background_seq_index]
background_onehot = np.array([dna_one_hot(seq) for seq in background_seq])
shuffle_onehot = np.array([dinuc_shuffle(onehot) for onehot in background_onehot])
shuffle_onehot = np.swapaxes(shuffle_onehot,1,2)
insert_loci = np.random.choice(background_onehot.shape[1]-max_motif_len,rep_per_motif)


In [11]:
file = h5py.File('/home/amber/multitask_RNA/data/motif/plants/dinuc_shuffle/synthetic_seq.h5','w')
for i in range(len(core_names)):
    motifs = [core_motifs[2*i],core_motifs[2*i+1]]
    motif_len = motifs[0].shape[-1]
    location = []
    seq = [] 
    for motif_pwm in motifs:
        insert_seq = shuffle_onehot.copy()
        cum_prob = motif_pwm.cumsum(axis=0)
        Z = np.random.uniform(0,1,(rep_per_motif,motif_len))
        for j,loci in enumerate(insert_loci):
            motif = np.zeros(motif_pwm.shape)
            for k in range(motif_len):
                index = [base for base in range(4) if Z[j,k] < cum_prob[base,k]][0]
                motif[index,k] = 1
            insert_seq[j,:,loci:loci+motif_len] = motif
            location.append(np.arange(loci,loci+motif_len))
        seq.extend(onehot_to_seq(insert_seq))
    file.create_dataset(core_names[i],data = np.array(seq).tolist())
    file.create_dataset(core_names[i]+'_loci',data = location)
        
file.close()

In [12]:
with open('/home/amber/multitask_RNA/data/motif/plants/dinuc_shuffle/selected_motif.csv', 'w') as fp:
    for item in core_names:
        # write each item on a new line
        fp.write("%s\n" % item)

## Raw sequence insert

In [13]:
from gopher.dinuc_shuffle import dinuc_shuffle
from gopher.variant_effect import dna_one_hot
from datasets import load_dataset

In [14]:
rep_per_motif=100
data_files={}
data_files["test"] ='/home/amber/multitask_RNA/data/GPN_plant/dataset/test/Arabidopsis_thaliana.test.512.256.parquet'
extension = 'parquet'
raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=None)

Using custom data configuration default-27cf6341afd1f923
Found cached dataset parquet (/home/amber/.cache/huggingface/datasets/parquet/default-27cf6341afd1f923/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

In [19]:
motif_len = [i.shape[-1] for i in core_motifs]
max_motif_len = max(motif_len)
background_seq_index = np.random.choice(len(raw_datasets['test']),rep_per_motif)
background_seq = np.array(raw_datasets['test']['seq'])[background_seq_index]
background_onehot = np.array([dna_one_hot(seq) for seq in background_seq])
background_onehot = np.swapaxes(background_onehot,1,2)
insert_loci = np.random.choice(background_onehot.shape[-1]-max_motif_len,rep_per_motif)


In [22]:
file = h5py.File('/home/amber/multitask_RNA/data/motif/plants/raw_sequence/synthetic_seq.h5','w')
for i in range(len(core_names)):
    motifs = [core_motifs[2*i],core_motifs[2*i+1]]
    motif_len = motifs[0].shape[-1]
    location = []
    seq = [] 
    for motif_pwm in motifs:
        insert_seq = background_onehot.copy()
        cum_prob = motif_pwm.cumsum(axis=0)
        Z = np.random.uniform(0,1,(rep_per_motif,motif_len))
        for j,loci in enumerate(insert_loci):
            motif = np.zeros(motif_pwm.shape)
            for k in range(motif_len):
                index = [base for base in range(4) if Z[j,k] < cum_prob[base,k]][0]
                motif[index,k] = 1
            insert_seq[j,:,loci:loci+motif_len] = motif
            location.append(np.arange(loci,loci+motif_len))
        seq.extend(onehot_to_seq(insert_seq))
    file.create_dataset(core_names[i],data = np.array(seq).tolist())
    file.create_dataset(core_names[i]+'_loci',data = location)
        
file.close()

In [23]:
with open('/home/amber/multitask_RNA/data/motif/plants/raw_sequence/selected_motif.csv', 'w') as fp:
    for item in core_names:
        fp.write("%s\n" % item)