In [1]:
# General import, names of train, test, val
import numpy as np
import pysam
from tqdm.notebook import tqdm
import h5py
import pandas as pd
import matplotlib.pyplot as plt
rng = np.random.default_rng(seed=0)

basedir = '/data/leslie/shared/ASA/'
ctype = 'cd8'

aligndir = f'{basedir}pseudodiploid/cutrun/{ctype}/tbet/'
datadir = f'{basedir}mouseASA/{ctype}/cast/data/cutrun/tbet/'
chroms = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19]

reps = ['r1','r2']
seqlen = 300                         # region around summit for sequence
save = True

## Get CUT&RUN peaks FASTA for HOMER enrichment

In [78]:
peaks = pd.read_csv(f'{aligndir}peaks/peakatlas.bed', sep='\t', header=None)
# filter peaks by IDR
idr_thresh = 0.05
idx = np.where(peaks[11]>-np.log10(idr_thresh))[0]
peaks = peaks.iloc[idx,:].reset_index(drop=True)
peaks = peaks.iloc[:,:3]
peaks.head()

Unnamed: 0,0,1,2
0,1,7088481,7089187
1,1,7397472,7398405
2,1,9700302,9700986
3,1,9943782,9944348
4,1,10037923,10038440


In [39]:
# Get the b6 sequences corresponding to CUT&RUN peaks
from Bio import SeqIO
from bisect import bisect

gen_b6 = SeqIO.index(f'{basedir}pseudodiploid/gen/b6.fa', 'fasta')

x = []
for c in tqdm(chroms):
    seq_b6 = ''.join(gen_b6.get_raw(str(c)).decode().split('\n')[1:])
    chrompeaks = peaks.iloc[np.where(peaks[0]==c)[0],:]     # slice out the relevant chromosome peaks

    # get relevant b6 genomic seqs
    x += [seq_b6[l:r] for l,r in zip(chrompeaks[1],chrompeaks[2])]

gen_b6.close()

  0%|          | 0/19 [00:00<?, ?it/s]

In [45]:
# Save the positive sequence
with open(datadir+'homer/positives.fa', 'w') as f:
    for i in range(len(x)):
        f.write(f'>{str(i+1)}\n')
        f.write(x[i]+'\n')

## Get negative background peaks

In [48]:
# for this, simply load x_b6_unegs from any of our datasets
from utils import unhot
with h5py.File(f'{basedir}mouseASA/{ctype}/cast/data/data_vi_150bp_aug.h5','r') as f:
    x_neg = f['x_train_b6_unegs'][()]

x_neg = x_neg[rng.choice(len(x_neg), len(x), replace=False)]
x_neg = unhot(x_neg)

In [58]:
# Save the negative sequences
with open(datadir+'homer/negatives.fa', 'w') as f:
    for i in range(len(x_neg)):
        f.write(f'>{str(i+1)}\n')
        f.write(x_neg[i]+'\n')

## HOMER de novo motif analysis

In [69]:
motifs = []
with open(datadir+'homer/homermotifs.txt', 'r') as f:
    for line in f:
        if line[0]=='>':
            motifs.append(line.split('\t')[0][1:])

In [74]:
for motif in motifs:
    print(motif)

CGCGGCGCCG
SCGCSCSGCS
CCGGACGCGG
VGTCGCGCGN
VGCCMCGCSS
CGCGCAGGCG
NCCGCGGCGN
CGGCAGCCGC
CGTCACGTCG
YCCCCGCCWS
CGCGCCGGGT
CTCGCGATCG
ACGGACGTCG
ACGGAGCCTA
AGGGCGGGAA
CGCGGTTCGA
TGGGAGKTGT
GCDGGGCCTC
MAGGCTTCCT
GGTTAGGGTT
CAGACGGCAC
TTGGCTGAAG
CCTAACCCTA
GATTGGCTGT
RWAGGTGTGA
