### get padded cds and transcript sequences for transcript set chosen with ORFquant

In [1]:
import pandas as pd
import numpy as np
import math

def convertFastaToDict(fastaFile):
    '''
    converts a fasta file to a dict of {sequenceName:sequence}
    can take extra files in * args
    '''
    if isinstance(fastaFile, list):
        files = fastaFile
    else:
        files = [fastaFile]
    currentName = None
    currentSequence = None
    seqDict = {}
    for currentFile in files:
        if currentFile.endswith('.gz'):
            import gzip
            f = gzip.open(currentFile)
        else:
            f = open(currentFile)
        for line in f:
            if not line.strip() == '' and not line.startswith('#'):  # ignore empty lines and commented out lines
                if line.startswith('>'):  # > marks the start of a new sequence
                    if not currentName == None:  # after we've reached the firtst > line, we know what the sequence corresponds to
                        seqDict[currentName] = currentSequence
                    currentName = line.strip()[1:].split()[
                        0]  # i've noticed the gencode names have extraneous numbering after some whitespace. This doens't match the GTF files, so I'm removing it.
                    currentSequence = ''
                else:
                    currentSequence += line.strip()
        f.close()
    seqDict[currentName] = currentSequence
    return seqDict

def get_CDS_seq(transcript, pad_5p, pad_3p):
    transcript_seq = transcript_seqs[transcript].upper()
    info_parts = transcript.split('|')
    for info_part in info_parts:
        if info_part.startswith('CDS:'):
            #print info_part, transcript
            CDS_positions = [int(pos) for pos in info_part.split(':')[1].split('-')]
            CDS_seq = transcript_seq[max((CDS_positions[0]-1)-pad_5p, 0):CDS_positions[1]+pad_3p]
            return CDS_seq

In [2]:
#download transcript sequences from gencode
! curl -L ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_35/gencode.v35.pc_transcripts.fa.gz -o ../annotations/gencode.v35.pc_transcripts.fa.gz
! gunzip -c ../annotations/gencode.v35.pc_transcripts.fa.gz | cat - ../annotations/reporter/UGAC_reporter_transcript.fa > ../annotations/gencode.v35.pBZ105.pc_transcripts.fa

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 41.5M  100 41.5M    0     0  4819k      0  0:00:08  0:00:08 --:--:-- 6069k


In [3]:
transcript_seqs = convertFastaToDict('../annotations/gencode.v35.pBZ105.pc_transcripts.fa')

In [4]:
def get_tx_from_file(file_name):
    txs = set()
    f = open(file_name)
    for line in f:
        txs.add(line.strip())
    f.close()
    return txs

In [5]:
def write_padded_CDS_seqs(transcript_file, outfile_name, pad_5p, pad_3p):
    transcript_subset = get_tx_from_file(transcript_file)
    outfile = open(outfile_name, 'w')
    for transcript in transcript_seqs.keys():
        if transcript.split('|')[0] in transcript_subset:
            cds_seq = get_CDS_seq(transcript, pad_5p, pad_3p)
            outfile.write('>%s\n%s\n' % (transcript, cds_seq))
    outfile.close()

def write_subset_tx_seqs(transcript_file, outfile_name):
    transcript_subset = get_tx_from_file(transcript_file)
    outfile = open(outfile_name, 'w')
    for transcript in transcript_seqs.keys():
        if transcript.split('|')[0] in transcript_subset:
            outfile.write('>%s\n%s\n' % (transcript, transcript_seqs[transcript]))
    outfile.close()

In [6]:
tx_file = '../1_orfquant/orfquant_tx_stops_collapsed.txt'

In [7]:
import os
write_padded_CDS_seqs(tx_file, os.path.join('../annotations/', 'orfquant_CDS_%d_%d.fa' % (15, 15)),15, 15)
write_subset_tx_seqs(tx_file, '../annotations/orfquant_tx.fa')

### generate salmon indexes

In [8]:
#generate decoys list, which just contains a list of human chromosome names
! grep "^>" < ../annotations/genome/GRCh38.primary_assembly.genome.pBZ105.fa | cut -d " " -f 1 > decoys.txt
! sed -i.bak -e 's/>//g' decoys.txt

In [9]:
#now generate "gentrome" files, which concatenate the genome and transcriptome
!mkdir gentromes
import os
import subprocess
for tx_file in ['orfquant_CDS_15_15.fa', 'orfquant_tx.fa']:
    tx_path = os.path.join('../annotations/', tx_file)
    prefix = tx_file[:-3]
    gentrome_path = os.path.join('gentromes/', prefix+'.gentrome.fa.gz')
    cmd="cat {tx_path} ../annotations/genome/GRCh38.primary_assembly.genome.pBZ105.fa | gzip > {gentrome_path}".format(**locals())
    print(cmd)
    #subprocess.Popen(cmd, shell=True).wait()
    ! {cmd}

mkdir: cannot create directory ‘gentromes’: File exists
cat ../annotations/orfquant_CDS_15_15.fa ../annotations/genome/GRCh38.primary_assembly.genome.pBZ105.fa | gzip > gentromes/orfquant_CDS_15_15.gentrome.fa.gz
cat ../annotations/orfquant_tx.fa ../annotations/genome/GRCh38.primary_assembly.genome.pBZ105.fa | gzip > gentromes/orfquant_tx.gentrome.fa.gz


In [10]:
#now generate a salmon index from each gentrome
!mkdir indices
import os
import subprocess
tx_files = ['orfquant_CDS_15_15.fa', 'orfquant_tx.fa']
for tx_file in sorted(tx_files):
    for k in [13, 31]:
        prefix = tx_file[:-3]
        gentrome_path = os.path.join('gentromes/', prefix+'.gentrome.fa.gz')
        index_path = os.path.join('indices/', prefix)
        #use K=31 for RNAseq (~75bp reads, and 13 for profiling)
        cmd="salmon index -t {gentrome_path} -d decoys.txt t -p 40 -i {index_path}_k{k} -k {k} --gencode ".format(**locals())
        #pasted into terminal
        print(cmd)
        ! {cmd}
    #subprocess.Popen(cmd, shell=True).wait()

mkdir: cannot create directory ‘indices’: File exists
salmon index -t gentromes/orfquant_CDS_15_15.gentrome.fa.gz -d decoys.txt t -p 40 -i indices/orfquant_CDS_15_15_k13 -k 13 --gencode 
Version Info: This is the most recent version of salmon.
[2020-11-24 21:39:01.728] [jLog] [info] building index
out : indices/orfquant_CDS_15_15_k13
[00m[2020-11-24 21:39:01.728] [puff::index::jointLog] [info] Running fixFasta
[00m
[Step 1 of 4] : counting k-mers

[00m[00m[2020-11-24 21:40:18.466] [puff::index::jointLog] [info] Replaced 151,122,963 non-ATCG nucleotides
[00m[00m[2020-11-24 21:40:18.466] [puff::index::jointLog] [info] Clipped poly-A tails from 3 transcripts
[00mwrote 24440 cleaned references
[00m[2020-11-24 21:40:22.963] [puff::index::jointLog] [info] Filter size not provided; estimating from number of distinct k-mers
[00m[00m[2020-11-24 21:40:57.170] [puff::index::jointLog] [info] ntHll estimated 35642314 distinct k-mers, setting filter size to 2^30
[00mThreads = 40
Vertex le

[00m[00m[2020-11-24 22:36:45.895] [puff::index::jointLog] [info] finished populating pos vector
[00m[00m[2020-11-24 22:36:45.895] [puff::index::jointLog] [info] writing index components
[00m[00m[2020-11-24 22:36:46.031] [puff::index::jointLog] [info] finished writing dense pufferfish index
[00m[2020-11-24 22:36:47.565] [jLog] [info] done building index
salmon index -t gentromes/orfquant_CDS_15_15.gentrome.fa.gz -d decoys.txt t -p 40 -i indices/orfquant_CDS_15_15_k31 -k 31 --gencode 
Version Info: This is the most recent version of salmon.
[2020-11-24 22:36:47.798] [jLog] [info] building index
out : indices/orfquant_CDS_15_15_k31
[00m[2020-11-24 22:36:47.798] [puff::index::jointLog] [info] Running fixFasta
[00m
[Step 1 of 4] : counting k-mers
[00m
[00m[00m[2020-11-24 22:38:04.067] [puff::index::jointLog] [info] Replaced 151,122,963 non-ATCG nucleotides
[00m[00m[2020-11-24 22:38:04.067] [puff::index::jointLog] [info] Clipped poly-A tails from 3 transcripts
[00mwrote 24439 

[00m[00m[2020-11-24 22:53:30.067] [puff::index::jointLog] [info] finished populating pos vector
[00m[00m[2020-11-24 22:53:30.067] [puff::index::jointLog] [info] writing index components
[00m[00m[2020-11-24 22:53:44.879] [puff::index::jointLog] [info] finished writing dense pufferfish index
[00m[2020-11-24 22:53:45.957] [jLog] [info] done building index
salmon index -t gentromes/orfquant_tx.gentrome.fa.gz -d decoys.txt t -p 40 -i indices/orfquant_tx_k13 -k 13 --gencode 
Version Info: This is the most recent version of salmon.
[2020-11-24 22:53:46.151] [jLog] [info] building index
out : indices/orfquant_tx_k13
[00m[2020-11-24 22:53:46.151] [puff::index::jointLog] [info] Running fixFasta
[00m
[Step 1 of 4] : counting k-mers

[00m[00m[2020-11-24 22:55:02.697] [puff::index::jointLog] [info] Replaced 151,122,963 non-ATCG nucleotides
[00m[00m[2020-11-24 22:55:02.697] [puff::index::jointLog] [info] Clipped poly-A tails from 221 transcripts
[00mwrote 24512 cleaned references
[00m

[00m[00m[2020-11-24 23:51:52.982] [puff::index::jointLog] [info] finished populating pos vector
[00m[00m[2020-11-24 23:51:52.982] [puff::index::jointLog] [info] writing index components
[00m[00m[2020-11-24 23:51:53.105] [puff::index::jointLog] [info] finished writing dense pufferfish index
[00m[2020-11-24 23:51:54.794] [jLog] [info] done building index
salmon index -t gentromes/orfquant_tx.gentrome.fa.gz -d decoys.txt t -p 40 -i indices/orfquant_tx_k31 -k 31 --gencode 
Version Info: This is the most recent version of salmon.
[2020-11-24 23:51:55.053] [jLog] [info] building index
out : indices/orfquant_tx_k31
[00m[2020-11-24 23:51:55.054] [puff::index::jointLog] [info] Running fixFasta
[00m
[Step 1 of 4] : counting k-mers

[00m[00m[2020-11-24 23:53:10.030] [puff::index::jointLog] [info] Replaced 151,122,963 non-ATCG nucleotides
[00m[00m[2020-11-24 23:53:10.030] [puff::index::jointLog] [info] Clipped poly-A tails from 221 transcripts
[00mwrote 24512 cleaned references
[00m

[00m[00m[2020-11-25 00:09:28.969] [puff::index::jointLog] [info] finished populating pos vector
[00m[00m[2020-11-25 00:09:28.969] [puff::index::jointLog] [info] writing index components
[00m[00m[2020-11-25 00:09:42.691] [puff::index::jointLog] [info] finished writing dense pufferfish index
[00m[2020-11-25 00:09:44.012] [jLog] [info] done building index
