__Looking at second uORF__

In homo sapiens, the uORF is located 138 nucleotides away from the main AUG, with the UTR length being 334 total nucleotides (POLG-201 from Ensembl). The length of this ORF is 72 nucleotides.  I think I will filter sequences here to 150 nucleotides of length and try to align the entire 5'-UTRs of sequences

I decided to add Sarcophilus harisii to the ignore list because just because it has the CUG doesn't mean the uORFs are conserved --> it also messes up the alignment

This is additionally interesting because 4 out of 5 of the sequences on the ignore list are marsupials and the 5th is Camelus_ferus. For some reason, I feel like all Camel genomes are super gappy when put into an alignment with other species. Perhaps the quality of deposition for Camels on NCBI was not good?

The consensus Kozak Sequence for the upstream AUG is quite decent





In [45]:
from Bio.Seq import Seq
from Bio import Entrez
from Bio import SeqIO
from Bio.Alphabet import IUPAC
import matplotlib
import matplotlib.pyplot as plt
% matplotlib inline
import numpy as np
from Bio.Align.Applications import MuscleCommandline
import os
from Bio import SeqIO
import csv
from itertools import islice
from Bio.Emboss.Applications import TranalignCommandline

In [46]:
cwd = os.getcwd()+os.sep
BLAST_genbank = 'Representative_Species/tblastn_refseqrna_mammalia_queryhomosapiens_1-12-20.gb'
BLAST_textoutput = 'Representative_Species/tblastn_refseqrna_mammalia_queryhomosapiens_1-12-20.txt'
full_5UTR_fasta = cwd + 'full_5UTR.fasta'
muscle_executable = cwd+"muscle3.8.31_i86win32.exe"
uORF_kozak_file = cwd + 'uORF_kozak_sequence_aligned.fasta'
aligned_fiveUTR_full = cwd+'aligned_full_5UTR.fasta'
mammal_ignore_list_extended = ['Camelus_ferus', 'Vombatus_ursinus', 'Phascolarctos_cinereus' ,'Monodelphis_domestica','Sarcophilus_harrisii']

In [47]:
def bestHitPerOrganism(hitTable):
    singleHitDict = {}
    for organism in hitTable:
        bestScore = 0.0
        final_accession = ''
        transcript_variant = 0
        for accession in hitTable[organism]:
            current_score = float(hitTable[organism][accession]['bit score'])
            if current_score > bestScore:
                bestScore = current_score
                final_accession = accession
        singleHitDict[organism] = {'accession':final_accession, 'bit_score': bestScore,
                                   'sequence':hitTable[organism][final_accession]['seq'],
                                  'nam':hitTable[organism][final_accession]['nam'],
                                  'start':hitTable[organism][final_accession]['start'],
                                  'end':hitTable[organism][final_accession]['end']}
    return singleHitDict     

In [48]:
#This function requires a genbank file from a blast result, the text_table of the results of a blast result. An optional
#parameter exists called optional_cutoff --> if an individual sorts sequences by something like query cover before downloading
#his/her blast result, the user can choose to stop processing at a specific sequence so that only sequences above a certain
#query cover are considered in downstream analysis. Alternatively, one could simply download sequences manually that are above
#a certain query cover
def processHitTable(genbank_file,text_table, optional_cutoff = ''):
    Sequence_dict = {}
    for file in SeqIO.parse(genbank_file, 'gb'):
        for feature in file.features:
                if feature.type == 'gene':
                    if 'gene' in feature.qualifiers.keys():
                        symbol = feature.qualifiers['gene']
                    if 'locus_tag' in feature.qualifiers.keys():
                        symbol = feature.qualifiers['locus_tag']
                if feature.type == 'source':
                    organism = feature.qualifiers['organism'][0].replace(" ", "_")
                    
                    #automatically should use POLG-201 transcript
                    if organism == 'Gallus_gallus':
                        Sequence_dict['Gallus_gallus'] = {}
                        Sequence_dict['Gallus_gallus']['POLG_201'] = {}
                        Sequence_dict['Gallus_gallus']['POLG_201']['nam'] = 'Ensembl_transcript_POLG-201'
                        Sequence_dict['Gallus_gallus']['POLG_201']['seq'] = (custom_POLG201_Gallus_gallus_sequence)
                        Sequence_dict['Gallus_gallus']['POLG_201']['start'] = 404
                        Sequence_dict['Gallus_gallus']['POLG_201']['end'] = 3983
                        Sequence_dict['Gallus_gallus']['POLG_201']['bit score'] = 1000000
                        
                if feature.type == 'CDS':
                    CDS = [int(a) for a in feature.location]
                    start = CDS[0]
                    end = CDS[-1]
                    accession = file.name
                    full_name = file.description
                    if organism not in Sequence_dict.keys():
                        Sequence_dict[organism] = dict()
                    Sequence_dict[organism][accession] = dict()
                    Sequence_dict[organism][accession]['nam'] = full_name
                    Sequence_dict[organism][accession]['seq'] = file.seq
                    Sequence_dict[organism][accession]['start'] = start
                    Sequence_dict[organism][accession]['end'] = end + 1
    
    final_hit_dict = {}
    with open(text_table) as f:
        reader = csv.DictReader(f, delimiter = "\t")
        for initial_row in islice(reader, 4, 5):
            header_list = str((initial_row['# tblastn'])).split('# Fields: ')[1].split(', ')
        hit_number = 0
        for row in islice(reader, 1, None):   
            hit_dict = {}
            hit_number +=1
            query_id = []
            query_id.append(str(row['# tblastn']))
            result_list = row[None]
            combined_results = query_id + result_list
            i = 0
            for item in header_list:
                hit_dict[item] = combined_results[i]
                i+=1  
            if optional_cutoff != '':
                if hit_dict['subject acc.ver'] == optional_cutoff:
                    break
            key = (hit_dict['subject acc.ver'].split('.'))[0]
            
            
            organism = ''
            accession = ''
            for item in Sequence_dict:
                for item2 in Sequence_dict[item]:
                    if item2 == key:
                        organism = item
                        accession = item2
                        Sequence_dict[organism][accession].update(hit_dict)
            
            
            
            
            #final_hit_dict[key] = hit_dict     
    return Sequence_dict

In [49]:
def writeUTRfile(five_150_UTR_dict, filename, ignore_list):
    file = open(filename, 'w')
    for item in five_150_UTR_dict:
        if item in ignore_list:
            print(item)
            continue    
        file.write('>'+item+'\n'+str(five_150_UTR_dict[item]['fiveUTR'])+'\n')
    file.close()

In [50]:
def getuORFRNA(UTR_dict,ignore_list,singleTranscriptTable):
    
    uORF_RNA_dict = {}
    for item in UTR_dict:
        if item in ignore_list:
            continue
        fiveUTRcode = UTR_dict[item][uORF_AUG_reference:]
        fiveUTRCDS = ''
        for nt in fiveUTRcode:
            if nt != '-':
                fiveUTRCDS = fiveUTRCDS+ nt
        fiveUTRCDS = fiveUTRCDS[10:]
        uORF_RNA_dict[item] = Seq(fiveUTRCDS).transcribe()
    return uORF_RNA_dict

In [51]:
def extractfiveUTR(singleTranscriptTable, UTR_size = 100):
    five_UTR_dict = {}
    for item in singleTranscriptTable:
        if singleTranscriptTable[item]['start'] > UTR_size:
            accession = singleTranscriptTable[item]['accession']
            sequence = singleTranscriptTable[item]['sequence']
            fiveUTR = sequence[0:singleTranscriptTable[item]['start']]
            five_UTR_dict[item] = {'accession':accession,'fiveUTR':fiveUTR}
    return five_UTR_dict

In [52]:
def writeKozakMotif(kozak_motif_dict, kozak_file):
    file = open(kozak_file, 'w')
    for item in kozak_motif_dict:
        file.write('>'+item+'\n'+kozak_motif_dict[item]+'\n')
    file.close()

In [53]:
def readAlignment(in_file):
    alignment_dict = {}
    for seq_record in (SeqIO.parse(in_file, 'fasta')):
        name = seq_record.id
        seq = seq_record.seq
        alignment_dict[name] = str(seq)
    return (alignment_dict)

In [54]:
def runMuscle(in_file, out_file, muscle_executable):
    muscle_cline = MuscleCommandline(muscle_executable, input=in_file, out=out_file)
    muscle_cline()

In [92]:
def kozakMotif(fiveUTRalignment_dict,reference_location):
    reference_location -=1
    kozak_dict = {}
    for item in fiveUTRalignment_dict:
        if item == 'Monodelphis_domestica':
            reference_location +=1
        
        
        fiveUTR = fiveUTRalignment_dict[item]
        CUG = fiveUTR[reference_location:reference_location+3]
        motif = CUG
        i = 0
        nextnt = fiveUTR[reference_location+3+i:reference_location+3+i+1]
        while nextnt == '-':
            i +=1
        motif += nextnt
    
        k = 0
        j = 6
    
        while j > 0:
            previousnt = fiveUTR[reference_location-1+k:reference_location+k]
            if previousnt == '-':
                k -=1
            else:
                motif = previousnt + motif
                k -=1
                j -=1
        kozak_dict[item] = motif
    return kozak_dict
    

In [83]:
hitTable = processHitTable(BLAST_genbank,BLAST_textoutput)
singleTranscriptTable = bestHitPerOrganism(hitTable)

In [84]:
five_150_UTR_dict = extractfiveUTR(singleTranscriptTable, 150)

In [85]:
writeUTRfile(five_150_UTR_dict, full_5UTR_fasta, mammal_ignore_list_extended)

Phascolarctos_cinereus
Vombatus_ursinus
Camelus_ferus


In [86]:
runMuscle(full_5UTR_fasta, aligned_fiveUTR_full, muscle_executable)

In [87]:
full5UTR_alignment_dict = readAlignment(aligned_fiveUTR_full)

In [88]:
file = open('reordered_5UTR_Full_alignment.fasta','w')
for item in full5UTR_alignment_dict:
    if item == 'Homo_sapiens':
        file.write('>'+item+'\n'+full5UTR_alignment_dict[item]+'\n')
for item in full5UTR_alignment_dict:
    if item != 'Homo_sapiens':
        file.write('>'+item+'\n'+full5UTR_alignment_dict[item]+'\n')
file.close()

With this alignment, the uORF AUG begins at position 1372 in the alignment

In [100]:
#custom_number that will depend on the alignment and requires manual insepction
uORF_AUG_reference = 1372

In [101]:
uORF_AUG_kozak_dict = kozakMotif(full5UTR_alignment_dict,uORF_AUG_reference)

In [102]:
writeKozakMotif(uORF_AUG_kozak_dict, uORF_kozak_file)