In [19]:
import re
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
import itertools

os.mkdir("uorfs")

# Create function to find ATGs

In [63]:

### function

def getuORFS(bedfile, fastafile, outfile):
    ### convert bed file to dataframe
    bed = pd.read_csv(bedfile, sep='\t', header=None)

    ### take input bedfile and make a new fasta with specified sequences
    cmd = "bedtools getfasta -s -fi "+fastafile+" -bed "+bedfile+" -fo uorfs/" + outfile + "_codoncount.fasta"
    os.system(cmd)
    newfasta = open("uorfs/" + outfile + "_codoncount.fasta")

    ### to return ATG starts:
    codons = ["ATG"]
    
    ### create list with the number of uORFs for each sequence line in the fasta file 
    uorf_number = []
    length = []
    positions = []
    ### create list that will contain the uORF number matched to the systemmatic gene name
    todf = []

    for idx,line in enumerate(newfasta):
        for codon in codons:
            if line[0]!=">":
                position = [m.start() for m in re.finditer(codon,line)]
                positions.append(position)
                uorf_number.append(len(position))
                length.append(len(line))
                todf.append(bed.iloc[int((idx/2)), 3] + '\t' + str(uorf_number[int(idx/2)]) + '\t' + str(positions[int(idx/2)]))


    series = pd.Series(todf)
    df = pd.DataFrame(series)
    df['parent'], df['uorf_number'], df['positions'] = df[0].str.split('\t', 2).str
    df['positions'] = df['positions'].str.strip("[").str.strip("]")
    series2 = pd.Series(length)
    df['length'] = series2
    df.drop(0, axis=1, inplace=True)
    df['rate'] = df['uorf_number'].astype('float') / (df['length'].astype('float')/3)
    print('ATG per codon: ' + str(df['rate'].mean()))
    df.to_csv('uorfs/' + outfile + '.txt', sep='\t', index=None)
    os.remove("uorfs/" + outfile + "_codoncount.fasta")


# Apply function to specified regions

In [64]:

### get the uORF count between orf and luti TSSs, the distance, and the rate at each locus

bedfile = 'bedfiles/181130TSS_between_prox_and_luti.bed'
fastafile = 'genome_files/SK1_PacBio_spikes.genome.fa'
outfile = 'ATGs_between_prox_and_luti'

getuORFS(bedfile, fastafile, outfile)



ATG per codon: 0.04879722205421276


In [65]:

### get the uORFs in the 500 bp upstream of all genes without LUTIs

bedfile = 'bedfiles/nonlutiup500.bed'
fastafile = 'genome_files/SK1_PacBio_spikes.genome.fa'
outfile = 'uorfs_nonluti_up500'

getuORFS(bedfile, fastafile, outfile)



ATG per codon: 0.05057374455284471


# To find the positions of the uORFs

In [36]:

# bring in the ATG positions relative to the LUTI TSS

pos = pd.read_csv('uorfs/ATGs_between_prox_and_luti.txt', sep='\t')

# convert the string of locations to a list
pos['positions'] = pos['positions'].astype(str).map(lambda x: x.replace('[', '').replace(']', '').replace(',', ''))

pos['positions'] = pos['positions'].str.split(' ')

# give each entry in each list its own row
lists = pos.apply(lambda x: pd.Series(x['positions']),axis=1).stack().reset_index(level=1, drop=True)

lists.name = 'position'

pos = pos.drop('positions', axis=1).join(lists)



# bring in the LUTI TSS chromosome coordinates to get the chromosome positions of each ATG

tss = pd.read_csv('bedfiles/181130TSS_between_prox_and_luti.bed', sep='\t', header=None)

pos = pos.merge(tss, how='left', left_on='parent', right_on=3)

pos = pos[~((pos['position'] == 'nan') | (pos['position'] == '0'))]

pos['count'] = pos.groupby('parent').cumcount()+1

pos['parent'] = pos['parent'] + '_' + pos['count'].astype(str)



In [37]:

# expand the bases to include the region around the ATG

def expand(numcodon, df):
    plus_mask = (pos[5] == '+')
    minus_mask = (pos[5] == '-')
    
    df['start'] = pos[1]
    df['stop'] = pos[2]
    df.loc[plus_mask, 'start'] += df.loc[plus_mask, 'position'].astype(int)
    df.loc[plus_mask, 'stop'] = df.loc[plus_mask, 'start']
    df.loc[plus_mask, 'stop'] += (3 * numcodon)

    df.loc[minus_mask, 'stop'] -= df.loc[minus_mask, 'position'].astype(int)
    df.loc[minus_mask, 'start'] = df.loc[minus_mask, 'stop']
    df.loc[minus_mask, 'start'] -= (3 * numcodon) 

    bed = pd.DataFrame()
    bed[0] = pos[0]
    bed[1] = pos[1] - (3 * numcodon)
    bed[2] = pos[2] + (3 * numcodon)
    bed[3] = pos['parent']
    bed[4] = pos[4]
    bed[5] = pos[5]
    bed[6] = pos['start']
    bed[7] = pos['stop']

    bed.to_csv('uorfs/atgpositions' + '_' + str(numcodon) + 'codons.bed', sep='\t', header=None, index=None)
    

# run for the 6 codons including and downstream of the ATG

expand(6, pos)



# Analysis

In [60]:

# ran fp-count from the Ingolia lab's Riboseq code (https://github.com/ingolia-lab/RiboSeq) 
# using the 6 codon version of the bed file produced above

fpcounts = pd.read_csv('uorfs/3h_fp_vs_genome_unique_qexpr_6codons.txt', 
                       sep='\t', header=None)

fpcounts['parent'] = fpcounts[0].str.split('_', expand=True)[0]
fpcounts['uORF number'] = fpcounts[0].str.split('_', expand=True)[1]
fpcounts['uORF number'] = fpcounts['uORF number'].astype(int)
fpcounts['count'] = fpcounts[2]
fpcounts['enriched'] = fpcounts['count'] > 3

fpcounts.set_index('parent', inplace=True)


fpg = (fpcounts.groupby(level='parent', group_keys=False)
       .filter(lambda x: len(x) > 3))


fpg = fpg.groupby(level='parent', group_keys=False)



In [61]:

# to get the percent translated

first = fpg.nth(0)
second = fpg.nth(1)
slast = fpg.nth(-2)
last = fpg.nth(-1)

dfs = [first, second, slast, last]
uorfpos = ['First', 'Second', 'Penultimate', 'Last']

def pcttrue(df):
    pcttrue = df['enriched'].value_counts(normalize=True) * 100
    return pcttrue
    
lst = []

for idx, i in enumerate(dfs):
    lst.append((uorfpos[idx],pcttrue(i)[1]))

# add the first two in combination and the last two in combination
fs = first.merge(second, how='inner', on='parent', suffixes=['_first', '_second'])
fs = fs[(fs['enriched_first'] == True) & (fs['enriched_second'] == True)] 
lst.insert(2, ('First Two', len(fs)/len(first)*100))       
                
pl = last.merge(slast, how='inner', on='parent', suffixes=['_last', '_slast'])
pl = pl[(pl['enriched_last'] == True) & (pl['enriched_slast'] == True)] 
lst.insert(5, ('Last Two', len(pl)/len(first)*100))       


df = pd.DataFrame(lst, columns=['uorfpos', 'pct'])

df.to_csv('uorfs/percenttranslated.txt', sep='\t', index=None)


In [62]:
df

Unnamed: 0,uorfpos,pct
0,First,78.125
1,Second,78.125
2,First Two,68.75
3,Penultimate,9.375
4,Last,20.3125
5,Last Two,4.6875
