# Get 3' UTRs

## Check that all the files are present

In [None]:
import os
import pandas as pd

info = pd.read_excel('GenomeInfo.xlsx')

for number, organism in info.iterrows():
    if organism['GFF File'].endswith('.gz'):
        organism['GFF File'] = organism['GFF File'][0:-3]
        organism['Assembly File'] = organism['Assembly File'][0:-3]
    
    if organism['Organism'].startswith('Euplotes'):
        continue
    if (os.path.exists('GenomesAndGFFs/'+organism['GFF File']) and
        os.path.exists('GenomesAndGFFs/'+organism['Assembly File'])):
        #print(organism['Organism'],': OK')
        continue
    else:
        print(organism['Organism'],': Not OK')
    #print(organism['Organism'],organism['GFF File'],organism['Assembly File'])


## Process each organism and save the 3' UTR

In [None]:
from Bio import SeqIO
import re
import functionsUTR as utr
utrSize = 20

In [None]:
allInfo = []
for number, organism in info.iterrows():
    if organism['Organism'].startswith('Euplotes'):
        continue
    org = organism['Organism']
    gffFile = 'GenomesAndGFFs/' + organism['GFF File']
    if gffFile.endswith('.gz'):
        gffFile = gffFile[0:-3]
    fasFile = 'GenomesAndGFFs/' + organism['Assembly File']
    if fasFile.endswith('.gz'):
        fasFile = fasFile[0:-3]
    marker = organism['GFF Marker']
    gffType = organism['Feature']
    utrFile = 'utrs/' + organism['Base Name'] + '.utr' + str(utrSize)
    stops = organism['GeneticCode']
    
    fasta_dic = utr.parse_fasta(fasFile)
    gff = utr.process_gff(gffFile,marker,gffType)
    utrs, details = utr.get_UTRs(org,gff,fasta_dic,stops,utrSize,
                       utrFile)
    print(details[0:8])
    allInfo.append(details)

## Euplotes

For each CDS get the last 20 bases and search for them in the corresponding chromosome. Find out how many times in the chromosome each 20-mer appear if only one keep it. Otherwise if it does not or is repeated discard.

In [None]:
cdss = utr.parse_fasta("GenomesAndGFFs/Euplotes_Octocarinatus_CDS.fasta")
chrom = utr.parse_fasta("GenomesAndGFFs/Euplotes_Octocarinatus.fasta")

for name, seq in chrom.items():
    chrom[name] = (seq, utr.reverse_complement(seq))
    
import re

countAll = 0
countNotFound = 0
countRev = 0
countFor = 0
countRev2 = 0
countFor2 = 0
countFoundMul = 0
countFoundBoth = 0

utrs = []
matchLen = 20
utrLen = 20

for name, seq in cdss.items():
    chromo = name.split('.')[0]
    forward = chrom[chromo][0]
    reverse = chrom[chromo][1]
    countAll += 1
    pattern = seq[-matchLen:]
    foundF = re.findall(pattern,forward)
    foundR = re.findall(pattern,reverse)
    
    if len(foundF) == 1 and len(foundR) == 0:
        countFor += 1
        match = re.search(pattern,forward)
        pos = match.end()
        utrs.append([name, forward[pos-3:pos], 
                     forward[pos:pos+utrLen]])
    elif len(foundF) == 0 and len(foundR) == 1:
        countRev += 1
        match = re.search(pattern,reverse)
        pos = match.end()
        utrs.append([name, reverse[pos-3:pos], 
                     reverse[pos:pos+utrLen]])


    elif len(foundF) == 0 and len(foundR) == 0:
        pattern = seq[-matchLen+11:]
        foundF = re.findall(pattern,forward)
        foundR = re.findall(pattern,reverse)
        if len(foundF) == 1 and len(foundR) == 0:
            countFor2 += 1
            match = re.search(pattern,forward)
            pos = match.end()
            utrs.append([name, forward[pos-3:pos], 
                         forward[pos:pos+utrLen]])
        elif len(foundF) == 0 and len(foundR) == 1:
            countRev2 += 1
            match = re.search(pattern,reverse)
            pos = match.end()
            utrs.append([name, reverse[pos-3:pos], 
                         reverse[pos:pos+utrLen]])
        else:
            countNotFound += 1
        #print(name, chromo, len(founds1), len(founds2))
    else:
        pattern = seq[-matchLen-10:]
        foundF = re.findall(pattern,forward)
        foundR = re.findall(pattern,reverse)
        if len(foundF) == 1 and len(foundR) == 0:
            countFor2 += 1
            match = re.search(pattern,forward)
            pos = match.end()
            utrs.append([name, forward[pos-3:pos], 
                         forward[pos:pos+utrLen]])
        elif len(foundF) == 0 and len(foundR) == 1:
            countRev2 += 1
            match = re.search(pattern,reverse)
            pos = match.end()
            utrs.append([name, reverse[pos-3:pos], 
                         reverse[pos:pos+utrLen]])
        elif len(foundF) > 1 or len(foundR) > 1:
            countFoundMul +=1
            #print(name, chromo, len(founds1), len(founds2))
        else:
            countFoundBoth +=1
        
    #if countAll % 100 == 0:
        #print(countAll)

utrFrame = pd.DataFrame(utrs, columns = ['Gene','Stop','UTR'])
utrFrame.to_csv('utrs/Euplotes_octocarinatus.utr'+str(utrLen),sep='\t')
print(countAll, countNotFound, countRev, countFor, countFor2, countRev2, countFoundMul, countFoundBoth)

## Tetrahymena thermophila

We have to change the gene names in the Tetrahymena thermophila UTR file to perform further analysis.

In [None]:
gffFile = 'GenomesAndGFFs/' + info[ info['Organism']=='Tetrahymena thermophila' ]['GFF File'].values[0]
utrFile = 'utrs/' + info[ info['Organism']=='Tetrahymena thermophila' ]['Base Name'].values[0]+ '.utr' + str(utrSize)
print(utrFile)
print(gffFile)

In [None]:
utrs = pd.read_csv(utrFile,sep='\t')
utrs.head()

In [None]:
filein = open(gffFile)

import re

mrna = {}
cds = {}
genes = {}

for line in filein:
    if not line.startswith('#'):
        line = line.strip().split()
        if line[2] == 'mRNA':
            #ID=mRNA001.1;Parent=gene001.1;Name=1.m000121
            temp = re.search(r"ID=([^;]*);.*Parent=([^;]*)",line[8])
            if temp:
                name, parent = temp.group(1,2)
                if name in mrna:
                    if not parent == mrna[name]:
                        print('Problem')
                else:
                    mrna[name] = parent
            else:
                print('could not find')
        elif line[2] == 'CDS':
            #ID=mRNA001.1;Parent=gene001.1;Name=1.m000121
            temp = re.search(r"ID=([^;]*);.*Parent=([^;]*)",line[8])
            if temp:
                name, parent = temp.group(1,2)
                if name in cds:
                    if not parent == cds[name]:
                        print('Problem')
                else:
                    cds[name] = parent
            else:
                print('could not find')
        elif line[2] == 'gene':
            #ID=gene001.1;Name=TTHERM_001431529;Note="hypothetical protein"
            temp = re.search(r"ID=([^;]*);.*Name=([^;]*)",line[8])
            if temp:
                name, gene = temp.group(1,2)
                if name in gene:
                    if not gene == genes[name]:
                        print('Problem')
                else:
                    genes[name] = gene
            else:
                print('could not find')




                
                
# genes:  geneXXX: TTHERM
# mrna:   mRNAXXX: geneXXX
#cds:     cdsXXX: mRNAXXX
utrs = pd.read_csv(utrFile,sep='\t')

newutrs = []
for number, info in utrs.iterrows():
    gene = info['Gene']
    stop = info['Stop']
    UTR = info['UTR']
    
    name = genes[mrna[gene]]
    newutrs.append([name, stop, UTR])
newutr = pd.DataFrame(newutrs, columns = ['Gene','Stop','UTR'])
newutr.to_csv(utrFile, sep="\t")
newutr