# Create CDS FASTAS for each gene to allow alignment

Using the mt-specfic vcfs based on mapping to references that have blanked regions we can call FASTAs to be used for translation alignment


### This code is adapted from 
    /home/nessrobe/scripts/bin/Roanna_MT_gene_searcher.reMapped.singleMTs.py

In [1]:
import sys, re, pickle, subprocess
from annotation import GFF_line
from collections import OrderedDict

## Explore the genes

## Parse common name translation

In [2]:
from annotation import Transcript
from Bio import SeqIO

transcripts = Transcript.hash_gff('../find_shared_genes/mtRegions.GFF', index_label='ness_ID', quiet=True)
ref_dict = SeqIO.to_dict(SeqIO.parse(open('/scratch/research/references/chlamydomonas/5.3_chlamy_w_organelles_mt_minus/chlamy.5.3.w_organelles_mtMinus.fasta'), 'fasta'))

In [53]:
target_genes={}
#target_genes={common_name:{transcripts:[], domain:'', ness_IDs: []}
for l in open('../find_shared_genes/NameTranslation.txt').readlines()[1:]:
    if len(l.strip().split('\t'))==4:
        domain, common_name, ch6NessID, mtMinusNessID = l.strip().split('\t')
        if ch6NessID == 'None':ch6NessID=None
        if mtMinusNessID == 'None':mtMinusNessID=None
    elif len(l.strip().split("\t")) ==3:
        domain, common_name, ch6NessID = l.strip().split('\t')
        mtMinusNessID = None
    else:print(l.strip(), l.strip().split('\t'))
    if domain in 'R T':
        if mtMinusNessID == None or ch6NessID ==None:continue
        target_genes[common_name] = {ch6NessID    :{'transcript':transcripts[ch6NessID],'domain':domain}, \
                                     mtMinusNessID:{'transcript':transcripts[mtMinusNessID],'domain':domain}}
    elif domain == 'C':
        if ch6NessID ==None:continue
        target_genes[common_name] = {ch6NessID:{'transcript':transcripts[ch6NessID],'domain':domain}}
    # Add CDSs and strand
    for ness_ID in target_genes[common_name].keys():
        #print(t)
        t = target_genes[common_name][ness_ID]['transcript']
        target_genes[common_name][ness_ID]['CDS'] = [[exon.start, exon.end] for exon in t.sorted_feats('CDS')]
        target_genes[common_name][ness_ID]['strand'] = t.strand
        target_genes[common_name][ness_ID]['chromosome'] = t.seqid
                                         


C	161193 ['C', '161193']
C	522917 ['C', '522917']


In [54]:
target_genes['PDK1']


{'26893429': {'CDS': [[431053, 431189],
   [431523, 431868],
   [432170, 432295],
   [432503, 432572],
   [432794, 432918],
   [433206, 433444],
   [433811, 433897],
   [434078, 434186],
   [434438, 434551],
   [434852, 435022]],
  'chromosome': 'chromosome_6',
  'domain': 'R',
  'strand': '-',
  'transcript': <annotation.Transcript.Transcript at 0x7f39358e55c0>},
 'ADF43177.1': {'CDS': [[175950, 176086],
   [176415, 176772],
   [177074, 177199],
   [177407, 177476],
   [177699, 177823],
   [178104, 178342],
   [178709, 178795],
   [178974, 179082],
   [179334, 179447],
   [179766, 179936]],
  'chromosome': 'mtMinus',
  'domain': 'R',
  'strand': '-',
  'transcript': <annotation.Transcript.Transcript at 0x7f3935ec1198>}}

## Target Genotype Lists     
### Plus
- CC2936
- CC2937
- CC3060
- CC3064
- CC3065
- CC3068
- CC3071
- CC3076
- CC3086

### Minus:
- CC2935
- CC2938
- CC3059
- CC3061
- CC3062
- CC3063
- CC3073
- CC3075
- CC3079
- CC3084


In [6]:
# Target Genotype Lists
que_mtPlus = "CC2936 CC2937 CC3060 CC3064 CC3065 CC3068 CC3071 CC3076 CC3086".split()
que_mtMinus = "CC2935 CC2938 CC3059 CC3061 CC3062 CC3063 CC3073 CC3075 CC3079 CC3084".split()

# Reference
reference_fasta = '/scratch/research/references/chlamydomonas/5.3_chlamy_w_organelles_mt_minus/chlamy.5.3.w_organelles_mtMinus.fasta'

# Overall VCFs
quebec_vcf =  "../../../data/references/all_quebec.HC.vcf.gz"

# MT Specific VCFs
quebec_mtPLUS_vcf="../../../data/references/all_quebec.mtPlus.HC.vcf.gz"
quebec_mtMinus_vcf="../../../data/references/all_quebec.mtMinus.HC.vcf.gz"

reference_fasta = "/scratch/research/references/chlamydomonas/5.3_chlamy_w_organelles_mt_minus/chlamy.5.3.w_organelles_mtMinus.fasta"

T and R domain we expect two sets of fastas
C domain we expect one

We need the coords of the CDSs and the strand



In [75]:
for common_name in target_genes:
    for ness_id in target_genes[common_name]:
        locus = target_genes[common_name][ness_id]
        #break
        cmds=[]
        cds_string = " ".join(["%s:%i-%i" %(locus['chromosome'],i[0], i[1]) for i in locus['CDS']])
        if locus['chromosome'] == 'mtMinus': # ie if its mtMinus T or R
            print("mt- T or R gene")
            if locus['strand'] == '-':
                cmds.append("vcf2fasta.py -r %s --concatenate --reverse_complement -v %s -s %s -i %s >%s.%s.quebec_mtMinus.fasta" %(reference_fasta, quebec_mtMinus_vcf, " ".join(que_mtMinus), cds_string, common_name, ness_id))
            else:
                cmds.append("vcf2fasta.py -r %s --concatenate -v %s -s %s -i %s >%s.%s.quebec_mtMinus.fasta" %(reference_fasta, quebec_mtMinus_vcf, " ".join(que_mtMinus), cds_string, common_name,ness_id))
        elif locus['domain'] in "T R" and locus['chromosome'] == 'chromosome_6': # This is mt+ genes
            #print(common_name, locus['domain'], locus['chromosome'])
            print("mt+ T or R gene")
            if locus['strand'] == '-':
                cmds.append("vcf2fasta.py -r %s --concatenate --reverse_complement -v %s -s %s -i %s >%s.%s.quebec_mtPlus.fasta" %(reference_fasta, quebec_mtPLUS_vcf, " ".join(que_mtPlus), cds_string, common_name, ness_id))
            else:
                cmds.append("vcf2fasta.py -r %s --concatenate -v %s -s %s -i %s >%s.%s.quebec_mtPlus.fasta" %(reference_fasta, quebec_mtPLUS_vcf, " ".join(que_mtPlus), cds_string, common_name,ness_id))
        elif locus['domain'] == "C" and locus['chromosome'] == 'chromosome_6': # This is mt+ or mt- C domain genes which are the same
            print("mt+ and mt- C domain gene")
            if locus['strand'] == '-':
                cmds.append("vcf2fasta.py -r %s --concatenate --reverse_complement -v %s -s %s -i %s >%s.%s.quebec_mtPlus.fasta"  %(reference_fasta, quebec_vcf, " ".join(que_mtPlus),  cds_string, common_name, ness_id))
                cmds.append("vcf2fasta.py -r %s --concatenate --reverse_complement -v %s -s %s -i %s >%s.%s.quebec_mtMinus.fasta" %(reference_fasta, quebec_vcf, " ".join(que_mtMinus), cds_string, common_name, ness_id))
            else:
                cmds.append("vcf2fasta.py -r %s --concatenate -v %s -s %s -i %s >%s.%s.quebec_mtPlus.fasta"  %(reference_fasta, quebec_vcf, " ".join(que_mtPlus),  cds_string, common_name, ness_id))
                cmds.append("vcf2fasta.py -r %s --concatenate -v %s -s %s -i %s >%s.%s.quebec_mtMinus.fasta" %(reference_fasta, quebec_vcf, " ".join(que_mtMinus), cds_string, common_name, ness_id))
        for cmd in cmds:
                child = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True)
        if len(child.communicate()[-1]) >1:
            print(cmd, "\n", child.communicate()[-1], "\n\n")  

mt+ T or R gene
mt- T or R gene
mt+ and mt- C domain gene
mt+ and mt- C domain gene
mt+ and mt- C domain gene
mt+ T or R gene
mt- T or R gene
mt+ and mt- C domain gene
mt+ T or R gene
mt- T or R gene
mt+ and mt- C domain gene
mt+ T or R gene
mt- T or R gene
mt+ and mt- C domain gene
mt+ and mt- C domain gene
mt+ T or R gene
mt- T or R gene
mt- T or R gene
mt+ T or R gene
mt+ T or R gene
mt- T or R gene
mt+ T or R gene
mt- T or R gene
mt+ and mt- C domain gene
mt- T or R gene
mt+ T or R gene
mt- T or R gene
mt+ T or R gene
mt+ T or R gene
mt- T or R gene
mt- T or R gene
mt+ T or R gene
mt+ and mt- C domain gene
mt- T or R gene
mt+ T or R gene
mt- T or R gene
mt+ T or R gene
mt+ T or R gene
mt- T or R gene
mt+ T or R gene
mt- T or R gene
mt+ T or R gene
mt- T or R gene
mt+ and mt- C domain gene
mt+ T or R gene
mt- T or R gene
mt+ T or R gene
mt- T or R gene
mt+ T or R gene
mt- T or R gene
mt+ T or R gene
mt- T or R gene
mt- T or R gene
mt+ T or R gene
mt- T or R gene
mt+ T or R gene
mt+ 

In [None]:
j=0
for n in found_ness_IDs:
    cds_string = " ".join(["%s:%i-%i" %(found_ness_IDs[n]['seqid'],i[0], i[1]) for i in found_ness_IDs[n]['CDSs']])
    if n in name_translator:
        common_name =  name_translator[n]
    else:common_name = "unknown_common_name"
    # common samples
    if found_ness_IDs[n]['seqid'] == 'mtMinus':
        """eg.
        vcf2fasta.py -r %s -v quebec_wt.vcf.gz -s CC3060 CC3064 CC3065 CC3068 CC3069 CC3071 CC3072 CC3076 CC3078 --concatenate \
         -i chromosome_6:481283-481427 chromosome_6:481485-481762 chromosome_6:481831-482049 chromosome_6:482108-482200 chromosome_6:482261-482369 chromosome_6:482428-482511 chromosome_6:482573-482779 chromosome_6:482838-482982 chromosome_6:483042-483169 chromosome_6:483695-484080 --reverse_complement
        """
        if found_ness_IDs[n]['strand'] == '-':
            cmd="vcf2fasta.py -r %s --concatenate --reverse_complement -v %s -s %s -i %s >%s.%s.quebec_mtMinus.fasta;wait\n\n" %(reference_fasta, quebec_mtMinus_vcf, " ".join(que_mtMinus), cds_string, common_name, n)
        else:
            cmd="vcf2fasta.py -r %s --concatenate -v %s -s %s -i %s >%s.%s.quebec_mtMinus.fasta;wait\n\n" %(reference_fasta, quebec_mtMinus_vcf, " ".join(que_mtMinus), cds_string, common_name,n)
    elif found_ness_IDs[n]['seqid'] == 'chromosome_6':
        if found_ness_IDs[n]['strand'] == '-':
            cmd="vcf2fasta.py -r %s --concatenate --reverse_complement -v %s -s %s -i %s >%s.%s.quebec_mtPlus.fasta;wait\n\n" %(reference_fasta, quebec_mtPLUS_vcf, " ".join(que_mtPlus), cds_string, common_name,n)
        else:
            cmd="vcf2fasta.py -r %s --concatenate -v %s -s %s -i %s >%s.%s.quebec_mtPlus.fasta;wait\n\n" %(reference_fasta, quebec_mtPLUS_vcf, " ".join(que_mtPlus), cds_string, common_name,n)
        # This block of code runs on C-region mtMinus genes because they are on CH6 not mtMinus reference. 
        if common_name in genes and genes[common_name]['region'] =='C': #the centromere region is not included in the minus allele reference fasta, so we are using those 
            if found_ness_IDs[n]['strand'] == '-':
                cmd="vcf2fasta.py -r %s --concatenate --reverse_complement -v %s -s %s -i %s >%s.%s.quebec_mtMinus.fasta;wait\n\n" %(reference_fasta, quebec_vcf, " ".join(que_mtMinus), cds_string, common_name, n)
            else:
                cmd ="vcf2fasta.py -r %s --concatenate -v %s -s %s -i %s >%s.%s.quebec_mtMinus.fasta;wait\n\n" %(reference_fasta, quebec_vcf, " ".join(que_mtMinus), cds_string, common_name,n)
    j+=1
    print(j)#,cmd)
    child = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True)
    if len(child.communicate()[-1]) >1:
        print(cmd, "\n", child.communicate()[-1], "\n\n")

# MT limited Genes
The following code adapts the above code to get the mt Limited genes

In [4]:
target_genes={}
#target_genes={common_name:{transcripts:[], domain:'', ness_IDs: []}
for l in open('../mtLimitedGenes/mtLimited.NameTranslation.txt'):
    if len(l.strip().split())==3:
        mtAllele, common_name, ness_ID = l.strip().split()
    else:print(l.strip(), l.strip().split('\t'))
    target_genes[common_name] = {ness_ID    :{'transcript':transcripts[ness_ID],'mtAllele':mtAllele}}
    for ness_ID in target_genes[common_name].keys():
        #print(t)
        t = target_genes[common_name][ness_ID]['transcript']
        target_genes[common_name][ness_ID]['CDS'] = [[exon.start, exon.end] for exon in t.sorted_feats('CDS')]
        target_genes[common_name][ness_ID]['strand'] = t.strand
        target_genes[common_name][ness_ID]['chromosome'] = t.seqid
                                         

In [None]:
for common_name in target_genes:
    for ness_id in target_genes[common_name]:
        locus = target_genes[common_name][ness_id]
        cmds=[]
        cds_string = " ".join(["%s:%i-%i" %(locus['chromosome'],i[0], i[1]) for i in locus['CDS']])
        if locus['chromosome'] == 'mtMinus' and locus['mtAllele'] == 'mtMinus': # ie if its mtMinus 
            print("mt- limited gene")
            if locus['strand'] == '-':
                cmds.append("vcf2fasta.py -r %s --concatenate --reverse_complement -v %s -s %s -i %s >mtLimited/%s.%s.quebec_mtMinus.fasta" %(reference_fasta, quebec_mtMinus_vcf, " ".join(que_mtMinus), cds_string, common_name, ness_id))
            else:
                cmds.append("vcf2fasta.py -r %s --concatenate -v %s -s %s -i %s >mtLimited/%s.%s.quebec_mtMinus.fasta" %(reference_fasta, quebec_mtMinus_vcf, " ".join(que_mtMinus), cds_string, common_name,ness_id))
        elif locus['chromosome'] == 'chromosome_6' and locus['mtAllele'] == 'mtPlus': # ie if its mtPlus
            #print(common_name, locus['domain'], locus['chromosome'])
            print("mt+ gene")
            if locus['strand'] == '-':
                cmds.append("vcf2fasta.py -r %s --concatenate --reverse_complement -v %s -s %s -i %s >mtLimited/%s.%s.quebec_mtPlus.fasta" %(reference_fasta, quebec_mtPLUS_vcf, " ".join(que_mtPlus), cds_string, common_name, ness_id))
            else:
                cmds.append("vcf2fasta.py -r %s --concatenate -v %s -s %s -i %s >mtLimited/%s.%s.quebec_mtPlus.fasta" %(reference_fasta, quebec_mtPLUS_vcf, " ".join(que_mtPlus), cds_string, common_name,ness_id))
        else:
            print("Something weird",common_name, ness_id, locus['chromosome'],locus['mtAllele'] )
        for cmd in cmds:
                print(cmd)
                child = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True)
        if len(child.communicate()[-1]) >1:
            print(cmd, "\n", child.communicate()[-1], "\n\n")  

# Random Genes
The following code adapts the above code to get other ch6 genes

THe mtPlus allele runs in line with mtMinus sequence here:

    chromosome_6:298299-826737
    
But the C terminus ends with MAT3 

    chromosome_6	phytozome8_0	mRNA	937146	943474	.	+	.	ID=PAC:26893469;Name=Cre06.g255450.t1.3;pacid=26893469;longest=1;geneName=MAT3;Parent=Cre06.g255450;ness_ID=26893469

So we want transcripts before 298299 or after 943474

In [12]:
!grep MAT3 ../find_shared_genes/mtRegions.GFF


chromosome_6	phytozome8_0	mRNA	937146	943474	.	+	.	ID=PAC:26893469;Name=Cre06.g255450.t1.3;pacid=26893469;longest=1;geneName=MAT3;Parent=Cre06.g255450;ness_ID=26893469


In [3]:
transcripts = Transcript.hash_gff('/scratch/research/references/chlamydomonas/5.3_chlamy_w_organelles_mt_minus/annotation/concatenated_GFF/final.strict.GFF3', index_label='ness_ID', quiet=True)

In [18]:
ch6_transcripts = {}
for ness_id in transcripts:
    t = transcripts[ness_id]
    if t.seqid == "chromosome_6" and (t.end < 298299 or t.start > 943474):
        ch6_transcripts[ness_id] = t
print(len(ch6_transcripts))

1613


In [4]:
import random
all_ness_ids = list(transcripts.keys())
random.shuffle(all_ness_ids)
random_transcripts = {}
for ness_id in all_ness_ids:
    t = transcripts[ness_id]
    if t.seqid == "chromosome_6":
        continue
    elif "chromosome" in t.seqid:
        random_transcripts[ness_id] = t
    if len(random_transcripts) >=1000:
        break
    
print(len(random_transcripts))

1000


In [None]:
for ness_id in ch6_transcripts:
    t = ch6_transcripts[ness_id]
    CDSs = [[exon.start, exon.end] for exon in t.sorted_feats('CDS')]
    cmds=[]
    cds_string = " ".join(["%s:%i-%i" %(t.seqid,i[0], i[1]) for i in CDSs])
    if t.strand == '-':
        cmds.append("vcf2fasta.py -r %s --concatenate --reverse_complement -v %s -s %s -i %s >chromosome_6/%s.all_quebec.fasta" %(reference_fasta, quebec_vcf, " ".join(que_mtMinus+que_mtPlus), cds_string, ness_id))
    elif t.strand == '+':
        cmds.append("vcf2fasta.py -r %s --concatenate                      -v %s -s %s -i %s >chromosome_6/%s.all_quebec.fasta" %(reference_fasta, quebec_vcf, " ".join(que_mtMinus+que_mtPlus), cds_string, ness_id))
    else:
        print("Something weird", ness_id, t.seqid)
    for cmd in cmds:
        #print(cmd)
        child = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True)
    if len(child.communicate()[-1]) >1:
        print(cmd, "\n", child.communicate()[-1], "\n\n")  

In [None]:
i=0
for ness_id in random_transcripts:
    t = random_transcripts[ness_id]
    CDSs = [[exon.start, exon.end] for exon in t.sorted_feats('CDS')]
    cmds=[]
    cds_string = " ".join(["%s:%i-%i" %(t.seqid,i[0], i[1]) for i in CDSs])
    if t.strand == '-':
        cmds.append("vcf2fasta.py -r %s --concatenate --reverse_complement -v %s -s %s -i %s >random_transcripts2/%s.all_quebec.fasta" %(reference_fasta, quebec_vcf, " ".join(que_mtMinus+que_mtPlus), cds_string, ness_id))
    elif t.strand == '+':
        cmds.append("vcf2fasta.py -r %s --concatenate                      -v %s -s %s -i %s >random_transcripts2/%s.all_quebec.fasta" %(reference_fasta, quebec_vcf, " ".join(que_mtMinus+que_mtPlus), cds_string, ness_id))
    else:
        print("Something weird", ness_id, t.seqid)
    for cmd in cmds:
        #print(cmd)
        child = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True)
    if len(child.communicate()[-1]) >1:
        print(cmd, "\n", child.communicate()[-1], "\n\n")  
    i+=1
    if i%10 ==0:print(i)

10
20
30
40
50
60
70
