# Find Shared Genes


Multiple Steps were taken to find shared gene pairs:   

  1. reciprocal best BLAST
  2. Use deHoff et al table S4 to get equate common names with v4 gene IDs (Cre06.gXYZ) and use those gene IDs to find v5.3 Gene IDs and then associate them with the ness_ID
  3. for the last few remaining we can infer the gene based on the position relative to other known genes.
  
  
### The result of all this has been manually collated into a table `NameTranslation.txt`

Extract all CDS in ch6 and mtminus mt Loci

Run recirpocal BBLAST to ID orthologs

get transcripts - outout CDS fastas for BLAST

ch6 mt region runs from NIC7 (chromosome_6:336959-344444) to MAT3  (chromosome_6:937146-943474)

In [None]:
%%bash
# bgzip -c  ../VCF2FASTA/mini_gff.gff >mini_gff.gff.gz
# tabix -p gff mini_gff.gff.gz
tabix mini_gff.gff.gz chromosome_6:336949-943475 mtMinus >mtRegions.GFF

In [None]:
!grep "chromosome_6" mtRegions.GFF |tail -n 50


In [1]:
from annotation import Transcript
from Bio import SeqIO
from annotation import GFF_line

In [None]:
transcripts = Transcript.hash_gff('mtRegions.GFF', index_label='ness_ID', quiet=True)

In [None]:
ref_dict = SeqIO.to_dict(SeqIO.parse(open('/scratch/research/references/chlamydomonas/5.3_chlamy_w_organelles_mt_minus/chlamy.5.3.w_organelles_mtMinus.fasta'), 'fasta'))

In [None]:
oMinus = open('mtMinus_CDS.fasta', 'w')
oPlus = open('mtPlus_CDS.fasta', 'w')
for ness_id in transcripts:
    t=transcripts[ness_id]
    cds = "".join(t.cds(ref_dict))
    if t.seqid == 'chromosome_6':
        oPlus.write(">%s\n%s\n" %(ness_id, cds))
    elif t.seqid == 'mtMinus':
        oMinus.write(">%s\n%s\n" %(ness_id, cds))
    else: print('WTF',t.seqid, ness_id)
oMinus.close()
oPlus.close()
        
        
    

In [None]:
%%bash

blastn \
-num_alignments 1 \
-max_hsps 1 \
-evalue 1e-10 \
-query mtMinus_CDS.fasta \
-subject mtPlus_CDS.fasta \
-out Minus_vs_Plus.txt \
-outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qcovs qcovhsp"

In [None]:
%%bash

blastn \
-num_alignments 1 \
-max_hsps 1 \
-evalue 1e-10 \
-query mtPlus_CDS.fasta \
-subject mtMinus_CDS.fasta \
-out Plus_vs_Minus.txt \
-outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qcovs qcovhsp"

In [3]:
def line_parser(line, sep=None):
    parsed=[]
    for i in line.strip().split(sep):
        try:
            float(i)
            pass
        except ValueError:
            parsed.append(i)
            continue
        if "." in i or 'e' in i:
            parsed.append(float(i))
        else:
            parsed.append(int(i))
    return(parsed)

In [4]:
p2m = {}
for l in open('Plus_vs_Minus.txt'):
    qseqid, sseqid, pident, length, mismatch, \
    gapopen, qstart, qend, sstart, send, \
    evalue, bitscore, qcovs, qcovhsp = line_parser(l, sep="\t")
    qseqid =str(qseqid)
    if pident >80 and qcovs >25:
        pass
        #print(qseqid, sseqid)
    else:print(l)
    if qseqid not in p2m:
        p2m[qseqid] = [sseqid, pident, qcovs]
    else: print(l)

26893063	ADF43181.1	98.738	951	12	0	3367	4317	1	951	0.0	1690	22	22

26893603	ADF43181.1	98.633	951	13	0	3319	4269	1	951	0.0	1685	22	22



In [5]:
m2p = {}
for l in open('Minus_vs_Plus.txt'):
    qseqid, sseqid, pident, length, mismatch, \
    gapopen, qstart, qend, sstart, send, \
    evalue, bitscore, qcovs, qcovhsp = line_parser(l, sep="\t")
    sseqid =str(sseqid)
    if pident >80 and qcovs >25:
        #print(qseqid, sseqid)
        pass
    else:print(l)
    if qseqid not in p2m:
        m2p[qseqid] = [sseqid, pident, qcovs]
    else: print(l)

for m,p in m2p.items():
    print(m,p)

print("___________________________")
for p,m in p2m.items():
    print(p,m)

ADF43176.1 ['26893261', 96.949, 28]
ADF43179.1 ['26894262', 99.732, 100]
ADF43159.1 ['26894744', 98.663, 92]
ADF43197.1 ['26893801', 99.866, 100]
ADF43162.1 ['26894689', 99.288, 100]
ADF43166.1 ['26893360', 99.199, 100]
ADF43188.1 ['26893651', 99.479, 43]
ADF43180.1 ['26893070', 99.422, 97]
ADF43187.1 ['26893524', 98.494, 100]
ADF43172.1 ['26894098', 98.278, 100]
ADF43196.1 ['26893033', 99.33, 94]
ADF43169.1 ['26893622', 98.189, 99]
ADF43189.1 ['26894226', 98.063, 98]
ADF43177.1 ['26893429', 98.242, 100]
ADF43178.1 ['26893660', 99.407, 100]
ADF43164.1 ['26893557', 98.81, 87]
ADF43199.1 ['26893348', 92.982, 46]
ADF43168.1 ['26894643', 98.166, 84]
ADF43170.1 ['26893973', 99.592, 100]
ADF43181.1 ['26893063', 98.738, 100]
ADF43194.1 ['26893073', 98.915, 100]
ADF43167.1 ['26894285', 99.02, 49]
ADF43173.1 ['26893370', 99.338, 93]
ADF43182.1 ['26893603', 98.175, 72]
ADF43161.1 ['26893872', 96.101, 82]
ADF43184.1 ['26893059', 99.077, 40]
ADF43174.1 ['26894095', 97.739, 100]
ADF43183.1 ['268936

In [6]:
pairs = []
for m,p in m2p.items():
    if p[0] in list(p2m.keys()) and p2m[p[0]][0] == m:
        print("pair:", p,m)
        pairs.append([p[0],m])
print(len(pairs))    

pair: ['26893261', 96.949, 28] ADF43176.1
pair: ['26894262', 99.732, 100] ADF43179.1
pair: ['26894744', 98.663, 92] ADF43159.1
pair: ['26893801', 99.866, 100] ADF43197.1
pair: ['26894689', 99.288, 100] ADF43162.1
pair: ['26893360', 99.199, 100] ADF43166.1
pair: ['26893651', 99.479, 43] ADF43188.1
pair: ['26893070', 99.422, 97] ADF43180.1
pair: ['26893524', 98.494, 100] ADF43187.1
pair: ['26894098', 98.278, 100] ADF43172.1
pair: ['26893033', 99.33, 94] ADF43196.1
pair: ['26893622', 98.189, 99] ADF43169.1
pair: ['26894226', 98.063, 98] ADF43189.1
pair: ['26893429', 98.242, 100] ADF43177.1
pair: ['26893660', 99.407, 100] ADF43178.1
pair: ['26893557', 98.81, 87] ADF43164.1
pair: ['26893348', 92.982, 46] ADF43199.1
pair: ['26894643', 98.166, 84] ADF43168.1
pair: ['26893973', 99.592, 100] ADF43170.1
pair: ['26893063', 98.738, 100] ADF43181.1
pair: ['26893073', 98.915, 100] ADF43194.1
pair: ['26894285', 99.02, 49] ADF43167.1
pair: ['26893370', 99.338, 93] ADF43173.1
pair: ['26893872', 96.101,

In [7]:
for p in pairs:
    print(p[0], p[1])

26893261 ADF43176.1
26894262 ADF43179.1
26894744 ADF43159.1
26893801 ADF43197.1
26894689 ADF43162.1
26893360 ADF43166.1
26893651 ADF43188.1
26893070 ADF43180.1
26893524 ADF43187.1
26894098 ADF43172.1
26893033 ADF43196.1
26893622 ADF43169.1
26894226 ADF43189.1
26893429 ADF43177.1
26893660 ADF43178.1
26893557 ADF43164.1
26893348 ADF43199.1
26894643 ADF43168.1
26893973 ADF43170.1
26893063 ADF43181.1
26893073 ADF43194.1
26894285 ADF43167.1
26893370 ADF43173.1
26893872 ADF43161.1
26893059 ADF43184.1
26894095 ADF43174.1
26893178 ADF43171.1
26894494 ADF43175.1
26893232 ADF43186.1
26894333 ADF43165.1
26893647 ADF43163.1
26893219 ADF43198.1
26894294 ADF43195.1
26893181 ADF43160.1
26894082 ADF43192.1
26894462 ADF43191.1


In [8]:
# these are the ones with low qcovs in one direction:
for i in "26893261 26893348 26893059 26894285 26893651".split():
    p, m = i, p2m[i][0]
    print(p2m[p], m2p[m])

['ADF43176.1', 96.949, 37] ['26893261', 96.949, 28]
['ADF43199.1', 92.982, 98] ['26893348', 92.982, 46]
['ADF43184.1', 99.077, 89] ['26893059', 99.077, 40]
['ADF43167.1', 99.02, 100] ['26894285', 99.02, 49]
['ADF43188.1', 99.479, 37] ['26893651', 99.479, 43]


# After all this tjere are two reciprocal best blast hits that are low "coverage" in both directions.

The pair ADF43176.1' and '26893261' are 37% hit p2m and 43 m2p

    ['ADF43188.1', 99.479, 37] ['26893651', 99.479, 43]

This corresponds to the gene "MADS2m" which according to deHoff is on both CHs in the rearranged region on an inversion
So lets keep it and carry on

In [9]:
! zgrep "ADF43188.1" mini_gff.gff.gz

mtMinus	feature	gene	251374	254600	.	+	.	gene=MADS2m;ness_ID=ADF43188.1;ID=ADF43188.1;Name=ADF43188.1
mtMinus	feature	CDS	251374	251520	.	+	0	ness_ID=ADF43188.1;ID=ADF43188.1.CDS.1;Parent=ADF43188.1;Name=ADF43188.1
mtMinus	feature	CDS	251775	251864	.	+	0	ness_ID=ADF43188.1;ID=ADF43188.1.CDS.2;Parent=ADF43188.1;Name=ADF43188.1
mtMinus	feature	CDS	252011	252151	.	+	0	ness_ID=ADF43188.1;ID=ADF43188.1.CDS.3;Parent=ADF43188.1;Name=ADF43188.1
mtMinus	feature	CDS	252330	252735	.	+	0	ness_ID=ADF43188.1;ID=ADF43188.1.CDS.4;Parent=ADF43188.1;Name=ADF43188.1
mtMinus	feature	CDS	253014	253039	.	+	0	ness_ID=ADF43188.1;ID=ADF43188.1.CDS.5;Parent=ADF43188.1;Name=ADF43188.1
mtMinus	feature	CDS	253101	253458	.	+	0	ness_ID=ADF43188.1;ID=ADF43188.1.CDS.6;Parent=ADF43188.1;Name=ADF43188.1
mtMinus	feature	CDS	253779	254089	.	+	0	ness_ID=ADF43188.1;ID=ADF43188.1.CDS.7;Parent=ADF43188.1;Name=ADF43188.1
mtMinus	feature	CDS	254286	254600	.	+	0	ness_ID=ADF43188.1;ID=ADF43188.1.CDS.8;Parent=ADF43188.1;Name=ADF43

 - The other pair is 
    ['ADF43176.1', 96.949, 37] ['26893261', 96.949, 28]
Which corresponds to "LPS1m" which is also in the R domain and itself inverted  along with 522875. 

In [10]:
! zgrep "ADF43176.1" mini_gff.gff.gz


mtMinus	feature	gene	139144	171097	.	+	.	gene=LPS1m;ness_ID=ADF43176.1;ID=ADF43176.1;Name=ADF43176.1
mtMinus	feature	CDS	139144	139302	.	+	0	ness_ID=ADF43176.1;ID=ADF43176.1.CDS.1;Parent=ADF43176.1;Name=ADF43176.1
mtMinus	feature	CDS	139385	139492	.	+	0	ness_ID=ADF43176.1;ID=ADF43176.1.CDS.2;Parent=ADF43176.1;Name=ADF43176.1
mtMinus	feature	CDS	141363	141576	.	+	0	ness_ID=ADF43176.1;ID=ADF43176.1.CDS.3;Parent=ADF43176.1;Name=ADF43176.1
mtMinus	feature	CDS	145283	145554	.	+	0	ness_ID=ADF43176.1;ID=ADF43176.1.CDS.4;Parent=ADF43176.1;Name=ADF43176.1
mtMinus	feature	CDS	145692	145783	.	+	0	ness_ID=ADF43176.1;ID=ADF43176.1.CDS.5;Parent=ADF43176.1;Name=ADF43176.1
mtMinus	feature	CDS	145881	145935	.	+	0	ness_ID=ADF43176.1;ID=ADF43176.1.CDS.6;Parent=ADF43176.1;Name=ADF43176.1
mtMinus	feature	CDS	153198	153275	.	+	0	ness_ID=ADF43176.1;ID=ADF43176.1.CDS.7;Parent=ADF43176.1;Name=ADF43176.1
mtMinus	feature	CDS	155812	156084	.	+	0	ness_ID=ADF43176.1;ID=ADF43176.1.CDS.8;Parent=ADF43176.1;Name=ADF431

In [11]:
!zgrep "Cre06.g252801" mini_gff.gff.gz

chromosome_6	phytozome8_0	gene	491828	501445	.	-	.	ID=Cre06.g252801;Name=Cre06.g252801
chromosome_6	phytozome8_0	mRNA	491828	501445	.	-	.	ID=PAC:26893261;Name=Cre06.g252801.t1.3;pacid=26893261;longest=1;Parent=Cre06.g252801;ness_ID=26893261


In [12]:
!zgrep "PDK1" mini_gff.gff.gz

chromosome_6	phytozome8_0	mRNA	430275	435265	.	-	.	ID=PAC:26893429;Name=Cre06.g252300.t1.2;pacid=26893429;longest=1;geneName=PDK1;Parent=Cre06.g252300;ness_ID=26893429
mtMinus	feature	gene	175950	179936	.	-	.	gene=PDK1m;ness_ID=ADF43177.1;ID=ADF43177.1;Name=ADF43177.1


# deHoff


In [13]:
mtPlus_shared = "NIC7 SPS1 SPP3 EIF5Bb PTC1 182394 97782 RFC4 182392 ALB3 PSF2 182390 182389 294687 HDH1 TOC34 PDK1 CGL70 NMDA1 DRG1 DLA3 522872 LEU1S SPL2 LPS1 522875 PKY1 MADS2 UBCH1 GCSH PR46a PR46b MT0618 155027 OTU2a HRGP1 UTP1 MT0796 MT0828 MT0829 FUM1 FBX9 522914 522915 294742 522917 522918 522919 161193 196073 522922 SAD1 294752 344092 196063 CGLD28 THI10".split()
mtMinus_shared= "NIC7 SPS1 SPP1C EIF5Bb PTC1 182394 97782 RFC4 182392 ALB3 PSF2 182390 182389 294687 HDH1 TOC34 PDK1 CGL70 NMDA1 DRG1 DLA3 522872 LEU1S SPL2 LPS1 522875 PKY1 MADS2 UBCH1 GCSH PR46a PR46b MT0618 155027 OTU2 HRGP1 UTP1 MT0796 MT0828 MT0829 FUM1 FBX9 522914 522915 294742 522917 522918 522919 161193 196073 522922 SAD1 294752 344092 196063 CGLD28 THI10".split()
shared_pairs = [(i,j) for i,j in zip(mtPlus_shared, mtMinus_shared)]

nr_pair_names = []
for i,j in zip(mtPlus_shared, mtMinus_shared):
    if i not in nr_pair_names: nr_pair_names.append(i)
    if j not in nr_pair_names: nr_pair_names.append(j)

        
synonyms  = {}
#this is to avoid looking at attributes that are not names like DBXREF or codon_start 
possible_name_attributes = "Name Parent ID geneName gene ness_ID protein_id pacid product".split()
for common_name in nr_pair_names:
    """
    for every common name find its synonyms
    look through all the synonyms in all the attributes in both directions
    """
    synonyms[common_name] = {'synonyms':[common_name], 'gffs':[]}
    print(common_name)
    for l in open('mtRegions.GFF'):
        g = GFF_line.GFF_line(l)        
        # if common name is substring of an attribute
#         if common_name in l:
#             print("\tfound: ", common_name, "in this line: ", l[:12])
        found=False
        for attr in g.attributes:
            if attr in possible_name_attributes:
                for syn in synonyms[common_name]['synonyms']:
                    # is synonym in attribute value
                    if syn in g.attributes[attr]:
                        found = True
                    elif g.attributes[attr] in syn:
                        found = True
                    if found:
                        for attr in g.attributes:
                            if attr in possible_name_attributes and \
                            g.attributes[attr] not in synonyms[common_name]['synonyms']:
                                synonyms[common_name]['synonyms'].append(g.attributes[attr])
                        


NIC7
SPS1
SPP3
SPP1C
EIF5Bb
PTC1
182394
97782
RFC4
182392
ALB3
PSF2
182390
182389
294687
HDH1
TOC34
PDK1
CGL70
NMDA1
DRG1
DLA3
522872
LEU1S
SPL2
LPS1
522875
PKY1
MADS2
UBCH1
GCSH
PR46a
PR46b
MT0618
155027
OTU2a
OTU2
HRGP1
UTP1
MT0796
MT0828
MT0829
FUM1
FBX9
522914
522915
294742
522917
522918
522919
161193
196073
522922
SAD1
294752
344092
196063
CGLD28
THI10


# combine deHoff with reciprocal best BLAST


In [14]:
synonyms['THI10']['synonyms']

['THI10',
 'PAC:26893692',
 'Cre06.g255350.t1.2',
 '26893692',
 'Cre06.g255350',
 'PAC:26893692.exon.1',
 'PAC:26893692.five_prime_UTR.1',
 'PAC:26893692.CDS.1',
 'PAC:26893692.CDS.2',
 'PAC:26893692.exon.2',
 'PAC:26893692.CDS.3',
 'PAC:26893692.exon.3',
 'PAC:26893692.CDS.4',
 'PAC:26893692.exon.4',
 'PAC:26893692.CDS.5',
 'PAC:26893692.exon.5',
 'PAC:26893692.CDS.6',
 'PAC:26893692.exon.6',
 'PAC:26893692.exon.7',
 'PAC:26893692.CDS.7',
 'PAC:26893692.three_prime_UTR.1']

In [15]:
for idx in range(len(pairs)):
    pair = pairs[idx]
    #print(pair)
    minus, plus = pair[0],pair[1] 
    for ness_id in pair:
        for common_name in synonyms:
            for syn in synonyms[common_name]['synonyms']:
                if syn in ness_id or ness_id in syn:
                    synonyms[common_name]['ness_ids']=pair
                    #print("\tfound it!", ness_id, common_name)
                    pairs[idx] = [minus, plus, common_name]
found_count = 0
for common_name in synonyms:
    if 'ness_ids' in synonyms[common_name]:
        print(common_name, synonyms[common_name]['ness_ids'][0], synonyms[common_name]['ness_ids'][1])
        found_count +=1
    else:
         print(common_name)


MT0829
THI10
SPL2 26894294 ADF43195.1
PDK1 26893429 ADF43177.1
OTU2a
522922
182389 26893178 ADF43171.1
EIF5Bb 26894689 ADF43162.1
182392 26894285 ADF43167.1
HRGP1
522919
MADS2 26893651 ADF43188.1
LEU1S 26893073 ADF43194.1
MT0828 26893348 ADF43199.1
FBX9
UBCH1 26893524 ADF43187.1
97782 26894333 ADF43165.1
UTP1 26893801 ADF43197.1
MT0618
522917
522914
OTU2 26893063 ADF43181.1
SPP3 26893872 ADF43161.1
NMDA1 26894262 ADF43179.1
522915
522872 26894082 ADF43192.1
GCSH 26893232 ADF43186.1
522918
SAD1
ALB3 26894643 ADF43168.1
LPS1 26893261 ADF43176.1
294742
344092
PTC1 26893647 ADF43163.1
DRG1 26893070 ADF43180.1
182390 26893973 ADF43170.1
155027
161193
PSF2 26893622 ADF43169.1
TOC34 26894095 ADF43174.1
CGLD28
CGL70 26893660 ADF43178.1
SPP1C 26893872 ADF43161.1
MT0796
PKY1 26894226 ADF43189.1
RFC4 26893360 ADF43166.1
HDH1 26893370 ADF43173.1
196073
PR46b 26893059 ADF43184.1
522875 26894494 ADF43175.1
182394 26893557 ADF43164.1
196063
294687 26894098 ADF43172.1
DLA3 26894462 ADF43191.1
FUM1
PR4

In [16]:
for pair in pairs:
    print("\t".join(pair))

26893261	ADF43176.1	LPS1
26894262	ADF43179.1	NMDA1
26894744	ADF43159.1	NIC7
26893801	ADF43197.1	UTP1
26894689	ADF43162.1	EIF5Bb
26893360	ADF43166.1	RFC4
26893651	ADF43188.1	MADS2
26893070	ADF43180.1	DRG1
26893524	ADF43187.1	UBCH1
26894098	ADF43172.1	294687
26893033	ADF43196.1
26893622	ADF43169.1	PSF2
26894226	ADF43189.1	PKY1
26893429	ADF43177.1	PDK1
26893660	ADF43178.1	CGL70
26893557	ADF43164.1	182394
26893348	ADF43199.1	MT0828
26894643	ADF43168.1	ALB3
26893973	ADF43170.1	182390
26893063	ADF43181.1	OTU2
26893073	ADF43194.1	LEU1S
26894285	ADF43167.1	182392
26893370	ADF43173.1	HDH1
26893872	ADF43161.1	SPP1C
26893059	ADF43184.1	PR46b
26894095	ADF43174.1	TOC34
26893178	ADF43171.1	182389
26894494	ADF43175.1	522875
26893232	ADF43186.1	GCSH
26894333	ADF43165.1	97782
26893647	ADF43163.1	PTC1
26893219	ADF43198.1
26894294	ADF43195.1	SPL2
26893181	ADF43160.1	SPS1
26894082	ADF43192.1	522872
26894462	ADF43191.1	DLA3


# the C Domain

The reciprocal best BLAST pairs didn't include the C domain because it isn't part of the mtMinus reference

According to deHoff the C domain starts with mt0828 and ends with MAT3


In [None]:
synonyms['MT0828']

In [None]:
!tabix mini_gff.gff.gz chromosome_6:200000-1000000|grep "ness_ID=26893348"

# MT0828 starts at 824863

In [None]:
!tabix mini_gff.gff.gz chromosome_6:824863-1000000|grep "mRNA"