# Step 1. Generate gene fasta file

## Step 1.1 Load genome file using biopython from ARC: trs2seq dictionary

In [1]:
# use Biopython to read FASTA file
from Bio import SeqIO
chr2seq = {}
allrecords = []
for seq_record in SeqIO.parse("Ath.fa", "fasta"):  #??
    chr2seq[seq_record.id] = seq_record.seq
    allrecords.append(seq_record)
    print(seq_record)

ID: 1
Name: 1
Description: 1 dna:chromosome chromosome:TAIR10:1:1:30427671:1 REF
Number of features: 0
Seq('CCCTAAACCCTAAACCCTAAACCCTAAACCTCTGAATCCTTAATCCCTAAATCC...GGG', SingleLetterAlphabet())
ID: 2
Name: 2
Description: 2 dna:chromosome chromosome:TAIR10:2:1:19698289:1 REF
Number of features: 0
Seq('NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...GGG', SingleLetterAlphabet())
ID: 3
Name: 3
Description: 3 dna:chromosome chromosome:TAIR10:3:1:23459830:1 REF
Number of features: 0
Seq('NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...CCC', SingleLetterAlphabet())
ID: 4
Name: 4
Description: 4 dna:chromosome chromosome:TAIR10:4:1:18585056:1 REF
Number of features: 0
Seq('NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...AGG', SingleLetterAlphabet())
ID: 5
Name: 5
Description: 5 dna:chromosome chromosome:TAIR10:5:1:26975502:1 REF
Number of features: 0
Seq('TATACCATGTACCCTCAACCTTAAAACCCTAAAACCTATACTATAAATCTTTAA...ATC', SingleLetterAlphabet())
ID: Mt
Name: Mt
Descripti

In [2]:
#Check chr2seq output
from pprint import pprint
pprint(chr2seq)

{'1': Seq('CCCTAAACCCTAAACCCTAAACCCTAAACCTCTGAATCCTTAATCCCTAAATCC...GGG', SingleLetterAlphabet()),
 '2': Seq('NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...GGG', SingleLetterAlphabet()),
 '3': Seq('NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...CCC', SingleLetterAlphabet()),
 '4': Seq('NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...AGG', SingleLetterAlphabet()),
 '5': Seq('TATACCATGTACCCTCAACCTTAAAACCCTAAAACCTATACTATAAATCTTTAA...ATC', SingleLetterAlphabet()),
 'Mt': Seq('GGATCCGTTCGAAACAGGTTAGCCTACTATAATATAAGGATTGGATTCTAATAA...ATT', SingleLetterAlphabet()),
 'Pt': Seq('ATGGGCGAACGACGGGAATTGAACCCGCGATGGTGAATTCACAATCCACTGCCT...ATC', SingleLetterAlphabet())}


In [3]:
#Test to see if chromosome 1 can be called from dictionary
chr1seq = chr2seq['1']
print(repr(chr1seq))

Seq('CCCTAAACCCTAAACCCTAAACCCTAAACCTCTGAATCCTTAATCCCTAAATCC...GGG', SingleLetterAlphabet())


In [4]:
#1.1 Explanation
'''
Fasta file was imported using Biopython module.  The fasta file was converted into a 
dictionary that has the chromosome numbers as the keys, and the corresponding chromosome sequences as the values
'''

'\nFasta file was imported using Biopython module.  The fasta file was converted into a \ndictionary that has the chromosome numbers as the keys, and the corresponding chromosome sequences as the values\n'

## Step 1.2 Load GTF file to a dictionary

In [7]:
###Code for myGTF------------------------------------------------------------------------------------

#!/usr/bin/env python

"""
Modified from
https://gist.github.com/slowkow/8101481
"""
import gzip
import re

GTF_HEADER  = ['seqname', 'source', 'feature', 'start', 'end', 'score',
               'strand', 'frame']
R_SEMICOLON = re.compile(r'\s*;\s*')
R_COMMA     = re.compile(r'\s*,\s*')
R_KEYVALUE  = re.compile(r'(\s+|\s*=\s*)')


def lines(filename):
    """Open an optionally gzipped GTF file and generate a dict for each line.
    """
    #fn_open = gzip.open if filename.endswith('.gz') else open

    with fn_open(filename) as fh:
        for line in fh:
            if line.startswith('#'):
                continue
            else:
                yield parse(line)


def parse(line):
    """Parse a single GTF line and return a dict.
    """
    result = {}

    fields = line.rstrip().split('\t')

    for i, col in enumerate(GTF_HEADER):
        result[col] = _get_value(fields[i])

    # INFO field consists of "key1=value;key2=value;...".
    infos = [x for x in re.split(R_SEMICOLON, fields[8]) if x.strip()]

    for i, info in enumerate(infos, 1):
        # It should be key="value".
        try:
            key, _, value = re.split(R_KEYVALUE, info, 1)
        # But sometimes it is just "value".
        except ValueError:
            key = 'INFO{}'.format(i)
            value = info
        # Ignore the field if there is no value.
        if value:
            result[key] = _get_value(value)

    return result


def _get_value(value):
    if not value:
        return None

    # Strip double and single quotes.
    value = value.strip('"\'')

    # Return a list if the value has a comma.
    if ',' in value:
        value = re.split(R_COMMA, value)
    # These values are equivalent to None.
    elif value in ['', '.', 'NA']:
        return None

    return value


In [18]:
#This creates a dictionary of transcripts with their start and stop locations (ts2exon), and another dictionary of transcripts with their chromosome locations (ts2chr).

import myGTF
ts2exon = {}   
ts2chr = {}
for k in myGTF.lines('large.gtf'):
    tsid = k['transcript_id']; chrname = k['seqname']
    tsstart = int(k['start'])-1; tsend = int(k['end'])

    if not tsid in ts2exon: 
        ts2exon[tsid]=[]    
        ts2exon[tsid].append([tsstart,tsend])
    else:
        ts2exon[tsid].append([tsstart,tsend])

    if not tsid in ts2chr:
        ts2chr[tsid]=chrname

In [19]:
#The following code is used to sort the ts2exon dictionary.

ts2exon_s = {}
from operator import itemgetter
for ets in ts2exon:
    ts2exon_s[ets]=sorted(ts2exon[ets], key = itemgetter(1))

In [20]:
# check the result of ts2exon sorted
from pprint import pprint
pprint(ts2exon_s)

{'AT1G01010.1': [[3630, 3913],
                 [3995, 4276],
                 [4485, 4605],
                 [4705, 5095],
                 [5173, 5326],
                 [5438, 5899]],
 'AT1G01020.1': [[6787, 7069],
                 [7156, 7232],
                 [7383, 7450],
                 [7563, 7649],
                 [7761, 7835],
                 [7941, 7987],
                 [8235, 8325],
                 [8416, 8464],
                 [8570, 9130]],
 'AT1G01020.2': [[6787, 7069],
                 [7156, 7450],
                 [7563, 7649],
                 [7761, 7835],
                 [7941, 7987],
                 [8235, 8325],
                 [8416, 8464],
                 [8570, 8737]],
 'AT1G01020.3': [[6787, 7069],
                 [7156, 7232],
                 [7383, 7450],
                 [7563, 7649],
                 [7761, 7835],
                 [7941, 7987],
                 [8235, 8464],
                 [8570, 9130]],
 'AT1G01020.4': [[6787, 7069],
    

                 [351728, 351819],
                 [351911, 352001],
                 [352125, 352536]],
 'AT1G02020.1': [[352621, 353162], [353237, 355124]],
 'AT1G02020.2': [[352636, 355115]],
 'AT1G02020.3': [[352611, 353162], [353237, 353512], [353621, 355021]],
 'AT1G02030.1': [[355123, 357258]],
 'AT1G02040.1': [[357905, 359078]],
 'AT1G02050.1': [[359116, 360149], [360239, 360746]],
 'AT1G02060.1': [[360917, 363142]],
 'AT1G02080.1': [[373334, 373405],
                 [373605, 373786],
                 [373891, 374189],
                 [374271, 374500],
                 [374582, 374922],
                 [375121, 375238],
                 [375324, 375453],
                 [375527, 375719],
                 [375807, 376002],
                 [376113, 376251],
                 [376351, 376474],
                 [376561, 376815],
                 [376924, 377028],
                 [377210, 377305],
                 [377390, 377615],
                 [377880, 378036],
          

 'AT1G03050.1': [[707484, 708189],
                 [708303, 708509],
                 [708600, 709467],
                 [709597, 710041]],
 'AT1G03055.1': [[710017, 710136],
                 [710240, 710333],
                 [710694, 710801],
                 [710954, 711016],
                 [711091, 711184],
                 [711262, 711408],
                 [711504, 711857]],
 'AT1G03055.2': [[710062, 710136],
                 [710240, 710333],
                 [710694, 711016],
                 [711091, 711184],
                 [711262, 711408],
                 [711504, 711857]],
 'AT1G03060.1': [[712472, 714206],
                 [714316, 714463],
                 [714557, 714864],
                 [715006, 715315],
                 [715659, 718662],
                 [718934, 720585],
                 [720680, 721795],
                 [721904, 722007],
                 [722115, 722815],
                 [722965, 724119],
                 [724274, 724389],
                 

                 [1093824, 1093995],
                 [1094311, 1094464],
                 [1094550, 1094742],
                 [1094900, 1095029],
                 [1095106, 1095177],
                 [1095346, 1095443],
                 [1095517, 1095721],
                 [1095803, 1095967],
                 [1096034, 1096510]],
 'AT1G04163.1': [[744723, 746147]],
 'AT1G04170.1': [[1096881, 1097120],
                 [1097398, 1097531],
                 [1097618, 1097680],
                 [1097799, 1097912],
                 [1097998, 1098101],
                 [1098378, 1098606],
                 [1098717, 1098936],
                 [1099036, 1099297],
                 [1099399, 1099901]],
 'AT1G04170.2': [[1096881, 1097286],
                 [1097398, 1097531],
                 [1097618, 1097680],
                 [1097799, 1097912],
                 [1097998, 1098101],
                 [1098378, 1098606],
                 [1098717, 1098936],
                 [1099036, 1099297],


                 [1452508, 1452872]],
 'AT1G05065.1': [[1454605, 1455322]],
 'AT1G05070.1': [[1457006, 1457518], [1457617, 1457641], [1458347, 1458802]],
 'AT1G05077.1': [[4915882, 4916216]],
 'AT1G05083.1': [[4920782, 4921480]],
 'AT1G05087.1': [[1463107, 1463528], [1463648, 1464173]],
 'AT1G05087.2': [[1463113, 1464173]],
 'AT1G05090.1': [[1464238, 1465324], [1465447, 1465875]],
 'AT1G05093.1': [[4953016, 4953522]],
 'AT1G05113.1': [[5036520, 5036720]],
 'AT1G05120.1': [[1471322, 1471696],
                 [1471779, 1471860],
                 [1471947, 1472036],
                 [1472129, 1472282],
                 [1472372, 1472477],
                 [1472602, 1472833],
                 [1473008, 1473131],
                 [1473237, 1473307],
                 [1473393, 1473479],
                 [1473670, 1473817],
                 [1473920, 1473984],
                 [1474094, 1474168],
                 [1474252, 1474375],
                 [1474595, 1475539],
                 [1475

                 [1860586, 1860698]],
 'AT1G06140.1': [[1864773, 1866999]],
 'AT1G06143.1': [[1867036, 1868905]],
 'AT1G06150.1': [[1868988, 1869311],
                 [1869598, 1869724],
                 [1869862, 1870003],
                 [1870156, 1870258],
                 [1870366, 1871575],
                 [1871666, 1871786],
                 [1871913, 1871973],
                 [1872225, 1872267],
                 [1872349, 1872431],
                 [1872744, 1872888],
                 [1873099, 1873730]],
 'AT1G06150.2': [[1868988, 1869311],
                 [1869598, 1869724],
                 [1869862, 1870009],
                 [1870156, 1870258],
                 [1870366, 1871575],
                 [1871666, 1871786],
                 [1871913, 1871973],
                 [1872225, 1872267],
                 [1872349, 1872431],
                 [1872744, 1872888],
                 [1873099, 1873730]],
 'AT1G06170.1': [[1884990, 1885268],
                 [1885340, 188574

                 [2372783, 2374439]],
 'AT1G07700.1': [[2379594, 2379798],
                 [2380103, 2380231],
                 [2380353, 2380782],
                 [2380870, 2380924],
                 [2381013, 2381390]],
 'AT1G07700.2': [[2379594, 2379729],
                 [2380103, 2380231],
                 [2380353, 2380782],
                 [2380870, 2380924],
                 [2381013, 2381390]],
 'AT1G07700.3': [[2379646, 2380782], [2380870, 2380924], [2381013, 2381362]],
 'AT1G07700.4': [[2379622, 2380231],
                 [2380353, 2380782],
                 [2380870, 2380924],
                 [2381013, 2381379]],
 'AT1G07702.1': [[2382250, 2382331]],
 'AT1G07705.1': [[2381614, 2381754],
                 [2381994, 2382107],
                 [2382222, 2382331],
                 [2382403, 2382582],
                 [2382681, 2382730],
                 [2382839, 2383049],
                 [2383346, 2383553],
                 [2383641, 2383743],
                 [2384125, 23

                 [2802823, 2802881],
                 [2803016, 2803094],
                 [2803240, 2803329],
                 [2803536, 2803618],
                 [2803875, 2804064],
                 [2804251, 2804537]],
 'AT1G08750.3': [[2801042, 2801160],
                 [2801267, 2801395],
                 [2801829, 2801871],
                 [2801981, 2802126],
                 [2802387, 2802617],
                 [2802823, 2802881],
                 [2803016, 2803094],
                 [2803240, 2803329],
                 [2803536, 2803618],
                 [2803875, 2804064],
                 [2804251, 2804711]],
 'AT1G08760.1': [[2805043, 2806464],
                 [2806757, 2807057],
                 [2807310, 2807402],
                 [2807541, 2808623]],
 'AT1G08770.1': [[2808759, 2809775]],
 'AT1G08780.1': [[2809777, 2810185], [2810909, 2811026], [2811114, 2811313]],
 'AT1G08800.1': [[2812817, 2814758], [2814917, 2817158], [2817717, 2818071]],
 'AT1G08800.2': [[2812817,

                 [3209963, 3210054],
                 [3210142, 3210249],
                 [3210330, 3210486],
                 [3210568, 3210739],
                 [3210847, 3211109],
                 [3211183, 3211300],
                 [3211384, 3211450],
                 [3211534, 3211659],
                 [3211833, 3211979],
                 [3212117, 3212272]],
 'AT1G09880.3': [[3208788, 3209085],
                 [3209167, 3209300],
                 [3209376, 3209650],
                 [3209730, 3209872],
                 [3209963, 3210054],
                 [3210142, 3210249],
                 [3210330, 3210486],
                 [3210568, 3210739],
                 [3210847, 3211109],
                 [3211183, 3211300],
                 [3211384, 3211450],
                 [3211534, 3211659],
                 [3211833, 3211979],
                 [3212117, 3212202],
                 [3213414, 3213468]],
 'AT1G09880.4': [[3208827, 3209085],
                 [3209167, 3209300],

                 [3693621, 3693648]],
 'AT1G11070.2': [[3690011, 3691133],
                 [3691224, 3691390],
                 [3691461, 3692434],
                 [3692529, 3692653],
                 [3692730, 3692826],
                 [3693195, 3693510],
                 [3693621, 3694200]],
 'AT1G11070.3': [[3690011, 3690624],
                 [3690705, 3691133],
                 [3691224, 3691390],
                 [3691461, 3692434],
                 [3692529, 3692653],
                 [3692730, 3692826],
                 [3693195, 3693510],
                 [3693621, 3694200]],
 'AT1G11070.4': [[3690011, 3690624],
                 [3690705, 3690858],
                 [3690968, 3691133],
                 [3691224, 3691390],
                 [3691461, 3692434],
                 [3692529, 3692653],
                 [3692730, 3692826],
                 [3693195, 3693510],
                 [3693621, 3694200]],
 'AT1G11070.5': [[3690011, 3690624],
                 [3690705, 3691133

                 [4074029, 4074104],
                 [4074347, 4074423],
                 [4074541, 4074641],
                 [4074724, 4074784],
                 [4074878, 4074949],
                 [4075027, 4075090],
                 [4075194, 4075259],
                 [4075342, 4075439],
                 [4075533, 4075659],
                 [4075793, 4075869],
                 [4075963, 4076207]],
 'AT1G12060.1': [[4076225, 4077230]],
 'AT1G12064.1': [[4077497, 4078095]],
 'AT1G12070.1': [[4078546, 4078984],
                 [4079146, 4079271],
                 [4079363, 4079588],
                 [4079765, 4079832],
                 [4079923, 4080236]],
 'AT1G12080.1': [[4084161, 4084303], [4084486, 4084567], [4084669, 4085036]],
 'AT1G12080.2': [[4084161, 4084303], [4084486, 4085185]],
 'AT1G12110.1': [[4105222, 4105455],
                 [4105640, 4105818],
                 [4105912, 4105952],
                 [4106126, 4106701],
                 [4108425, 4109601]],
 'AT1G12

 'AT1G14260.2': [[4872855, 4873023],
                 [4873160, 4873439],
                 [4873540, 4873612],
                 [4873693, 4873761],
                 [4873853, 4874004],
                 [4874076, 4874163],
                 [4874250, 4874895]],
 'AT1G14260.3': [[4873116, 4873439],
                 [4873540, 4873612],
                 [4873693, 4873761],
                 [4873853, 4874004],
                 [4874076, 4874163],
                 [4874250, 4874907]],
 'AT1G14270.1': [[4874874, 4875140],
                 [4875230, 4875337],
                 [4875423, 4875515],
                 [4875590, 4875683],
                 [4875757, 4875807],
                 [4875894, 4876028],
                 [4876111, 4876194],
                 [4876360, 4876468],
                 [4876669, 4876767],
                 [4876940, 4877256]],
 'AT1G14270.2': [[4874874, 4875140],
                 [4875230, 4875337],
                 [4875423, 4875515],
                 [4875590, 4875683]

                 [5401837, 5401897],
                 [5401978, 5402519]],
 'AT1G15690.2': [[5398989, 5399376],
                 [5399574, 5400149],
                 [5400236, 5400616],
                 [5400716, 5400803],
                 [5400884, 5401215],
                 [5401345, 5401608],
                 [5402154, 5402733]],
 'AT1G15700.1': [[5402568, 5403818]],
 'AT1G15710.1': [[5404129, 5405910]],
 'AT1G15720.1': [[5406024, 5407486]],
 'AT1G15730.1': [[5407400, 5407672],
                 [5407780, 5407885],
                 [5408030, 5408116],
                 [5408202, 5408318],
                 [5408500, 5408592],
                 [5408687, 5408738],
                 [5408902, 5409099],
                 [5409199, 5409380],
                 [5409458, 5409524],
                 [5409622, 5410043]],
 'AT1G15740.1': [[5410907, 5411053],
                 [5411492, 5411724],
                 [5411800, 5411929],
                 [5412012, 5412220],
                 [5412341, 54124

                 [157049, 157182],
                 [157278, 157429],
                 [157508, 157699],
                 [157805, 158182]],
 'AT2G01330.1': [[158182, 158674],
                 [158760, 159174],
                 [159274, 159697],
                 [159770, 159957],
                 [160041, 160203],
                 [160363, 160870]],
 'AT2G01340.1': [[163953, 164187], [164292, 164709], [165011, 165290]],
 'AT2G01350.1': [[165140, 165428],
                 [165511, 165579],
                 [165651, 165828],
                 [165914, 166067],
                 [166199, 166283],
                 [166490, 166637],
                 [166722, 166856],
                 [166934, 167071],
                 [167159, 167259]],
 'AT2G01350.2': [[165140, 165428],
                 [165511, 165579],
                 [165651, 165828],
                 [165914, 166067],
                 [166199, 166283],
                 [166490, 166637],
                 [166722, 167071],
               

                 [797898, 798035],
                 [798147, 798283],
                 [798382, 798720],
                 [799183, 799442]],
 'AT2G02810.1': [[801526, 801746],
                 [801936, 802034],
                 [802131, 802262],
                 [802359, 802390],
                 [802493, 802597],
                 [802672, 802794],
                 [802880, 803521]],
 'AT2G02820.1': [[803655, 805262],
                 [805339, 805482],
                 [805616, 805762],
                 [805850, 806029],
                 [806144, 806330],
                 [806428, 806484],
                 [806560, 806633],
                 [806725, 806782],
                 [806890, 806937],
                 [807031, 807895]],
 'AT2G02820.2': [[803655, 804826],
                 [804913, 805262],
                 [805339, 805482],
                 [805616, 805762],
                 [805850, 806029],
                 [806144, 806330],
                 [806428, 806484],
                 

 'AT2G04880.1': [[1717887, 1718068],
                 [1718480, 1719380],
                 [1719483, 1719639],
                 [1719735, 1720526]],
 'AT2G04880.2': [[1717832, 1718068],
                 [1718480, 1719127],
                 [1719199, 1719380],
                 [1719483, 1719639],
                 [1719735, 1720526]],
 'AT2G04890.1': [[1720360, 1721899], [1722280, 1722604]],
 'AT2G04900.1': [[1722480, 1722590],
                 [1722669, 1722740],
                 [1722824, 1722905],
                 [1722989, 1723033],
                 [1723506, 1723976]],
 'AT2G04900.2': [[1722499, 1722590],
                 [1722669, 1722740],
                 [1722824, 1722905],
                 [1722989, 1723332]],
 'AT2G04910.1': [[1723979, 1724354]],
 'AT2G04910.2': [[1723938, 1723976], [1724077, 1724354]],
 'AT2G04940.1': [[1735962, 1736701],
                 [1736830, 1736937],
                 [1737038, 1737116],
                 [1737316, 1737437],
                 [1737689, 1

                 [5756741, 5757353]],
 'AT2G13800.4': [[5753669, 5753777],
                 [5753933, 5754077],
                 [5754752, 5754821],
                 [5754902, 5754974],
                 [5755052, 5755108],
                 [5755614, 5755722],
                 [5755805, 5756147],
                 [5756256, 5756651],
                 [5756741, 5757239]],
 'AT2G13810.1': [[5768416, 5768559],
                 [5768673, 5768699],
                 [5768798, 5768866],
                 [5768957, 5769074],
                 [5770389, 5770452],
                 [5770657, 5770762],
                 [5770840, 5770891],
                 [5770980, 5771301],
                 [5771377, 5771584],
                 [5771796, 5772266]],
 'AT2G13820.1': [[5774088, 5774449], [5775839, 5775875], [5775960, 5776302]],
 'AT2G13820.2': [[5775119, 5775875], [5775960, 5776289]],
 'AT2G13820.3': [[5774114, 5774449], [5775839, 5775875], [5775960, 5776342]],
 'AT2G13840.1': [[5788444, 5789028],
      

 'AT2G17150.1': [[7466686, 7467887],
                 [7468213, 7469222],
                 [7469301, 7469349],
                 [7469699, 7470694],
                 [7471174, 7471586]],
 'AT2G17150.2': [[7468073, 7469213],
                 [7469301, 7469349],
                 [7469699, 7470694],
                 [7471174, 7471307]],
 'AT2G17150.3': [[7466686, 7469222],
                 [7469301, 7469349],
                 [7469699, 7470694],
                 [7470873, 7471586]],
 'AT2G17150.4': [[7466686, 7467887],
                 [7468213, 7469222],
                 [7469301, 7470694],
                 [7471174, 7471586]],
 'AT2G17150.5': [[7466686, 7469222],
                 [7469301, 7469349],
                 [7469699, 7470694],
                 [7471174, 7471586]],
 'AT2G17150.6': [[7466754, 7467887],
                 [7468213, 7469222],
                 [7469301, 7469349],
                 [7469699, 7470694],
                 [7470873, 7471262]],
 'AT2G17190.1': [[7477844, 74781

                 [8384029, 8384104],
                 [8384201, 8384302],
                 [8384422, 8384629],
                 [8384756, 8384966],
                 [8385442, 8387012]],
 'AT2G19385.1': [[8386932, 8387235],
                 [8387329, 8387404],
                 [8387516, 8387647],
                 [8387854, 8388218],
                 [8388311, 8388419],
                 [8388518, 8388790]],
 'AT2G19390.1': [[8389609, 8390761],
                 [8390851, 8391074],
                 [8391180, 8391238],
                 [8391335, 8391406],
                 [8391605, 8391680],
                 [8391777, 8392224],
                 [8392705, 8392794],
                 [8392876, 8392936],
                 [8393030, 8393128],
                 [8393233, 8393992],
                 [8394155, 8394338],
                 [8394933, 8395229],
                 [8395750, 8396003],
                 [8396079, 8396596],
                 [8397153, 8397250],
                 [8397636, 8397939]]

                 [8902976, 8903098],
                 [8903194, 8903698]],
 'AT2G20650.1': [[8903600, 8903815],
                 [8903906, 8904172],
                 [8904254, 8904583],
                 [8904683, 8904851],
                 [8905036, 8905130],
                 [8905215, 8905275],
                 [8905367, 8905525],
                 [8905635, 8905713],
                 [8905789, 8905856],
                 [8905983, 8906044],
                 [8906139, 8906255],
                 [8906361, 8906489],
                 [8906574, 8906647],
                 [8906888, 8907339]],
 'AT2G20650.2': [[8903342, 8903815],
                 [8903902, 8904172],
                 [8904254, 8904583],
                 [8904683, 8904851],
                 [8905036, 8905130],
                 [8905215, 8905275],
                 [8905367, 8905525],
                 [8905635, 8905713],
                 [8905789, 8905856],
                 [8905983, 8906044],
                 [8906139, 8906255],

 'AT2G21540.5': [[9220555, 9220845],
                 [9220945, 9221017],
                 [9221191, 9221541],
                 [9221742, 9221854],
                 [9222038, 9222128],
                 [9222212, 9222312],
                 [9222397, 9222447],
                 [9222534, 9222636],
                 [9222737, 9222815],
                 [9222916, 9223183],
                 [9223262, 9223609],
                 [9223691, 9223761],
                 [9224336, 9224496]],
 'AT2G21540.6': [[9220555, 9220845],
                 [9220945, 9221017],
                 [9221191, 9221541],
                 [9221742, 9221854],
                 [9221942, 9221960],
                 [9222038, 9222128],
                 [9222212, 9222312],
                 [9222397, 9222447],
                 [9222534, 9222636],
                 [9222737, 9222815],
                 [9222916, 9223183],
                 [9223262, 9223609],
                 [9223691, 9223761],
                 [9223871, 9223946]],

                 [9972058, 9972203],
                 [9972291, 9972411],
                 [9972503, 9972638],
                 [9972731, 9972793],
                 [9972889, 9972965],
                 [9973188, 9973290],
                 [9973424, 9973541],
                 [9973631, 9973690],
                 [9973795, 9973970],
                 [9974053, 9974188],
                 [9974273, 9974372],
                 [9974455, 9974564],
                 [9974640, 9974753],
                 [9974831, 9975207]],
 'AT2G23420.2': [[9971603, 9971967],
                 [9972058, 9972203],
                 [9972291, 9972411],
                 [9972503, 9972638],
                 [9972731, 9972793],
                 [9972889, 9972965],
                 [9973188, 9973290],
                 [9973424, 9973541],
                 [9973631, 9973690],
                 [9973795, 9973970],
                 [9974053, 9974188],
                 [9974273, 9974372],
                 [9974455, 9974564],


                 [10487180, 10487427],
                 [10487505, 10487762]],
 'AT2G24650.3': [[10483267, 10484219],
                 [10484320, 10484464],
                 [10484564, 10484936],
                 [10485026, 10485533],
                 [10485631, 10486081],
                 [10486164, 10486608],
                 [10486832, 10486960],
                 [10487040, 10487115],
                 [10487180, 10487427],
                 [10487505, 10487773]],
 'AT2G24650.4': [[10483277, 10483762],
                 [10483844, 10484219],
                 [10484320, 10484464],
                 [10484564, 10484936],
                 [10485026, 10485533],
                 [10485631, 10486081],
                 [10486164, 10486608],
                 [10486832, 10486960],
                 [10487040, 10487427],
                 [10487505, 10487762]],
 'AT2G24650.5': [[10483277, 10483762],
                 [10483844, 10484219],
                 [10484320, 10484464],
                 [1048

                 [11055861, 11056062]],
 'AT2G25930.1': [[11058943, 11059677],
                 [11060039, 11060836],
                 [11062016, 11062068],
                 [11062158, 11063324]],
 'AT2G25940.1': [[11063300, 11063678],
                 [11063903, 11064107],
                 [11064202, 11064412],
                 [11064526, 11064775],
                 [11064849, 11064935],
                 [11065021, 11065178],
                 [11065257, 11065422],
                 [11065837, 11066184]],
 'AT2G25950.1': [[11068579, 11068818],
                 [11069106, 11069187],
                 [11069356, 11069406],
                 [11069592, 11069670],
                 [11069780, 11069838],
                 [11069924, 11069974],
                 [11070151, 11070260],
                 [11070340, 11070402],
                 [11070481, 11070661]],
 'AT2G25950.2': [[11068560, 11068818],
                 [11069106, 11069187],
                 [11069356, 11069406],
                 [110

 'AT2G27020.1': [[11528287, 11528586],
                 [11528669, 11528760],
                 [11528864, 11528930],
                 [11529317, 11529364],
                 [11529453, 11529538],
                 [11529626, 11529713],
                 [11529970, 11530044],
                 [11530190, 11530274],
                 [11530583, 11530623],
                 [11530754, 11531024]],
 'AT2G27020.2': [[11528229, 11528586],
                 [11528669, 11528760],
                 [11528864, 11528930],
                 [11529317, 11529364],
                 [11529453, 11529538],
                 [11529626, 11529713],
                 [11529970, 11530044],
                 [11530190, 11530274],
                 [11530583, 11530623],
                 [11530754, 11531178]],
 'AT2G27030.1': [[11531966, 11532144], [11532686, 11533284]],
 'AT2G27030.3': [[11531966, 11532144],
                 [11532686, 11533056],
                 [11534076, 11534358]],
 'AT2G27035.1': [[11535517, 11535844],

 'AT2G28380.1': [[12133731, 12134936],
                 [12135052, 12135272],
                 [12135669, 12136241]],
 'AT2G28390.1': [[12139577, 12139931],
                 [12140034, 12140099],
                 [12140189, 12140281],
                 [12140362, 12140638],
                 [12140740, 12140805],
                 [12140885, 12140967],
                 [12141045, 12141137],
                 [12141221, 12141354],
                 [12141720, 12141842],
                 [12141938, 12142040],
                 [12142223, 12142263],
                 [12142353, 12142425],
                 [12142547, 12142627],
                 [12142868, 12143665]],
 'AT2G28410.1': [[12156021, 12156984]],
 'AT2G28420.1': [[12157612, 12158807]],
 'AT2G28426.1': [[12159275, 12159407]],
 'AT2G28430.1': [[12159547, 12159852],
                 [12160162, 12160282],
                 [12160366, 12160466]],
 'AT2G28440.1': [[12160976, 12162032]],
 'AT2G28450.1': [[12162050, 12162426],
                 [

                  [5587376, 5587461]],
 'MSTRG.1365.5': [[5588353, 5588621]],
 'MSTRG.1367.6': [[5593356, 5593528], [5593685, 5593961], [5594096, 5594170]],
 'MSTRG.1369.2': [[5618438, 5618796]],
 'MSTRG.1371.7': [[5620036, 5620558]],
 'MSTRG.1378.2': [[5650742, 5651231]],
 'MSTRG.139.3': [[586022, 586321]],
 'MSTRG.143.2': [[602525, 602754], [602979, 603125], [603600, 604352]],
 'MSTRG.145.1': [[607798, 609569]],
 'MSTRG.147.2': [[614245, 615189]],
 'MSTRG.151.5': [[626762, 626828],
                 [626916, 627004],
                 [627099, 627165],
                 [627307, 627382],
                 [627481, 627614],
                 [627700, 627747],
                 [627834, 627892],
                 [627969, 628019],
                 [628111, 628168],
                 [628245, 628469],
                 [629287, 629321],
                 [629417, 629494],
                 [629579, 629819]],
 'MSTRG.157.1': [[641428, 642257]],
 'MSTRG.162.7': [[666873, 667642]],
 'MSTRG.162.8': [[

 'MSTRG.6532.1': [[11291137, 11291459],
                  [11291530, 11291676],
                  [11291760, 11291903],
                  [11292650, 11292874],
                  [11292951, 11293447]],
 'MSTRG.6533.3': [[11299543, 11302269]],
 'MSTRG.654.2': [[2570471, 2570744]],
 'MSTRG.6547.2': [[11392966, 11393394], [11393465, 11394255]],
 'MSTRG.6549.2': [[11401828, 11402164]],
 'MSTRG.655.2': [[2574684, 2574973]],
 'MSTRG.6550.4': [[11404394, 11404811]],
 'MSTRG.6551.3': [[11419153, 11419563]],
 'MSTRG.6551.4': [[11422436, 11422781]],
 'MSTRG.6562.2': [[11474877, 11475840]],
 'MSTRG.6562.3': [[11476208, 11476510]],
 'MSTRG.6565.2': [[11491854, 11491929],
                  [11492014, 11492100],
                  [11492344, 11492444],
                  [11492568, 11492668],
                  [11492749, 11492842],
                  [11493002, 11493203],
                  [11493286, 11493343],
                  [11493420, 11493525],
                  [11493616, 11493685],
             

In [4]:
#Comments about code from 1.2
'''The large.gtf file was imported using myGTF and converted into a 
dictionary with the name and positions of the transcripts in the DNA sequence. It was then
sorted with the new file name of ts2pos_s.
'''

'The large.gtf file was imported using myGTF and converted into a \ndictionary with the name and positions of the transcripts in the DNA sequence. It was then\nsorted with the new file name of ts2pos_s.\n'

## Step 1.3 Create a Transcript 2 Chrome Dictionary

In [10]:
#Check result of ts2chr, which was created to have a dictionary of transcripts with their chromosome location.
pprint(ts2chr)

{'AT1G01010.1': '1',
 'AT1G01020.1': '1',
 'AT1G01020.2': '1',
 'AT1G01020.3': '1',
 'AT1G01020.4': '1',
 'AT1G01020.5': '1',
 'AT1G01020.6': '1',
 'AT1G01030.1': '1',
 'AT1G01030.2': '1',
 'AT1G01040.1': '1',
 'AT1G01040.2': '1',
 'AT1G01050.1': '1',
 'AT1G01050.2': '1',
 'AT1G01060.1': '1',
 'AT1G01060.2': '1',
 'AT1G01060.3': '1',
 'AT1G01060.4': '1',
 'AT1G01060.5': '1',
 'AT1G01060.6': '1',
 'AT1G01060.7': '1',
 'AT1G01060.8': '1',
 'AT1G01080.1': '1',
 'AT1G01080.2': '1',
 'AT1G01080.3': '1',
 'AT1G01090.1': '1',
 'AT1G01100.1': '1',
 'AT1G01100.2': '1',
 'AT1G01100.3': '1',
 'AT1G01100.4': '1',
 'AT1G01110.1': '1',
 'AT1G01110.2': '1',
 'AT1G01120.1': '1',
 'AT1G01130.1': '1',
 'AT1G01140.1': '1',
 'AT1G01140.2': '1',
 'AT1G01140.3': '1',
 'AT1G01150.1': '1',
 'AT1G01150.2': '1',
 'AT1G01150.3': '1',
 'AT1G01160.1': '1',
 'AT1G01160.2': '1',
 'AT1G01170.1': '1',
 'AT1G01170.2': '1',
 'AT1G01210.1': '1',
 'AT1G01210.2': '1',
 'AT1G01210.3': '1',
 'AT1G01220.1': '1',
 'AT1G01220.2

 'AT1G03935.1': '1',
 'AT1G03940.1': '1',
 'AT1G03950.1': '1',
 'AT1G03960.1': '1',
 'AT1G03960.2': '1',
 'AT1G03960.3': '1',
 'AT1G03960.4': '1',
 'AT1G03960.5': '1',
 'AT1G03960.6': '1',
 'AT1G03960.7': '1',
 'AT1G03970.1': '1',
 'AT1G03980.1': '1',
 'AT1G03980.2': '1',
 'AT1G03980.3': '1',
 'AT1G03980.4': '1',
 'AT1G03990.1': '1',
 'AT1G03993.1': '1',
 'AT1G03997.1': '1',
 'AT1G04000.1': '1',
 'AT1G04007.1': '1',
 'AT1G04010.1': '1',
 'AT1G04013.1': '1',
 'AT1G04017.1': '1',
 'AT1G04020.1': '1',
 'AT1G04020.2': '1',
 'AT1G04023.1': '1',
 'AT1G04030.1': '1',
 'AT1G04030.2': '1',
 'AT1G04040.1': '1',
 'AT1G04043.1': '1',
 'AT1G04047.1': '1',
 'AT1G04050.1': '1',
 'AT1G04050.2': '1',
 'AT1G04050.3': '1',
 'AT1G04053.1': '1',
 'AT1G04057.1': '1',
 'AT1G04067.1': '1',
 'AT1G04070.1': '1',
 'AT1G04073.1': '1',
 'AT1G04077.1': '1',
 'AT1G04080.1': '1',
 'AT1G04080.2': '1',
 'AT1G04080.3': '1',
 'AT1G04080.4': '1',
 'AT1G04080.5': '1',
 'AT1G04083.1': '1',
 'AT1G04090.1': '1',
 'AT1G04097.1

 'AT1G05894.1': '1',
 'AT1G05900.1': '1',
 'AT1G05900.2': '1',
 'AT1G05910.1': '1',
 'AT1G05940.1': '1',
 'AT1G05940.2': '1',
 'AT1G05940.3': '1',
 'AT1G05940.4': '1',
 'AT1G05950.1': '1',
 'AT1G05950.2': '1',
 'AT1G05950.3': '1',
 'AT1G05950.4': '1',
 'AT1G05950.5': '1',
 'AT1G05960.1': '1',
 'AT1G05960.2': '1',
 'AT1G05960.3': '1',
 'AT1G05960.4': '1',
 'AT1G05970.1': '1',
 'AT1G05970.2': '1',
 'AT1G06000.1': '1',
 'AT1G06002.1': '1',
 'AT1G06010.1': '1',
 'AT1G06040.1': '1',
 'AT1G06040.2': '1',
 'AT1G06045.1': '1',
 'AT1G06050.1': '1',
 'AT1G06060.1': '1',
 'AT1G06060.2': '1',
 'AT1G06070.1': '1',
 'AT1G06080.1': '1',
 'AT1G06080.2': '1',
 'AT1G06110.1': '1',
 'AT1G06110.2': '1',
 'AT1G06110.3': '1',
 'AT1G06130.1': '1',
 'AT1G06130.2': '1',
 'AT1G06140.1': '1',
 'AT1G06143.1': '1',
 'AT1G06150.1': '1',
 'AT1G06150.2': '1',
 'AT1G06170.1': '1',
 'AT1G06170.2': '1',
 'AT1G06180.1': '1',
 'AT1G06190.1': '1',
 'AT1G06190.2': '1',
 'AT1G06190.3': '1',
 'AT1G06190.4': '1',
 'AT1G06190.5

 'AT1G08260.1': '1',
 'AT1G08260.2': '1',
 'AT1G08270.1': '1',
 'AT1G08270.2': '1',
 'AT1G08280.1': '1',
 'AT1G08300.1': '1',
 'AT1G08300.2': '1',
 'AT1G08300.3': '1',
 'AT1G08310.1': '1',
 'AT1G08310.2': '1',
 'AT1G08310.3': '1',
 'AT1G08315.1': '1',
 'AT1G08340.1': '1',
 'AT1G08350.1': '1',
 'AT1G08350.2': '1',
 'AT1G08350.3': '1',
 'AT1G08350.4': '1',
 'AT1G08360.1': '1',
 'AT1G08370.1': '1',
 'AT1G08380.1': '1',
 'AT1G08390.1': '1',
 'AT1G08390.2': '1',
 'AT1G08400.1': '1',
 'AT1G08410.1': '1',
 'AT1G08420.1': '1',
 'AT1G08420.2': '1',
 'AT1G08450.1': '1',
 'AT1G08450.2': '1',
 'AT1G08450.3': '1',
 'AT1G08460.1': '1',
 'AT1G08460.2': '1',
 'AT1G08465.1': '1',
 'AT1G08470.1': '1',
 'AT1G08480.1': '1',
 'AT1G08490.1': '1',
 'AT1G08500.1': '1',
 'AT1G08510.1': '1',
 'AT1G08520.1': '1',
 'AT1G08530.1': '1',
 'AT1G08530.2': '1',
 'AT1G08530.3': '1',
 'AT1G08540.1': '1',
 'AT1G08550.1': '1',
 'AT1G08550.2': '1',
 'AT1G08550.3': '1',
 'AT1G08560.1': '1',
 'AT1G08570.1': '1',
 'AT1G08570.2

 'AT1G11300.1': '1',
 'AT1G11300.2': '1',
 'AT1G11303.1': '1',
 'AT1G11310.1': '1',
 'AT1G11310.2': '1',
 'AT1G11310.3': '1',
 'AT1G11320.1': '1',
 'AT1G11330.1': '1',
 'AT1G11330.2': '1',
 'AT1G11330.3': '1',
 'AT1G11350.1': '1',
 'AT1G11360.1': '1',
 'AT1G11360.2': '1',
 'AT1G11360.3': '1',
 'AT1G11360.4': '1',
 'AT1G11380.1': '1',
 'AT1G11390.1': '1',
 'AT1G11400.1': '1',
 'AT1G11400.2': '1',
 'AT1G11400.3': '1',
 'AT1G11400.4': '1',
 'AT1G11400.5': '1',
 'AT1G11400.6': '1',
 'AT1G11410.1': '1',
 'AT1G11410.2': '1',
 'AT1G11410.3': '1',
 'AT1G11410.4': '1',
 'AT1G11420.1': '1',
 'AT1G11430.1': '1',
 'AT1G11440.1': '1',
 'AT1G11450.1': '1',
 'AT1G11450.2': '1',
 'AT1G11450.3': '1',
 'AT1G11450.4': '1',
 'AT1G11450.5': '1',
 'AT1G11475.1': '1',
 'AT1G11480.1': '1',
 'AT1G11480.2': '1',
 'AT1G11530.1': '1',
 'AT1G11540.1': '1',
 'AT1G11540.2': '1',
 'AT1G11545.1': '1',
 'AT1G11580.1': '1',
 'AT1G11580.2': '1',
 'AT1G11591.1': '1',
 'AT1G11592.1': '1',
 'AT1G11592.2': '1',
 'AT1G11593.1

 'AT1G16490.1': '1',
 'AT1G16490.2': '1',
 'AT1G16500.1': '1',
 'AT1G16515.1': '1',
 'AT1G16515.2': '1',
 'AT1G16520.1': '1',
 'AT1G16530.1': '1',
 'AT1G16540.1': '1',
 'AT1G16540.2': '1',
 'AT1G16540.3': '1',
 'AT1G16560.1': '1',
 'AT1G16560.2': '1',
 'AT1G16560.3': '1',
 'AT1G16560.4': '1',
 'AT1G16560.5': '1',
 'AT1G16560.6': '1',
 'AT1G16560.7': '1',
 'AT1G16560.8': '1',
 'AT1G16560.9': '1',
 'AT1G16570.1': '1',
 'AT1G16570.2': '1',
 'AT1G16590.1': '1',
 'AT2G01008.1': '2',
 'AT2G01010.1': '2',
 'AT2G01020.1': '2',
 'AT2G01021.1': '2',
 'AT2G01023.1': '2',
 'AT2G01060.1': '2',
 'AT2G01060.2': '2',
 'AT2G01070.1': '2',
 'AT2G01070.2': '2',
 'AT2G01080.1': '2',
 'AT2G01090.1': '2',
 'AT2G01090.2': '2',
 'AT2G01090.3': '2',
 'AT2G01090.4': '2',
 'AT2G01100.1': '2',
 'AT2G01100.2': '2',
 'AT2G01100.3': '2',
 'AT2G01110.1': '2',
 'AT2G01120.1': '2',
 'AT2G01120.2': '2',
 'AT2G01130.1': '2',
 'AT2G01130.2': '2',
 'AT2G01140.1': '2',
 'AT2G01150.1': '2',
 'AT2G01150.2': '2',
 'AT2G01150.3

 'AT2G04560.1': '2',
 'AT2G04620.1': '2',
 'AT2G04620.2': '2',
 'AT2G04621.1': '2',
 'AT2G04630.1': '2',
 'AT2G04650.1': '2',
 'AT2G04650.2': '2',
 'AT2G04660.1': '2',
 'AT2G04690.1': '2',
 'AT2G04690.2': '2',
 'AT2G04690.3': '2',
 'AT2G04690.4': '2',
 'AT2G04690.5': '2',
 'AT2G04690.6': '2',
 'AT2G04700.1': '2',
 'AT2G04700.2': '2',
 'AT2G04700.3': '2',
 'AT2G04740.1': '2',
 'AT2G04740.2': '2',
 'AT2G04740.3': '2',
 'AT2G04740.4': '2',
 'AT2G04750.1': '2',
 'AT2G04755.1': '2',
 'AT2G04780.1': '2',
 'AT2G04780.2': '2',
 'AT2G04790.1': '2',
 'AT2G04790.10': '2',
 'AT2G04790.11': '2',
 'AT2G04790.12': '2',
 'AT2G04790.13': '2',
 'AT2G04790.14': '2',
 'AT2G04790.15': '2',
 'AT2G04790.16': '2',
 'AT2G04790.2': '2',
 'AT2G04790.3': '2',
 'AT2G04790.4': '2',
 'AT2G04790.5': '2',
 'AT2G04790.6': '2',
 'AT2G04790.7': '2',
 'AT2G04790.8': '2',
 'AT2G04790.9': '2',
 'AT2G04805.1': '2',
 'AT2G04840.1': '2',
 'AT2G04842.1': '2',
 'AT2G04842.2': '2',
 'AT2G04845.1': '2',
 'AT2G04845.2': '2',
 'AT2G

 'AT2G16280.1': '2',
 'AT2G16360.1': '2',
 'AT2G16365.1': '2',
 'AT2G16365.2': '2',
 'AT2G16365.3': '2',
 'AT2G16365.4': '2',
 'AT2G16365.5': '2',
 'AT2G16365.6': '2',
 'AT2G16370.1': '2',
 'AT2G16370.2': '2',
 'AT2G16370.3': '2',
 'AT2G16390.1': '2',
 'AT2G16390.2': '2',
 'AT2G16400.1': '2',
 'AT2G16405.1': '2',
 'AT2G16430.1': '2',
 'AT2G16430.2': '2',
 'AT2G16440.1': '2',
 'AT2G16460.1': '2',
 'AT2G16460.2': '2',
 'AT2G16485.1': '2',
 'AT2G16485.2': '2',
 'AT2G16500.1': '2',
 'AT2G16510.1': '2',
 'AT2G16530.1': '2',
 'AT2G16530.2': '2',
 'AT2G16530.3': '2',
 'AT2G16530.4': '2',
 'AT2G16530.5': '2',
 'AT2G16570.1': '2',
 'AT2G16575.1': '2',
 'AT2G16580.1': '2',
 'AT2G16586.1': '2',
 'AT2G16595.1': '2',
 'AT2G16600.1': '2',
 'AT2G16600.2': '2',
 'AT2G16630.1': '2',
 'AT2G16640.1': '2',
 'AT2G16640.2': '2',
 'AT2G16640.3': '2',
 'AT2G16650.1': '2',
 'AT2G16650.2': '2',
 'AT2G16660.1': '2',
 'AT2G16660.2': '2',
 'AT2G16700.1': '2',
 'AT2G16700.2': '2',
 'AT2G16700.3': '2',
 'AT2G16700.4

 'AT2G19640.1': '2',
 'AT2G19640.2': '2',
 'AT2G19660.1': '2',
 'AT2G19660.2': '2',
 'AT2G19670.1': '2',
 'AT2G19680.1': '2',
 'AT2G19680.2': '2',
 'AT2G19690.1': '2',
 'AT2G19690.2': '2',
 'AT2G19690.3': '2',
 'AT2G19710.1': '2',
 'AT2G19720.1': '2',
 'AT2G19730.1': '2',
 'AT2G19730.2': '2',
 'AT2G19730.3': '2',
 'AT2G19740.1': '2',
 'AT2G19750.1': '2',
 'AT2G19760.1': '2',
 'AT2G19780.1': '2',
 'AT2G19790.1': '2',
 'AT2G19796.1': '2',
 'AT2G19800.1': '2',
 'AT2G19802.1': '2',
 'AT2G19810.1': '2',
 'AT2G19830.1': '2',
 'AT2G19830.2': '2',
 'AT2G19860.1': '2',
 'AT2G19860.2': '2',
 'AT2G19870.1': '2',
 'AT2G19880.1': '2',
 'AT2G19880.2': '2',
 'AT2G19910.1': '2',
 'AT2G19910.2': '2',
 'AT2G19920.1': '2',
 'AT2G19920.2': '2',
 'AT2G19920.3': '2',
 'AT2G19920.4': '2',
 'AT2G19930.1': '2',
 'AT2G19930.2': '2',
 'AT2G19930.3': '2',
 'AT2G19940.1': '2',
 'AT2G19940.2': '2',
 'AT2G19940.3': '2',
 'AT2G19950.1': '2',
 'AT2G19950.2': '2',
 'AT2G19970.1': '2',
 'AT2G20000.1': '2',
 'AT2G20000.2

 'AT2G23820.2': '2',
 'AT2G23840.1': '2',
 'AT2G23890.1': '2',
 'AT2G23890.2': '2',
 'AT2G23890.3': '2',
 'AT2G23900.1': '2',
 'AT2G23910.1': '2',
 'AT2G23910.2': '2',
 'AT2G23910.3': '2',
 'AT2G23910.4': '2',
 'AT2G23930.1': '2',
 'AT2G23930.2': '2',
 'AT2G23940.1': '2',
 'AT2G23940.2': '2',
 'AT2G23940.3': '2',
 'AT2G23945.1': '2',
 'AT2G23950.1': '2',
 'AT2G23950.2': '2',
 'AT2G23980.1': '2',
 'AT2G23980.10': '2',
 'AT2G23980.2': '2',
 'AT2G23980.3': '2',
 'AT2G23980.4': '2',
 'AT2G23980.5': '2',
 'AT2G23980.6': '2',
 'AT2G23980.7': '2',
 'AT2G23980.8': '2',
 'AT2G23980.9': '2',
 'AT2G23985.1': '2',
 'AT2G23985.10': '2',
 'AT2G23985.11': '2',
 'AT2G23985.12': '2',
 'AT2G23985.13': '2',
 'AT2G23985.14': '2',
 'AT2G23985.15': '2',
 'AT2G23985.16': '2',
 'AT2G23985.17': '2',
 'AT2G23985.18': '2',
 'AT2G23985.19': '2',
 'AT2G23985.2': '2',
 'AT2G23985.20': '2',
 'AT2G23985.21': '2',
 'AT2G23985.22': '2',
 'AT2G23985.23': '2',
 'AT2G23985.24': '2',
 'AT2G23985.25': '2',
 'AT2G23985.26': 

 'AT2G29628.1': '2',
 'AT2G29630.1': '2',
 'AT2G29630.2': '2',
 'AT2G29630.3': '2',
 'AT2G29630.4': '2',
 'AT2G29640.1': '2',
 'AT2G29650.1': '2',
 'AT2G29650.2': '2',
 'AT2G29650.3': '2',
 'AT2G29650.4': '2',
 'AT2G29660.1': '2',
 'AT2G29670.1': '2',
 'AT2G29670.2': '2',
 'AT2G29679.1': '2',
 'AT2G29680.1': '2',
 'AT2G29680.2': '2',
 'AT2G29690.1': '2',
 'AT2G29690.2': '2',
 'AT2G29700.1': '2',
 'AT2G29760.1': '2',
 'AT2G29770.1': '2',
 'MSTRG.100.4': '1',
 'MSTRG.100.5': '1',
 'MSTRG.1000.4': '1',
 'MSTRG.1000.5': '1',
 'MSTRG.101.4': '1',
 'MSTRG.1010.2': '1',
 'MSTRG.1019.3': '1',
 'MSTRG.1021.2': '1',
 'MSTRG.1022.4': '1',
 'MSTRG.1029.1': '1',
 'MSTRG.1030.3': '1',
 'MSTRG.1031.2': '1',
 'MSTRG.1034.1': '1',
 'MSTRG.1037.3': '1',
 'MSTRG.1039.2': '1',
 'MSTRG.1040.3': '1',
 'MSTRG.1045.4': '1',
 'MSTRG.105.2': '1',
 'MSTRG.1056.3': '1',
 'MSTRG.1059.3': '1',
 'MSTRG.106.1': '1',
 'MSTRG.106.6': '1',
 'MSTRG.1062.1': '1',
 'MSTRG.1063.2': '1',
 'MSTRG.1063.3': '1',
 'MSTRG.1067.1'

 'MSTRG.5876.2': '2',
 'MSTRG.5876.3': '2',
 'MSTRG.5877.3': '2',
 'MSTRG.5882.2': '2',
 'MSTRG.5882.3': '2',
 'MSTRG.5887.4': '2',
 'MSTRG.5897.2': '2',
 'MSTRG.59.3': '1',
 'MSTRG.5905.2': '2',
 'MSTRG.5911.2': '2',
 'MSTRG.5917.3': '2',
 'MSTRG.5938.3': '2',
 'MSTRG.5945.4': '2',
 'MSTRG.5953.4': '2',
 'MSTRG.5960.3': '2',
 'MSTRG.5964.1': '2',
 'MSTRG.5965.1': '2',
 'MSTRG.5968.1': '2',
 'MSTRG.5976.2': '2',
 'MSTRG.5980.2': '2',
 'MSTRG.5982.2': '2',
 'MSTRG.5984.2': '2',
 'MSTRG.599.6': '1',
 'MSTRG.5998.2': '2',
 'MSTRG.60.2': '1',
 'MSTRG.6000.1': '2',
 'MSTRG.6000.4': '2',
 'MSTRG.6009.2': '2',
 'MSTRG.6009.5': '2',
 'MSTRG.6011.2': '2',
 'MSTRG.6023.3': '2',
 'MSTRG.6024.3': '2',
 'MSTRG.6027.3': '2',
 'MSTRG.6029.3': '2',
 'MSTRG.603.2': '1',
 'MSTRG.6030.3': '2',
 'MSTRG.6033.2': '2',
 'MSTRG.6036.3': '2',
 'MSTRG.6037.2': '2',
 'MSTRG.6047.2': '2',
 'MSTRG.6051.3': '2',
 'MSTRG.6052.2': '2',
 'MSTRG.6058.3': '2',
 'MSTRG.6062.2': '2',
 'MSTRG.6065.2': '2',
 'MSTRG.6066.5':

In [21]:
#Comments about code from 1.3
'''
Transcript names listed with what chromosome that they are located on in a dictionary.
'''

'\nTranscript names listed with what chromosome that they are located on in a dictionary.\n'

## Step 1.4 Use a “for” loop to work on each gene

In [11]:
ts2seq= {} #initialize empty dictionary

#for loop to create transcript to sequence dictionary (transcript=key, sequence=value)
for eachts in ts2exon_s:
    fullseq=""
    chrname=ts2chr[eachts]
    chrseq=chr2seq[chrname]
    for es,ee in ts2exon_s[eachts]:
        fullseq=fullseq+chrseq[es:ee] #stitch together exons
    ts2seq[eachts]=fullseq
print(ts2seq)

{'AT1G01010.1': Seq('AAATTATTAGATATACCAAACCAGAGAAAACAAATACATAATCGGAGAAATACA...AAG', SingleLetterAlphabet()), 'MSTRG.2.1': Seq('TTAGTAAGGTCTAATTCAATTTTTGGTGGCGATAATATTTGGCTTAGTCATAAA...TTC', SingleLetterAlphabet()), 'AT1G01020.1': Seq('TTAGTAAGGTCTAATTCAATTTTTGGTGGCGATAATATTTGGCTTAGTCATAAA...ATT', SingleLetterAlphabet()), 'AT1G01020.5': Seq('TTAGTAAGGTCTAATTCAATTTTTGGTGGCGATAATATTTGGCTTAGTCATAAA...ATT', SingleLetterAlphabet()), 'AT1G01020.3': Seq('TTAGTAAGGTCTAATTCAATTTTTGGTGGCGATAATATTTGGCTTAGTCATAAA...ATT', SingleLetterAlphabet()), 'AT1G01020.6': Seq('TTAGTAAGGTCTAATTCAATTTTTGGTGGCGATAATATTTGGCTTAGTCATAAA...TCT', SingleLetterAlphabet()), 'AT1G01020.4': Seq('TTAGTAAGGTCTAATTCAATTTTTGGTGGCGATAATATTTGGCTTAGTCATAAA...ATT', SingleLetterAlphabet()), 'AT1G01020.2': Seq('TTAGTAAGGTCTAATTCAATTTTTGGTGGCGATAATATTTGGCTTAGTCATAAA...TCT', SingleLetterAlphabet()), 'AT1G01030.1': Seq('TTATATACAAAATTGAAAAGATGCGAGTTTCAACATGGTGACAAAAGCCTAATG...TAT', SingleLetterAlphabet()), 'AT1G01030.2': Seq('TTATATACA

### 1.4 Explanation
'''
A transcript to sequence dictionary was created using the ts2chr dictionary, the ts2exon_s sorted dictionary,
and the chr2seq dictionary to pull out the transcript names, which ones were on which chromosome, and where the exons for 
each started and ended on the chromosome. The exons were then stiched together so the sequence of each transcript was matched
with the transcript name.
'''

## Step 1.5 Save results to FASTA file

In [12]:
# Convert values of the dictionary to Biopython Seq objects
allseqrecord=[]
for seq in ts2seq: #loop to convert all lines
   
    #import biopython tools
    from Bio import SeqIO
    from Bio.Alphabet import IUPAC
    from Bio.SeqRecord import SeqRecord
    from Bio.Seq import Seq 
    
    #convert dictionary to biopython seq objects
    des = ""
    trans2seque={}
    seqrecords = SeqRecord(Seq(str(ts2seq[seq]),IUPAC.unambiguous_dna), id=seq, description=des)
    trans2seque[seq]=seqrecords
    allseqrecord.append(seqrecords)

#write dictionary out to fasta file called Step1.out.fasta
SeqIO.write(allseqrecord, "Step1.out.fasta", "fasta")

6340

### 1.5 Explanation
'''
The ts2exon dictionary was converted to a bioseq object so that it could then be written out as a fasta file with each transcript name and its sequence.\
'''

# Step 2. Six frame translation

## Step 2.1 Load fasta file with Biopython

In [13]:
# Import and parse the fasta file from Step 1
from Bio import SeqIO
#ts2seq = {}
rc = {}
allrecords = []
for seq_record in SeqIO.parse("Step1.out.fasta", "fasta"):
    # ts2seq = {} the disctionary has been constructed in Step 1, containing all transcripts (key: transcript name; value: sequence)
    # ts2seq[seq_record.id] = seq_record.seq
    # Construc a dictionary containing all the reverse complementary sequences (key: transcript name; value: reverse complementary sequence)
    rc[seq_record.id] = seq_record.seq.reverse_complement()
    allrecords.append(seq_record)

## Step 2.2 Six frame translation

In [14]:
# Define the function of translation
def proteinseq(sequence):
    ncodon = len(sequence) // 3
    pos = list(range(ncodon))
    pseq = ''
    gencode = {
        'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M', 
        'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', 
        'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', 
        'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',                  
        'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 
        'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 
        'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q', 
        'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R', 
        'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 
        'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A', 
        'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', 
        'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 
        'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 
        'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L', 
        'TAC':'Y', 'TAT':'Y', 'TAA':'X', 'TAG':'X', 
        'TGC':'C', 'TGT':'C', 'TGA':'X', 'TGG':'W'} 
    for i in pos:
        codon = sequence[3 * i:3 * i + 3]
        if codon in gencode:
            aa = gencode[codon]
        else:
            aa = '?'
        pseq = pseq + aa
    return pseq

In [15]:
allseqrecord=[]
for seq in ts2seq:
    # Convert Biopython Seq objects to string variables
    myseq = str(ts2seq[seq])
    myseqrv = str(rc[seq])
    # Get six frames
    rf1 = myseq[0:]
    rf2 = myseq[1:]
    rf3 = myseq[2:]
    rf4 = myseqrv[0:]
    rf5 = myseqrv[1:]
    rf6 = myseqrv[2:]
    
    # Translate for each frame
    PEP1 = proteinseq(rf1)
    PEP2 = proteinseq(rf2)
    PEP3 = proteinseq(rf3)
    PEP4 = proteinseq(rf4)
    PEP5 = proteinseq(rf5)
    PEP6 = proteinseq(rf6)
    
    # Look for ORFs in six frames
    import re
    ORF = re.compile('M[^X]*X')
    
    # Find the longest ORF in frame 1
    ORF1 = {}
    o1 = ORF.findall(PEP1)
    if o1:
        ORF1[seq] = max(o1, key=len)
    else:
        ORF1[seq] = 'No'
    
    # Find the longest ORF in frame 2
    ORF2 = {}
    o2 = ORF.findall(PEP2)
    if o2:
        ORF2[seq] = max(o2, key=len)
    else:
        ORF2[seq] = 'No'
    
    # Find the longest ORF in frame 3
    ORF3 = {}
    o3 = ORF.findall(PEP3)
    if o3:
        ORF3[seq] = max(o3, key=len)
    else:
        ORF3[seq] = 'No'
    
    # Find the longest ORF in frame 4
    ORF4 = {}
    o4 = ORF.findall(PEP4)
    if o4:
        ORF4[seq] = max(o4, key=len)
    else:
        ORF4[seq] = 'No'
    
    # Find the longest ORF in frame 5
    ORF5 = {}
    o5 = ORF.findall(PEP5)
    if o5:
        ORF5[seq] = max(o5, key=len)
    else:
        ORF5[seq] = 'No'
    
    # Find the longest ORF in frame 6
    ORF6 = {}
    o6 = ORF.findall(PEP6)
    if o6:
        ORF6[seq] = max(o6, key=len)
    else:
        ORF6[seq] = 'No'
    
    # Combine all the longest ORFs from six frames into one dictionary (key: transcript name; value: six longest ORFs from six frames each). 
    sixORF = {}
    for key in (ORF1.keys() | ORF2.keys()):
        if key in ORF1: sixORF.setdefault(key, []).append(ORF1[key])
        if key in ORF2: sixORF.setdefault(key, []).append(ORF2[key])
        if key in ORF3: sixORF.setdefault(key, []).append(ORF3[key])
        if key in ORF4: sixORF.setdefault(key, []).append(ORF4[key])
        if key in ORF5: sixORF.setdefault(key, []).append(ORF5[key])
        if key in ORF6: sixORF.setdefault(key, []).append(ORF6[key])
    
    # Find the longest ORF among six longest ORFs and write into the dictionary (key: transcript name; value: the longest ORF)
    longestORF = {}
    sixseq = sixORF[seq]
    longestORF[seq] = max(sixseq, key=len)
    length = len(longestORF[seq])
    des = "length = " + str(length)

    # Convert values of the dictionary constructing in the last step to Biopython Seq objects
    from Bio.Seq import Seq
    from Bio.SeqRecord import SeqRecord
    from Bio.Alphabet import IUPAC
    
    gene2pipetide = {}
    seqrecords = SeqRecord(Seq(longestORF[seq], IUPAC.protein), id=seq, description=des)
    gene2pipetide[seq] = seqrecords
    allseqrecord.append(seqrecords)

# Write the fasta file
SeqIO.write(allseqrecord, "Step2.out.fasta", "fasta")

6340

# Step 3. Write lncRNA output

In [16]:
import Bio
import csv
#import Biopython package

In [17]:
from Bio import SeqIO  #import SeqIO
chr2seq = {}
allrecords = []


with open('Step3.out.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)    
    for seq_record in SeqIO.parse("Step2.out.fasta", "fasta"): 
        chr2seq[seq_record.id] = seq_record.seq 
        allrecords.append(seq_record)
        if 100 > len(seq_record):
            writer.writerow([seq_record.id,len(seq_record), 'Yes'])
            print(seq_record.id,len(seq_record), 'Yes')
        elif 100 < len(seq_record):
            writer.writerow([seq_record.id,len(seq_record), 'No'])
            print(seq_record.id,len(seq_record), 'No')
#IncRNA can be filtered by finding sequences whose length are less than 100, and showned as 'Yes'

AT1G01010.1 430 No
MSTRG.2.1 95 Yes
AT1G01020.1 246 No
AT1G01020.5 199 No
AT1G01020.3 237 No
AT1G01020.6 105 No
AT1G01020.4 237 No
AT1G01020.2 192 No
AT1G01030.1 359 No
AT1G01030.2 336 No
AT1G01040.1 1910 No
AT1G01040.2 1911 No
AT1G03993.1 53 Yes
at1g01046 23 Yes
AT1G01050.1 213 No
AT1G01050.2 213 No
AT1G03997.1 40 Yes
AT1G01060.6 644 No
AT1G01060.3 646 No
AT1G01060.1 646 No
AT1G01060.7 646 No
AT1G01060.4 645 No
AT1G01060.2 646 No
AT1G01060.8 646 No
AT1G01060.5 644 No
AT1G01080.1 294 No
AT1G01080.3 287 No
AT1G01080.2 295 No
AT1G01090.1 429 No
AT1G01100.2 113 No
AT1G01100.3 97 Yes
AT1G01100.1 113 No
AT1G01100.4 113 No
MSTRG.12.5 68 Yes
AT1G01110.2 528 No
AT1G01110.1 365 No
AT1G01120.1 529 No
AT1G01130.1 125 No
AT1G01140.1 448 No
AT1G01140.2 450 No
AT1G01140.3 452 No
AT1G01150.1 346 No
AT1G01150.2 310 No
AT1G01150.3 293 No
AT1G01160.1 196 No
AT1G01160.2 230 No
AT1G04007.1 76 Yes
AT1G01170.1 84 Yes
AT1G01170.2 84 Yes
AT1G04013.1 29 Yes
AT1G01210.2 107 No
AT1G01210.3 107 No
AT1G01210.1 107

AT1G03920.3 570 No
MSTRG.251.4 357 No
AT1G03930.1 472 No
AT1G04217.1 2 Yes
AT1G03935.1 27 Yes
AT1G04223.1 4 Yes
AT1G03940.1 470 No
AT1G04227.1 15 Yes
AT1G03950.1 211 No
AT1G03960.2 390 No
AT1G03960.1 530 No
AT1G03960.3 390 No
AT1G03960.4 390 No
AT1G03960.5 501 No
AT1G03960.6 468 No
AT1G03960.7 329 No
AT1G03970.1 271 No
AT1G04233.1 20 Yes
AT1G03980.2 475 No
AT1G03980.1 368 No
AT1G03980.3 453 No
AT1G03980.4 378 No
AT1G03990.1 759 No
AT1G04000.1 153 No
AT1G04010.1 634 No
AT1G04020.1 714 No
AT1G04020.2 714 No
MSTRG.266.3 54 Yes
AT1G04030.1 435 No
MSTRG.267.2 435 No
AT1G04030.2 435 No
AT1G04040.1 272 No
AT1G04237.1 64 Yes
AT1G04050.2 735 No
AT1G04050.3 631 No
AT1G04050.1 735 No
AT1G04070.1 95 Yes
AT1G04080.3 824 No
AT1G04080.1 769 No
AT1G04080.4 769 No
AT1G04080.5 550 No
AT1G04080.2 583 No
AT1G04090.1 573 No
AT1G04100.1 262 No
AT1G04105.1 104 No
AT1G04110.1 776 No
AT1G04120.1 1515 No
AT1G04120.2 1510 No
AT1G04120.3 1480 No
MSTRG.277.4 821 No
MSTRG.277.5 66 Yes
AT1G04130.1 361 No
AT1G04130.2

AT1G06230.3 767 No
AT1G06230.4 767 No
AT1G06230.2 767 No
AT1G06230.1 767 No
AT1G06240.1 384 No
AT1G06250.2 310 No
AT1G06250.1 424 No
AT1G06265.1 115 No
AT1G06265.2 89 Yes
AT1G06265.3 20 Yes
AT1G06260.1 344 No
AT1G06270.1 344 No
AT1G06290.1 676 No
MSTRG.476.2 604 No
AT1G06310.1 676 No
AT1G06320.1 196 No
AT1G06330.1 151 No
AT1G06360.2 316 No
AT1G06360.1 300 No
MSTRG.480.3 142 No
MSTRG.481.1 45 Yes
MSTRG.482.1 76 Yes
AT1G06380.1 255 No
AT1G06390.1 408 No
AT1G06390.2 408 No
MSTRG.484.3 30 Yes
AT1G06400.1 217 No
AT1G06410.1 852 No
AT1G06410.2 852 No
AT1G06410.3 852 No
MSTRG.486.4 672 No
AT1G06420.1 222 No
AT1G06420.2 199 No
AT1G06430.2 686 No
AT1G06430.3 686 No
AT1G06430.1 686 No
MSTRG.488.4 72 Yes
AT1G06440.1 391 No
AT1G06450.1 361 No
AT1G04463.1 68 Yes
AT1G06460.1 286 No
AT1G04467.1 35 Yes
AT1G06470.2 415 No
AT1G06470.1 415 No
AT1G06470.3 415 No
AT1G06470.4 415 No
AT1G06475.1 94 Yes
AT1G06490.2 1934 No
AT1G06490.1 1959 No
AT1G06500.4 137 No
AT1G06500.1 137 No
AT1G06500.2 137 No
AT1G06500.

AT1G08590.1 1030 No
AT1G08592.1 105 No
AT1G08592.2 45 Yes
AT1G08600.3 1480 No
AT1G08600.4 1480 No
AT1G08600.2 1480 No
AT1G08600.1 1459 No
AT1G08610.1 560 No
MSTRG.690.2 560 No
AT1G04657.1 51 Yes
AT1G08620.2 1210 No
MSTRG.692.2 77 Yes
AT1G08620.3 1224 No
AT1G08620.4 1210 No
AT1G08620.1 1210 No
AT1G08630.3 359 No
AT1G08630.2 359 No
AT1G08630.1 359 No
AT1G08630.4 359 No
AT1G08630.5 345 No
AT1G08630.7 340 No
AT1G08630.6 359 No
MSTRG.693.8 41 Yes
AT1G08640.1 295 No
MSTRG.694.2 39 Yes
AT1G08645.1 143 No
AT1G08650.1 285 No
AT1G08650.2 277 No
AT1G08660.2 348 No
AT1G08660.1 475 No
AT1G08660.3 297 No
AT1G08680.4 652 No
AT1G08680.1 650 No
AT1G08680.2 649 No
AT1G08680.5 581 No
AT1G08680.3 608 No
AT1G08695.1 89 Yes
AT1G08700.1 454 No
AT1G08710.1 275 No
AT1G08710.2 296 No
AT1G08720.1 934 No
AT1G08720.2 907 No
AT1G08750.1 389 No
AT1G08750.3 389 No
AT1G08750.2 389 No
AT1G08760.1 749 No
AT1G08770.1 210 No
AT1G08780.1 130 No
AT1G08800.1 1114 No
AT1G08800.2 1114 No
AT1G08800.3 1114 No
AT1G08800.4 1114 No

AT1G11280.3 809 No
AT1G11280.4 819 No
MSTRG.935.6 69 Yes
AT1G11290.1 810 No
AT1G11300.1 821 No
AT1G11300.2 806 No
AT1G11303.1 821 No
AT1G11310.1 574 No
AT1G11310.3 512 No
AT1G11310.2 419 No
MSTRG.939.4 49 Yes
AT1G11320.1 425 No
AT1G11330.2 843 No
AT1G11330.3 749 No
AT1G11330.1 841 No
MSTRG.941.4 59 Yes
AT1G11350.1 831 No
AT1G11360.1 243 No
AT1G11360.4 243 No
AT1G11360.3 243 No
AT1G11360.2 243 No
AT1G11380.1 255 No
AT1G11390.1 625 No
AT1G11400.2 205 No
AT1G11400.4 205 No
AT1G11400.5 205 No
AT1G11400.6 205 No
AT1G11400.1 205 No
AT1G11400.3 205 No
AT1G11410.2 731 No
AT1G11410.3 822 No
AT1G11410.4 853 No
AT1G11410.1 846 No
AT1G04847.1 47 Yes
AT1G11420.1 605 No
AT1G11430.1 233 No
AT1G11440.1 364 No
AT1G11450.3 353 No
AT1G11450.4 320 No
AT1G11450.1 261 No
AT1G11450.5 309 No
AT1G11450.2 302 No
AT1G11475.1 72 Yes
AT1G11480.1 579 No
AT1G11480.2 566 No
AT1G11530.1 119 No
AT1G04853.1 33 Yes
AT1G11540.1 368 No
AT1G11540.2 451 No
AT1G11545.1 306 No
AT1G11580.2 579 No
AT1G11580.1 558 No
AT1G11591.1 

AT1G14690.1 604 No
AT1G14690.2 604 No
AT1G14710.1 602 No
AT1G14710.2 602 No
MSTRG.1222.3 61 Yes
MSTRG.1222.4 33 Yes
AT1G14720.1 333 No
MSTRG.1223.2 152 No
AT1G14740.1 734 No
AT1G14750.3 620 No
AT1G14750.1 579 No
AT1G14750.2 411 No
AT1G14750.4 410 No
AT1G14755.1 79 Yes
AT1G14760.2 139 No
AT1G14760.1 143 No
AT1G14760.3 97 Yes
AT1G14770.1 430 No
MSTRG.1228.2 274 No
AT1G14770.2 430 No
AT1G14780.1 628 No
AT1G14790.1 1108 No
AT1G14800.1 385 No
AT1G14810.1 376 No
AT1G14810.2 307 No
AT1G05123.1 28 Yes
AT1G14820.3 253 No
AT1G14820.2 240 No
AT1G14820.1 240 No
AT1G14830.1 615 No
MSTRG.1235.2 85 Yes
AT1G14840.2 590 No
AT1G14840.1 605 No
MSTRG.1235.5 62 Yes
AT1G14850.1 1465 No
AT1G14890.1 220 No
AT1G14900.1 205 No
MSTRG.1238.2 177 No
AT1G14910.1 693 No
AT1G14910.2 688 No
MSTRG.1239.3 98 Yes
AT1G14920.1 534 No
AT1G14970.1 563 No
AT1G14970.2 443 No
AT1G14970.3 417 No
MSTRG.1241.4 30 Yes
AT1G14980.1 99 Yes
AT1G14980.2 83 Yes
AT1G14990.1 123 No
AT1G14990.2 133 No
MSTRG.1242.5 47 Yes
AT1G15000.1 445 No


AT2G02730.3 277 No
AT2G02740.1 269 No
AT2G02750.1 614 No
at2g02741 26 Yes
AT2G04295.1 203 No
AT2G02760.1 153 No
AT2G02760.2 153 No
AT2G02760.3 153 No
AT2G02760.4 153 No
AT2G02765.2 218 No
AT2G02765.1 239 No
AT2G02765.4 180 No
AT2G02765.3 239 No
MSTRG.5397.5 55 Yes
AT2G02780.1 743 No
AT2G02780.2 629 No
AT2G02780.3 680 No
MSTRG.5398.4 79 Yes
AT2G02790.2 598 No
AT2G02790.3 598 No
AT2G02790.1 637 No
AT2G02800.2 427 No
AT2G02800.1 427 No
AT2G02795.1 141 No
AT2G02810.1 333 No
MSTRG.5402.2 141 No
AT2G02820.1 456 No
AT2G02820.2 485 No
AT2G02820.3 334 No
MSTRG.5403.4 101 No
AT2G02850.1 130 No
AT2G02860.1 595 No
AT2G02860.2 465 No
AT2G02870.3 468 No
AT2G02870.1 468 No
AT2G02870.2 468 No
AT2G02880.1 315 No
AT2G02910.2 461 No
AT2G02910.1 461 No
AT2G04375.1 32 Yes
AT2G04405.1 27 Yes
AT2G02930.1 213 No
AT2G04435.1 28 Yes
AT2G04455.1 14 Yes
AT2G02955.1 667 No
AT2G02960.6 272 No
AT2G02960.1 272 No
AT2G02960.3 272 No
AT2G02960.4 272 No
AT2G02960.5 276 No
AT2G02960.2 272 No
AT2G02970.1 556 No
MSTRG.5414

AT2G14750.1 277 No
AT2G14775.1 270 No
AT2G14800.1 55 Yes
AT2G14810.1 67 Yes
AT2G14820.1 635 No
AT2G14820.2 635 No
AT2G14825.1 147 No
MSTRG.5718.2 29 Yes
AT2G14835.2 344 No
AT2G14835.1 344 No
AT2G14850.1 292 No
AT2G14860.1 253 No
AT2G14878.1 24 Yes
AT2G14880.1 142 No
AT2G14880.2 106 No
AT2G14880.3 106 No
AT2G14890.1 192 No
AT2G14890.2 177 No
AT2G14900.1 109 No
AT2G14910.1 387 No
AT2G14910.2 367 No
MSTRG.5727.1 77 Yes
AT2G14935.1 75 Yes
AT2G15000.1 94 Yes
AT2G15000.4 102 No
AT2G15000.6 102 No
AT2G15000.3 98 Yes
AT2G15000.7 102 No
AT2G15000.2 98 Yes
AT2G15000.5 102 No
AT2G15010.1 136 No
AT2G15090.1 482 No
AT2G15128.1 17 Yes
AT2G15230.1 394 No
AT2G15240.1 253 No
AT2G15270.1 195 No
AT2G15270.2 195 No
AT2G15280.1 202 No
MSTRG.5735.2 200 No
AT2G15280.2 163 No
AT2G15290.1 297 No
AT2G15292.1 75 Yes
AT2G15300.1 745 No
MSTRG.5738.2 572 No
AT2G15325.1 122 No
AT2G15400.1 320 No
AT2G15400.2 236 No
MSTRG.5740.3 250 No
MSTRG.5741.1 55 Yes
AT2G15430.1 320 No
AT2G15430.2 245 No
AT2G15430.3 244 No
MSTRG.

AT2G20110.1 572 No
AT2G20110.2 579 No
AT2G20110.3 545 No
AT2G20120.1 269 No
AT2G20120.2 269 No
MSTRG.6036.3 49 Yes
AT2G20130.1 257 No
MSTRG.6037.2 85 Yes
AT2G20140.1 444 No
AT2G20160.1 151 No
AT2G20170.2 402 No
AT2G20170.3 385 No
AT2G20170.1 402 No
AT2G20180.1 408 No
AT2G20180.2 479 No
AT2G20180.3 479 No
AT2G20180.5 408 No
AT2G20180.4 479 No
AT2G20180.7 408 No
AT2G20180.6 479 No
AT2G20180.8 343 No
AT2G20190.1 1440 No
AT2G20208.1 85 Yes
AT2G20210.1 605 No
AT2G20210.2 489 No
AT2G20230.1 271 No
AT2G07265.1 41 Yes
AT2G20240.1 714 No
MSTRG.6047.2 521 No
AT2G07275.1 67 Yes
AT2G20250.1 575 No
AT2G20250.2 593 No
AT2G20250.3 443 No
AT2G20260.1 146 No
AT2G20270.1 180 No
AT2G20270.2 207 No
MSTRG.6051.3 180 No
AT2G20280.1 372 No
MSTRG.6052.2 36 Yes
AT2G20290.2 1483 No
AT2G20290.3 1468 No
AT2G20290.4 1477 No
AT2G20290.1 1494 No
AT2G20300.1 745 No
AT2G20310.1 431 No
AT2G20320.1 1030 No
AT2G20320.2 1030 No
AT2G20330.1 649 No
AT2G20340.1 491 No
AT2G20340.2 464 No
MSTRG.6058.3 44 Yes
AT2G20360.1 403 No

AT2G22990.4 417 No
AT2G22990.2 320 No
AT2G07855.1 16 Yes
AT2G23000.2 438 No
AT2G23000.1 438 No
AT2G23040.4 99 Yes
AT2G23040.2 99 Yes
AT2G23040.3 124 No
MSTRG.6267.4 73 Yes
AT2G23040.6 73 Yes
AT2G23040.7 73 Yes
AT2G23040.5 93 Yes
AT2G23040.1 127 No
AT2G23050.2 366 No
AT2G23050.1 482 No
MSTRG.6268.3 47 Yes
AT2G23060.2 280 No
AT2G23060.1 414 No
AT2G23070.1 433 No
AT2G23080.1 334 No
AT2G23080.2 308 No
AT2G23090.1 79 Yes
AT2G07875.1 46 Yes
AT2G23093.1 450 No
AT2G23093.2 383 No
MSTRG.6274.3 95 Yes
AT2G23120.1 84 Yes
AT2G23118.1 48 Yes
AT2G23130.1 186 No
AT2G23130.2 163 No
AT2G23140.1 830 No
AT2G23140.2 827 No
AT2G23140.3 638 No
AT2G23140.4 827 No
AT2G23140.5 638 No
AT2G23150.1 510 No
AT2G23170.1 596 No
AT2G23180.1 517 No
AT2G23200.1 835 No
AT2G23210.1 288 No
AT2G23260.1 457 No
AT2G23290.1 310 No
AT2G23300.1 774 No
AT2G23310.1 213 No
AT2G23310.2 212 No
AT2G23320.1 318 No
AT2G23320.2 263 No
AT2G23321.1 47 Yes
AT2G23340.1 177 No
AT2G23348.2 91 Yes
AT2G23348.3 91 Yes
AT2G23348.1 91 Yes
AT2G23348

AT2G25900.1 316 No
AT2G25910.1 342 No
AT2G25910.2 343 No
AT2G25920.1 281 No
AT2G25930.1 696 No
AT2G08135.1 25 Yes
AT2G25940.1 479 No
AT2G25950.2 192 No
AT2G25950.1 205 No
AT2G25964.1 51 Yes
AT2G25970.1 633 No
AT2G25980.1 450 No
AT2G25980.2 345 No
AT2G26000.2 480 No
AT2G26000.1 462 No
AT2G26000.3 480 No
MSTRG.6492.1 74 Yes
AT2G26030.1 443 No
AT2G26030.4 448 No
AT2G26030.2 370 No
AT2G26030.3 443 No
AT2G26030.5 288 No
AT2G26040.1 191 No
AT2G26060.1 353 No
AT2G26060.2 338 No
AT2G26070.1 251 No
AT2G26070.2 289 No
AT2G26080.1 1045 No
MSTRG.6496.2 45 Yes
AT2G26100.1 372 No
AT2G26100.2 359 No
AT2G26110.1 310 No
AT2G26140.1 718 No
AT2G26170.1 523 No
AT2G26170.2 440 No
AT2G26180.1 417 No
AT2G26190.2 421 No
AT2G26190.1 533 No
AT2G26190.3 533 No
AT2G26200.1 566 No
AT2G26200.2 356 No
MSTRG.6503.3 41 Yes
AT2G26210.1 191 No
AT2G26210.2 168 No
AT2G26210.7 191 No
AT2G26210.4 191 No
AT2G26210.5 191 No
AT2G26210.3 168 No
AT2G26210.6 191 No
MSTRG.6504.8 101 No
at2g26211 17 Yes
MSTRG.6506.1 64 Yes
AT2G2623

# Author contribution:
Step 1.1-1.3 Alex Sandercock \
Step 1.4-1.5 Becca Selby \
Step 2 Shu Yang \
Step 3 Zirui Zhou