## spliceR (python version) code

#### Import packages

In [18]:
import numpy as np
import pandas as pd
import json
from Bio.Seq import Seq
import math
import urllib.request

#### Helper functions

In [28]:
### reads information from json files from www.ebi.ac.uk HLA database
### example webpage: https://www.ebi.ac.uk/ipd/imgt/hla/alleles/allele/?accession=HLA00050

def parse_json(file_path=None, file_url=None):
    """ takes in path to json file from www.ebi.ac.uk HLA database (default) OR url of json file 
    returns dict of:
    - haplotype_name (str)
    - prevalence (tuple of CWD v2.0 Status, CWD v3.0 Status (Total))
    - genomic_features (list of dicts, one dict for each utr/exon/intron, keys are 'length', 'start', 'type', 'number' (intron/exons only))
    - coding_features (similar to genomic features but only has exons - needed for pct cDNA calculation)
    - sequences (dict of protein, coding, and genomic sequences (probably won't use for BE stuff))
    - genomic_sequence (str)
    """
    d = {}
    if file_url is not None:
        with urllib.request.urlopen(file_url) as url:
            HLA_file_contents = json.load(url)
    else: 
        with open(file_path) as HLA_file:
            HLA_file_contents = json.load(HLA_file)
        # keys we want: 'cwd' (for commonality), 'feature' (exon coords), 'name', 'sequence'
    d['haplotype_name'] = HLA_file_contents['name']
    # d['prevalence'] = (HLA_file_contents['cwd']['version_2'], HLA_file_contents['cwd']['version_3']['Total'])
    d['genomic_features'] = HLA_file_contents['feature']['genomic']
    d['coding_features'] = HLA_file_contents['feature']['coding']
    d['sequences'] = HLA_file_contents['sequence']
    d['genomic_sequence'] = HLA_file_contents['sequence']['genomic']

    return d

In [20]:
### reads in motif and position weights from tsv files 
def parse_motif_position_weights(motif_weights_filepath, position_weights_filepath):
    """ takes in filepath of tsv files storing motif weights and position weights, 
    returns 2 element tuple of motif_weights_dict, position_weights_dict
        motif_weights_dict is dictionary mapping motif (7 NT str) to dictionary containing weight (float)
        positions_weights_dict is nested dictionary mapping enzyme class (ABE or CBE) to dictionary mapping edited NT position to weight

    example of using weight dictionaries: 
        pw = position_weights_dict["CBE"][target_pos]['position_weight']
        mw = motif_weights_dict[motif]['motif_weight']
    """
    ## motif_weights_dict is dictionary mapping motif (7 NT str) to dictionary containing weight (float)
    motif_weights_df = pd.read_csv(motif_weights_filepath, sep="\t", index_col=0)
    motif_weights_dict = motif_weights_df.to_dict('index')

    ## positions_weights_dict is nested dictionary mapping enzyme class (ABE or CBE) to dictionary mapping edited NT position to weight
    position_weights_df = pd.read_csv(position_weights_filepath, sep="\t", index_col=0)
    ABE_pw_df = position_weights_df[position_weights_df['enzyme_class'] == 'ABE']
    ABE_pw_dict = ABE_pw_df.to_dict('index')
    CBE_pw_df = position_weights_df[position_weights_df['enzyme_class'] == 'CBE']
    CBE_pw_dict = CBE_pw_df.to_dict('index')
    position_weights_dict = {"ABE": ABE_pw_dict, "CBE": CBE_pw_dict}

    return motif_weights_dict, position_weights_dict
    

In [21]:
### functions for manipulating DNA seq strings and other basic helper functions

def get_rev_complement(seq):
    comp = str()
    for char in seq:
        if char == "A":
            comp += "T"
        elif char == "T":
            comp += "A"
        elif char == "G":
            comp += "C"
        elif char == "C":
            comp += "G"
    comp = comp[::-1]
    return comp 

def matchDNAPattern(pattern, subject):
    """ takes in a pattern (shorter DNA string) and subject (longer DNA string)
    return list of 2 element lists:
        each list giving start coord of matching region (in longer string) and subsequence of longer string that matches
    note: "N" matches any of the 4 nucleotides ("A", "T", "G", "C")
    """
    output = []
    len_substring = len(pattern)
    len_string = len(subject)
    if len_substring>len_string:
        raise Exception("pattern cannot be longer than subject")
    last_start = len_string-len_substring+1
    for i in range(0,last_start):
        # print(i)
        si = subject[i:i+len_substring]
        m = True
        for idx, char in enumerate(si):
            if (char == pattern[idx]) or (pattern[idx]=="N"):
                continue
            else:
                m = False 
                break
        if m:
            output.append([i,si])
    return(output)

# test
# print(matchDNAPattern("NNGCA", "ACTGCAGGGCA"))
# print(matchDNAPattern("NNGCAT", "ACTGCAGGGCA"))
# print(matchDNAPattern("NNGCAAAAAAAAAAAAAAT", "ACTGCAGGGCA"))

def probability(l):
    return(math.exp(l)/(1+math.exp(l)))

#### Main function generating dataframe with scored guides

In [22]:
def spliceR_main(
    genomic_features,
    coding_features,
    genomic_sequence,
    coding_sequence,
    motif_weights_dict,
    position_weights_dict,
    output_as_df = True,
    haplotype_name = None,
    guide_length = 20, 
    pam = "NGG",
    flank_length = 20,
    CBE = True, 
    ABE = True,
    SA = True, 
    SD = True,
    logistic_adjust = 1
):
    """ 
    inputs: 
        - genomic_features: a list of dictionaries, one dict for each utr/exon/intron, keys are 'length', 'start', 'type', 'number' (for intron/exons only)
        partial example: [{'length': 300, 'start': 1, 'type': '5utr'},
        {'length': 73,
        'number': '1',
        'partial': False,
        'start': 301,
        'type': 'exon'},
        {'length': 130,
        'number': '1',
        'partial': False,
        'start': 374,
        'type': 'intron'}, ...
        - coding features: similar list of dictionaries as 'genomic_features' but only includes exons 
        - genomic sequence: string of genomic DNA sequence 
        - coding sequence: string of cDNA sequence 
    
    returns: dataframe (or list of lists) with columns: 
    ['haplotype name', 'exon number', 'splice site type', 'protospacer', 'guide seq with PAM', 'PAM', 'enzyme class', 'CBE/ABE position', 'guide score', 'cDNA distuption pct']
    """
    ### inner helper functions for getting CBE/ABE guides for SA and SD sites 

    def get_CBE_SD_guides(haplotype_name, exon_number, pam, exon_end_idx, flank_length, genomic_sequence, position_weights_dict, motif_weights_dict, guide_length, pct_of_cds):
        pam_length = len(pam)
        sd_start = exon_end_idx-flank_length-pam_length
        sd_end = exon_end_idx+flank_length+pam_length
        SD_seq = get_rev_complement(genomic_sequence[sd_start:sd_end])
        guide_template_w_pam = "N"*guide_length+pam
        unscored_CBE_SD_guides = matchDNAPattern(guide_template_w_pam, SD_seq) # 0 indexed 
        motif = get_rev_complement(genomic_sequence[exon_end_idx-3:exon_end_idx+4])
        output = []
        for u_cbe_sd_guide in unscored_CBE_SD_guides:
            strt = u_cbe_sd_guide[0]
            target_pos = flank_length+len(pam)-strt #position of target base within the guide (1 to 20 + pam)
            if 1<=target_pos<=20:
                pw = position_weights_dict["CBE"][target_pos]['position_weight']
                mw = motif_weights_dict[motif]['motif_weight']
                cbe_score = probability(mw+pw-logistic_adjust)
                u_cbe_sd_guide_wo_pam = u_cbe_sd_guide[1][:-3]
                guide_info = [haplotype_name, exon_number, "donor",u_cbe_sd_guide_wo_pam, u_cbe_sd_guide[1], pam, "CBE", target_pos, cbe_score, pct_of_cds]
                output.append(guide_info)
        return output

    def get_ABE_SD_guides(haplotype_name, exon_number, pam, exon_end_idx, flank_length, genomic_sequence, position_weights_dict, motif_weights_dict, guide_length, pct_of_cds):
        pam_length = len(pam)
        sd_start = exon_end_idx-flank_length-pam_length
        sd_end = exon_end_idx+flank_length+pam_length
        SD_seq = get_rev_complement(genomic_sequence[sd_start:sd_end])
        guide_template_w_pam = "N"*guide_length+pam
        unscored_ABE_SD_guides = matchDNAPattern(guide_template_w_pam, SD_seq) # 0 indexed 
        motif = get_rev_complement(genomic_sequence[exon_end_idx-2:exon_end_idx+5])
        output = []
        for u_abe_sd_guide in unscored_ABE_SD_guides:
            strt = u_abe_sd_guide[0]
            target_pos = flank_length+len(pam)-strt-1 #position of target base within the guide (1 to 20 + pam)
            if 1<=target_pos<=20:
                pw = position_weights_dict["ABE"][target_pos]['position_weight']
                mw = motif_weights_dict[motif]['motif_weight']
                abe_score = probability(mw+pw-logistic_adjust)
                u_abe_sd_guide_wo_pam = u_abe_sd_guide[1][:-3]
                guide_info = [haplotype_name, exon_number, "donor", u_abe_sd_guide_wo_pam, u_abe_sd_guide[1], pam, "ABE", target_pos, abe_score, pct_of_cds]
                output.append(guide_info)
        return output

    def get_CBE_SA_guides(haplotype_name, exon_number, pam, exon_start_idx, flank_length, genomic_sequence, position_weights_dict, motif_weights_dict, guide_length, pct_of_cds):
        pam_length = len(pam)
        sa_start = exon_start_idx-flank_length-pam_length
        sa_end = exon_start_idx+flank_length+pam_length
        SA_seq = get_rev_complement(genomic_sequence[sa_start:sa_end])
        guide_template_w_pam = "N"*guide_length+pam
        unscored_CBE_SA_guides = matchDNAPattern(guide_template_w_pam, SA_seq)
        motif = get_rev_complement(genomic_sequence[exon_start_idx-4:exon_start_idx+3]) #checked 
        # scoring all potential guides
        output = []
        for u_cbe_sa_guide in unscored_CBE_SA_guides: 
            strt = u_cbe_sa_guide[0]
            target_pos = flank_length+len(pam)-strt+1 #position of target base within the guide (1 to 20 + pam) checked
            if 1<=target_pos<=20:
                pw = position_weights_dict["CBE"][target_pos]['position_weight']
                mw = motif_weights_dict[motif]['motif_weight']
                cbe_score = probability(mw+pw-logistic_adjust)
                u_cbe_sa_guide_wo_pam = u_cbe_sa_guide[1][:-3]
                guide_info = [haplotype_name, exon_number, "acceptor", u_cbe_sa_guide_wo_pam, u_cbe_sa_guide[1], pam, "CBE", target_pos, cbe_score, pct_of_cds]
                output.append(guide_info)
        return output

    def get_ABE_SA_guides(haplotype_name, exon_number, pam, exon_start_idx, flank_length, genomic_sequence, position_weights_dict, motif_weights_dict, guide_length, pct_of_cds):
        # unlike the other 3 cases, SA seq to be edited is NOT reverse complement 
        pam_length = len(pam)
        sa_start = exon_start_idx-flank_length-pam_length
        sa_end = exon_start_idx+flank_length+pam_length
        guide_template_w_pam = "N"*guide_length+pam
        SA_seq = genomic_sequence[sa_start:sa_end]
        unscored_ABE_SA_guides = matchDNAPattern(guide_template_w_pam, SA_seq)
        motif = genomic_sequence[exon_start_idx-5:exon_start_idx+2] #checked
        # scoring all potential guides
        output=[]
        for u_abe_sa_guide in unscored_ABE_SA_guides: 
            strt = u_abe_sa_guide[0]
            target_pos = flank_length+len(pam)-strt-1 #position of target base within the guide (1 to 20 + pam) checked
            if 1<=target_pos<=20:
                pw = position_weights_dict["ABE"][target_pos]['position_weight']
                mw = motif_weights_dict[motif]['motif_weight']
                abe_score = probability(mw+pw-logistic_adjust)
                u_abe_sa_guide_wo_pam = u_abe_sa_guide[1][:-3]
                guide_info = [haplotype_name, exon_number, "acceptor", u_abe_sa_guide_wo_pam, u_abe_sa_guide[1], pam, "ABE", target_pos, abe_score, pct_of_cds]
                output.append(guide_info)
        return output
    
    #### MAIN FUNCTION 
    cols = ['haplotype name', 'exon number', 'splice site type', 'protospacer', 'guide seq with PAM', 'PAM', 'enzyme class', 'CBE/ABE position', 'guide score', 'cDNA distuption pct']

    exons_list = []
    num_exons = 0 
    for feature in genomic_features:
        if feature['type'] =='exon':
            exons_list.append(feature)
            if int(feature['number'])>num_exons:
                num_exons=int(feature['number'])

    ## list to collect all outputs 
    output_list_of_lists = [] # each list corresponds to a possible guide 
    ## loop over all exons 
    for exon in exons_list:
        # print("exon", exon['number'])

        ## calculating what percentage of the way through coding sequence the end of this exon is (want earlier disruption)
        end_of_exon_in_cds = 0 
        for cd_feat in coding_features:
            if cd_feat['number'] == exon['number']:
                end_of_exon_in_cds=cd_feat['start']+cd_feat['length']
                break
        pct_of_cds = end_of_exon_in_cds/len(coding_sequence)
        
        ## getting start and end coordinates of this exon (relative to GENOMIC sequence)
        exon_start_idx = exon['start']-1 #-1 to account for 0 indexing 
        exon_end_idx = exon_start_idx+exon['length']

        # check if it's first exon, if so, only get splice donor (after exon)
        if int(exon['number']) == 1:
            if SD:
                if CBE:
                    output_list_of_lists.extend(get_CBE_SD_guides(haplotype_name, exon['number'], pam, exon_end_idx, flank_length, genomic_sequence, position_weights_dict, motif_weights_dict, guide_length, pct_of_cds))
                
                if ABE:
                    output_list_of_lists.extend(get_ABE_SD_guides(haplotype_name, exon['number'], pam, exon_end_idx, flank_length, genomic_sequence, position_weights_dict, motif_weights_dict, guide_length, pct_of_cds))

        # check if it's last exon, if so, only get splice acceptor (before exon)
        elif int(exon['number']) == num_exons:
            if SA:
                if CBE:
                    output_list_of_lists.extend(get_CBE_SA_guides(haplotype_name, exon['number'], pam, exon_start_idx, flank_length, genomic_sequence, position_weights_dict, motif_weights_dict, guide_length, pct_of_cds))

                if ABE: 
                    output_list_of_lists.extend(get_ABE_SA_guides(haplotype_name, exon['number'], pam, exon_start_idx, flank_length, genomic_sequence, position_weights_dict, motif_weights_dict, guide_length, pct_of_cds))
        
        else:
            # splice acceptor sites 
            if SA:
                if CBE:
                    output_list_of_lists.extend(get_CBE_SA_guides(haplotype_name, exon['number'], pam, exon_start_idx, flank_length, genomic_sequence, position_weights_dict, motif_weights_dict, guide_length, pct_of_cds))

                if ABE: 
                    output_list_of_lists.extend(get_ABE_SA_guides(haplotype_name, exon['number'], pam, exon_start_idx, flank_length, genomic_sequence, position_weights_dict, motif_weights_dict, guide_length, pct_of_cds))

            # splice donor sites 
            if SD:
                if CBE:
                    output_list_of_lists.extend(get_CBE_SD_guides(haplotype_name, exon['number'], pam, exon_end_idx, flank_length, genomic_sequence, position_weights_dict, motif_weights_dict, guide_length, pct_of_cds))
                
                if ABE:
                    output_list_of_lists.extend(get_ABE_SD_guides(haplotype_name, exon['number'], pam, exon_end_idx, flank_length, genomic_sequence, position_weights_dict, motif_weights_dict, guide_length, pct_of_cds))

    if output_as_df:
        return pd.DataFrame(output_list_of_lists, columns=cols)
    
    else: return output_list_of_lists
        

## For HLA project

In [25]:
n_to_a = pd.read_csv("HLA_A_name_to_accession.txt", sep="\t")
n_to_a 
name_to_accession = dict(zip(n_to_a['name'], n_to_a['accession']))
# name_to_accession #dict mapping HLA haplotype name to accession 

In [29]:
#### running spliceR on all HLA A common haplotypes 

# initializing empty dataframe to store results 
cols = ['haplotype name', 'exon number', 'splice site type', 'protospacer', 'guide seq with PAM', 'PAM', 'enzyme class', 'CBE/ABE position', 'guide score', 'cDNA distuption pct']
HLA_A_output_df = pd.DataFrame(columns=cols)

motif_weights_filepath = "../motif_weights.tsv"
position_weights_filepath = "../position_weights.tsv"
motif_weights_dict, position_weights_dict = parse_motif_position_weights(motif_weights_filepath, position_weights_filepath)

for haplotype in name_to_accession:
    print(haplotype)
    accession = name_to_accession[haplotype]
    hap_url = f"https://www.ebi.ac.uk/cgi-bin/ipd/api/allele/{accession}"
    # print(hp_url)
    hap_info = parse_json(file_url=hap_url)
    genomic_features = hap_info['genomic_features']
    coding_features = hap_info['coding_features']
    genomic_sequence = hap_info['genomic_sequence']
    coding_sequence = hap_info['sequences']['coding']

    hap_out = spliceR_main(
        genomic_features,
        coding_features,
        genomic_sequence,
        coding_sequence,
        motif_weights_dict,
        position_weights_dict,
        output_as_df = True,
        haplotype_name = haplotype,
        guide_length = 20, 
        pam = "NGG",
        flank_length = 20,
        CBE = True, 
        ABE = True,
        SA = True, 
        SD = True,
        logistic_adjust = 1
    )
    HLA_A_output_df = pd.concat([HLA_A_output_df, hap_out], axis=0, ignore_index=True)


HLA_A_output_df



A*30:04:01:01


  HLA_A_output_df = pd.concat([HLA_A_output_df, hap_out], axis=0, ignore_index=True)


A*24:03:01:01
A*24:02:01:01
A*11:01:01:01
A*34:02:01:01
A*32:01:01:01
A*01:01:01:01
A*03:02:01:01
A*68:01:01:01
A*68:01:02:01
A*23:01:01:01
A*11:02:01:01
A*74:01:01:01
A*02:05:01:01
A*26:01:01:01
A*30:01:01:01
A*34:01:01:01
A*26:03:01:01
A*66:01:01:01
A*03:01:01:01
A*02:11:01:01
A*02:02:01:01
A*02:03:01:01
A*33:01:01:01
A*30:02:01:01
A*24:07:01:01
A*29:02:01:01
A*25:01:01:01
A*68:02:01:01
A*33:03:01:01
A*36:01:01:01
A*29:01:01:01
A*02:01:01:01
A*02:01:04
A*31:01:02:01
A*68:03:01:01
A*02:06:01:01
A*02:07:01:01


Unnamed: 0,haplotype name,exon number,splice site type,protospacer,guide seq with PAM,PAM,enzyme class,CBE/ABE position,guide score,cDNA distuption pct
0,A*30:04:01:01,1,donor,GACCCCGCACTCACCCGCCC,GACCCCGCACTCACCCGCCCAGG,NGG,CBE,14,0.113088,0.067395
1,A*30:04:01:01,1,donor,CGCACTCACCCGCCCAGGTC,CGCACTCACCCGCCCAGGTCTGG,NGG,CBE,9,0.334910,0.067395
2,A*30:04:01:01,1,donor,GCACTCACCCGCCCAGGTCT,GCACTCACCCGCCCAGGTCTGGG,NGG,CBE,8,0.466535,0.067395
3,A*30:04:01:01,1,donor,CACCCGCCCAGGTCTGGGTC,CACCCGCCCAGGTCTGGGTCAGG,NGG,CBE,3,0.336702,0.067395
4,A*30:04:01:01,1,donor,ACCCGCCCAGGTCTGGGTCA,ACCCGCCCAGGTCTGGGTCAGGG,NGG,CBE,2,0.257234,0.067395
...,...,...,...,...,...,...,...,...,...,...
1910,A*02:07:01:01,7,acceptor,CCCTGGGCACTGTCACTGCC,CCCTGGGCACTGTCACTGCCTGG,NGG,CBE,20,0.044522,0.996357
1911,A*02:07:01:01,7,acceptor,CCTGGGCACTGTCACTGCCT,CCTGGGCACTGTCACTGCCTGGG,NGG,CBE,19,0.059258,0.996357
1912,A*02:07:01:01,7,acceptor,CTGGGCACTGTCACTGCCTG,CTGGGCACTGTCACTGCCTGGGG,NGG,CBE,18,0.085840,0.996357
1913,A*02:07:01:01,7,acceptor,CCCAGGCAGTGACAGTGCCC,CCCAGGCAGTGACAGTGCCCAGG,NGG,ABE,4,0.238371,0.996357


In [30]:
HLA_A_output_df.to_csv('HLA_A_all_haps_splice_site_BE_test1.csv', index=False)

## tests/extras

In [15]:
# for HLA project: 
motif_weights_filepath = "../motif_weights.tsv"
position_weights_filepath = "../position_weights.tsv"
motif_weights_dict, position_weights_dict = parse_motif_position_weights(motif_weights_filepath, position_weights_filepath)

A_24_02_info = parse_json("HLA00050.json")
genomic_features = A_24_02_info['genomic_features']
coding_features = A_24_02_info['coding_features']
genomic_sequence = A_24_02_info['genomic_sequence']
coding_sequence = A_24_02_info['sequences']['coding']

test1 = spliceR_main(
    genomic_features,
    coding_features,
    genomic_sequence,
    coding_sequence,
    motif_weights_dict,
    position_weights_dict,
    output_as_df = True,
    haplotype_name = "A*24:02:01:01",
    guide_length = 20, 
    pam = "NGG",
    flank_length = 20,
    CBE = True, 
    ABE = True,
    SA = True, 
    SD = True,
    logistic_adjust = 1
)
test1

Unnamed: 0,haplotype name,exon number,splice site type,protospacer,guide seq with PAM,PAM,enzyme class,CBE/ABE position,guide score,cDNA distuption pct
0,A*24:02:01:01,1,donor,GACCCCGCACTCACCTGCCC,GACCCCGCACTCACCTGCCCAGG,NGG,CBE,14,0.11579,0.067395
1,A*24:02:01:01,1,donor,CGCACTCACCTGCCCAGGTC,CGCACTCACCTGCCCAGGTCTGG,NGG,CBE,9,0.340876,0.067395
2,A*24:02:01:01,1,donor,GCACTCACCTGCCCAGGTCT,GCACTCACCTGCCCAGGTCTGGG,NGG,CBE,8,0.473178,0.067395
3,A*24:02:01:01,1,donor,CACCTGCCCAGGTCTGGGTC,CACCTGCCCAGGTCTGGGTCAGG,NGG,CBE,3,0.342684,0.067395
4,A*24:02:01:01,1,donor,ACCTGCCCAGGTCTGGGTCA,ACCTGCCCAGGTCTGGGTCAGGG,NGG,CBE,2,0.262362,0.067395
5,A*24:02:01:01,1,donor,GACCCCGCACTCACCTGCCC,GACCCCGCACTCACCTGCCCAGG,NGG,ABE,13,0.020324,0.067395
6,A*24:02:01:01,1,donor,CGCACTCACCTGCCCAGGTC,CGCACTCACCTGCCCAGGTCTGG,NGG,ABE,8,0.269631,0.067395
7,A*24:02:01:01,1,donor,GCACTCACCTGCCCAGGTCT,GCACTCACCTGCCCAGGTCTGGG,NGG,ABE,7,0.417329,0.067395
8,A*24:02:01:01,1,donor,CACCTGCCCAGGTCTGGGTC,CACCTGCCCAGGTCTGGGTCAGG,NGG,ABE,2,0.061594,0.067395
9,A*24:02:01:01,1,donor,ACCTGCCCAGGTCTGGGTCA,ACCTGCCCAGGTCTGGGTCAGGG,NGG,ABE,1,0.027279,0.067395


In [17]:
A_24_02_info_frm_url = parse_json(file_url="https://www.ebi.ac.uk/cgi-bin/ipd/api/allele/HLA00002")
A_24_02_info_frm_url

{'haplotype_name': 'A*01:02:01:01',
 'prevalence': ('C', 'C'),
 'genomic_features': [{'length': 300, 'start': 1, 'type': '5utr'},
  {'length': 73,
   'number': '1',
   'partial': False,
   'start': 301,
   'type': 'exon'},
  {'length': 130,
   'number': '1',
   'partial': False,
   'start': 374,
   'type': 'intron'},
  {'length': 270,
   'number': '2',
   'partial': False,
   'start': 504,
   'type': 'exon'},
  {'length': 241,
   'number': '2',
   'partial': False,
   'start': 774,
   'type': 'intron'},
  {'length': 276,
   'number': '3',
   'partial': False,
   'start': 1015,
   'type': 'exon'},
  {'length': 579,
   'number': '3',
   'partial': False,
   'start': 1291,
   'type': 'intron'},
  {'length': 276,
   'number': '4',
   'partial': False,
   'start': 1870,
   'type': 'exon'},
  {'length': 102,
   'number': '4',
   'partial': False,
   'start': 2146,
   'type': 'intron'},
  {'length': 117,
   'number': '5',
   'partial': False,
   'start': 2248,
   'type': 'exon'},
  {'length':