# Generating SLiM exome recombination map from GTF annotations

In [1]:
from pybedtools import BedTool
import pandas as pd

In [2]:
RECOMB_RATE = 1e-8 # crossovers per bp per generation

### Download the GTF annotations

In [3]:
gtf = pd.read_table('ftp://ftp.ensembl.org/pub/release-75/gtf/homo_sapiens/Homo_sapiens.GRCh37.75.gtf.gz',
                    header=None, sep='\t', skipinitialspace=True, skiprows=5, compression='gzip',
                    names=['chrom', 'source', 'feature', 'start', 'end',
                           'score', 'strand', 'frame', 'attribute'], low_memory=False)

In [4]:
gtf.head()

Unnamed: 0,chrom,source,feature,start,end,score,strand,frame,attribute
0,1,pseudogene,gene,11869,14412,.,+,.,"gene_id ""ENSG00000223972""; gene_name ""DDX11L1""..."
1,1,processed_transcript,transcript,11869,14409,.,+,.,"gene_id ""ENSG00000223972""; transcript_id ""ENST..."
2,1,processed_transcript,exon,11869,12227,.,+,.,"gene_id ""ENSG00000223972""; transcript_id ""ENST..."
3,1,processed_transcript,exon,12613,12721,.,+,.,"gene_id ""ENSG00000223972""; transcript_id ""ENST..."
4,1,processed_transcript,exon,13221,14409,.,+,.,"gene_id ""ENSG00000223972""; transcript_id ""ENST..."


### Take only autosomal exonic regions from the GTF file

In [5]:
AUTOSOMES = [str(i + 1) for i in range(22)]

In [6]:
exons = gtf[gtf.chrom.isin(AUTOSOMES) &
           (gtf.source == "protein_coding") &
           (gtf.feature == "exon")]

### Merge the overlapping exons

In [7]:
exons = BedTool.from_dataframe(exons).sort().merge().to_dataframe()

In [8]:
exons.reset_index(inplace=True, drop=True)

### Convert chromosome IDs to integers and sort exons by chromosome

In [9]:
exons.chrom = exons.chrom.astype(int)
exons.sort_values(by=['chrom', 'start'], inplace=True)

## Generating the recombination map

Recombination rate within exons is $10^{-8}$:

In [10]:
exons['recomb_rate'] = RECOMB_RATE

Recombination rate between exons is implemented by inserting a 1 bp "gap" between each pair of adjacent exons and setting the recombination rate at these positions to $L \times 1\cdot10^{-8}$ crossovers per generation ($L$ is the distance between adjacent exons).

The recombination rate of the "gap" between the last exon on one chromosome and the first exon on another chromosome will be 0.5.

In [11]:
def create_recomb_map(exons):
    """Create recombination map from a given DataFrame of exon coordinates."""
    # create a new DataFrame with coordinates of 1 bp inter-exonic gaps
    gaps = pd.DataFrame({'chrom'       : exons.chrom.values,
                         'start'       : exons.end.values,
                         'end'         : exons.end.values + 1,
                         'recomb_rate' : list(RECOMB_RATE * (exons.start[1:].values - exons.end[:-1].values)) + [0.5]},
                        columns=['chrom', 'start', 'end', 'recomb_rate'])
    
    # merge the dataframes of exon and gap coordinates
    exons_and_gaps = pd.concat([exons, gaps]).sort_values(by=['chrom', 'start']).reset_index(drop=True)
    
    # calculate the end position of each simulated exon/gap region
    # (this is how a recombination map is specified for SLiM)
    exons_and_gaps['interval_end'] = (exons_and_gaps.end - exons_and_gaps.start).cumsum()
    
    return exons_and_gaps[['chrom', 'start', 'end', 'interval_end', 'recomb_rate']]

SLiM simulates the whole genome as one segment. Therefore, all exons on all chromosomes have to be concatenated and all coordinates have to be shifted appropriately (i.e. first coordinate on the chromosome 2 has to be incremented by a last coordinate of the chromosome 1).

In [12]:
recomb_map = {}
previous_end = 0

for chrom in AUTOSOMES:
    # generate SLiM format recombination map for this chromosome
    recomb_map[chrom] = create_recomb_map(exons.query('chrom == {}'.format(chrom)))
    
    # increment the positions of the recombination map based on the
    # last position on the previous chromosome
    recomb_map[chrom].interval_end += previous_end

    # get the currently last position of the
    previous_end = max(recomb_map[chrom].interval_end)

In [13]:
recomb_map = pd.concat(recomb_map, ignore_index=True).sort_values(by='interval_end')

Remove the very last base of the recombination map because it has a 0.5 recombination rate anyway and there's no other chromosome after it.

In [14]:
len(recomb_map)

428386

In [15]:
recomb_map = recomb_map[:-1]

### The final recombination map to use in SLiM

In [16]:
recomb_map.head()

Unnamed: 0,chrom,start,end,interval_end,recomb_rate
0,1,69090,70008,918,1e-08
1,1,70008,70009,919,0.00064892
2,1,134900,135802,1821,1e-08
3,1,135802,135803,1822,1.818e-05
4,1,137620,139379,3581,1e-08


In [17]:
recomb_map.tail()

Unnamed: 0,chrom,start,end,interval_end,recomb_rate
288596,22,51220615,51220779,72548207,1e-08
288597,22,51220779,51220780,72548208,4.17e-06
288598,22,51221196,51221714,72548726,1e-08
288599,22,51221714,51221715,72548727,2.14e-06
288600,22,51221928,51222091,72548890,1e-08


## Output the result in a format suitable for import into SLiM

In [18]:
!mkdir -p ../input

In [19]:
recomb_map[['interval_end', 'recomb_rate']].to_csv('../input/exons.txt', sep='\t', index=False, header=False)