# Generating SLiM genomic structures and recombination map

In [1]:
from pybedtools import BedTool
import pandas as pd

In [2]:
!mkdir -p ../data/slim_coords

<br><br>
## Annotate the BED files with region coordinates & inter-region sites with recombination rate

### First add the gap regions between inter-region sites and regions themselves

In [3]:
%%bash
for f in ../data/bed/regions/*_regions_gap_sites.bed; do
    python3 ../add_spacers_to_bed.py $f ${f%.*}_spacers.bed
done

### Then add the recombination rates for each region/site/spacer

In [4]:
%%bash
for f in ../data/bed/regions/*_regions_gap_sites_spacers.bed; do
    bedmap --ec --delim '\t' --echo --count --echo-map-score --echo-overlap-size $f ../data/genetic_map/genetic_map_GRCh37_all.txt.bed \
        | python ../get_window_average.py 4 \
        | awk '($7 != "0.0") && ($7 != "0")' \
        > ${f%.*}_recomb_rates.bed
done

In [6]:
!ls ../data/bed/regions/*recomb_rates.bed

../data/bed/regions/ctcf_binding_site_regions_gap_sites_spacers_recomb_rates.bed
../data/bed/regions/enhancer_regions_gap_sites_spacers_recomb_rates.bed
../data/bed/regions/open_chromatin_regions_gap_sites_spacers_recomb_rates.bed
../data/bed/regions/priPhastCons_regions_gap_sites_spacers_recomb_rates.bed
../data/bed/regions/promoter_flank_regions_gap_sites_spacers_recomb_rates.bed
../data/bed/regions/promoter_regions_gap_sites_spacers_recomb_rates.bed
../data/bed/regions/protein_coding_regions_gap_sites_spacers_recomb_rates.bed
../data/bed/regions/tf_binding_site_regions_gap_sites_spacers_recomb_rates.bed
../data/bed/regions/tss_5k_regions_gap_sites_spacers_recomb_rates.bed
../data/bed/regions/utr3_regions_gap_sites_spacers_recomb_rates.bed
../data/bed/regions/utr5_regions_gap_sites_spacers_recomb_rates.bed


In [5]:
regions = ["protein_coding", "utr5", "utr3", "tss_5k", "ctcf_binding_site", "enhancer", "open_chromatin", "promoter", "promoter_flank", "tf_binding_site", "priPhastCons"]

In [6]:
regions

['protein_coding',
 'utr5',
 'utr3',
 'tss_5k',
 'ctcf_binding_site',
 'enhancer',
 'open_chromatin',
 'promoter',
 'promoter_flank',
 'tf_binding_site',
 'priPhastCons']

In [10]:
region = regions[1]

<br><br><br>
## Conversion of physical coordinates into SLiM-based coordinate system

### Read the coordinates annotated with the genetic map

In [7]:
def read_recomb_map(bed_file):
    recomb_map = pd.read_table(bed_file, names=["chrom", "start", "end", "type", "xxx", "length", "recomb_rate", "xxx"])[["chrom", "start", "end", "type", "recomb_rate"]]

    # add labels to each region/site/spacer to make debugging easier
    # (regions of width 1 were filtered out already when the BED files were generated)
    recomb_map.loc[(recomb_map.end - recomb_map.start > 1) & (recomb_map.type != "spacer"), "type"] = "region"
    recomb_map.loc[(recomb_map.end - recomb_map.start == 1) & (recomb_map.type != "spacer"), "type"] = "site"

    # convert the recombination rate (in cM/Mb) to format required by SLiM
    recomb_map.recomb_rate = recomb_map.recomb_rate * 1e-8

    # reduce the spacers to 1 bp (which is what we will simulate in SLiM)
    recomb_map.loc[recomb_map.type == "spacer", "end"] = recomb_map.loc[recomb_map.type == "spacer"].start + 1
    
    return recomb_map

### Add "1 bp" chromosome breaks with recombination rate of 0.5

In [8]:
def add_chrom_breaks(recomb_map):
    chrom_maps = []

    for chrom in list(range(1, 23)):
        chrom_map = recomb_map.loc[recomb_map.chrom == "chr" + str(chrom)]

        chrom_break = pd.DataFrame({"chrom": "chr" + str(chrom),
                                    "start": list(chrom_map.end)[-1],
                                    "end":   list(chrom_map.end)[-1] + 1,
                                    "type":  ["chrom_break"],
                                    "recomb_rate": [0.5]},
                                   columns=["chrom", "start", "end", "type", "recomb_rate"])

        chrom_maps.append(pd.concat([chrom_map, chrom_break]).sort_values(by=['chrom', 'start']).reset_index(drop=True))

    # remove the very last base of the recombination map (it has a 0.5 recombination
    # rate anyway and there's no other chromosome after it)
    recomb_map = pd.concat(chrom_maps)[:-1].reset_index(drop=True)
    
    return recomb_map

###  Concatenate regions/sites on all chromosome as if they were directly adjacent on a single chromosome

SLiM simulates everything as a single noodle - we need to concatenate everything together and change the coordinates appropriately.

In [9]:
def concatenate_regions(regions):
    concat_regions = regions.copy()

    concat_regions["width"] = regions.end - regions.start
    concat_regions["slim_start"] = pd.Series([0] + list(concat_regions.width[:-1])).cumsum().values
    concat_regions["slim_end"] = concat_regions.width.cumsum() - 1
    
    return concat_regions

In [12]:
recomb_map_no_breaks = read_recomb_map("../data/bed/regions/{}_regions_gap_sites_spacers_recomb_rates.bed".format(region))

In [13]:
recomb_map = add_chrom_breaks(recomb_map_no_breaks)

Convert the coordinates of all regions/sites/gaps into SLiM's 0-based single segment coordinate system:

In [14]:
concat_map = concatenate_regions(recomb_map)

In [15]:
concat_map.head()

Unnamed: 0,chrom,start,end,type,recomb_rate,width,slim_start,slim_end
0,chr1,139309,139379,region,3.354927e-08,70,0,69
1,chr1,139379,139380,spacer,3.124258e-08,1,70,70
2,chr1,367639,367658,region,2.887498e-08,19,71,89
3,chr1,367658,367659,spacer,2.838626e-08,1,90,90
4,chr1,622034,622053,region,2.655176e-08,19,91,109


In [16]:
concat_map.tail()

Unnamed: 0,chrom,start,end,type,recomb_rate,width,slim_start,slim_end
210709,chr22,51220722,51220779,region,6.48707e-09,57,10471876,10471932
210710,chr22,51220779,51220780,spacer,6.48707e-09,1,10471933,10471933
210711,chr22,51221196,51221714,region,6.48707e-09,518,10471934,10472451
210712,chr22,51221714,51221715,spacer,6.48707e-09,1,10472452,10472452
210713,chr22,51221928,51222091,region,6.48707e-09,163,10472453,10472615


Save the recombination map in a SLiM-friendly format (end-positions of regions and spacers, without the positions of informative sites since they don't have recombination rates themselves - the "spacers" around them do):

In [17]:
concat_map.query("type != 'site'") \
          .to_csv("../data/slim_coords/{}_recomb_map.bed".format(region), sep="\t", index=False)

<br><br><br>
## Save SLiM coordinates of all sites from the archaic admixture array

**Both sites in gaps and sites within the simulated regions of interest.**

Recombination map includes only positions of sites that fall outside of exonic regions. However, in order to simulate the sites from the archaic admixture array, we have to know the positions of sites _within_ exons too.

SLiM simulates all exons and individual sites as one continuous segment of concatenated regions, with coordinates of exons and sites shifted appropriately. To obtain the coordinates of array sites within exons, we need to find out, for each site, which exon does it fall in and calculate its position relative to the start of that exon.

Take the subset of array sites that lie inside exons:

In [18]:
region_sites = pd.read_table("../data/bed/regions/{}_sites.bed".format(region), names=["chrom", "start", "end"])

Get a DataFrame of the coordinates of exons that contain a site from the admixture array (will contain multiple copies of one exon if more than one site falls within that exon).

In [19]:
regions_with_sites = BedTool.from_dataframe(concat_map.query("type == 'region'")) \
                            .intersect(BedTool.from_dataframe(region_sites), wa=True) \
                            .to_dataframe() \
                            .rename(columns={"name": "type",
                                             "score": 'recomb_rate',
                                             "strand": 'width',
                                             "thickStart": "slim_start",
                                             "thickEnd": "slim_end"})

# some regions had a recombination rate equal to 0 and were filtered out
# we have to filter out sites that fall within these regions (since we
# couldn't simulate them anyway)
region_sites = BedTool.from_dataframe(region_sites) \
                      .intersect(BedTool.from_dataframe(concat_map.query("type == 'region'")), wa=True) \
                      .to_dataframe()

Calculate the position of each site relative to the start of "its" exon and convert this position into a SLiM single-segment coordinate (i.e. relative the the position 0 of the simulated segment):

In [20]:
region_sites["slim_start"] = region_sites.start - regions_with_sites.start + regions_with_sites.slim_start
region_sites["slim_end"] = region_sites.slim_start

In [21]:
gap_sites = concat_map.query("type == 'site'").drop(["type", "recomb_rate", "width"], axis=1)

In [22]:
sites = pd.concat([gap_sites, region_sites]).sort_values("slim_start").reset_index(drop=True)

In [25]:
sites.to_csv("../data/slim_coords/{}_sites.bed".format(region), sep="\t", index=False)

## Save SLiM coordinates of exonic regions only

This is required for specification for the `initializeGenomicElement` function in SLiM.

In [27]:
concat_map.query("type == 'region'") \
          .to_csv("../data/slim_coords/{}_regions.bed".format(region), sep='\t', index=False)

<br><br><br><br><br>
# Testing of the recombination map building functions

Sample 5 exons/sites for each chromosome to make a testing data set.

In [47]:
test_data = recomb_map.groupby('chrom') \
                           .apply(pd.DataFrame.head) \
                           .reset_index(drop=True)

In [48]:
test_data.head()

Unnamed: 0,chrom,start,end,type,recomb_rate
0,chr1,139309,139379,region,3.354927e-08
1,chr1,139379,139380,spacer,3.124258e-08
2,chr1,367639,367658,region,2.887498e-08
3,chr1,367658,367659,spacer,2.838626e-08
4,chr1,622034,622053,region,2.655176e-08


#### Test `add_chrom_breaks`

The function should add a 1bp spacer with a 0.5 recombination rate between chromosomes:

In [49]:
add_chrom_breaks(test_data).query("type == 'chrom_break'")

Unnamed: 0,chrom,start,end,type,recomb_rate
5,chr1,622053,622054,chrom_break,0.5
11,chr2,243562,243563,chrom_break,0.5
17,chr3,238746,238747,chrom_break,0.5
23,chr4,331772,331773,chrom_break,0.5
29,chr5,217333,217334,chrom_break,0.5
35,chr6,292560,292561,chrom_break,0.5
41,chr7,229557,229558,chrom_break,0.5
47,chr8,150563,150564,chrom_break,0.5
53,chr9,179072,179073,chrom_break,0.5
59,chr10,95241,95242,chrom_break,0.5


#### Test `concatenate_regions`

The function should concatenate all regions into a single continuous segment. This segment should start at 0 and its last position should be equal to the total length of all regions - 1.

In [51]:
concatenate_regions(test_data.query("chrom == 'chr1' | chrom == 'chr2'"))

Unnamed: 0,chrom,start,end,type,recomb_rate,width,slim_start,slim_end
0,chr1,139309,139379,region,3.354927e-08,70,0,69
1,chr1,139379,139380,spacer,3.124258e-08,1,70,70
2,chr1,367639,367658,region,2.887498e-08,19,71,89
3,chr1,367658,367659,spacer,2.838626e-08,1,90,90
4,chr1,622034,622053,region,2.655176e-08,19,91,109
55,chr2,140690,140691,site,6.3525e-10,1,110,110
56,chr2,140691,140692,spacer,3.147195e-09,1,111,111
57,chr2,242800,242871,region,2.1597e-10,71,112,182
58,chr2,242871,242872,spacer,2.1597e-10,1,183,183
59,chr2,243502,243562,region,2.1597e-10,60,184,243


What is the correct end of the single continuous segment?

In [52]:
sum((test_data.query("chrom == 'chr1' | chrom == 'chr2'")).end -
    (test_data.query("chrom == 'chr1' | chrom == 'chr2'")).start) - 1

243

This value matches the `slim_end` value in the last row.