# Generating SLiM genomic structures and recombination map

In [1]:
from pybedtools import BedTool
import pandas as pd

In [2]:
!mkdir -p ../data/slim_coords

<br><br>
## Annotate the BED files with region coordinates & inter-region sites with recombination rate

### First add the gap regions between inter-region sites and regions themselves

In [3]:
%%bash
for f in ../data/bed/regions/*_unif_regions_gap_sites.bed; do
    python3 ../code/add_spacers_to_bed.py $f ${f%.*}_spacers.bed
done

### Add the recombination rates for each region/site/spacer

In [4]:
%%bash
for f in ../data/bed/regions/*_unif_regions_gap_sites_spacers.bed; do
    bedmap --ec --delim '\t' --echo --count --echo-map-score --echo-overlap-size $f ../data/genetic_map/genetic_map_GRCh37_all.txt.bed \
        | python ../code/get_window_average.py 4 \
        | awk '($7 != "0.0") && ($7 != "0")' \
        > ${f%.*}_recomb_rates.bed
done

<br><br><br>
## Conversion of physical coordinates into SLiM-based coordinate system

### Read the coordinates annotated with the genetic map

In [5]:
def read_recomb_map(bed_file, uniform=False):
    recomb_map = pd.read_table(bed_file, names=["chrom", "start", "end", "type", "width", "length", "recomb_rate", "xxx"])[["chrom", "start", "end", "length", "type", "recomb_rate"]]

    # add labels to each region/site/spacer to make debugging easier
    # (regions of width 1 were filtered out already when the BED files were generated)
    recomb_map.loc[(recomb_map.end - recomb_map.start > 1) & (recomb_map.type != "spacer"), "type"] = "region"
    recomb_map.loc[(recomb_map.end - recomb_map.start == 1) & (recomb_map.type != "spacer"), "type"] = "site"

    # convert the recombination rate (in cM/Mb) to format required by SLiM
    # and scale by length accordingly
    recomb_map.loc[recomb_map.type == "region", "recomb_rate"] = (1 if uniform else recomb_map.recomb_rate) * 1e-8
    recomb_map.loc[recomb_map.type == "spacer", "recomb_rate"] = (1 if uniform else recomb_map.recomb_rate) * 1e-8 * recomb_map.length

    # reduce the spacers to 1 bp (which is what we will simulate in SLiM)
    recomb_map.loc[recomb_map.type == "spacer", "end"] = recomb_map.loc[recomb_map.type == "spacer"].start + 1
    
    return recomb_map.drop("length", axis=1)

In [6]:
read_recomb_map("../data/bed/regions/protein_coding_unif_regions_gap_sites_spacers_recomb_rates.bed", uniform=True).head()

Unnamed: 0,chrom,start,end,type,recomb_rate
0,chr1,0,1,site,2.981822
1,chr1,1,2,spacer,5e-05
2,chr1,5000,5001,site,2.981822
3,chr1,5001,5002,spacer,5e-05
4,chr1,10000,10001,site,2.981822


In [7]:
read_recomb_map("../data/bed/regions/protein_coding_unif_regions_gap_sites_spacers_recomb_rates.bed").head()

Unnamed: 0,chrom,start,end,type,recomb_rate
0,chr1,0,1,site,2.981822
1,chr1,1,2,spacer,0.000149
2,chr1,5000,5001,site,2.981822
3,chr1,5001,5002,spacer,0.000149
4,chr1,10000,10001,site,2.981822


### Add "1 bp" chromosome breaks with recombination rate of 0.5

In [8]:
def add_chrom_breaks(recomb_map):
    chrom_maps = []

    for chrom in list(range(1, 23)):
        chrom_map = recomb_map.loc[recomb_map.chrom == "chr" + str(chrom)]

        chrom_break = pd.DataFrame({"chrom": "chr" + str(chrom),
                                    "start": list(chrom_map.end)[-1],
                                    "end":   list(chrom_map.end)[-1] + 1,
                                    "type":  ["chrom_break"],
                                    "recomb_rate": [0.5]},
                                   columns=["chrom", "start", "end", "type", "recomb_rate"])

        chrom_maps.append(pd.concat([chrom_map, chrom_break]).sort_values(by=['chrom', 'start']).reset_index(drop=True))

    # remove the very last base of the recombination map (it has a 0.5 recombination
    # rate anyway and there's no other chromosome after it)
    recomb_map = pd.concat(chrom_maps)[:-1].reset_index(drop=True)
    
    return recomb_map

###  Concatenate regions/sites on all chromosome as if they were directly adjacent on a single chromosome

SLiM simulates everything as a single noodle - we need to concatenate everything together and change the coordinates appropriately.

In [9]:
def concatenate_regions(regions):
    concat_regions = regions.copy()

    concat_regions["width"] = regions.end - regions.start
    concat_regions["slim_start"] = pd.Series([0] + list(concat_regions.width[:-1])).cumsum().values
    concat_regions["slim_end"] = concat_regions.width.cumsum() - 1
    
    return concat_regions

In [10]:
regions = ["merged", "exon", "protein_coding", "utr5", "utr3", "tss_5k", "ctcf_binding_site", "enhancer", "open_chromatin", "promoter", "promoter_flank", "tf_binding_site", "priPhastCons"]

In [11]:
for region in regions:
    # load the recombination map of regions and "gap sites"
    recomb_map_no_breaks = read_recomb_map("../data/bed/regions/{}_unif_regions_gap_sites_spacers_recomb_rates.bed".format(region))
    recomb_map = add_chrom_breaks(recomb_map_no_breaks)
    
    # convert all coordinates into SLiM's 0-based coordinate system
    # (all chromosomes concatenated into a single continuous segment)
    concat_map = concatenate_regions(recomb_map)
    
    # save the recombination map (just regions and spacers, without
    # the informative sites since they don't have recombination rates
    # themselves - the "spacers" around them do)
    concat_map.query("type != 'site'") \
              .to_csv("../data/slim_coords/{}_unif_recomb_map.bed".format(region), sep="\t", index=False)
    
    # load the subset of array sites that lie inside a region of interest
    region_sites = pd.read_table("../data/bed/regions/{}_unif_sites.bed".format(region), names=["chrom", "start", "end"])
    
    # get a DataFrame of the coordinates of regions that contain sites from
    # the admixture array (will contain multiple copies of one region if more
    # than one site falls within that exon)
    regions_with_sites = BedTool.from_dataframe(concat_map.query("type == 'region'")) \
                                .intersect(BedTool.from_dataframe(region_sites), wa=True) \
                                .to_dataframe() \
                                .rename(columns={"name": "type",
                                                 "score": 'recomb_rate',
                                                 "strand": 'width',
                                                 "thickStart": "slim_start",
                                                 "thickEnd": "slim_end"})
    
    # some regions had a recombination rate equal to 0 and were filtered out
    # and sites that fall within these regions have to be filtered out (since
    # they can't be simulated anyway)
    region_sites = BedTool.from_dataframe(region_sites) \
                          .intersect(BedTool.from_dataframe(concat_map.query("type == 'region'")), wa=True) \
                          .to_dataframe()

    # calculate the position of each site relative to the start of "its" region
    # and convert this position into SLiM's single-segment coordinate system
    # (i.e. relative the the position 0 of the simulated segment)
    region_sites["slim_start"] = region_sites.start - regions_with_sites.start + regions_with_sites.slim_start
    region_sites["slim_end"] = region_sites.slim_start
    region_sites["within"] = "region"

    # concatenate the converted gap sites and region sites and save their coordinates
    gap_sites = concat_map.query("type == 'site'").drop(["type", "recomb_rate", "width"], axis=1)
    gap_sites["within"] = "gap"

    pd.concat([gap_sites, region_sites]) \
      .sort_values("slim_start") \
      .reset_index(drop=True) \
      .to_csv("../data/slim_coords/{}_unif_all_sites.bed".format(region), sep="\t", index=False)
    
    # save SLiM coordinates of the regions themselves (will be used for calling
    # initializeGenomicElement function in SLiM)
    concat_map.query("type == 'region'") \
              .to_csv("../data/slim_coords/{}_unif_regions.bed".format(region), sep='\t', index=False)

<br><br><br><br><br>
# Testing of the recombination map building functions

Sample 5 exons/sites for each chromosome to make a testing data set.

In [12]:
test_data = recomb_map.groupby('chrom') \
                           .apply(pd.DataFrame.head) \
                           .reset_index(drop=True)

In [13]:
test_data.head()

Unnamed: 0,chrom,start,end,type,recomb_rate
0,chr1,0,1,site,2.981822
1,chr1,1,2,spacer,0.000149
2,chr1,5000,5001,site,2.981822
3,chr1,5001,5002,spacer,0.000149
4,chr1,10000,10001,site,2.981822


#### Test `add_chrom_breaks`

The function should add a 1bp spacer with a 0.5 recombination rate between chromosomes:

In [14]:
add_chrom_breaks(test_data).query("type == 'chrom_break'")

Unnamed: 0,chrom,start,end,type,recomb_rate
5,chr1,10001,10002,chrom_break,0.5
11,chr2,10001,10002,chrom_break,0.5
17,chr3,10001,10002,chrom_break,0.5
23,chr4,10001,10002,chrom_break,0.5
29,chr5,10001,10002,chrom_break,0.5
35,chr6,10001,10002,chrom_break,0.5
41,chr7,10001,10002,chrom_break,0.5
47,chr8,10001,10002,chrom_break,0.5
53,chr9,10001,10002,chrom_break,0.5
59,chr10,10001,10002,chrom_break,0.5


#### Test `concatenate_regions`

The function should concatenate all regions into a single continuous segment. This segment should start at 0 and its last position should be equal to the total length of all regions - 1.

In [15]:
concatenate_regions(test_data.query("chrom == 'chr1' | chrom == 'chr2'"))

Unnamed: 0,chrom,start,end,type,recomb_rate,width,slim_start,slim_end
0,chr1,0,1,site,2.981822,1,0,0
1,chr1,1,2,spacer,0.000149,1,1,1
2,chr1,5000,5001,site,2.981822,1,2,2
3,chr1,5001,5002,spacer,0.000149,1,3,3
4,chr1,10000,10001,site,2.981822,1,4,4
55,chr2,0,1,site,0.339408,1,5,5
56,chr2,1,2,spacer,1.7e-05,1,6,6
57,chr2,5000,5001,site,0.339408,1,7,7
58,chr2,5001,5002,spacer,1.7e-05,1,8,8
59,chr2,10000,10001,site,0.339408,1,9,9


What is the correct end of the single continuous segment?

In [16]:
sum((test_data.query("chrom == 'chr1' | chrom == 'chr2'")).end -
    (test_data.query("chrom == 'chr1' | chrom == 'chr2'")).start) - 1

9

This value matches the `slim_end` value in the last row.