# Generating SLiM genomic structures and recombination map

In [2]:
from pybedtools import BedTool
import pandas as pd

In [2]:
!mkdir -p ../data/slim_coords

<br><br>
## Annotate the BED files with region coordinates & inter-region sites with recombination rate

### First add the gap regions between inter-region sites and regions themselves

In [3]:
%%bash
for f in ../data/bed/regions/*_regions_gap_sites.bed; do
    python3 ../add_spacers_to_bed.py $f ${f%.*}_spacers.bed
done

### Add the recombination rates for each region/site/spacer

In [4]:
%%bash
for f in ../data/bed/regions/*_regions_gap_sites_spacers.bed; do
    bedmap --ec --delim '\t' --echo --count --echo-map-score --echo-overlap-size $f ../data/genetic_map/genetic_map_GRCh37_all.txt.bed \
        | python ../get_window_average.py 4 \
        | awk '($7 != "0.0") && ($7 != "0")' \
        > ${f%.*}_recomb_rates.bed
done

<br><br><br>
## Conversion of physical coordinates into SLiM-based coordinate system

### Read the coordinates annotated with the genetic map

In [5]:
def read_recomb_map(bed_file):
    recomb_map = pd.read_table(bed_file, names=["chrom", "start", "end", "type", "xxx", "length", "recomb_rate", "xxx"])[["chrom", "start", "end", "type", "recomb_rate"]]

    # add labels to each region/site/spacer to make debugging easier
    # (regions of width 1 were filtered out already when the BED files were generated)
    recomb_map.loc[(recomb_map.end - recomb_map.start > 1) & (recomb_map.type != "spacer"), "type"] = "region"
    recomb_map.loc[(recomb_map.end - recomb_map.start == 1) & (recomb_map.type != "spacer"), "type"] = "site"

    # convert the recombination rate (in cM/Mb) to format required by SLiM
    recomb_map.recomb_rate = recomb_map.recomb_rate * 1e-8

    # reduce the spacers to 1 bp (which is what we will simulate in SLiM)
    recomb_map.loc[recomb_map.type == "spacer", "end"] = recomb_map.loc[recomb_map.type == "spacer"].start + 1
    
    return recomb_map

### Add "1 bp" chromosome breaks with recombination rate of 0.5

In [6]:
def add_chrom_breaks(recomb_map):
    chrom_maps = []

    for chrom in list(range(1, 23)):
        chrom_map = recomb_map.loc[recomb_map.chrom == "chr" + str(chrom)]

        chrom_break = pd.DataFrame({"chrom": "chr" + str(chrom),
                                    "start": list(chrom_map.end)[-1],
                                    "end":   list(chrom_map.end)[-1] + 1,
                                    "type":  ["chrom_break"],
                                    "recomb_rate": [0.5]},
                                   columns=["chrom", "start", "end", "type", "recomb_rate"])

        chrom_maps.append(pd.concat([chrom_map, chrom_break]).sort_values(by=['chrom', 'start']).reset_index(drop=True))

    # remove the very last base of the recombination map (it has a 0.5 recombination
    # rate anyway and there's no other chromosome after it)
    recomb_map = pd.concat(chrom_maps)[:-1].reset_index(drop=True)
    
    return recomb_map

###  Concatenate regions/sites on all chromosome as if they were directly adjacent on a single chromosome

SLiM simulates everything as a single noodle - we need to concatenate everything together and change the coordinates appropriately.

In [7]:
def concatenate_regions(regions):
    concat_regions = regions.copy()

    concat_regions["width"] = regions.end - regions.start
    concat_regions["slim_start"] = pd.Series([0] + list(concat_regions.width[:-1])).cumsum().values
    concat_regions["slim_end"] = concat_regions.width.cumsum() - 1
    
    return concat_regions

In [13]:
regions = ["protein_coding", "utr5", "utr3", "tss_5k", "ctcf_binding_site", "enhancer", "open_chromatin", "promoter", "promoter_flank", "tf_binding_site", "priPhastCons"]

In [14]:
for region in regions:
    # load the recombination map of regions and "gap sites"
    recomb_map_no_breaks = read_recomb_map("../data/bed/regions/{}_regions_gap_sites_spacers_recomb_rates.bed".format(region))
    recomb_map = add_chrom_breaks(recomb_map_no_breaks)
    
    # convert all coordinates into SLiM's 0-based coordinate system
    # (all chromosomes concatenated into a single continuous segment)
    concat_map = concatenate_regions(recomb_map)
    
    # save the recombination map (just regions and spacers, without
    # the informative sites since they don't have recombination rates
    # themselves - the "spacers" around them do)
    concat_map.query("type != 'site'") \
              .to_csv("../data/slim_coords/{}_recomb_map.bed".format(region), sep="\t", index=False)
    
    # load the subset of array sites that lie inside a region of interest
    region_sites = pd.read_table("../data/bed/regions/{}_sites.bed".format(region), names=["chrom", "start", "end"])
    
    # get a DataFrame of the coordinates of regions that contain sites from
    # the admixture array (will contain multiple copies of one region if more
    # than one site falls within that exon)
    regions_with_sites = BedTool.from_dataframe(concat_map.query("type == 'region'")) \
                                .intersect(BedTool.from_dataframe(region_sites), wa=True) \
                                .to_dataframe() \
                                .rename(columns={"name": "type",
                                                 "score": 'recomb_rate',
                                                 "strand": 'width',
                                                 "thickStart": "slim_start",
                                                 "thickEnd": "slim_end"})
    
    # some regions had a recombination rate equal to 0 and were filtered out
    # and sites that fall within these regions have to be filtered out (since
    # they can't be simulated anyway)
    region_sites = BedTool.from_dataframe(region_sites) \
                          .intersect(BedTool.from_dataframe(concat_map.query("type == 'region'")), wa=True) \
                          .to_dataframe()
    
    # calculate the position of each site relative to the start of "its" region
    # and convert this position into SLiM's single-segment coordinate system
    # (i.e. relative the the position 0 of the simulated segment)
    region_sites["slim_start"] = region_sites.start - regions_with_sites.start + regions_with_sites.slim_start
    region_sites["slim_end"] = region_sites.slim_start

    # concatenate the converted gap sites and region sites and save their coordinates
    gap_sites = concat_map.query("type == 'site'").drop(["type", "recomb_rate", "width"], axis=1)
    pd.concat([gap_sites, region_sites]) \
      .sort_values("slim_start") \
      .reset_index(drop=True) \
      .to_csv("../data/slim_coords/{}_sites.bed".format(region), sep="\t", index=False)
    
    # save SLiM coordinates of the regions themselves (will be used for calling
    # initializeGenomicElement function in SLiM)
    concat_map.query("type == 'region'") \
              .to_csv("../data/slim_coords/{}_regions.bed".format(region), sep='\t', index=False)

<br><br><br><br><br>
# Testing of the recombination map building functions

Sample 5 exons/sites for each chromosome to make a testing data set.

In [15]:
test_data = recomb_map.groupby('chrom') \
                           .apply(pd.DataFrame.head) \
                           .reset_index(drop=True)

In [16]:
test_data.head()

Unnamed: 0,chrom,start,end,type,recomb_rate
0,chr1,63422,63598,region,2.082414e-08
1,chr1,63598,63599,spacer,2.082414e-08
2,chr1,69093,69446,region,2.082414e-08
3,chr1,69446,69447,spacer,2.082414e-08
4,chr1,69511,69745,region,2.082414e-08


#### Test `add_chrom_breaks`

The function should add a 1bp spacer with a 0.5 recombination rate between chromosomes:

In [17]:
add_chrom_breaks(test_data).query("type == 'chrom_break'")

Unnamed: 0,chrom,start,end,type,recomb_rate
5,chr1,69745,69746,chrom_break,0.5
11,chr2,175363,175364,chrom_break,0.5
17,chr3,85124,85125,chrom_break,0.5
23,chr4,367479,367480,chrom_break,0.5
29,chr5,192177,192178,chrom_break,0.5
35,chr6,200612,200613,chrom_break,0.5
41,chr7,92357,92358,chrom_break,0.5
47,chr8,116987,116988,chrom_break,0.5
53,chr9,17792,17793,chrom_break,0.5
59,chr10,181860,181861,chrom_break,0.5


#### Test `concatenate_regions`

The function should concatenate all regions into a single continuous segment. This segment should start at 0 and its last position should be equal to the total length of all regions - 1.

In [18]:
concatenate_regions(test_data.query("chrom == 'chr1' | chrom == 'chr2'"))

Unnamed: 0,chrom,start,end,type,recomb_rate,width,slim_start,slim_end
0,chr1,63422,63598,region,2.082414e-08,176,0,175
1,chr1,63598,63599,spacer,2.082414e-08,1,176,176
2,chr1,69093,69446,region,2.082414e-08,353,177,529
3,chr1,69446,69447,spacer,2.082414e-08,1,530,530
4,chr1,69511,69745,region,2.082414e-08,234,531,764
55,chr2,45489,45578,region,2.50805e-09,89,765,853
56,chr2,45578,45579,spacer,4.067903e-09,1,854,854
57,chr2,140690,140691,site,6.3525e-10,1,855,855
58,chr2,140691,140692,spacer,2.034159e-09,1,856,856
59,chr2,175333,175363,region,3.06585e-09,30,857,886


What is the correct end of the single continuous segment?

In [19]:
sum((test_data.query("chrom == 'chr1' | chrom == 'chr2'")).end -
    (test_data.query("chrom == 'chr1' | chrom == 'chr2'")).start) - 1

886

This value matches the `slim_end` value in the last row.

<br><br><br><br><br>
# Building an artificial fake genome structure

In [181]:
!mkdir -p ../data/slim_coords/toy_genome

### Several "independent haplotypes"

* 1kb in size
* 0.5 recombination rate between them
* a single informative site in the middle of each

https://biology.stackexchange.com/a/19011/27272

In [182]:
from collections import defaultdict

N = 1000
size = 1000

haps = defaultdict(list)
sites = []

for i in range(N):
    haps["slim_start"].append(i * size)
    haps["slim_end"].append((i + 1) * size - 1)
    sites.append(i * size + size / 2)

sites = pd.DataFrame({"slim_start": sites}, dtype=int)

haps = pd.DataFrame(haps)[["slim_start", "slim_end"]]
haps["recomb_rate"] = 1e-8
haps["type"] = "region"

breaks = pd.DataFrame({"slim_start": haps.slim_end[:-1].values,
                       "slim_end": haps.slim_start[1:].values,
                       "recomb_rate": 0.5,
                       "type": "break"})[["slim_start", "slim_end", "recomb_rate", "type"]]

recomb_map = pd.concat([haps, breaks]).sort_values("slim_start").reset_index(drop=True)

In [183]:
recomb_map.tail(3)

Unnamed: 0,slim_start,slim_end,recomb_rate,type
1996,998000,998999,1e-08,region
1997,998999,999000,0.5,break
1998,999000,999999,1e-08,region


In [184]:
sites.tail(3)

Unnamed: 0,slim_start
997,997500
998,998500
999,999500


Save the recombination map:

In [185]:
recomb_map.to_csv("../data/slim_coords/toy_genome/independent_haps_recomb_map.bed", sep="\t", index=False)

Save the informative sites:

In [186]:
sites.to_csv("../data/slim_coords/toy_genome/independent_haps_sites.bed", sep="\t", index=False)

Save the regions:

In [187]:
recomb_map.query("type == 'region'") \
          .to_csv("../data/slim_coords/toy_genome/independent_haps_regions.bed", sep='\t', index=False)

### Several "independent haplotypes"

* 1kb in size
* 0.5 recombination rate between them
* a single informative site in the middle of each

https://biology.stackexchange.com/a/19011/27272

In [194]:
chrom_len = 100_000_000
gap_len = int(chrom_len / N - size)

In [195]:
linked_recomb_map = recomb_map.copy()

In [196]:
linked_recomb_map.loc[recomb_map.type == "break", "recomb_rate"] = gap_len * 1e-8
linked_recomb_map.head()

Unnamed: 0,slim_start,slim_end,recomb_rate,type
0,0,999,1e-08,region
1,999,1000,0.00099,break
2,1000,1999,1e-08,region
3,1999,2000,0.00099,break
4,2000,2999,1e-08,region


In [197]:
linked_recomb_map.to_csv("../data/slim_coords/toy_genome/linked_haps_recomb_map.bed", sep="\t", index=False)

Save the informative sites:

In [198]:
sites.to_csv("../data/slim_coords/toy_genome/linked_haps_sites.bed", sep="\t", index=False)

Save the regions:

In [199]:
linked_recomb_map.query("type == 'region'") \
                 .to_csv("../data/slim_coords/toy_genome/linked_haps_regions.bed", sep='\t', index=False)