-
Notifications
You must be signed in to change notification settings - Fork 0
/
ex3.R
145 lines (91 loc) · 3.95 KB
/
ex3.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
library(slendr)
init_env()
# Exercise #1 ------------------------------------------------------------------
# chimpanzee outgroup
chimp <- population("CHIMP", time = 7e6, N = 5000)
# two populations of anatomically modern humans: Africans and Europeans
afr <- population("AFR", parent = chimp, time = 6e6, N = 15000)
eur <- population("EUR", parent = afr, time = 70e3, N = 3000)
# Neanderthal population splitting at 600 ky ago from modern humans
# (becomes extinct by 40 ky ago)
nea <- population("NEA", parent = afr, time = 600e3, N = 1000, remove = 40e3)
# Neanderthal introgression event (3% admixture between 55-50 kya)
gf <- gene_flow(from = nea, to = eur, rate = 0.03, start = 55000, end = 50000)
# compile the entire model into a single object
model <- compile_model(
populations = list(chimp, nea, afr, eur),
gene_flow = gf,
generation_time = 30
)
plot_model(model, proportions = TRUE)
plot_model(model, proportions = TRUE, log = TRUE)
# if the command below takes way too much time to run, feel free to go down from
# 100 Mb sequence_length to even 10Mb (it doesn't matter much)
ts <-
msprime(model, sequence_length = 100e6, recombination_rate = 1e-8) %>%
ts_mutate(mutation_rate = 1e-8)
# ts_save(ts, "ex3.trees")
# ts <- ts_load("ex3.trees", model)
# Exercise #4 ------------------------------------------------------------------
library(dplyr)
library(ggplot2)
# first get a (named!) list of individuals in each population
samples <- ts_names(ts, split = "pop")
samples
lapply(samples, head, 5) # get just 5 samples from each population
# compute nucleotide diversity (pi) in each population
pi <- ts_diversity(ts, sample_sets = samples)
arrange(pi, diversity)
# heterozygosity in a single sample
ts_diversity(ts, "NEA_1")
# compute pairwise population divergence
div <- ts_divergence(ts, sample_sets = samples)
arrange(div, divergence)
# outgroup f3
ts_f3(ts, B = "AFR_1", C = "EUR_1", A = "CHIMP_1")
samples[["AFR"]] %>% head(5)
samples[["EUR"]] %>% head(5)
ts_f3(ts, B = samples["AFR"], C = samples["EUR"], A = "CHIMP_1")
ts_f3(ts, B = samples["AFR"], C = samples["NEA"], A = "CHIMP_1")
ts_f3(ts, B = samples["EUR"], C = samples["NEA"], A = "CHIMP_1")
# outgroup f3 as a linear combination of f2 statistics
# branch-based version
ts_f3(ts, B = "AFR_1", C = "AFR_2", A = "CHIMP_1", mode = "branch")
my_f3 <- (
ts_f2(ts, A = "AFR_1", B = "CHIMP_1", mode = "branch")$f2 +
ts_f2(ts, A = "AFR_2", B = "CHIMP_1", mode = "branch")$f2 -
ts_f2(ts, A = "AFR_1", B = "AFR_2", mode = "branch")$f2
) / 2
my_f3
# admixture detection -----------------------------------------------------
# D(AFR, EUR; NEA, CHIMP) ~ (BABA - ABBA) / (BABA + ABBA)
ts_f4(ts, W = "AFR_1", X = "AFR_2", Y = "NEA_1", Z = "CHIMP_1")
ts_f4(ts, W = "AFR_1", X = "EUR_1", Y = "NEA_1", Z = "CHIMP_1")
set.seed(42)
afr_samples <- samples$AFR %>% sample(25)
eur_samples <- samples$EUR %>% sample(25)
f4_afr <- lapply(afr_samples, function(x) ts_f4(ts, W = "AFR_1", X = x, Y = "NEA_1", Z = "CHIMP_1")) %>% bind_rows()
f4_eur <- lapply(eur_samples, function(x) ts_f4(ts, W = "AFR_1", X = x, Y = "NEA_1", Z = "CHIMP_1")) %>% bind_rows()
f4_afr$pop <- "AFR"
f4_eur$pop <- "EUR"
rbind(f4_afr, f4_eur) %>%
ggplot(aes(pop, f4, color = pop)) +
geom_boxplot() +
geom_jitter() +
geom_hline(yintercept = 0, linetype = 2) +
ggtitle("f4(AFR, EUR; NEA, CHIMP)")
# unique quartets
# # install a combinatorics R package
# install.packages("combinat")
quartet <- c("AFR_1", "EUR_1", "NEA_1", "CHIMP_1")
quartets <- combinat::permn(quartet)
# how many permutations there are in total?
# 4! = 4 * 3 * 2 * 1 = 24
length(quartets)
# loop across all quartets, computing the corresponding f4 statistic
all_f4s <- lapply(quartets, function(q) ts_f4(ts, q[1], q[2], q[3], q[4], mode = "branch"))
# bind the list of f4 results into a single data frame
all_f4s <- bind_rows(all_f4s) %>% arrange(abs(f4))
print(all_f4s, n = Inf)
distinct(all_f4s, f4, .keep_all = TRUE)
distinct(all_f4s, abs(f4), .keep_all = TRUE)