-
Notifications
You must be signed in to change notification settings - Fork 2
/
sim.py
193 lines (141 loc) · 5.55 KB
/
sim.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
# -*- coding: utf-8 -*-
"""
Extremely naive simulation functions to generate genotype data for
illustration of other features in the ``anhima`` package.
"""
from __future__ import division, print_function, absolute_import
# python standard library dependencies
import random
from anhima.compat import range
# third party dependencies
import numpy as np
import scipy
def simulate_biallelic_genotypes(n_variants, n_samples, af_dist,
p_missing=.1,
ploidy=2):
"""Simulate genotypes at biallelic variants for a population in
Hardy-Weinberg equilibrium
Parameters
----------
n_variants : int
The number of variants.
n_samples : int
The number of samples.
af_dist : frozen continuous random variable
The distribution of allele frequencies.
p_missing : float, optional
The fraction of missing genotype calls.
ploidy : int, optional
The sample ploidy.
Returns
-------
genotypes : ndarray, int8
An array of shape (n_variants, n_samples, ploidy) where each
element of the array is an integer corresponding to an allele index
(-1 = missing, 0 = reference allele, 1 = alternate allele).
"""
# initialise output array
genotypes = np.empty((n_variants, n_samples, ploidy), dtype='i1')
# generate allele frequencies under the given distribution
af = af_dist.rvs(n_variants)
# freeze binomial distribution to model missingness
miss_dist = scipy.stats.binom(p=p_missing, n=n_samples)
# iterate over variants
for i, p in zip(range(n_variants), af):
# randomly generate alleles under the given allele frequency
# ensure p is valid probability
p = min(p, 1)
alleles = scipy.stats.bernoulli.rvs(p, size=n_samples*ploidy)
# reshape alleles as genotypes under the given ploidy
genotypes[i] = alleles.reshape(n_samples, ploidy)
# simulate some missingness
n_missing = miss_dist.rvs()
missing_indices = random.sample(range(n_samples),
n_missing)
genotypes[i, missing_indices] = (-1,) * ploidy
return genotypes
def simulate_genotypes_with_ld(n_variants, n_samples, correlation=0.2):
"""A very simple function to simulate a set of genotypes, where
variants are in some degree of linkage disequilibrium with their
neighbours.
Parameters
----------
n_variants : int
The number of variants to simulate data for.
n_samples : int
The number of individuals to simulate data for.
correlation : float, optional
The fraction of samples to copy genotypes between neighbouring
variants.
Returns
-------
gn : ndarray, int8
A 2-dimensional array of shape (n_variants, n_samples) where each
element is a genotype call coded as a single integer counting the
number of non-reference alleles.
"""
# initialise an array of random genotypes
gn = np.random.randint(size=(n_variants, n_samples), low=0, high=3)
gn = gn.astype('i1')
# determine the number of samples to copy genotypes for
n_copy = int(correlation * n_samples)
# introduce linkage disequilibrium by copying genotypes from one sample to
# the next
for i in range(1, n_variants):
# randomly pick the samples to copy from
sample_indices = random.sample(range(n_samples), n_copy)
# view genotypes from the previous variant for the selected samples
c = gn[i-1, sample_indices]
# randomly choose whether to invert the correlation
inv = random.randint(0, 1)
if inv:
c = 2-c
# copy across genotypes
gn[i, sample_indices] = c
return gn
def simulate_relatedness(genotypes, relatedness=.5, n_iter=1000, copy=True):
"""
Simulate relatedness by randomly copying genotypes between individuals.
Parameters
----------
genotypes : array_like
An array of shape (n_variants, n_samples, ploidy) where each
element of the array is an integer corresponding to an allele index
(-1 = missing, 0 = reference allele, 1 = first alternate allele,
2 = second alternate allele, etc.).
relatedness : float, optional
Fraction of variants to copy genotypes for.
n_iter : int, optional
Number of times to randomly copy genotypes between individuals.
copy : bool, optional
If False, modify `genotypes` in place.
Returns
-------
genotypes : ndarray, shape (n_variants, n_samples, ploidy)
The input genotype array but with relatedness simulated.
"""
# check genotypes array
genotypes = np.asarray(genotypes)
assert genotypes.ndim >= 2
n_variants = genotypes.shape[0]
n_samples = genotypes.shape[1]
# copy input array
if copy:
genotypes = genotypes.copy()
else:
# modify in place
pass
# determine the number of variants to copy genotypes for
n_copy = int(relatedness * n_variants)
# iteratively introduce relatedness
for i in range(n_iter):
# randomly choose donor and recipient
donor_index = random.randint(0, n_samples-1)
donor = genotypes[:, donor_index]
recip_index = random.randint(0, n_samples-1)
recip = genotypes[:, recip_index]
# randomly pick a set of variants to copy
variant_indices = random.sample(range(n_variants), n_copy)
# copy across genotypes
recip[variant_indices] = donor[variant_indices]
return genotypes