In [None]:
import time
import random
import subprocess

import tskit
import numpy as np
import matplotlib.pyplot as plt
import tsinfer

the first line of simulation file tell us how was run the simulation

In [None]:
subprocess.run(["head", "../test1M.out"])

In [None]:
subprocess.run(["tail", "../test1M.out"])

I can use numpy to read data:

In [None]:
dat = np.loadtxt("../test1M.out", skiprows=6)
type(dat)

get info on array content

In [None]:
dat.dtype

In [None]:
dat.shape

extract the geonotpe objects

In [None]:
gts = np.array(dat[:, 2:], dtype=np.int16)
gts

In [None]:
gts.shape

In [None]:
plt.imshow(gts[:200], aspect="auto", cmap="viridis")
plt.show()

In [None]:
relPos = dat[:, 0]

In [None]:
plt.hist(relPos, bins=100)
plt.show()

In [None]:
absPos = relPos * 1_000_000

I need big integers to represent absolute positions

In [None]:
absPos = absPos.astype(np.int32)

In [None]:
plt.hist(absPos, bins=100)
plt.show()

there could be position that are identical

In [None]:
len(absPos.tolist())

In [None]:
len(set(absPos.tolist()))

I need to create an object, I want to remove duplicates by adding one without change the original list:

In [None]:
absPosShift = absPos.copy()
for i in range(1, absPos.shape[0]):
    if absPosShift[i] <= absPosShift[i - 1]:
        absPosShift[i] = absPosShift[i - 1] + 1

In [None]:
len(set(absPosShift.tolist()))

## Inference with TSINFER
### set sampledata

In [None]:
samp = tsinfer.SampleData(
    sequence_length=1_000_000,
    path="mySamples.samples")

add population information

In [None]:
samp.add_population(metadata={"name": "Mouflon"})
samp.add_population(metadata={"name": "Iranian"})
samp.add_population(metadata={"name": "Border"})

In [None]:
# this will create a list like of 105 elements [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, ...]
popID = np.repeat([0, 1, 2], [5, 50, 50]).tolist()
for i in range(105):
    samp.add_individual(ploidy=2, population=popID[i])

In [None]:
list(samp.populations())

add variant data:

In [None]:
for i in range(absPosShift.shape[0]):
    samp.add_site(absPosShift[i], gts[i, :])

Then finalize

In [None]:
samp.finalise()

### TS inference

In [None]:
t0 = time.time()
ts = tsinfer.infer(samp)
t1 = time.time()
t1 - t0

In [None]:
ts = ts.simplify()
ts

In [None]:
ts.at(110).draw_svg(size=(1000, 200))

In [None]:
swapIndex = [random.random() < 0.1 for i in range(len(absPosShift))]

make a new sampledata object:

In [None]:
samp2 = tsinfer.SampleData(
    sequence_length=1_000_000,
    path="mySamples2.samples"
)

In [None]:
for i in range(len(absPosShift)):
    samp2.add_site(absPosShift[i], gts[i, :], ancestral_allele=int(swapIndex[i]))

samp2.finalise()

In [None]:
ts2 = tsinfer.infer(samp2)

In [None]:
ts2 = ts2.simplify()
ts2

In [None]:
ts

ancestry are the same since they don't depends on trees

In [None]:
ts.diversity([range(10), range(10, 110), range(110, 210)]) * 100

In [None]:
ts2.diversity([range(10), range(10, 110), range(110, 210)]) * 100

also Fst

In [None]:
ts.Fst([range(10), range(200, 210)])

In [None]:
ts2.Fst([range(10), range(200, 210)])

nearest neighbours change a little:

In [None]:
ts.genealogical_nearest_neighbours([0], [range(10), range(10, 110), range(110, 210)])

In [None]:
ts2.genealogical_nearest_neighbours([0], [range(10), range(10, 110), range(110, 210)])

In [None]:
plt.figure(figsize=(10, 5))
plt.bar(np.array(range(11))-.15, ts.simplify(range(10)).allele_frequency_spectrum(polarised=True), width=0.3, label="ts")
plt.bar(np.array(range(11))+.15, ts2.simplify(range(10)).allele_frequency_spectrum(polarised=True), width=0.3, label="ts2")
plt.legend()
plt.show()