# Try to analyze simulated data


In [None]:
import tskit
import matplotlib.pyplot as plt
import numpy as np

In [None]:
ts = tskit.load("../test1M.out.inf.ts")
ts

*genome and breeds* relative this dataset:
* 0-9 mouflon
* 10-109 iranian
* 110-209 Border

The first tree is for the end of chromosome, so don't have sense

In [None]:
ts.first().draw_svg()

Get tree at position. Change image size

In [None]:
ts.at(110).draw_svg(size=(500, 500))

simplify unary trees (data occurring once, individual ancestors that don't differ before and after)

In [None]:
ts.simplify().at(110).draw_svg(size=(500, 500))

`simplify` can down-sample a ts. Subselect a few of them:

In [None]:
ts.simplify([0,1,2,3,4, 10, 11, 12, 13, 14, 110, 111, 112, 113, 114]).at(110).draw_svg(size=(500, 500))

Try to label this nodes:

In [None]:
{i: ["M", "I", "B"][i // 5] for i in range(15)}

In [None]:
ts.simplify(
    [0,1,2,3,4, 10, 11, 12, 13, 14, 110, 111, 112, 113, 114]).at(110).draw_svg(
        size=(500, 500),
        omit_sites=True,
        node_labels={i: ["M", "I", "B"][i // 5] for i in range(15)},
        y_axis=True)

B cluster, while M is not a very good outgroup, since one of them is closer to I

## Inspect GT matrix
this is not stored, I can derive it from treesequence

In [None]:
ts.genotype_matrix()

In [None]:
plt.imshow(ts.genotype_matrix(), aspect="auto")
plt.ylabel("Sites")
plt.xlabel("Haplotypes (genomes)")

subset a matrix

In [None]:
plt.imshow(ts.genotype_matrix()[:200, :], aspect="auto")
plt.ylabel("Sites")
plt.xlabel("Haplotypes (genomes)")

## Genetic statistics
what about summary statistics? nucleotide diversity for all individuals:

In [None]:
ts.diversity()

for groups (mouflon, iranian, border)

In [None]:
ts.diversity([range(10), range(10, 110), range(110, 210)]) * 100

compare Fst pairwise. It requires to define groups and then the pairs of groups to compare:

In [None]:
ts.Fst([range(10), range(10, 110), range(110, 210)], indexes=[[0,1], [0,2], [1,2]])

In [None]:
fst = ts.Fst(
    [range(10), range(10, 110), range(110, 210)],
    indexes=[[0,1], [0,2], [1,2]],
    windows=np.linspace(0, 1_000_000, 101)
)
fst

In [None]:
plt.plot(range(100), fst[:, 0], label="M-I")
plt.plot(range(100), fst[:, 1], label="M-B")
plt.plot(range(100), fst[:, 2], label="I-B")
plt.legend()
plt.show()

## Genealogic nearest neighbours
where's the nearest neighbours to genome *index* from populations

In [None]:
ts.genealogical_nearest_neighbours([0], [range(10), range(10, 110), range(110, 210)])

In [None]:
ts.genealogical_nearest_neighbours(range(110, 120), [range(10), range(10, 110), range(110, 210)])

In [None]:
np.sum(ts.genotype_matrix()[:, :2], axis=1) != 1

In [None]:
hetPos = ts.sites_position[np.invert(np.sum(ts.genotype_matrix()[:,:2], axis=1) != 1)]

In [None]:
def getRohLengths(g0, g1, tr=20000):
    hp = ts.sites_position[np.invert(np.sum(ts.genotype_matrix()[:, [g0,g1]], axis=1) != 1)]
    dd = np.diff(hp)
    return dd[dd > tr]

In [None]:
plt.hist(getRohLengths(0, 1, tr=1000), histtype="step", label="Mouflon", bins=100)
plt.hist(getRohLengths(110, 111, tr=1000), histtype="step", label="B0", bins=100)
plt.legend()
plt.title("Distribution of ROHs in individuals 0 (mouflon)")
plt.show()