# Analyze the simulated data

## Simulation on 2K SNPs

In [None]:
import numpy as np
import pandas as pd

import tskit

from tskitetude import get_data_dir

Analyze the `data/sheepTSsimMilano/ts300I2k.vcf.gz` generated using msprime. Get a list of all sample names from VCF file:

```bash
export TSKIT_DIR=${PWD}
cd data/sheepTSsimMilano/
bcftools query -l ts300I2k.vcf.gz > ts300I2k.sample_names.txt
```

Now add population information to the sample names, and then write them into a file:

In [None]:
with open(get_data_dir() / "sheepTSsimMilano" / "ts300I2k.sample_names.txt") as f:
    sample_names = f.read().splitlines()

pop_names = ["MM", "II", "A", "B", "C", "D", "E", "F", "G"]

# create a column for FID
fids = [pop_names[0]] * 5 + [pop_name for pop_name in pop_names[1:] for _ in range(300)]
sample_names = pd.DataFrame({"FID": fids, "IID": sample_names})

sample_names.to_csv(
    get_data_dir() / "sheepTSsimMilano" / "ts300I2k.sample_names_fid.csv",
    sep="\t",
    index=False,
    header=False
)

Analyze the `data/sheepTSsimMilano/ts300I2k.vcf.gz` generated using msprime. Call the `create_tstree` with the following parameters:

```bash
create_tstree --vcf ts300I2k.vcf.gz --focal ts300I2k.sample_names_fid.csv --ancestral_as_reference \
    --output_samples ts300I2k.inferred.samples --output_trees ts300I2k.inferred.trees --num_threads 16 \
    --mutation_rate 5.87e-9 --ne 34500
```

The `5.87-9` and `34500` are the mutation rate and effective population size respectively. The `--ancestral_as_reference` flag is used to treat the ancestral allele as the reference allele. The `--num_threads 4` flag is used to specify the number of threads to use. The `--output_samples` flag is used to specify the output file for the inferred samples. The `--output_trees` flag is used to specify the output file for the inferred trees. 

In [None]:
mutation_rate = 5.87e-9
print("Mutation rate: ", mutation_rate)

In [None]:
ts300I2k = tskit.load(get_data_dir() / "sheepTSsimMilano/ts300I2k.inferred.trees")
ts300I2k

In [None]:
ts300I2k.diversity()

In [None]:
ts300I2k.diversity(mode="branch") * mutation_rate

## Simulation on 25K SNPs

Get sample names (should be equal to previous run):

```bash
bcftools query -l ts300I25k.vcf.gz > ts300I25k.sample_names.txt
```

Now add population information to the sample names, and then write them into a file:

In [None]:
with open(get_data_dir() / "sheepTSsimMilano" / "ts300I25k.sample_names.txt") as f:
    sample_names = f.read().splitlines()

pop_names = ["MM", "II", "A", "B", "C", "D", "E", "F", "G"]

# create a column for FID
fids = [pop_names[0]] * 5 + [pop_name for pop_name in pop_names[1:] for _ in range(300)]
sample_names = pd.DataFrame({"FID": fids, "IID": sample_names})

sample_names.to_csv(
    get_data_dir() / "sheepTSsimMilano" / "ts300I25k.sample_names_fid.csv",
    sep="\t",
    index=False,
    header=False
)

Generate tree:

```bash
create_tstree --vcf ts300I25k.vcf.gz --focal ts300I25k.sample_names_fid.csv --ancestral_as_reference \
    --output_samples ts300I25k.inferred.samples --output_trees ts300I25k.inferred.trees --num_threads 16 \
    --mutation_rate 5.87e-9 --ne 34500
```

In [None]:
ts300I25k = tskit.load(get_data_dir() / "sheepTSsimMilano/ts300I25k.inferred.trees")
ts300I25k

In [None]:
ts300I25k.diversity()

In [None]:
ts300I25k.diversity(mode="branch") * mutation_rate

## Simulation on entire dataset

Get samples names (like we did before):

```bash
bbcftools query -l tsm100M300I.vcf.gz > tsm100M300I.sample_names.txt
```

Now add population information to the sample names, and then write them into a file:

In [None]:
with open(get_data_dir() / "sheepTSsimMilano" / "tsm100M300I.sample_names.txt") as f:
    sample_names = f.read().splitlines()

pop_names = ["MM", "II", "A", "B", "C", "D", "E", "F", "G"]

# create a column for FID
fids = [pop_names[0]] * 5 + [pop_name for pop_name in pop_names[1:] for _ in range(300)]
sample_names = pd.DataFrame({"FID": fids, "IID": sample_names})

sample_names.to_csv(
    get_data_dir() / "sheepTSsimMilano" / "tsm100M300I.sample_names_fid.csv",
    sep="\t",
    index=False,
    header=False
)

This step will require a lot of time: Submit the job to the cluster:

```bash
cd ${TSKIT_DIR}
sbatch scripts/simulation-tsm100M300I.sh
```

In [None]:
tsm100M300I = tskit.load(get_data_dir() / "sheepTSsimMilano/tsm100M300I.inferred.trees")
tsm100M300I

In [None]:
tsm100M300I.diversity()

In [None]:
tsm100M300I.diversity(mode="branch") * mutation_rate

## Calculate FST
define individuals list:

In [None]:
indList = [np.arange(10)] + [np.arange(600*i+10, 600*(i+1)+10) for i in range(8)]

In [None]:
[i.Fst([indList[0], indList[1]], mode="branch") for i in [ts300I2k, ts300I25k, tsm100M300I]]


And then with `site` mode:

In [None]:
[i.Fst([indList[0], indList[1]], mode="site") for i in [ts300I2k, ts300I25k, tsm100M300I]]


In [None]:
tmp = {
    "simulation": ["ts300I2k", "ts300I25k", "tsm100M300I"],
    "diversity": [ts300I2k.diversity(), ts300I25k.diversity(), tsm100M300I.diversity()],
    "diversity_branch": [
        ts300I2k.diversity(mode="branch") * mutation_rate,
        ts300I25k.diversity(mode="branch") * mutation_rate,
        tsm100M300I.diversity(mode="branch") * mutation_rate
    ],
    "FST_branch": [i.Fst([indList[0], indList[1]], mode="branch") for i in [ts300I2k, ts300I25k, tsm100M300I]],
    "FST_site": [i.Fst([indList[0], indList[1]], mode="site") for i in [ts300I2k, ts300I25k, tsm100M300I]]
}
pd.DataFrame(tmp)

In [None]:
tree = ts300I2k.at_index(1)
tree.draw_svg(y_axis=True, size=(1000, 200))

In [None]:
tree.root

Print the age of the tree:

In [None]:
tree.time(tree.root)

Iterate over the trees and get time (throw away the 0 time nodes):

In [None]:
for u in tree.nodes():
    if tree.time(u) > 0:
        print(f"Node {u}: time {tree.time(u)}")