# Using ezancestry as a Python library

In [1]:
from pathlib import Path

import pandas as pd
from sklearn.model_selection import train_test_split

# load config variables
from ezancestry.config import aisnps_directory as _aisnps_directory
from ezancestry.config import aisnps_set as _aisnps_set
from ezancestry.config import models_directory as _models_directory
from ezancestry.config import population_level as _population_level
from ezancestry.evaluate import export_performance

# load functions
from ezancestry.fetch import get_thousand_genomes_aisnps
from ezancestry.model import DEFAULT_PIPELINE, predict_ancestry, train
from ezancestry.process import process_user_input  #, vcf2df

### pull aisnps from 1kG

This will query the 1000 Genomes VCF for the Kidd et al. AISNPs and save the results as a .csv when it does not exist on your local disk.  
The function also returns a results as a pandas DataFrame.

Note that this will take a few minutes to run as it is querying the 1000 Genomes VCF.
It will download `.tbi` files for the VCFs.

**optional**
The Kidd andn Seldin files are included when you install `ezancestry` and are located in the `ezancestry` directory. You do not need to run `get_thousand_genomes_aisnps` function if you have already installed or cloned the `ezancestry` repository.

If you want to load the 1000 Genomes aisnps without downloading anything, you can use the following code:

```python
pd.read_csv(f"{_aisnps_directory}/kidd.1kG.csv")
```

In [2]:
# kidd1kg = get_thousand_genomes_aisnps(aisnps_directory=None, aisnps_sets="kidd")
# seldin1kg = get_thousand_genomes_aisnps(aisnps_directory=None, aisnps_sets="seldin")

# If you have a custom AISNPs file, named custom.aisnps.txt, you can load it using the following function to query the 1000 Genomes VCF
# custom1kg = get_thousand_genomes_aisnps(aisnps_directory=None, aisnps_sets="custom")

In [3]:
# If the file exists in the aisnps_directory, you can load it rather than querying the 1000 Genomes VCF
kidd1kg = get_thousand_genomes_aisnps(aisnps_directory="data/aisnps/", aisnps_sets=_aisnps_set)

2024-02-12 22:03:25.842 | INFO     | ezancestry.fetch:get_thousand_genomes_aisnps:34 - Loaded: kidd.1kG.csv


In [13]:
DEFAULT_PIPELINE.fit(kidd1kg.drop(columns=["population", "superpopulation", "gender"]), kidd1kg["superpopulation"])

: 

In [8]:
model = train(
    kidd1kg.drop(columns=["population", "superpopulation", "gender"]),
    kidd1kg["superpopulation"],
    sklearn_pipeline=None,
    aisnps_set="kidd",
    models_directory=None,
    population_level="superpopulation",
    overwrite_model=False,
)

: 

In [None]:
vcf_fname = Path(_aisnps_directory).joinpath("kidd.aisnp.1kG.vcf")
df_kidd = vcf2df(vcf_fname, dfsamples)

In [None]:
vcf_fname = Path(_aisnps_directory).joinpath("Seldin.aisnp.1kG.vcf")
df_seldin = vcf2df(vcf_fname, dfsamples)

### could start here

In [None]:
train_kidd, test_kidd, y_train_kidd, y_test_kidd = train_test_split(
    df_kidd,
    df_kidd["superpopulation"],
    test_size=0.2,
    stratify=df_kidd["superpopulation"],
    random_state=42,
)

### one-hot encode snps

In [None]:
# The user could have missing snps
df_user = df_kidd[df_kidd.columns[0:43]].copy()

# The user could have extra snps
df_user["extra_snp"] = "TT"

# The user could have genotypes that weren't in the original encoder
df_user.loc["HG00096", "rs3737576"] = "blah"

In [None]:
ohe_user = encode_genotypes(df_user, aisnps_set="kidd", overwrite_encoder=False)

In [None]:
# make sure "blah" genotype didn't get encoded
ohe_user.loc["HG00096", ["rs3737576_CC", "rs3737576_CT", "rs3737576_TT"]]

In [None]:
# change to True to write new encoders
OVERWRITE_ENCODER = False

In [None]:
# get an encoder for each snp set
df_kidd_encoded = encode_genotypes(df_kidd, aisnps_set="kidd", overwrite_encoder=OVERWRITE_ENCODER)
df_seldin_encoded = encode_genotypes(df_seldin, aisnps_set="Seldin", overwrite_encoder=OVERWRITE_ENCODER)

### dimensionality reduction & training

In [None]:
OVERWRITE_MODEL = False

In [None]:
# write all the super population dimred models for kidd and Seldin
for aisnps_set, df, df_labels in zip(
    ["kidd", "Seldin"], 
    [df_kidd_encoded, df_seldin_encoded], 
    [df_kidd["superpopulation"], df_seldin["superpopulation"]]
):
    for algorithm, labels in zip(["pca", "umap", "nca"], [None, None, None, df_labels]):
        df_reduced = dimensionality_reduction(df, algorithm=algorithm, aisnps_set=aisnps_set, overwrite_model=OVERWRITE_MODEL, labels=labels, population_level="super population")
        knn_model = train(df_reduced, df_labels, algorithm=algorithm, aisnps_set=aisnps_set, k=9, population_level="superpopulation", overwrite_model=OVERWRITE_MODEL)

In [None]:
# write all the population dimred models for kidd and Seldin
for aisnps_set, df, df_labels in zip(
    ["kidd", "Seldin"], 
    [df_kidd_encoded, df_seldin_encoded], 
    [df_kidd["population"], df_seldin["population"]]
):
    for algorithm, labels in zip(["nca"], [df_labels]):
        df_reduced = dimensionality_reduction(df, algorithm=algorithm, aisnps_set=aisnps_set, overwrite_model=OVERWRITE_MODEL, labels=labels, population_level="population")
        knn_model = train(df_reduced, labels, algorithm=algorithm, aisnps_set=aisnps_set, k=9, population_level="population", overwrite_model=OVERWRITE_MODEL)

# Predict

In [None]:
from ezancestry.commands import predict

In [None]:
from snps import SNPs

In [None]:
mygenomefile = "/Users/kevin/mygenome/genome2.txt"

## load from DataFrame

In [None]:
# the snps Python package will read the genome file properly 
mygenome = SNPs(mygenomefile)
mygenomedf = mygenome.snps

In [None]:
mygenomedf.head(2)

In [None]:
# predict on the 
predictions = predict(mygenomedf, 
                    aisnps_set="kidd",
                    k=None,
                    n_components=None,
                    algorithm=None,
                    write_predictions=False,
                    models_directory=None,
                    output_directory=None,
                    aisnps_directory=None,
                    thousand_genomes_directory=None,
                    samples_directory=None
                     )

In [None]:
predictions

## or load directly from a file

In [None]:
predictions = predict(mygenomefile, 
                    aisnps_set="kidd",
                    k=None,
                    n_components=None,
                    algorithm=None,
                    write_predictions=False,
                    models_directory=None,
                    output_directory=None,
                    aisnps_directory=None,
                    thousand_genomes_directory=None,
                    samples_directory=None
                     )

In [None]:
predictions