# Using ezancestry as a Python library

In [1]:
from pathlib import Path

from sklearn.model_selection import train_test_split

In [2]:
# load config variables
from ezancestry.config import aisnps_directory as _aisnps_directory
from ezancestry.config import aisnps_set as _aisnps_set
from ezancestry.config import algorithm as _algorithm
from ezancestry.config import k as _k
from ezancestry.config import models_directory as _models_directory
from ezancestry.config import n_components as _n_components
from ezancestry.config import population_level as _population_level
from ezancestry.config import samples_directory as _samples_directory
from ezancestry.config import thousand_genomes_directory as _thousand_genomes_directory

# load functions
from ezancestry.aisnps import extract_aisnps
from ezancestry.dimred import dimensionality_reduction
from ezancestry.evaluate import export_performance
from ezancestry.fetch import download_thousand_genomes
from ezancestry.model import predict_ancestry, train
from ezancestry.process import (encode_genotypes, get_1kg_labels,
                                process_user_input, vcf2df)

### pull aisnps from 1kG

In [3]:
# Kidd
aisnps_file = Path(_aisnps_directory).joinpath("Kidd.AISNP.txt")
extract_aisnps(_thousand_genomes_directory, aisnps_file, aisnps_set="Kidd")

2021-09-12 21:31:57.465 | INFO     | ezancestry.aisnps:extract_aisnps:58 - Looking for 1000 genomes data in: /Users/kevin/.ezancestry/data/thousand_genomes
2021-09-12 21:31:57.465 | INFO     | ezancestry.aisnps:extract_aisnps:58 - Looking for 1000 genomes data in: /Users/kevin/.ezancestry/data/thousand_genomes
2021-09-12 21:31:58.053 | INFO     | ezancestry.aisnps:extract_aisnps:84 - Successfully wrote Kidd.AISNP.1kG.vcf
2021-09-12 21:31:58.053 | INFO     | ezancestry.aisnps:extract_aisnps:84 - Successfully wrote Kidd.AISNP.1kG.vcf


In [4]:
# Seldin
aisnps_file = Path(_aisnps_directory).joinpath("Seldin.AISNP.txt")
extract_aisnps(_thousand_genomes_directory, aisnps_file, aisnps_set="Seldin")

2021-09-12 21:31:58.067 | INFO     | ezancestry.aisnps:extract_aisnps:58 - Looking for 1000 genomes data in: /Users/kevin/.ezancestry/data/thousand_genomes
2021-09-12 21:31:58.067 | INFO     | ezancestry.aisnps:extract_aisnps:58 - Looking for 1000 genomes data in: /Users/kevin/.ezancestry/data/thousand_genomes
2021-09-12 21:31:59.288 | INFO     | ezancestry.aisnps:extract_aisnps:84 - Successfully wrote Seldin.AISNP.1kG.vcf
2021-09-12 21:31:59.288 | INFO     | ezancestry.aisnps:extract_aisnps:84 - Successfully wrote Seldin.AISNP.1kG.vcf


In [5]:
# pull the 1000 Genomes Project samples
dfsamples = get_1kg_labels(_samples_directory)

In [6]:
dfsamples.head(3)

Unnamed: 0_level_0,population,superpopulation,gender
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
HG00096,GBR,EUR,male
HG00097,GBR,EUR,female
HG00099,GBR,EUR,female


Unnamed: 0_level_0,population,superpopulation,gender
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
HG00096,GBR,EUR,male
HG00097,GBR,EUR,female
HG00099,GBR,EUR,female


In [7]:
vcf_fname = Path(_aisnps_directory).joinpath("Kidd.AISNP.1kG.vcf")
df_kidd = vcf2df(vcf_fname, dfsamples)

In [8]:
vcf_fname = Path(_aisnps_directory).joinpath("Seldin.AISNP.1kG.vcf")
df_seldin = vcf2df(vcf_fname, dfsamples)

  df[variant.ID] = [
  df[variant.ID] = [


### could start here

In [9]:
train_kidd, test_kidd, y_train_kidd, y_test_kidd = train_test_split(
    df_kidd,
    df_kidd["superpopulation"],
    test_size=0.2,
    stratify=df_kidd["superpopulation"],
    random_state=42,
)

### one-hot encode snps

In [10]:
# The user could have missing snps
df_user = df_kidd[df_kidd.columns[0:43]].copy()

# The user could have extra snps
df_user["extra_snp"] = "TT"

# The user could have genotypes that weren't in the original encoder
df_user.loc["HG00096", "rs3737576"] = "blah"

In [11]:
ohe_user = encode_genotypes(df_user, aisnps_set="Kidd", overwrite_encoder=False)

2021-09-12 21:32:01.251 | INFO     | ezancestry.process:encode_genotypes:137 - Successfully loaded an encoder from /Users/kevin/.ezancestry/data/models/one_hot_encoder.KIDD.bin
2021-09-12 21:32:01.251 | INFO     | ezancestry.process:encode_genotypes:137 - Successfully loaded an encoder from /Users/kevin/.ezancestry/data/models/one_hot_encoder.KIDD.bin


In [12]:
# make sure "blah" genotype didn't get encoded
ohe_user.loc["HG00096", ["rs3737576_CC", "rs3737576_CT", "rs3737576_TT"]]

rs3737576_CC    0.0
rs3737576_CT    0.0
rs3737576_TT    0.0
Name: HG00096, dtype: float64

rs3737576_CC    0.0
rs3737576_CT    0.0
rs3737576_TT    0.0
Name: HG00096, dtype: float64

In [13]:
# change to True to write new encoders
OVERWRITE_ENCODER = False

In [14]:
# get an encoder for each snp set
df_kidd_encoded = encode_genotypes(df_kidd, aisnps_set="Kidd", overwrite_encoder=OVERWRITE_ENCODER)
df_seldin_encoded = encode_genotypes(df_seldin, aisnps_set="Seldin", overwrite_encoder=OVERWRITE_ENCODER)

2021-09-12 21:32:01.379 | INFO     | ezancestry.process:encode_genotypes:137 - Successfully loaded an encoder from /Users/kevin/.ezancestry/data/models/one_hot_encoder.KIDD.bin
2021-09-12 21:32:01.379 | INFO     | ezancestry.process:encode_genotypes:137 - Successfully loaded an encoder from /Users/kevin/.ezancestry/data/models/one_hot_encoder.KIDD.bin
2021-09-12 21:32:01.502 | INFO     | ezancestry.process:encode_genotypes:137 - Successfully loaded an encoder from /Users/kevin/.ezancestry/data/models/one_hot_encoder.SELDIN.bin
2021-09-12 21:32:01.502 | INFO     | ezancestry.process:encode_genotypes:137 - Successfully loaded an encoder from /Users/kevin/.ezancestry/data/models/one_hot_encoder.SELDIN.bin


### dimensionality reduction & training

In [15]:
OVERWRITE_MODEL = False

In [16]:
# write all the SUPER POPULATION dimred models for Kidd and Seldin
for aisnps_set, df, df_labels in zip(
    ["Kidd", "Seldin"], 
    [df_kidd_encoded, df_seldin_encoded], 
    [df_kidd["superpopulation"], df_seldin["superpopulation"]]
):
    for algorithm, labels in zip(["PCA", "UMAP", "NCA"], [None, None, None, df_labels]):
        df_reduced = dimensionality_reduction(df, algorithm=algorithm, aisnps_set=aisnps_set, overwrite_model=OVERWRITE_MODEL, labels=labels, population_level="super population")
        knn_model = train(df_reduced, df_labels, algorithm=algorithm, aisnps_set=aisnps_set, k=9, population_level="superpopulation", overwrite_model=OVERWRITE_MODEL)

2021-09-12 21:32:01.642 | INFO     | ezancestry.dimred:dimensionality_reduction:126 - Successfully loaded a dimensionality reduction model
2021-09-12 21:32:01.642 | INFO     | ezancestry.dimred:dimensionality_reduction:126 - Successfully loaded a dimensionality reduction model
2021-09-12 21:32:20.240 | INFO     | ezancestry.dimred:dimensionality_reduction:126 - Successfully loaded a dimensionality reduction model
2021-09-12 21:32:20.240 | INFO     | ezancestry.dimred:dimensionality_reduction:126 - Successfully loaded a dimensionality reduction model
2021-09-12 21:32:20.264 | INFO     | ezancestry.dimred:dimensionality_reduction:126 - Successfully loaded a dimensionality reduction model
2021-09-12 21:32:20.264 | INFO     | ezancestry.dimred:dimensionality_reduction:126 - Successfully loaded a dimensionality reduction model
2021-09-12 21:32:20.288 | INFO     | ezancestry.dimred:dimensionality_reduction:126 - Successfully loaded a dimensionality reduction model
2021-09-12 21:32:20.288 | I

In [17]:
# write all the POPULATION dimred models for Kidd and Seldin
for aisnps_set, df, df_labels in zip(
    ["Kidd", "Seldin"], 
    [df_kidd_encoded, df_seldin_encoded], 
    [df_kidd["population"], df_seldin["population"]]
):
    for algorithm, labels in zip(["NCA"], [df_labels]):
        df_reduced = dimensionality_reduction(df, algorithm=algorithm, aisnps_set=aisnps_set, overwrite_model=OVERWRITE_MODEL, labels=labels, population_level="population")
        knn_model = train(df_reduced, labels, algorithm=algorithm, aisnps_set=aisnps_set, k=9, population_level="population", overwrite_model=OVERWRITE_MODEL)

2021-09-12 21:32:34.944 | INFO     | ezancestry.dimred:dimensionality_reduction:126 - Successfully loaded a dimensionality reduction model
2021-09-12 21:32:34.944 | INFO     | ezancestry.dimred:dimensionality_reduction:126 - Successfully loaded a dimensionality reduction model
2021-09-12 21:32:34.959 | INFO     | ezancestry.dimred:dimensionality_reduction:126 - Successfully loaded a dimensionality reduction model
2021-09-12 21:32:34.959 | INFO     | ezancestry.dimred:dimensionality_reduction:126 - Successfully loaded a dimensionality reduction model
