# Using ezancestry as a Python library

In [1]:
from pathlib import Path

from sklearn.model_selection import train_test_split

In [2]:
# load config variables
from ezancestry.config import aisnps_directory as _aisnps_directory
from ezancestry.config import aisnps_set as _aisnps_set
from ezancestry.config import algorithm as _algorithm
from ezancestry.config import k as _k
from ezancestry.config import models_directory as _models_directory
from ezancestry.config import n_components as _n_components
from ezancestry.config import population_level as _population_level
from ezancestry.config import samples_directory as _samples_directory
from ezancestry.config import thousand_genomes_directory as _thousand_genomes_directory

# load functions
from ezancestry.aisnps import extract_aisnps
from ezancestry.dimred import dimensionality_reduction
from ezancestry.evaluate import export_performance
from ezancestry.fetch import download_thousand_genomes
from ezancestry.model import predict_ancestry, train
from ezancestry.process import (encode_genotypes, get_1kg_labels,
                                process_user_input, vcf2df)

### pull aisnps from 1kG

In [3]:
# kidd
aisnps_file = Path(_aisnps_directory).joinpath("kidd.aisnp.txt")
extract_aisnps(_thousand_genomes_directory, aisnps_file, aisnps_set="kidd")

2021-09-20 06:24:56.869 | INFO     | ezancestry.aisnps:extract_aisnps:58 - Looking for 1000 genomes data in: /Users/kevin/.ezancestry/data/thousand_genomes
2021-09-20 06:24:57.402 | INFO     | ezancestry.aisnps:extract_aisnps:84 - Successfully wrote kidd.aisnp.1kG.vcf


In [4]:
# Seldin
aisnps_file = Path(_aisnps_directory).joinpath("Seldin.aisnp.txt")
extract_aisnps(_thousand_genomes_directory, aisnps_file, aisnps_set="Seldin")

2021-09-20 06:24:57.414 | INFO     | ezancestry.aisnps:extract_aisnps:58 - Looking for 1000 genomes data in: /Users/kevin/.ezancestry/data/thousand_genomes
2021-09-20 06:24:58.501 | INFO     | ezancestry.aisnps:extract_aisnps:84 - Successfully wrote Seldin.aisnp.1kG.vcf


In [5]:
# pull the 1000 Genomes Project samples
dfsamples = get_1kg_labels(_samples_directory)

In [6]:
dfsamples.head(3)

Unnamed: 0_level_0,population,superpopulation,gender
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
HG00096,GBR,EUR,male
HG00097,GBR,EUR,female
HG00099,GBR,EUR,female


In [7]:
vcf_fname = Path(_aisnps_directory).joinpath("kidd.aisnp.1kG.vcf")
df_kidd = vcf2df(vcf_fname, dfsamples)

In [8]:
vcf_fname = Path(_aisnps_directory).joinpath("Seldin.aisnp.1kG.vcf")
df_seldin = vcf2df(vcf_fname, dfsamples)

  df[variant.ID] = [


### could start here

In [9]:
train_kidd, test_kidd, y_train_kidd, y_test_kidd = train_test_split(
    df_kidd,
    df_kidd["superpopulation"],
    test_size=0.2,
    stratify=df_kidd["superpopulation"],
    random_state=42,
)

### one-hot encode snps

In [10]:
# The user could have missing snps
df_user = df_kidd[df_kidd.columns[0:43]].copy()

# The user could have extra snps
df_user["extra_snp"] = "TT"

# The user could have genotypes that weren't in the original encoder
df_user.loc["HG00096", "rs3737576"] = "blah"

In [11]:
ohe_user = encode_genotypes(df_user, aisnps_set="kidd", overwrite_encoder=False)

2021-09-20 06:25:00.167 | INFO     | ezancestry.process:encode_genotypes:138 - Successfully loaded an encoder from /Users/kevin/.ezancestry/data/models/one_hot_encoder.kidd.bin


In [12]:
# make sure "blah" genotype didn't get encoded
ohe_user.loc["HG00096", ["rs3737576_CC", "rs3737576_CT", "rs3737576_TT"]]

rs3737576_CC    0.0
rs3737576_CT    0.0
rs3737576_TT    0.0
Name: HG00096, dtype: float64

In [13]:
# change to True to write new encoders
OVERWRITE_ENCODER = False

In [14]:
# get an encoder for each snp set
df_kidd_encoded = encode_genotypes(df_kidd, aisnps_set="kidd", overwrite_encoder=OVERWRITE_ENCODER)
df_seldin_encoded = encode_genotypes(df_seldin, aisnps_set="Seldin", overwrite_encoder=OVERWRITE_ENCODER)

2021-09-20 06:25:00.284 | INFO     | ezancestry.process:encode_genotypes:138 - Successfully loaded an encoder from /Users/kevin/.ezancestry/data/models/one_hot_encoder.kidd.bin
2021-09-20 06:25:00.393 | INFO     | ezancestry.process:encode_genotypes:138 - Successfully loaded an encoder from /Users/kevin/.ezancestry/data/models/one_hot_encoder.seldin.bin


### dimensionality reduction & training

In [15]:
OVERWRITE_MODEL = False

In [16]:
# write all the super population dimred models for kidd and Seldin
for aisnps_set, df, df_labels in zip(
    ["kidd", "Seldin"], 
    [df_kidd_encoded, df_seldin_encoded], 
    [df_kidd["superpopulation"], df_seldin["superpopulation"]]
):
    for algorithm, labels in zip(["pca", "umap", "nca"], [None, None, None, df_labels]):
        df_reduced = dimensionality_reduction(df, algorithm=algorithm, aisnps_set=aisnps_set, overwrite_model=OVERWRITE_MODEL, labels=labels, population_level="super population")
        knn_model = train(df_reduced, df_labels, algorithm=algorithm, aisnps_set=aisnps_set, k=9, population_level="superpopulation", overwrite_model=OVERWRITE_MODEL)

2021-09-20 06:25:00.502 | INFO     | ezancestry.dimred:dimensionality_reduction:126 - Successfully loaded a dimensionality reduction model
2021-09-20 06:25:17.779 | INFO     | ezancestry.dimred:dimensionality_reduction:126 - Successfully loaded a dimensionality reduction model
2021-09-20 06:25:17.796 | INFO     | ezancestry.dimred:dimensionality_reduction:126 - Successfully loaded a dimensionality reduction model
2021-09-20 06:25:17.818 | INFO     | ezancestry.dimred:dimensionality_reduction:126 - Successfully loaded a dimensionality reduction model
2021-09-20 06:25:31.511 | INFO     | ezancestry.dimred:dimensionality_reduction:126 - Successfully loaded a dimensionality reduction model
2021-09-20 06:25:31.531 | INFO     | ezancestry.dimred:dimensionality_reduction:126 - Successfully loaded a dimensionality reduction model


In [17]:
# write all the population dimred models for kidd and Seldin
for aisnps_set, df, df_labels in zip(
    ["kidd", "Seldin"], 
    [df_kidd_encoded, df_seldin_encoded], 
    [df_kidd["population"], df_seldin["population"]]
):
    for algorithm, labels in zip(["nca"], [df_labels]):
        df_reduced = dimensionality_reduction(df, algorithm=algorithm, aisnps_set=aisnps_set, overwrite_model=OVERWRITE_MODEL, labels=labels, population_level="population")
        knn_model = train(df_reduced, labels, algorithm=algorithm, aisnps_set=aisnps_set, k=9, population_level="population", overwrite_model=OVERWRITE_MODEL)

2021-09-20 06:25:31.567 | INFO     | ezancestry.dimred:dimensionality_reduction:126 - Successfully loaded a dimensionality reduction model
2021-09-20 06:25:31.585 | INFO     | ezancestry.dimred:dimensionality_reduction:126 - Successfully loaded a dimensionality reduction model


# Predict

In [18]:
from ezancestry.commands import predict

In [19]:
from snps import SNPs

In [20]:
mygenomefile = "/Users/kevin/mygenome/genome2.txt"

## load from DataFrame

In [21]:
# the snps Python package will read the genome file properly 
mygenome = SNPs(mygenomefile)
mygenomedf = mygenome.snps

In [22]:
mygenomedf.head(2)

Unnamed: 0_level_0,chrom,pos,genotype
rsid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
rs12564807,1,734462,AA
rs3131972,1,752721,GG


In [23]:
# predict on the 
predictions = predict(mygenomedf, 
                    aisnps_set="kidd",
                    k=None,
                    n_components=None,
                    algorithm=None,
                    write_predictions=False,
                    models_directory=None,
                    output_directory=None,
                    aisnps_directory=None,
                    thousand_genomes_directory=None,
                    samples_directory=None
                     )

2021-09-20 06:25:34.289 | INFO     | ezancestry.process:_input_to_dataframe:276 - Sample has a valid genotype for 44 out of a possible 55 (80.0%)
2021-09-20 06:25:34.342 | INFO     | ezancestry.process:encode_genotypes:138 - Successfully loaded an encoder from /Users/kevin/.ezancestry/data/models/one_hot_encoder.kidd.bin
2021-09-20 06:25:34.351 | INFO     | ezancestry.dimred:dimensionality_reduction:126 - Successfully loaded a dimensionality reduction model
2021-09-20 06:25:34.356 | INFO     | ezancestry.model:predict_ancestry:94 - Successfully loaded trained knn model: /Users/kevin/.ezancestry/data/models/knn.pca.kidd.population.bin
2021-09-20 06:25:34.403 | INFO     | ezancestry.dimred:dimensionality_reduction:126 - Successfully loaded a dimensionality reduction model
2021-09-20 06:25:34.409 | INFO     | ezancestry.model:predict_ancestry:94 - Successfully loaded trained knn model: /Users/kevin/.ezancestry/data/models/knn.pca.kidd.superpopulation.bin


In [24]:
predictions

Unnamed: 0,component1,component2,component3,predicted_population_population,ACB,ASW,BEB,CDX,CEU,CHB,...,PUR,STU,TSI,YRI,predicted_population_superpopulation,AFR,AMR,EAS,EUR,SAS
sample,-0.820187,-2.58022,-0.768882,IBS,0.0,0.0,0.0,0.0,0.089856,0.0,...,0.0,0.0,0.165606,0.0,EUR,0.0,0.0,0.0,1.0,0.0


## or load directly from a file

In [25]:
predictions = predict(mygenomefile, 
                    aisnps_set="kidd",
                    k=None,
                    n_components=None,
                    algorithm=None,
                    write_predictions=False,
                    models_directory=None,
                    output_directory=None,
                    aisnps_directory=None,
                    thousand_genomes_directory=None,
                    samples_directory=None
                     )

2021-09-20 06:25:36.877 | INFO     | ezancestry.process:_input_to_dataframe:276 - Sample has a valid genotype for 44 out of a possible 55 (80.0%)
2021-09-20 06:25:36.955 | INFO     | ezancestry.process:encode_genotypes:138 - Successfully loaded an encoder from /Users/kevin/.ezancestry/data/models/one_hot_encoder.kidd.bin
2021-09-20 06:25:36.963 | INFO     | ezancestry.dimred:dimensionality_reduction:126 - Successfully loaded a dimensionality reduction model
2021-09-20 06:25:36.971 | INFO     | ezancestry.model:predict_ancestry:94 - Successfully loaded trained knn model: /Users/kevin/.ezancestry/data/models/knn.pca.kidd.population.bin
2021-09-20 06:25:37.001 | INFO     | ezancestry.dimred:dimensionality_reduction:126 - Successfully loaded a dimensionality reduction model
2021-09-20 06:25:37.005 | INFO     | ezancestry.model:predict_ancestry:94 - Successfully loaded trained knn model: /Users/kevin/.ezancestry/data/models/knn.pca.kidd.superpopulation.bin


In [26]:
predictions

Unnamed: 0,component1,component2,component3,predicted_population_population,ACB,ASW,BEB,CDX,CEU,CHB,...,PUR,STU,TSI,YRI,predicted_population_superpopulation,AFR,AMR,EAS,EUR,SAS
sample,-0.820187,-2.58022,-0.768882,IBS,0.0,0.0,0.0,0.0,0.089856,0.0,...,0.0,0.0,0.165606,0.0,EUR,0.0,0.0,0.0,1.0,0.0
