# Using ezancestry as a Python library

In [1]:
from sklearn.base import clone
from sklearn.ensemble import RandomForestClassifier
from snps import SNPs

from ezancestry.commands import predict

# load config variables
from ezancestry.config import aisnps_set as _aisnps_set

# load functions
from ezancestry.fetch import get_thousand_genomes_aisnps
from ezancestry.model import DEFAULT_PIPELINE
from ezancestry.process import process_user_input

### pull aisnps from 1kG

This will query the 1000 Genomes VCF for the Kidd et al. AISNPs and save the results as a .csv when it does not exist on your local disk.  
The function also returns a results as a pandas DataFrame.

Note that this will take a few minutes to run as it is querying the 1000 Genomes VCF.
It will download `.tbi` files for the VCFs.

**optional**
The Kidd and Seldin files are included when you install `ezancestry` and are located in the `ezancestry` directory. You do not need to run `get_thousand_genomes_aisnps` function if you have already installed or cloned the `ezancestry` repository.

If you want to load the 1000 Genomes aisnps without downloading anything, you can use the following code:

```python
pd.read_csv(f"{_aisnps_directory}/kidd.1kG.csv")
```

In [2]:
# kidd1kg = get_thousand_genomes_aisnps(aisnps_directory=None, aisnps_sets="kidd")
# seldin1kg = get_thousand_genomes_aisnps(aisnps_directory=None, aisnps_sets="seldin")

# If you have a custom AISNPs file, named custom.aisnps.txt, you can load it using the following function to query the 1000 Genomes VCF
# custom1kg = get_thousand_genomes_aisnps(aisnps_directory=None, aisnps_sets="custom")

In [3]:
# If the file exists in the aisnps_directory, you can load it rather than querying the 1000 Genomes VCF
kidd1kg = get_thousand_genomes_aisnps(aisnps_directory="data/aisnps/", aisnps_sets=_aisnps_set)

2024-02-21 21:32:20.259 | INFO     | ezancestry.fetch:get_thousand_genomes_aisnps:34 - Loaded: kidd.1kG.csv


In [4]:
DEFAULT_PIPELINE

In [5]:
better_pipeline = clone(DEFAULT_PIPELINE)
better_pipeline.steps[-1] = ("classifier", RandomForestClassifier(n_estimators=1000, max_depth=100, max_features=1.0, random_state=42))

better_pipeline.fit(kidd1kg.drop(columns=["sample", "population", "superpopulation", "gender"]), kidd1kg["superpopulation"])

In [6]:
better_pipeline.predict(kidd1kg.drop(columns=["sample", "population", "superpopulation", "gender"]))

array(['EUR', 'EUR', 'EUR', ..., 'SAS', 'SAS', 'SAS'], dtype=object)

## load from DataFrame

In [7]:
mygenomefile = "/Users/kevin/mygenome/genome2.txt"

In [8]:
# predict on the
predictions = predict(
    mygenomefile,
    aisnps_set="kidd",
    write_predictions=False,
    models_directory=None,
    output_directory=None,
    aisnps_directory=None,
)

2024-02-21 21:32:33.181 | INFO     | ezancestry.process:_input_to_dataframe:156 - genome2.txt has a valid genotype for 44 out of a possible 55 (80.0%)
2024-02-21 21:32:33.199 | INFO     | ezancestry.model:predict_ancestry:121 - Using user-provided model
2024-02-21 21:32:34.583 | INFO     | ezancestry.process:_input_to_dataframe:156 - genome2.txt has a valid genotype for 44 out of a possible 55 (80.0%)
2024-02-21 21:32:34.601 | INFO     | ezancestry.model:predict_ancestry:121 - Using user-provided model


In [9]:
predictions[["predicted_ancestry_superpopulation", "EUR", "AFR", "AMR", "EAS", "SAS"]]

Unnamed: 0,predicted_ancestry_superpopulation,EUR,AFR,AMR,EAS,SAS
genome2.txt,EUR,0.964286,0.0,0.035714,0.0,0.0


In [10]:
# This will filter the genome file to only include the SNPs that are in the AISNPs file
mygenomedf = process_user_input(mygenomefile, aisnps_directory="data/aisnps/", aisnps_set="kidd")

2024-02-21 21:32:35.997 | INFO     | ezancestry.process:_input_to_dataframe:156 - genome2.txt has a valid genotype for 44 out of a possible 55 (80.0%)


In [11]:
better_pipeline.predict_proba(mygenomedf[better_pipeline.feature_names_in_])

array([[0.   , 0.129, 0.   , 0.871, 0.   ]])

In [12]:
better_pipeline.classes_

array(['AFR', 'AMR', 'EAS', 'EUR', 'SAS'], dtype=object)