This notebook performs cross-validation to identify the best parameters for a KNN model to predict the genetic ancestry of the 1000 Genomes individuals.

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import joblib

# load config variables
from ezancestry.config import aisnps_set as _aisnps_set
from ezancestry.config import models_directory as _models_directory
from ezancestry.config import population_level as _population_level

# load functions
from ezancestry.fetch import get_thousand_genomes_aisnps

In [2]:
!pip install optuna


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
import optuna
from sklearn.decomposition import PCA
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.metrics import accuracy_score, balanced_accuracy_score, top_k_accuracy_score, make_scorer
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# If the file exists in the aisnps_directory, you can load it rather than querying the 1000 Genomes VCF
kidd1kg = get_thousand_genomes_aisnps(aisnps_directory="data/aisnps/", aisnps_sets=_aisnps_set).set_index("sample")
seldin1kg = get_thousand_genomes_aisnps(aisnps_directory="data/aisnps/", aisnps_sets=_aisnps_set).set_index("sample")

2024-02-20 04:36:48.163 | INFO     | ezancestry.fetch:get_thousand_genomes_aisnps:34 - Loaded: kidd.1kG.csv
2024-02-20 04:36:48.177 | INFO     | ezancestry.fetch:get_thousand_genomes_aisnps:34 - Loaded: kidd.1kG.csv


# Kidd AISNPs to predict superpopulation

In [7]:
todrop = ["superpopulation", "population", "gender"]
label = "superpopulation"

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    kidd1kg.drop(columns=todrop),
    kidd1kg[label],
    test_size=0.2,
    random_state=42,
)

In [9]:
def objective(trial):
    weights = trial.suggest_categorical("weights", ["uniform", "distance"])
    n_components = trial.suggest_int("n_components", 3, 50)
    n_neighbors = trial.suggest_int("n_neighbors", 11, 100)

    model = make_pipeline(
        OneHotEncoder(handle_unknown="ignore", sparse_output=False, dtype=np.int8),
        PCA(n_components=n_components),
        KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, n_jobs=4),
    )

    return cross_val_score(model, X_train, y_train, n_jobs=4, scoring="accuracy", cv=StratifiedKFold(5)).mean()

In [10]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, n_jobs=2)

[I 2024-02-20 04:37:05,505] A new study created in memory with name: no-name-a3f0050b-f579-4d19-82f0-760ab090ee26
[I 2024-02-20 04:37:07,153] Trial 0 finished with value: 0.8986645885286784 and parameters: {'weights': 'uniform', 'n_components': 49, 'n_neighbors': 81}. Best is trial 0 with value: 0.8986645885286784.
[I 2024-02-20 04:37:07,270] Trial 1 finished with value: 0.9216309226932669 and parameters: {'weights': 'uniform', 'n_components': 32, 'n_neighbors': 64}. Best is trial 1 with value: 0.9216309226932669.
[I 2024-02-20 04:37:07,448] Trial 2 finished with value: 0.9076483790523691 and parameters: {'weights': 'distance', 'n_components': 38, 'n_neighbors': 96}. Best is trial 1 with value: 0.9216309226932669.
[I 2024-02-20 04:37:07,628] Trial 3 finished with value: 0.958076059850374 and parameters: {'weights': 'uniform', 'n_components': 10, 'n_neighbors': 13}. Best is trial 3 with value: 0.958076059850374.
[I 2024-02-20 04:37:07,807] Trial 4 finished with value: 0.9660598503740647

In [11]:
model = make_pipeline(
    OneHotEncoder(handle_unknown="ignore", sparse_output=False),
    PCA(n_components=study.best_params["n_components"]),
    KNeighborsClassifier(n_neighbors=study.best_params["n_neighbors"], weights=study.best_params["weights"], n_jobs=4),
)

model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_test_pred).round(3)}")
print(f"Balanced accuracy: {balanced_accuracy_score(y_test, y_test_pred).round(3)}")

Accuracy: 0.97
Balanced accuracy: 0.962


In [12]:
# Save the model
model_path = Path(_models_directory) / f"kidd_{label}.pkl"
model_path.parent.mkdir(parents=True, exist_ok=True)
joblib.dump(model, model_path)

['/home/vscode/.ezancestry/data/models/kidd_superpopulation.pkl']

# Kidd AISNPs to predict population

In [13]:
label = "population"

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    kidd1kg.drop(columns=todrop),
    kidd1kg[label],
    test_size=0.2,
    random_state=42,
)

In [15]:
topk_scorer = make_scorer(top_k_accuracy_score, needs_proba=True, k=2)

In [16]:
def objective(trial):
    weights = trial.suggest_categorical("weights", ["uniform", "distance"])
    n_components = trial.suggest_int("n_components", 3, 50)
    n_neighbors = trial.suggest_int("n_neighbors", 11, 100)

    model = make_pipeline(
        OneHotEncoder(handle_unknown="ignore", sparse_output=False, dtype=np.int8),
        PCA(n_components=n_components),
        KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, n_jobs=4),
    )

    return cross_val_score(model, X_train, y_train, n_jobs=4, scoring=topk_scorer, cv=StratifiedKFold(5)).mean()

In [17]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, n_jobs=2)

[I 2024-02-20 04:37:39,644] A new study created in memory with name: no-name-56f1bb87-aeed-4eec-8ab3-2c61f0b0008d
[I 2024-02-20 04:37:39,912] Trial 1 finished with value: 0.4982593516209476 and parameters: {'weights': 'distance', 'n_components': 37, 'n_neighbors': 57}. Best is trial 1 with value: 0.4982593516209476.
[I 2024-02-20 04:37:40,042] Trial 0 finished with value: 0.5092431421446384 and parameters: {'weights': 'distance', 'n_components': 32, 'n_neighbors': 33}. Best is trial 0 with value: 0.5092431421446384.
[I 2024-02-20 04:37:40,185] Trial 2 finished with value: 0.47679551122194513 and parameters: {'weights': 'distance', 'n_components': 43, 'n_neighbors': 30}. Best is trial 0 with value: 0.5092431421446384.
[I 2024-02-20 04:37:40,363] Trial 3 finished with value: 0.5147319201995013 and parameters: {'weights': 'distance', 'n_components': 38, 'n_neighbors': 47}. Best is trial 3 with value: 0.5147319201995013.
[I 2024-02-20 04:37:40,535] Trial 4 finished with value: 0.5227132169

In [18]:
model = make_pipeline(
    OneHotEncoder(handle_unknown="ignore", sparse_output=False, dtype=np.int8),
    PCA(n_components=study.best_params["n_components"]),
    KNeighborsClassifier(n_neighbors=study.best_params["n_neighbors"], weights=study.best_params["weights"], n_jobs=4),
)

model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_test_pred).round(3)}")
print(f"Balanced accuracy: {balanced_accuracy_score(y_test, y_test_pred).round(3)}")
print(f"Top-2 accuracy: {top_k_accuracy_score(y_test, model.predict_proba(X_test), k=2).round(3)}")
print(f"Top-3 accuracy: {top_k_accuracy_score(y_test, model.predict_proba(X_test), k=3).round(3)}")

Accuracy: 0.337
Balanced accuracy: 0.344
Top-2 accuracy: 0.557
Top-3 accuracy: 0.731


In [19]:
# Save the model
model_path = Path(_models_directory) / f"kidd_{label}.pkl"
model_path.parent.mkdir(parents=True, exist_ok=True)
joblib.dump(model, model_path)

['/home/vscode/.ezancestry/data/models/kidd_population.pkl']

# Seldin AISNPs to predict superpopulation

In [20]:
todrop = ["superpopulation", "population", "gender"]
label = "superpopulation"

In [21]:
X_train, X_test, y_train, y_test = train_test_split(
    seldin1kg.drop(columns=todrop),
    seldin1kg[label],
    test_size=0.2,
    random_state=42,
)

In [22]:
def objective(trial):
    weights = trial.suggest_categorical("weights", ["uniform", "distance"])
    n_components = trial.suggest_int("n_components", 3, 50)
    n_neighbors = trial.suggest_int("n_neighbors", 11, 100)
    
    model = make_pipeline(
        OneHotEncoder(handle_unknown="ignore", sparse_output=False, dtype=np.int8),
        PCA(n_components=n_components),
        KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, n_jobs=4),
    )

    return cross_val_score(model, X_train, y_train, n_jobs=4, scoring="accuracy", cv=StratifiedKFold(5)).mean()

In [23]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, n_jobs=2)

[I 2024-02-20 04:38:10,843] A new study created in memory with name: no-name-475bf2a9-0057-45bf-aae8-b7e00f207d3a
[I 2024-02-20 04:38:11,165] Trial 0 finished with value: 0.9306172069825436 and parameters: {'weights': 'uniform', 'n_components': 23, 'n_neighbors': 74}. Best is trial 0 with value: 0.9306172069825436.
[I 2024-02-20 04:38:11,326] Trial 1 finished with value: 0.9281197007481297 and parameters: {'weights': 'uniform', 'n_components': 26, 'n_neighbors': 75}. Best is trial 0 with value: 0.9306172069825436.
[I 2024-02-20 04:38:11,475] Trial 2 finished with value: 0.9645598503740649 and parameters: {'weights': 'uniform', 'n_components': 8, 'n_neighbors': 21}. Best is trial 2 with value: 0.9645598503740649.
[I 2024-02-20 04:38:11,612] Trial 3 finished with value: 0.9535798004987532 and parameters: {'weights': 'distance', 'n_components': 11, 'n_neighbors': 54}. Best is trial 2 with value: 0.9645598503740649.
[I 2024-02-20 04:38:11,774] Trial 4 finished with value: 0.922629675810473

In [24]:
model = make_pipeline(
    OneHotEncoder(handle_unknown="ignore", sparse_output=False, dtype=np.int8),
    PCA(n_components=study.best_params["n_components"]),
    KNeighborsClassifier(n_neighbors=study.best_params["n_neighbors"], weights=study.best_params["weights"], n_jobs=4),
)

model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_test_pred).round(3)}")
print(f"Balanced accuracy: {balanced_accuracy_score(y_test, y_test_pred).round(3)}")

Accuracy: 0.964
Balanced accuracy: 0.954


In [25]:
# Save the model
model_path = Path(_models_directory) / f"seldin_{label}.pkl"
model_path.parent.mkdir(parents=True, exist_ok=True)
joblib.dump(model, model_path)

['/home/vscode/.ezancestry/data/models/seldin_superpopulation.pkl']

# Seldin AISNPs to predict population

In [26]:
label = "population"

In [27]:
X_train, X_test, y_train, y_test = train_test_split(
    seldin1kg.drop(columns=todrop),
    seldin1kg[label],
    test_size=0.2,
    random_state=42,
)

In [28]:
def objective(trial):
    weights = trial.suggest_categorical("weights", ["uniform", "distance"])
    n_components = trial.suggest_int("n_components", 3, 50)
    n_neighbors = trial.suggest_int("n_neighbors", 11, 100)

    model = make_pipeline(
        OneHotEncoder(handle_unknown="ignore", sparse_output=False, dtype=np.int8),
        PCA(n_components=n_components),
        KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, n_jobs=4),
    )

    return cross_val_score(model, X_train, y_train, n_jobs=4, scoring=topk_scorer, cv=StratifiedKFold(5)).mean()

In [29]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, n_jobs=2)

[I 2024-02-20 04:38:40,790] A new study created in memory with name: no-name-00c591d4-a947-436a-9358-44107747dcae
[I 2024-02-20 04:38:41,071] Trial 0 finished with value: 0.4688004987531172 and parameters: {'weights': 'uniform', 'n_components': 50, 'n_neighbors': 20}. Best is trial 0 with value: 0.4688004987531172.
[I 2024-02-20 04:38:41,257] Trial 1 finished with value: 0.4922693266832917 and parameters: {'weights': 'uniform', 'n_components': 45, 'n_neighbors': 100}. Best is trial 1 with value: 0.4922693266832917.
[I 2024-02-20 04:38:41,386] Trial 2 finished with value: 0.48678304239401493 and parameters: {'weights': 'distance', 'n_components': 33, 'n_neighbors': 27}. Best is trial 1 with value: 0.4922693266832917.
[I 2024-02-20 04:38:41,531] Trial 3 finished with value: 0.4977593516209476 and parameters: {'weights': 'distance', 'n_components': 44, 'n_neighbors': 92}. Best is trial 3 with value: 0.4977593516209476.
[I 2024-02-20 04:38:41,705] Trial 4 finished with value: 0.51522443890

In [30]:
model = make_pipeline(
    OneHotEncoder(handle_unknown="ignore", sparse_output=False, dtype=np.int8),
    PCA(n_components=study.best_params["n_components"]),
    KNeighborsClassifier(n_neighbors=study.best_params["n_neighbors"], weights=study.best_params["weights"], n_jobs=4),
)

model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_test_pred).round(3)}")
print(f"Balanced accuracy: {balanced_accuracy_score(y_test, y_test_pred).round(3)}")
print(f"Top-2 accuracy: {top_k_accuracy_score(y_test, model.predict_proba(X_test), k=2).round(3)}")
print(f"Top-3 accuracy: {top_k_accuracy_score(y_test, model.predict_proba(X_test), k=3).round(3)}")

Accuracy: 0.313
Balanced accuracy: 0.322
Top-2 accuracy: 0.545
Top-3 accuracy: 0.735


In [31]:
# Save the model
model_path = Path(_models_directory) / f"seldin_{label}.pkl"
model_path.parent.mkdir(parents=True, exist_ok=True)
joblib.dump(model, model_path)

['/home/vscode/.ezancestry/data/models/seldin_population.pkl']