This notebook performs cross-validation to identify the best parameters for a KNN model to predict the genetic ancestry of the 1000 Genomes individuals.

In [48]:
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import joblib

# load config variables
from ezancestry.config import aisnps_directory as _aisnps_directory
from ezancestry.config import aisnps_set as _aisnps_set
from ezancestry.config import models_directory as _models_directory
from ezancestry.config import population_level as _population_level
from ezancestry.evaluate import export_performance

# load functions
from ezancestry.fetch import get_thousand_genomes_aisnps
from ezancestry.model import DEFAULT_PIPELINE, predict_ancestry, train

In [3]:
!pip install optuna


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [56]:
import optuna
from sklearn.decomposition import PCA
from sklearn.impute import KNNImputer
from sklearn.metrics import accuracy_score, balanced_accuracy_score, top_k_accuracy_score, make_scorer
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder

In [17]:
candidate_pipeline = make_pipeline(
    OneHotEncoder(handle_unknown="ignore", sparse_output=False, dtype=np.int8),
    KNNImputer(n_neighbors=7),
    PCA(n_components=10),
    KNeighborsClassifier(n_neighbors=11, weights="distance", n_jobs=4),
)

In [18]:
# If the file exists in the aisnps_directory, you can load it rather than querying the 1000 Genomes VCF
kidd1kg = get_thousand_genomes_aisnps(aisnps_directory="data/aisnps/", aisnps_sets=_aisnps_set)
seldin1kg = get_thousand_genomes_aisnps(aisnps_directory="data/aisnps/", aisnps_sets=_aisnps_set)

2024-02-14 03:17:56.269 | INFO     | ezancestry.fetch:get_thousand_genomes_aisnps:34 - Loaded: kidd.1kG.csv
2024-02-14 03:17:56.283 | INFO     | ezancestry.fetch:get_thousand_genomes_aisnps:34 - Loaded: kidd.1kG.csv


# Kidd AISNPs to predict superpopulation

In [26]:
todrop = ["superpopulation", "population", "gender"]
label = "superpopulation"

In [40]:
X_train, X_test, y_train, y_test = train_test_split(
    kidd1kg.drop(columns=todrop),
    kidd1kg[label],
    test_size=0.2,
    random_state=42,
)

In [33]:
def objective(trial):
    weights = trial.suggest_categorical("weights", ["uniform", "distance"])
    n_components = trial.suggest_int("n_components", 3, 50)
    n_neighbors = trial.suggest_int("n_neighbors", 11, 100)

    model = make_pipeline(
        OneHotEncoder(handle_unknown="ignore", sparse_output=False, dtype=np.int8),
        KNNImputer(n_neighbors=7),
        PCA(n_components=n_components),
        KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, n_jobs=4),
    )

    return cross_val_score(model, X_train, y_train, n_jobs=4, scoring="accuracy", cv=StratifiedKFold(5)).mean()

In [34]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, n_jobs=2)

[I 2024-02-14 03:30:24,511] A new study created in memory with name: no-name-dbe49198-8d1a-444e-9d13-ac92e7971897
[I 2024-02-14 03:30:26,781] Trial 0 finished with value: 0.9286209476309226 and parameters: {'weights': 'distance', 'n_components': 28, 'n_neighbors': 51}. Best is trial 0 with value: 0.9286209476309226.
[I 2024-02-14 03:30:27,320] Trial 1 finished with value: 0.960568578553616 and parameters: {'weights': 'distance', 'n_components': 7, 'n_neighbors': 65}. Best is trial 1 with value: 0.960568578553616.
[I 2024-02-14 03:30:28,074] Trial 2 finished with value: 0.9056546134663341 and parameters: {'weights': 'distance', 'n_components': 50, 'n_neighbors': 52}. Best is trial 1 with value: 0.960568578553616.
[I 2024-02-14 03:30:28,635] Trial 3 finished with value: 0.9645598503740649 and parameters: {'weights': 'distance', 'n_components': 4, 'n_neighbors': 74}. Best is trial 3 with value: 0.9645598503740649.
[I 2024-02-14 03:30:29,272] Trial 4 finished with value: 0.9181346633416458

In [47]:
model = make_pipeline(
    OneHotEncoder(handle_unknown="ignore", sparse_output=False, dtype=np.int8),
    KNNImputer(n_neighbors=7),
    PCA(n_components=study.best_params["n_components"]),
    KNeighborsClassifier(n_neighbors=study.best_params["n_neighbors"], weights=study.best_params["weights"], n_jobs=4),
)

model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_test_pred).round(3)}")
print(f"Balanced accuracy: {balanced_accuracy_score(y_test, y_test_pred).round(3)}")

Accuracy: 0.966
Balanced accuracy: 0.957


In [49]:
# Save the model
model_path = Path(_models_directory) / f"kidd_{_population_level}.joblib"
model_path.parent.mkdir(parents=True, exist_ok=True)
joblib.dump(model, model_path)

['/home/vscode/.ezancestry/data/models/kidd_superpopulation.joblib']

# Kidd AISNPs to predict population

In [50]:
label = "population"

In [51]:
X_train, X_test, y_train, y_test = train_test_split(
    kidd1kg.drop(columns=todrop),
    kidd1kg[label],
    test_size=0.2,
    random_state=42,
)

In [57]:
topk_scorer = make_scorer(top_k_accuracy_score, needs_proba=True, k=2)

In [58]:
def objective(trial):
    weights = trial.suggest_categorical("weights", ["uniform", "distance"])
    n_components = trial.suggest_int("n_components", 3, 50)
    n_neighbors = trial.suggest_int("n_neighbors", 11, 100)

    model = make_pipeline(
        OneHotEncoder(handle_unknown="ignore", sparse_output=False, dtype=np.int8),
        KNNImputer(n_neighbors=7),
        PCA(n_components=n_components),
        KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, n_jobs=4),
    )

    return cross_val_score(model, X_train, y_train, n_jobs=4, scoring=topk_scorer, cv=StratifiedKFold(5)).mean()

In [59]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, n_jobs=2)

[I 2024-02-14 03:40:44,170] A new study created in memory with name: no-name-044059a7-dcc2-44a8-a189-a6da0b40bd1f
[I 2024-02-14 03:40:44,942] Trial 0 finished with value: 0.5122369077306733 and parameters: {'weights': 'distance', 'n_components': 27, 'n_neighbors': 46}. Best is trial 0 with value: 0.5122369077306733.
[I 2024-02-14 03:40:45,263] Trial 1 finished with value: 0.5102306733167083 and parameters: {'weights': 'distance', 'n_components': 10, 'n_neighbors': 72}. Best is trial 0 with value: 0.5122369077306733.
[I 2024-02-14 03:40:45,861] Trial 2 finished with value: 0.4957643391521197 and parameters: {'weights': 'uniform', 'n_components': 46, 'n_neighbors': 88}. Best is trial 0 with value: 0.5122369077306733.
[I 2024-02-14 03:40:46,351] Trial 3 finished with value: 0.49276309226932663 and parameters: {'weights': 'uniform', 'n_components': 40, 'n_neighbors': 45}. Best is trial 0 with value: 0.5122369077306733.
[I 2024-02-14 03:40:46,840] Trial 4 finished with value: 0.496259351620

In [60]:
model = make_pipeline(
    OneHotEncoder(handle_unknown="ignore", sparse_output=False, dtype=np.int8),
    KNNImputer(n_neighbors=7),
    PCA(n_components=study.best_params["n_components"]),
    KNeighborsClassifier(n_neighbors=study.best_params["n_neighbors"], weights=study.best_params["weights"], n_jobs=4),
)

model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_test_pred).round(3)}")
print(f"Balanced accuracy: {balanced_accuracy_score(y_test, y_test_pred).round(3)}")
print(f"Top-2 accuracy: {top_k_accuracy_score(y_test, model.predict_proba(X_test), k=2).round(3)}")
print(f"Top-3 accuracy: {top_k_accuracy_score(y_test, model.predict_proba(X_test), k=3).round(3)}")

Accuracy: 0.327
Balanced accuracy: 0.33
Top-2 accuracy: 0.551
Top-3 accuracy: 0.727


In [61]:
# Save the model
model_path = Path(_models_directory) / f"kidd_{_population_level}.joblib"
model_path.parent.mkdir(parents=True, exist_ok=True)
joblib.dump(model, model_path)

['/home/vscode/.ezancestry/data/models/kidd_superpopulation.joblib']

# Seldin AISNPs to predict superpopulation

In [63]:
todrop = ["superpopulation", "population", "gender"]
label = "superpopulation"

In [64]:
X_train, X_test, y_train, y_test = train_test_split(
    seldin1kg.drop(columns=todrop),
    seldin1kg[label],
    test_size=0.2,
    random_state=42,
)

In [65]:
def objective(trial):
    weights = trial.suggest_categorical("weights", ["uniform", "distance"])
    n_components = trial.suggest_int("n_components", 3, 50)
    n_neighbors = trial.suggest_int("n_neighbors", 11, 100)

    model = make_pipeline(
        OneHotEncoder(handle_unknown="ignore", sparse_output=False, dtype=np.int8),
        KNNImputer(n_neighbors=7),
        PCA(n_components=n_components),
        KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, n_jobs=4),
    )

    return cross_val_score(model, X_train, y_train, n_jobs=4, scoring="accuracy", cv=StratifiedKFold(5)).mean()

In [66]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, n_jobs=2)

[I 2024-02-14 03:43:05,460] A new study created in memory with name: no-name-629a589d-309a-44b1-b94b-c300a27b5e73
[I 2024-02-14 03:43:06,407] Trial 1 finished with value: 0.9026596009975062 and parameters: {'weights': 'distance', 'n_components': 46, 'n_neighbors': 95}. Best is trial 1 with value: 0.9026596009975062.
[I 2024-02-14 03:43:06,723] Trial 0 finished with value: 0.9445960099750623 and parameters: {'weights': 'distance', 'n_components': 13, 'n_neighbors': 85}. Best is trial 0 with value: 0.9445960099750623.
[I 2024-02-14 03:43:07,203] Trial 2 finished with value: 0.9216334164588528 and parameters: {'weights': 'uniform', 'n_components': 38, 'n_neighbors': 28}. Best is trial 0 with value: 0.9445960099750623.
[I 2024-02-14 03:43:07,578] Trial 3 finished with value: 0.9465910224438904 and parameters: {'weights': 'distance', 'n_components': 13, 'n_neighbors': 78}. Best is trial 3 with value: 0.9465910224438904.
[I 2024-02-14 03:43:08,144] Trial 4 finished with value: 0.921628428927

In [67]:
model = make_pipeline(
    OneHotEncoder(handle_unknown="ignore", sparse_output=False, dtype=np.int8),
    KNNImputer(n_neighbors=7),
    PCA(n_components=study.best_params["n_components"]),
    KNeighborsClassifier(n_neighbors=study.best_params["n_neighbors"], weights=study.best_params["weights"], n_jobs=4),
)

model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_test_pred).round(3)}")
print(f"Balanced accuracy: {balanced_accuracy_score(y_test, y_test_pred).round(3)}")

Accuracy: 0.964
Balanced accuracy: 0.954


In [68]:
# Save the model
model_path = Path(_models_directory) / f"seldin_{_population_level}.joblib"
model_path.parent.mkdir(parents=True, exist_ok=True)
joblib.dump(model, model_path)

['/home/vscode/.ezancestry/data/models/seldin_superpopulation.joblib']

# Seldin AISNPs to predict population

In [70]:
label = "population"

In [71]:
X_train, X_test, y_train, y_test = train_test_split(
    seldin1kg.drop(columns=todrop),
    seldin1kg[label],
    test_size=0.2,
    random_state=42,
)

In [72]:
def objective(trial):
    weights = trial.suggest_categorical("weights", ["uniform", "distance"])
    n_components = trial.suggest_int("n_components", 3, 50)
    n_neighbors = trial.suggest_int("n_neighbors", 11, 100)

    model = make_pipeline(
        OneHotEncoder(handle_unknown="ignore", sparse_output=False, dtype=np.int8),
        KNNImputer(n_neighbors=7),
        PCA(n_components=n_components),
        KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, n_jobs=4),
    )

    return cross_val_score(model, X_train, y_train, n_jobs=4, scoring=topk_scorer, cv=StratifiedKFold(5)).mean()

In [73]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, n_jobs=2)

[I 2024-02-14 03:46:26,674] A new study created in memory with name: no-name-4a4a01ac-047e-4e42-b8f3-423100cc6c8a
[I 2024-02-14 03:46:27,409] Trial 0 finished with value: 0.519221945137157 and parameters: {'weights': 'uniform', 'n_components': 16, 'n_neighbors': 86}. Best is trial 0 with value: 0.519221945137157.
[I 2024-02-14 03:46:28,077] Trial 1 finished with value: 0.4932693266832917 and parameters: {'weights': 'uniform', 'n_components': 47, 'n_neighbors': 65}. Best is trial 0 with value: 0.519221945137157.
[I 2024-02-14 03:46:28,582] Trial 2 finished with value: 0.48678428927680795 and parameters: {'weights': 'uniform', 'n_components': 48, 'n_neighbors': 89}. Best is trial 0 with value: 0.519221945137157.
[I 2024-02-14 03:46:29,016] Trial 3 finished with value: 0.49026683291770573 and parameters: {'weights': 'uniform', 'n_components': 32, 'n_neighbors': 97}. Best is trial 0 with value: 0.519221945137157.
[I 2024-02-14 03:46:29,455] Trial 4 finished with value: 0.49475935162094764 

In [74]:
model = make_pipeline(
    OneHotEncoder(handle_unknown="ignore", sparse_output=False, dtype=np.int8),
    KNNImputer(n_neighbors=7),
    PCA(n_components=study.best_params["n_components"]),
    KNeighborsClassifier(n_neighbors=study.best_params["n_neighbors"], weights=study.best_params["weights"], n_jobs=4),
)

model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_test_pred).round(3)}")
print(f"Balanced accuracy: {balanced_accuracy_score(y_test, y_test_pred).round(3)}")
print(f"Top-2 accuracy: {top_k_accuracy_score(y_test, model.predict_proba(X_test), k=2).round(3)}")
print(f"Top-3 accuracy: {top_k_accuracy_score(y_test, model.predict_proba(X_test), k=3).round(3)}")

Accuracy: 0.321
Balanced accuracy: 0.33
Top-2 accuracy: 0.545
Top-3 accuracy: 0.729


In [75]:
# Save the model
model_path = Path(_models_directory) / f"seldin_{_population_level}.joblib"
model_path.parent.mkdir(parents=True, exist_ok=True)
joblib.dump(model, model_path)

['/home/vscode/.ezancestry/data/models/seldin_superpopulation.joblib']