This notebook performs cross-validation to identify the best parameters for a KNN model to predict the genetic ancestry of the 1000 Genomes individuals.

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import joblib

# load config variables
from ezancestry.config import aisnps_set as _aisnps_set
from ezancestry.config import models_directory as _models_directory
from ezancestry.config import population_level as _population_level

# load functions
from ezancestry.fetch import get_thousand_genomes_aisnps

In [2]:
!pip install optuna


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
import optuna
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, balanced_accuracy_score, top_k_accuracy_score, make_scorer
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# If the file exists in the aisnps_directory, you can load it rather than querying the 1000 Genomes VCF
kidd1kg = get_thousand_genomes_aisnps(aisnps_directory="data/aisnps/", aisnps_sets="kidd").set_index("sample")
seldin1kg = get_thousand_genomes_aisnps(aisnps_directory="data/aisnps/", aisnps_sets="seldin").set_index("sample")

2024-02-21 03:04:26.051 | INFO     | ezancestry.fetch:get_thousand_genomes_aisnps:34 - Loaded: kidd.1kG.csv
2024-02-21 03:04:26.085 | INFO     | ezancestry.fetch:get_thousand_genomes_aisnps:34 - Loaded: seldin.1kG.csv


# Kidd AISNPs to predict superpopulation

In [5]:
todrop = ["superpopulation", "population", "gender"]
label = "superpopulation"

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    kidd1kg.drop(columns=todrop),
    kidd1kg[label],
    test_size=0.2,
    random_state=42,
)

In [7]:
def objective(trial):
    weights = trial.suggest_categorical("weights", ["uniform", "distance"])
    n_components = trial.suggest_int("n_components", 3, 50)
    n_neighbors = trial.suggest_int("n_neighbors", 11, 100)

    model = make_pipeline(
        OneHotEncoder(handle_unknown="ignore", sparse_output=False, dtype=np.int8),
        PCA(n_components=n_components),
        KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, n_jobs=4),
    )

    return cross_val_score(model, X_train, y_train, n_jobs=4, scoring="accuracy", cv=StratifiedKFold(5)).mean()

In [8]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, n_jobs=2)

[I 2024-02-21 03:04:26,208] A new study created in memory with name: no-name-923b4518-eb9a-46b5-9210-0191a42fdc05
[I 2024-02-21 03:04:28,046] Trial 1 finished with value: 0.9590723192019951 and parameters: {'weights': 'uniform', 'n_components': 8, 'n_neighbors': 44}. Best is trial 1 with value: 0.9590723192019951.
[I 2024-02-21 03:04:28,240] Trial 0 finished with value: 0.908149625935162 and parameters: {'weights': 'uniform', 'n_components': 50, 'n_neighbors': 45}. Best is trial 1 with value: 0.9590723192019951.
[I 2024-02-21 03:04:28,390] Trial 2 finished with value: 0.9151408977556109 and parameters: {'weights': 'uniform', 'n_components': 38, 'n_neighbors': 68}. Best is trial 1 with value: 0.9590723192019951.
[I 2024-02-21 03:04:28,580] Trial 3 finished with value: 0.9645598503740649 and parameters: {'weights': 'distance', 'n_components': 4, 'n_neighbors': 94}. Best is trial 3 with value: 0.9645598503740649.
[I 2024-02-21 03:04:28,676] Trial 4 finished with value: 0.9366059850374064 

In [9]:
model = make_pipeline(
    OneHotEncoder(handle_unknown="ignore", sparse_output=False),
    PCA(n_components=study.best_params["n_components"]),
    KNeighborsClassifier(n_neighbors=study.best_params["n_neighbors"], weights=study.best_params["weights"], n_jobs=4),
)

model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_test_pred).round(3)}")
print(f"Balanced accuracy: {balanced_accuracy_score(y_test, y_test_pred).round(3)}")

Accuracy: 0.964
Balanced accuracy: 0.954


In [10]:
# Save the model
model_path = Path(_models_directory) / f"kidd_{label}.pkl"
model_path.parent.mkdir(parents=True, exist_ok=True)
joblib.dump(model, model_path)

['/home/vscode/.ezancestry/data/models/kidd_superpopulation.pkl']

# Kidd AISNPs to predict population

In [11]:
label = "population"

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    kidd1kg.drop(columns=todrop),
    kidd1kg[label],
    test_size=0.2,
    random_state=42,
)

In [13]:
topk_scorer = make_scorer(top_k_accuracy_score, needs_proba=True, k=2)

In [14]:
def objective(trial):
    weights = trial.suggest_categorical("weights", ["uniform", "distance"])
    n_components = trial.suggest_int("n_components", 3, 50)
    n_neighbors = trial.suggest_int("n_neighbors", 11, 100)

    model = make_pipeline(
        OneHotEncoder(handle_unknown="ignore", sparse_output=False, dtype=np.int8),
        PCA(n_components=n_components),
        KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, n_jobs=4),
    )

    return cross_val_score(model, X_train, y_train, n_jobs=4, scoring=topk_scorer, cv=StratifiedKFold(5)).mean()

In [15]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, n_jobs=2)

[I 2024-02-21 03:04:44,326] A new study created in memory with name: no-name-6902485b-1095-4caf-8b16-6e18396ef7f5
[I 2024-02-21 03:04:44,759] Trial 0 finished with value: 0.5307157107231921 and parameters: {'weights': 'distance', 'n_components': 10, 'n_neighbors': 70}. Best is trial 0 with value: 0.5307157107231921.
[I 2024-02-21 03:04:44,943] Trial 1 finished with value: 0.5102456359102245 and parameters: {'weights': 'uniform', 'n_components': 26, 'n_neighbors': 58}. Best is trial 0 with value: 0.5307157107231921.
[I 2024-02-21 03:04:45,157] Trial 2 finished with value: 0.4937643391521197 and parameters: {'weights': 'distance', 'n_components': 18, 'n_neighbors': 13}. Best is trial 0 with value: 0.5307157107231921.
[I 2024-02-21 03:04:45,328] Trial 3 finished with value: 0.48927182044887785 and parameters: {'weights': 'uniform', 'n_components': 29, 'n_neighbors': 22}. Best is trial 0 with value: 0.5307157107231921.
[I 2024-02-21 03:04:45,494] Trial 4 finished with value: 0.501773067331

In [16]:
model = make_pipeline(
    OneHotEncoder(handle_unknown="ignore", sparse_output=False, dtype=np.int8),
    PCA(n_components=study.best_params["n_components"]),
    KNeighborsClassifier(n_neighbors=study.best_params["n_neighbors"], weights=study.best_params["weights"], n_jobs=4),
)

model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_test_pred).round(3)}")
print(f"Balanced accuracy: {balanced_accuracy_score(y_test, y_test_pred).round(3)}")
print(f"Top-2 accuracy: {top_k_accuracy_score(y_test, model.predict_proba(X_test), k=2).round(3)}")
print(f"Top-3 accuracy: {top_k_accuracy_score(y_test, model.predict_proba(X_test), k=3).round(3)}")

Accuracy: 0.339
Balanced accuracy: 0.347
Top-2 accuracy: 0.551
Top-3 accuracy: 0.727


In [17]:
# Save the model
model_path = Path(_models_directory) / f"kidd_{label}.pkl"
model_path.parent.mkdir(parents=True, exist_ok=True)
joblib.dump(model, model_path)

['/home/vscode/.ezancestry/data/models/kidd_population.pkl']

# Seldin AISNPs to predict superpopulation

In [18]:
todrop = ["superpopulation", "population", "gender"]
label = "superpopulation"

In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    seldin1kg.drop(columns=todrop),
    seldin1kg[label],
    test_size=0.2,
    random_state=42,
)

In [20]:
def objective(trial):
    weights = trial.suggest_categorical("weights", ["uniform", "distance"])
    n_components = trial.suggest_int("n_components", 3, 50)
    n_neighbors = trial.suggest_int("n_neighbors", 11, 100)
    
    model = make_pipeline(
        OneHotEncoder(handle_unknown="ignore", sparse_output=False, dtype=np.int8),
        PCA(n_components=n_components),
        KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, n_jobs=4),
    )

    return cross_val_score(model, X_train, y_train, n_jobs=4, scoring="accuracy", cv=StratifiedKFold(5)).mean()

In [21]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, n_jobs=2)

[I 2024-02-21 03:05:00,794] A new study created in memory with name: no-name-b670e5e7-4a67-41af-b505-fb09a3a6b22a
[I 2024-02-21 03:05:01,610] Trial 1 finished with value: 0.9116496259351621 and parameters: {'weights': 'distance', 'n_components': 33, 'n_neighbors': 97}. Best is trial 1 with value: 0.9116496259351621.
[I 2024-02-21 03:05:01,681] Trial 0 finished with value: 0.9251284289276807 and parameters: {'weights': 'distance', 'n_components': 28, 'n_neighbors': 64}. Best is trial 0 with value: 0.9251284289276807.
[I 2024-02-21 03:05:02,135] Trial 2 finished with value: 0.9650635910224439 and parameters: {'weights': 'uniform', 'n_components': 5, 'n_neighbors': 80}. Best is trial 2 with value: 0.9650635910224439.
[I 2024-02-21 03:05:02,420] Trial 3 finished with value: 0.9540785536159602 and parameters: {'weights': 'distance', 'n_components': 14, 'n_neighbors': 52}. Best is trial 2 with value: 0.9650635910224439.
[I 2024-02-21 03:05:02,695] Trial 4 finished with value: 0.9366072319201

In [22]:
model = make_pipeline(
    OneHotEncoder(handle_unknown="ignore", sparse_output=False, dtype=np.int8),
    PCA(n_components=study.best_params["n_components"]),
    KNeighborsClassifier(n_neighbors=study.best_params["n_neighbors"], weights=study.best_params["weights"], n_jobs=4),
)

model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_test_pred).round(3)}")
print(f"Balanced accuracy: {balanced_accuracy_score(y_test, y_test_pred).round(3)}")

Accuracy: 0.984
Balanced accuracy: 0.981


In [23]:
# Save the model
model_path = Path(_models_directory) / f"seldin_{label}.pkl"
model_path.parent.mkdir(parents=True, exist_ok=True)
joblib.dump(model, model_path)

['/home/vscode/.ezancestry/data/models/seldin_superpopulation.pkl']

# Seldin AISNPs to predict population

In [24]:
label = "population"

In [25]:
X_train, X_test, y_train, y_test = train_test_split(
    seldin1kg.drop(columns=todrop),
    seldin1kg[label],
    test_size=0.2,
    random_state=42,
)

In [26]:
def objective(trial):
    weights = trial.suggest_categorical("weights", ["uniform", "distance"])
    n_components = trial.suggest_int("n_components", 3, 50)
    n_neighbors = trial.suggest_int("n_neighbors", 11, 100)

    model = make_pipeline(
        OneHotEncoder(handle_unknown="ignore", sparse_output=False, dtype=np.int8),
        PCA(n_components=n_components),
        KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, n_jobs=4),
    )

    return cross_val_score(model, X_train, y_train, n_jobs=4, scoring=topk_scorer, cv=StratifiedKFold(5)).mean()

In [27]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, n_jobs=2)

[I 2024-02-21 03:05:31,351] A new study created in memory with name: no-name-0ac79344-57d6-49a7-ac08-2b614e1c4fb7
[I 2024-02-21 03:05:32,088] Trial 0 finished with value: 0.49925062344139653 and parameters: {'weights': 'distance', 'n_components': 33, 'n_neighbors': 83}. Best is trial 0 with value: 0.49925062344139653.
[I 2024-02-21 03:05:32,329] Trial 1 finished with value: 0.5496845386533666 and parameters: {'weights': 'uniform', 'n_components': 4, 'n_neighbors': 33}. Best is trial 1 with value: 0.5496845386533666.
[I 2024-02-21 03:05:32,645] Trial 2 finished with value: 0.4713017456359102 and parameters: {'weights': 'distance', 'n_components': 42, 'n_neighbors': 85}. Best is trial 1 with value: 0.5496845386533666.
[I 2024-02-21 03:05:33,051] Trial 3 finished with value: 0.4793017456359102 and parameters: {'weights': 'uniform', 'n_components': 48, 'n_neighbors': 64}. Best is trial 1 with value: 0.5496845386533666.
[I 2024-02-21 03:05:33,294] Trial 4 finished with value: 0.521210723192

In [28]:
model = make_pipeline(
    OneHotEncoder(handle_unknown="ignore", sparse_output=False, dtype=np.int8),
    PCA(n_components=study.best_params["n_components"]),
    KNeighborsClassifier(n_neighbors=study.best_params["n_neighbors"], weights=study.best_params["weights"], n_jobs=4),
)

model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_test_pred).round(3)}")
print(f"Balanced accuracy: {balanced_accuracy_score(y_test, y_test_pred).round(3)}")
print(f"Top-2 accuracy: {top_k_accuracy_score(y_test, model.predict_proba(X_test), k=2).round(3)}")
print(f"Top-3 accuracy: {top_k_accuracy_score(y_test, model.predict_proba(X_test), k=3).round(3)}")

Accuracy: 0.345
Balanced accuracy: 0.353
Top-2 accuracy: 0.595
Top-3 accuracy: 0.752


In [29]:
# Save the model
model_path = Path(_models_directory) / f"seldin_{label}.pkl"
model_path.parent.mkdir(parents=True, exist_ok=True)
joblib.dump(model, model_path)

['/home/vscode/.ezancestry/data/models/seldin_population.pkl']