In [1]:
import pandas as pd
from pathlib import Path

from ast import literal_eval
import re

import myvariant
import time

import numpy as np

In [2]:
DATA_DIRECTORY = Path("/Users/kevin/projects/ezancestry/data/aisnps")


In [3]:
kg = pd.read_csv(DATA_DIRECTORY.joinpath("thousand_genomes.kidd.dataframe.csv"))
# read the header line to get column names
with open(DATA_DIRECTORY.joinpath("kidd.aisnp.1kg.vcf")) as f:
    for line in f:
        if line.startswith("#CHROM"):
            colnames = line.strip().split("\t")
            break
kgvcf = pd.read_csv(DATA_DIRECTORY.joinpath("kidd.aisnp.1kg.vcf"), sep="\t", comment="#", header=None, names=colnames)


In [4]:
# In the original DataFrame, 55 positions (records) and 2513 samples (columns)
kgvcf.shape


(55, 2513)

In [5]:
# process the columns
kgvcf.drop(columns=["QUAL", "FILTER", "INFO", "FORMAT"], inplace=True)
kgvcf.set_index(["#CHROM", "POS", "REF", "ALT"], inplace=True)


In [6]:
# unique values for snps
pd.unique(kgvcf.drop(columns=["ID"]).values.ravel("K"))


array(['0|0', '0|1', '1|1', '1|0'], dtype=object)

In [7]:
# set the index as rsid actuall
kgvcf.rename(columns={"ID": "rsid"}, inplace=True)
kgvcf.set_index(["rsid"], inplace=True)


In [8]:
dragen = pd.read_csv(DATA_DIRECTORY.joinpath("dragen.kidd.dataframe.csv"))


In [9]:
# dragen_index = dragen.set_index(["chrom", "pos", "ref", "alt"]).index
dragen.set_index(["chrom", "pos", "ref", "alt"], inplace=True)


In [10]:
def parse_genotypes(longstr):
    longstr = longstr.replace("gts", "'gts'")
    longstr = longstr.replace("=", ":")
    longstr = re.sub(r"id:([a-zA-Z0-9_.-]*)", r"'id':'\1'", longstr)
    return literal_eval(longstr)


In [11]:
dragen["genotypes"] = dragen["samples"].apply(parse_genotypes)
dragen.drop(columns=["samples"], inplace=True)


In [12]:
# pd.concat(dragen["genotypes"].apply(pd.DataFrame).to_list(), keys=dragen.index).reset_index()
dragen_gts = pd.concat(dragen["genotypes"].apply(pd.DataFrame).to_list(), keys=dragen.index)
dragen_gts = dragen_gts.droplevel(4)


In [13]:
# the index of dragen_gts has unique chrom, pos, ref, alt
dragen_gts.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,id,gts
chrom,pos,ref,alt,Unnamed: 4_level_1,Unnamed: 5_level_1
chr20,63528151,T,C,HG03300,"[1, 1]"
chr20,63528151,T,C,HG03799,"[0, 1]"
chr20,63528151,T,C,HG03190,"[0, 1]"
chr20,63528151,T,C,HG03352,"[1, 1]"
chr20,63528151,T,C,NA20281,"[0, 1]"


In [14]:
def ref_alt_to_gts(row):
    # ref = row["ref"]
    # alt = row["alt"]
    _, _, ref, alt = row.name
    gtsring = ""
    for gt in row["gts"]:
        if gt:
            gtsring += ref
        else:
            gtsring += alt
    return gtsring


In [15]:
# convert the 0,1 to ref alt
dragen_gts["new_gts"] = dragen_gts.apply(ref_alt_to_gts, axis=1)

# pivot
# dragen_gts = dragen_gts.pivot(columns="id", values="new_gts")
dragen_gts.reset_index(inplace=True)
dragen_gts = dragen_gts.pivot(index=['chrom', 'pos', 'ref', 'alt'], columns='id', values='new_gts')
# dragen_gts = dragen_gts.pivot(columns="id", values="gts")


In [16]:
def list_to_string(gt):
    try:
        return "|".join([str(_) for _ in gt])
    except:
        return np.nan


In [17]:
dragen_gts = dragen_gts.applymap(list_to_string)


In [18]:
# snp positions are the records, samples are the columns
dragen_gts.shape


(55, 3202)

In [19]:
mv = myvariant.MyVariantInfo()


In [20]:
def annotate(row):
    chrom = row["chrom"]
    pos = row["pos"]
    ref = row["ref"]
    alt = row["alt"]

    variant = mv.getvariant(f"{chrom}:g.{pos}{ref}>{alt}", assembly="hg38", fields=["dbsnp"])
    time.sleep(0.25)
    return variant["dbsnp"]["rsid"]


In [21]:
dragen_gts_ = dragen_gts.reset_index()
dragen_gts_.head()


id,chrom,pos,ref,alt,HG00096,HG00097,HG00099,HG00100,HG00101,HG00102,...,NA21128,NA21129,NA21130,NA21133,NA21135,NA21137,NA21141,NA21142,NA21143,NA21144
0,chr1,101244007,T,C,,,,,,,...,,,,C|T,,,,,,
1,chr1,151150013,C,T,T|C,C|C,T|C,C|C,T|C,C|C,...,C|C,T|C,C|C,C|C,C|C,C|C,C|C,C|C,C|C,T|C
2,chr1,159204893,T,C,,,,,,,...,,,,,,,,,,
3,chr10,93161308,A,G,G|A,,,G|A,,,...,G|A,,G|A,G|A,G|A,,,G|A,G|A,G|A
4,chr11,61829740,C,T,,,,T|C,,,...,,,,,,,,,,


In [22]:
# now we have the rsid to compare
dragen_gts_["rsid"] = dragen_gts_.apply(annotate, axis=1)


In [23]:
# use the rsid as index
dragen_gts = dragen_gts_.set_index(["rsid"])


In [24]:
def apply_refref(row):
     refref = row["ref"] + "|" + row["ref"]
     return row.fillna(refref)

In [25]:
dragen_gts = dragen_gts.apply(apply_refref, axis=1)

In [26]:
from ezancestry.process import get_1kg_labels

In [27]:
# only train and evaluate on the samples in the 1kg data
dfsamples = get_1kg_labels()

original_samples = set(dfsamples.index)

# make sure the 1kg vcf (DataFrame) only has samples listed in the sample index
kgsamples = set(kgvcf.columns.to_list())
kgsamples = kgsamples.intersection(original_samples)

# inner join the 1kg and dragen samples
dragensamples = set(dragen_gts.columns.to_list())
dragensamples = dragensamples.intersection(kgsamples)

In [28]:
len(dragensamples) == len(kgsamples)

True

In [29]:
dragendf = dragen_gts[dragensamples].T.copy()

In [30]:
dragendf.apply(lambda row: sorted(row.replace("|", "")))

rsid,rs3737576,rs7554936,rs2814778,rs4918664,rs174570,rs1079597,rs2238151,rs671,rs7997709,rs1572018,...,rs16891982,rs7722456,rs192655,rs3823159,rs917115,rs1462906,rs6990312,rs2196051,rs1871534,rs3814134
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HG03625,C|T,C|C,C|T,A|A,C|C,C|C,C|T,A|G,C|C,C|T,...,C|C,C|C,A|G,A|A,C|T,C|T,G|G,A|A,C|C,A|A
HG03436,C|T,C|C,C|T,A|A,C|C,C|C,C|T,A|G,C|C,C|T,...,C|C,C|C,A|G,A|A,C|T,C|T,G|G,A|A,C|C,A|A
HG03867,C|T,C|C,C|T,A|A,C|C,C|C,C|T,A|G,C|C,C|T,...,C|C,C|C,A|G,A|A,C|T,C|T,G|G,A|A,C|C,A|A
HG00553,C|T,C|C,C|T,A|A,C|C,C|C,C|T,A|G,C|C,C|T,...,C|C,C|C,A|G,A|A,C|T,C|T,G|G,A|A,C|C,A|A
HG03166,C|T,C|C,C|T,A|A,C|C,C|C,C|T,A|G,C|C,C|T,...,C|C,C|C,A|G,A|A,C|T,C|T,G|G,A|A,C|C,A|A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HG03451,T|T,T|C,T|T,G|A,T|C,T|C,T|T,G|G,T|C,T|T,...,G|C,T|C,G|G,G|A,T|T,T|T,T|G,G|A,G|C,G|A
HG02111,T|T,T|C,T|T,G|A,T|C,T|C,T|T,G|G,T|C,T|T,...,G|C,T|C,G|G,G|A,T|T,T|T,T|G,G|A,G|C,G|A
HG00150,T|T,T|C,T|T,G|A,T|C,T|C,T|T,G|G,T|C,T|T,...,G|C,T|C,G|G,G|A,T|T,T|T,T|G,G|A,G|C,G|A
NA21098,T|T,T|C,T|T,G|A,T|C,T|C,T|T,G|G,T|C,T|T,...,G|C,T|C,G|G,G|A,T|T,T|T,T|G,G|A,G|C,G|A


In [31]:
# we're actually going to use this DataFrame instead of the vcf because it has the alleles
kgdf = pd.read_csv(DATA_DIRECTORY.joinpath("thousand_genomes.kidd.dataframe.csv"))
kgdf.rename(columns={"Unnamed: 0": "id"}, inplace=True)
kgdf.set_index("id", inplace=True)
kgdf = kgdf.loc[kgsamples].copy()

# to match dragendf
kgdf.columns.name = "rsid"

In [32]:
# replace | with empty
dragendf = dragendf.apply(lambda row: row.str.replace("|", "", regex=False))

In [33]:
kgdf = kgdf.reindex(dfsamples.index)
dragendf = dragendf.reindex(dfsamples.index)

In [34]:
(dragendf.index == kgdf.index).all()

True

# Nested CV

In [58]:
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline


In [36]:
dragendf["rs3737576"].value_counts()

TT    2212
CT     292
Name: rs3737576, dtype: int64

In [37]:
kgdf["rs3737576"].value_counts()

TT    2147
CT     293
CC      64
Name: rs3737576, dtype: int64

In [68]:
dragendf_categories = dragendf.apply(pd.unique).values.tolist()

In [69]:
kgdf_categories = kgdf.apply(pd.unique).drop(["population", "superpopulation", "gender"]).tolist()

In [70]:
kgdf_categories = [a.tolist() for a in kgdf_categories]
dragendf_categories = [a.tolist() for a in dragendf_categories]

In [71]:
categorical_transformer = OneHotEncoder(sparse=False, drop="first")
categorical_columns = dragendf.columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, categorical_columns),
    ]
)

In [79]:
# define the pipeline first

pipe = make_pipeline(
    preprocessor,
    PCA(),
    KNeighborsClassifier(),
)

In [80]:
# define the param grid
param_grid = {
    "pca__n_components": [2, 3, 4, 5, 6, 7, 8, 9, 10],
    "kneighborsclassifier__n_neighbors": [3, 5, 7, 9, 11, 15, 21, 25, 51],
    "kneighborsclassifier__weights": ["uniform", "distance"],
    "kneighborsclassifier__algorithm": ["ball_tree", "kd_tree", "brute"],
    "kneighborsclassifier__leaf_size": [5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
}

In [87]:
X_dragen = dragendf.copy()
X_kg = kgdf.drop(columns=["population", "superpopulation", "gender"]).copy()
y = dfsamples["superpopulation"].copy()

1kg

In [88]:
NUM_TRIALS = 1

nested_scores = np.zeros(NUM_TRIALS)


for i in range(NUM_TRIALS):

    # Choose cross-validation techniques for the inner and outer loops,
    # independently of the dataset.
    # E.g "GroupKFold", "LeaveOneOut", "LeaveOneGroupOut", etc.
    inner_cv = KFold(n_splits=5, shuffle=True, random_state=i)
    outer_cv = KFold(n_splits=5, shuffle=True, random_state=i)

    # Nested CV with parameter optimization
    clf = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=inner_cv)
    nested_score = cross_val_score(clf, X=X_kg, y=y, cv=outer_cv)
    nested_scores[i] = nested_score.mean()


Dragen

In [None]:
NUM_TRIALS = 10

nested_scores = np.zeros(NUM_TRIALS)


for i in range(NUM_TRIALS):

    # Choose cross-validation techniques for the inner and outer loops,
    # independently of the dataset.
    # E.g "GroupKFold", "LeaveOneOut", "LeaveOneGroupOut", etc.
    inner_cv = KFold(n_splits=5, shuffle=True, random_state=i)
    outer_cv = KFold(n_splits=5, shuffle=True, random_state=i)

    # Nested CV with parameter optimization
    clf = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=inner_cv)
    nested_score = cross_val_score(clf, X=X_dragen, y=y, cv=outer_cv)
    nested_scores[i] = nested_score.mean()

In [29]:
# let's compare one sample between Dragen and 1kG
hg00096 = pd.merge(
    dragen_gts["HG00096"], kgvcf["HG00096"], left_index=True, right_index=True, suffixes=("_dragen", "_1kg")
)


In [30]:
hg00096


Unnamed: 0_level_0,HG00096_dragen,HG00096_1kg
rsid,Unnamed: 1_level_1,Unnamed: 2_level_1
rs3737576,0|0,0|0
rs7554936,0|1,0|1
rs2814778,0|0,0|0
rs4918664,0|1,0|1
rs174570,0|0,0|0
rs1079597,0|0,0|0
rs2238151,0|0,0|0
rs671,0|0,0|0
rs7997709,1|1,1|1
rs1572018,1|1,1|1


# Does DRAGEN store 0|0 as nulls?????

What was different between DRAGEN and 1kG for this sample?

In [31]:
hg00096.dropna().loc[hg00096["HG00096_dragen"] != hg00096["HG00096_1kg"]]


Unnamed: 0_level_0,HG00096_dragen,HG00096_1kg
rsid,Unnamed: 1_level_1,Unnamed: 2_level_1
rs260690,0|1,1|0
rs4833103,0|1,1|0
rs192655,0|1,1|0
rs1871534,1|1,0|0


In [29]:
# idk if DRAGEN is phased or not?
# looks like rs1871534 is legit different between the two technologies


# What's the 5fold CV performance between the a model trained on 1kG and DRAGEN?

In [2]:
from scipy import stats

0.16666666666666666

In [15]:
import numpy as np

In [17]:
np.arange(1, 5)

array([1, 2, 3, 4])

In [28]:
d4 = stats.rv_discrete(name="d4", values=([1, 2, 3, 4], [1/4]*4))
d6 = stats.rv_discrete(name="d6", values=([1, 2, 3, 4, 5, 6], [1/6]*6))
d8 = stats.rv_discrete(name="d8", values=([1, 2, 3, 4, 5, 6, 7, 8], [1/8]*8))
d12 = stats.rv_discrete(name="d12", values=([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [1/12]*12))
d20 = stats.rv_discrete(name="d20", values=([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], [1/20]*20))

In [25]:
expected_value, variance = d8.stats()

In [19]:
d6.rvs(size=10)

array([5, 2, 2, 2, 2, 3, 5, 2, 5, 6])