In [2]:
import pandas as pd
from pathlib import Path

from ast import literal_eval
import re

import myvariant
import time

import numpy as np


In [3]:
DATA_DIRECTORY = Path("/Users/kevin/projects/ezancestry/data/aisnps")


In [4]:
kg = pd.read_csv(DATA_DIRECTORY.joinpath("thousand_genomes.kidd.dataframe.csv"))
# read the header line to get column names
with open(DATA_DIRECTORY.joinpath("kidd.aisnp.1kg.vcf")) as f:
    for line in f:
        if line.startswith("#CHROM"):
            colnames = line.strip().split("\t")
            break
kgvcf = pd.read_csv(DATA_DIRECTORY.joinpath("kidd.aisnp.1kg.vcf"), sep="\t", comment="#", header=None, names=colnames)


In [5]:
# In the original DataFrame, 55 positions (records) and 2513 samples (columns)
kgvcf.shape


(55, 2513)

In [6]:
# process the columns
kgvcf.drop(columns=["QUAL", "FILTER", "INFO", "FORMAT"], inplace=True)
kgvcf.set_index(["#CHROM", "POS", "REF", "ALT"], inplace=True)


In [7]:
# unique values for snps
pd.unique(kgvcf.drop(columns=["ID"]).values.ravel("K"))


array(['0|0', '0|1', '1|1', '1|0'], dtype=object)

In [8]:
# set the index as rsid actuall
kgvcf.rename(columns={"ID": "rsid"}, inplace=True)
kgvcf.set_index(["rsid"], inplace=True)


In [9]:
dragen = pd.read_csv(DATA_DIRECTORY.joinpath("dragen.kidd.dataframe.csv"))


In [10]:
# dragen_index = dragen.set_index(["chrom", "pos", "ref", "alt"]).index
dragen.set_index(["chrom", "pos", "ref", "alt"], inplace=True)


In [11]:
def parse_genotypes(longstr):
    longstr = longstr.replace("gts", "'gts'")
    longstr = longstr.replace("=", ":")
    longstr = re.sub(r"id:([a-zA-Z0-9_.-]*)", r"'id':'\1'", longstr)
    return literal_eval(longstr)


In [12]:
dragen["genotypes"] = dragen["samples"].apply(parse_genotypes)
dragen.drop(columns=["samples"], inplace=True)


In [13]:
# pd.concat(dragen["genotypes"].apply(pd.DataFrame).to_list(), keys=dragen.index).reset_index()
dragen_gts = pd.concat(dragen["genotypes"].apply(pd.DataFrame).to_list(), keys=dragen.index)
dragen_gts = dragen_gts.droplevel(4)


In [14]:
# the index of dragen_gts has unique chrom, pos, ref, alt
dragen_gts.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,id,gts
chrom,pos,ref,alt,Unnamed: 4_level_1,Unnamed: 5_level_1
chr20,63528151,T,C,HG03300,"[1, 1]"
chr20,63528151,T,C,HG03799,"[0, 1]"
chr20,63528151,T,C,HG03190,"[0, 1]"
chr20,63528151,T,C,HG03352,"[1, 1]"
chr20,63528151,T,C,NA20281,"[0, 1]"


In [15]:
# dragen_gts.loc["chr1", 101244007, "T", "C"]


In [16]:
def ref_alt_to_gts(row):
    # ref = row["ref"]
    # alt = row["alt"]
    _, _, ref, alt = row.name
    gtsring = ""
    for gt in row["gts"]:
        if gt:
            gtsring += ref
        else:
            gtsring += alt
    return gtsring


In [17]:
# convert the 0,1 to ref alt
dragen_gts["new_gts"] = dragen_gts.apply(ref_alt_to_gts, axis=1)

# pivot
# dragen_gts = dragen_gts.pivot(columns="id", values="new_gts")
dragen_gts = dragen_gts.pivot(columns="id", values="gts")


In [18]:
def list_to_string(gt):
    try:
        return "|".join([str(_) for _ in gt])
    except:
        return np.nan


In [19]:
dragen_gts = dragen_gts.applymap(list_to_string)


In [20]:
# snp positions are the records, samples are the columns
dragen_gts.shape


(55, 3202)

In [21]:
mv = myvariant.MyVariantInfo()


In [22]:
def annotate(row):
    chrom = row["chrom"]
    pos = row["pos"]
    ref = row["ref"]
    alt = row["alt"]

    variant = mv.getvariant(f"{chrom}:g.{pos}{ref}>{alt}", assembly="hg38", fields=["dbsnp"])
    time.sleep(0.25)
    return variant["dbsnp"]["rsid"]


In [23]:
dragen_gts_ = dragen_gts.reset_index()
dragen_gts_.head()


id,chrom,pos,ref,alt,HG00096,HG00097,HG00099,HG00100,HG00101,HG00102,...,NA21128,NA21129,NA21130,NA21133,NA21135,NA21137,NA21141,NA21142,NA21143,NA21144
0,chr1,101244007,T,C,,,,,,,...,,,,0|1,,,,,,
1,chr1,151150013,C,T,0|1,1|1,0|1,1|1,0|1,1|1,...,1|1,0|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,0|1
2,chr1,159204893,T,C,,,,,,,...,,,,,,,,,,
3,chr10,93161308,A,G,0|1,,,0|1,,,...,0|1,,0|1,0|1,0|1,,,0|1,0|1,0|1
4,chr11,61829740,C,T,,,,0|1,,,...,,,,,,,,,,


In [24]:
# now we have the rsid to compare
dragen_gts_["rsid"] = dragen_gts_.apply(annotate, axis=1)


In [25]:
# use the rsid as index
dragen_gts = dragen_gts_.set_index(["rsid"])


In [26]:
# let's compare one sample between Dragen and 1kG
hg00096 = pd.merge(
    dragen_gts["HG00096"], kgvcf["HG00096"], left_index=True, right_index=True, suffixes=("_dragen", "_1kg")
)


In [27]:
hg00096

Unnamed: 0_level_0,HG00096_dragen,HG00096_1kg
rsid,Unnamed: 1_level_1,Unnamed: 2_level_1
rs3737576,,0|0
rs7554936,0|1,0|1
rs2814778,,0|0
rs4918664,0|1,0|1
rs174570,,0|0
rs1079597,,0|0
rs2238151,,0|0
rs671,,0|0
rs7997709,1|1,1|1
rs1572018,1|1,1|1


# Does DRAGEN store 0|0 as nulls?????

What was different between DRAGEN and 1kG for this sample?

In [28]:
hg00096.dropna().loc[hg00096["HG00096_dragen"] != hg00096["HG00096_1kg"]]


Unnamed: 0_level_0,HG00096_dragen,HG00096_1kg
rsid,Unnamed: 1_level_1,Unnamed: 2_level_1
rs260690,0|1,1|0
rs4833103,0|1,1|0
rs192655,0|1,1|0
rs1871534,1|1,0|0


In [29]:
# idk if DRAGEN is phased or not?
# looks like rs1871534 is legit different between the two technologies
