### Import Libraries

In [1]:
import pandas as pd
from tqdm.notebook import tqdm
import pickle
import io
import os

In [2]:
# Helper Functions
def read_vcf(path):
    with open(path, 'r') as f:
        lines = [l for l in f if not l.startswith('##')]
    return pd.read_csv(
        io.StringIO(''.join(lines)),
        dtype={'#CHROM': str, 'POS': int, 'ID': str, 'REF': str, 'ALT': str,
               'QUAL': str, 'FILTER': str, 'INFO': str},
        sep='\t'
    ).rename(columns={'#CHROM': 'CHROM'})

In [3]:
df = read_vcf("datasets/clinvar.vcf")

In [4]:
df

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO
0,1,925952,1019397,G,A,.,.,ALLELEID=1003021;CLNDISDB=MedGen:CN517202;CLND...
1,1,925956,1543320,C,T,.,.,ALLELEID=1632777;CLNDISDB=MedGen:CN517202;CLND...
2,1,925969,1648427,C,T,.,.,ALLELEID=1600580;CLNDISDB=MedGen:CN517202;CLND...
3,1,925976,1362713,T,C,.,.,ALLELEID=1396033;CLNDISDB=MedGen:CN517202;CLND...
4,1,925986,1568423,C,T,.,.,ALLELEID=1570515;CLNDISDB=MedGen:CN517202;CLND...
...,...,...,...,...,...,...,...,...
1469558,MT,16179,1525977,CAA,C,.,.,ALLELEID=1517353;CLNDISDB=Human_Phenotype_Onto...
1469559,MT,16230,1525975,A,G,.,.,ALLELEID=1517319;CLNDISDB=Human_Phenotype_Onto...
1469560,MT,16274,1525974,G,A,.,.,ALLELEID=1517318;CLNDISDB=Human_Phenotype_Onto...
1469561,MT,16319,1526264,G,A,.,.,ALLELEID=1517668;CLNDISDB=Human_Phenotype_Onto...


In [5]:
clinvar_chromosomes = sorted(set(df.CHROM))
clinvar_chromosomes

['1',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '2',
 '20',
 '21',
 '22',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 'MT',
 'NW_009646201.1',
 'X',
 'Y']

In [6]:
disease_variant_reference = {chromosome : set() for chromosome in clinvar_chromosomes}

In [11]:
disease_variant_reference['14']

{69992457,
 105381899,
 102498317,
 69992464,
 64225300,
 105381910,
 64225306,
 105381917,
 102498339,
 50593833,
 20447274,
 20447275,
 64225321,
 64225323,
 64225325,
 64225326,
 105381935,
 105381937,
 105381939,
 64225338,
 64225339,
 58458172,
 50593856,
 50593857,
 64225344,
 50593859,
 64225345,
 64225349,
 50593863,
 64225353,
 50593866,
 50593867,
 50593870,
 105381966,
 20447313,
 64225364,
 64225369,
 50593886,
 50593887,
 50593888,
 50593891,
 64225380,
 50593893,
 50593895,
 64225385,
 64225387,
 24117358,
 64225392,
 64225394,
 64225398,
 64225400,
 105382013,
 105382017,
 105382024,
 105382025,
 105382026,
 105382028,
 105382029,
 105382037,
 64225432,
 64225433,
 64225434,
 64225435,
 64225436,
 64225437,
 64225438,
 91881626,
 105382046,
 105382049,
 64225443,
 72351907,
 105382060,
 20447406,
 64225456,
 21364922,
 105382074,
 105382080,
 64225476,
 64225477,
 69992653,
 102498510,
 64225489,
 64225490,
 64225495,
 64225504,
 50200808,
 64225517,
 64225520,
 64225523

In [7]:
for i in tqdm(range(len(df))):
    chromosome = df["CHROM"][i]
    position = df["POS"][i]
    disease_variant_reference[chromosome].add(position)

  0%|          | 0/1469563 [00:00<?, ?it/s]

In [8]:
#save
with open('datasets/disease_variant_reference.json', 'wb') as fp:
    pickle.dump(disease_variant_reference, fp)