In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
home_dir = "../"

import pandas as pd
import xmltodict

In [2]:
def get_filehandle(filepath, colnames_as_tab_sep_str):
    if os.path.exists(filepath):
        os.remove(filepath)
    fhandle = open(filepath, mode="w")
    fhandle.write(f"{colnames_as_tab_sep_str}") # write column names
    return fhandle

    #     fhandle = open(filepath, mode="a")
    # else:
    #     fhandle = open(filepath, mode="w")
    #     fhandle.write(f"{colnames_as_tab_sep_str}") # write column names

    

In [3]:
snps_filepath = home_dir+"data/dbsnp/parsed/snps.tsv"
snps_cols = "SNP_ID\tSNP_CLASS\tCLINICAL_SIGNIFICANCE\tACC\tCHR\tCHRPOS\tSPDI\tFXN_CLASS\tVALIDATED\tTAX_ID\tCREATEDATE\tUPDATEDATE\tDOCSUM\tNP_VARIANTS\tREF_ALLELE\tALT_ALLELES\n"
snps_fhandle = get_filehandle(snps_filepath, snps_cols)

genes_filepath = home_dir+"data/dbsnp/parsed/genes.tsv"
genes_cols = "SNP_ID\tGENE_NAME\tGENE_ID\n"
genes_fhandle = get_filehandle(genes_filepath, genes_cols)

global_mafs_filepath = home_dir+"data/dbsnp/parsed/global_mafs.tsv"
global_mafs_cols = "SNP_ID\tSTUDY\tFREQ\tALT_ALLELE\tALT_FREQ\tALT_POPU\n"
global_mafs_fhandle = get_filehandle(global_mafs_filepath, global_mafs_cols)

In [4]:
def parse_document_summary(doc_sum):
    snp_dict = xmltodict.parse(doc_sum)["DocumentSummary"]
    # print(snp_dict)

    snp_id = "rs"+snp_dict["SNP_ID"] # primary and foreign key of the tables

    snps_fhandle.write(f'{snp_id}\t') # col: SNP_ID
    snps_fhandle.write(f'{snp_dict["SNP_CLASS"]}\t')
    snps_fhandle.write(f'{snp_dict["CLINICAL_SIGNIFICANCE"]}\t')
    snps_fhandle.write(f'{snp_dict["ACC"]}\t')
    snps_fhandle.write(f'{snp_dict["CHR"]}\t')
    snps_fhandle.write(f'{snp_dict["CHRPOS"]}\t')
    snps_fhandle.write(f'{snp_dict["SPDI"]}\t')
    snps_fhandle.write(f'{snp_dict["FXN_CLASS"]}\t')
    snps_fhandle.write(f'{snp_dict["VALIDATED"]}\t')
    snps_fhandle.write(f'{snp_dict["TAX_ID"]}\t')
    snps_fhandle.write(f'{snp_dict["CREATEDATE"]}\t')
    snps_fhandle.write(f'{snp_dict["UPDATEDATE"]}\t')
    snps_fhandle.write(f'{snp_dict["DOCSUM"]}\t')

    # parsing NP-variants
    docsum_items = snp_dict["DOCSUM"].split("|")
    hgvs_variants = docsum_items[0][5:].split(",")
    np_variants = ",".join([v for v in hgvs_variants if v.startswith("NP_")])
    snps_fhandle.write(f'{np_variants}\t') # col:NP_VARIANTS
    
    # parsing ref and alt alleles
    alleles = docsum_items[1][5:-1]
    alleles = alleles.split("/")
    ref_allele = alleles[0]
    alt_alleles = ",".join(alleles[1:])
    snps_fhandle.write(f'{ref_allele}\t') # col:REF_ALLELE
    snps_fhandle.write(f'{alt_alleles}\t') # col:ALT_ALLELES
    # print(alleles)
    snps_fhandle.write("\n")

    # parsing genes and putting into another .tsv file
    genes = snp_dict["GENES"]["GENE_E"]
    if type(genes)!=list: genes = [genes]
    for gene_dict in genes:
        genes_fhandle.write(f'{snp_id}\t') # col: SNP_ID
        genes_fhandle.write(f'{gene_dict["NAME"]}\t') # col: GENE_NAME
        genes_fhandle.write(f'{gene_dict["GENE_ID"]}\t')
        genes_fhandle.write("\n")


    # parsing global-mafs and putting into another .tsv file
    mafs = snp_dict["GLOBAL_MAFS"]["MAF"]
    if type(mafs)!=list: mafs = [mafs]
    for maf_dict in mafs:
        global_mafs_fhandle.write(f'{snp_id}\t') # col: SNP_ID
        global_mafs_fhandle.write(f'{maf_dict["STUDY"]}\t')
        global_mafs_fhandle.write(f'{maf_dict["FREQ"]}\t')
        
        freq_items = maf_dict["FREQ"].split("=")
        alt_allele = freq_items[0]
        alt_freq, alt_popu = freq_items[1].split("/")

        global_mafs_fhandle.write(f'{alt_allele}\t') # col: ALT_ALLELE		
        global_mafs_fhandle.write(f'{alt_freq}\t') # col: ALT_FREQ
        global_mafs_fhandle.write(f'{alt_popu}\t') # col: ALT_POPU
        global_mafs_fhandle.write("\n")

    

In [5]:
filepath = home_dir+"data/dbsnp/search_results/6866032_snps.txt"
# filepath = home_dir+"data/dbsnp/snps_search_result_small.xml"
with open(filepath) as f:
    n_snps = 0
    for i, line in enumerate(f.readlines()):
        # print(line)
        if line.startswith("<?xml") or line.startswith("</ExchangeSet>"): continue
        elif line.startswith("<ExchangeSet") :
            if line.find("<DocumentSummary")!=-1:
                doc_sum = line[line.find("<DocumentSummary"):]
                parse_document_summary(doc_sum)
                n_snps+=1

        elif line.startswith("<DocumentSummary"):
            parse_document_summary(line)
            n_snps+=1
        else:
            print(n_snps)
            print(line)
            break
            # data_dict = xmltodict.parse(line)
            # print(i, data_dict)
        # if i==2: break
        if i%100000==0: print(i) # just to see the progress

n_snps

100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000
2100000
2200000
2300000
2400000
2500000
2600000
2700000
2800000
2900000
3000000
3100000
3200000
3300000
3400000
3500000
3600000
3700000
3800000
3900000
4000000
4100000
4200000
4300000
4400000
4500000
4600000
4700000
4800000
4900000
5000000
5100000
5200000
5300000
5400000
5500000
5600000
5700000
5800000
5900000
6000000
6100000
6200000
6300000
6400000
6500000
6600000
6700000
6800000


6866032

In [6]:
snps_fhandle.close() # 7.6GB
genes_fhandle.close() # 213MB
global_mafs_fhandle.close() # 1.1G

In [29]:
# filepath = home_dir+"data/dbsnp/snps_search_result.xml"
# filepath = home_dir+"data/dbsnp/snps_search_result_small.xml"
# with open(filepath) as xml_file:
#     data_dict = xmltodict.parse(xml_file.read())
#     snps = data_dict["ExchangeSet"]["DocumentSummary"]

# print("#-downloaded snps:", len(snps))
# snps[0]