In [10]:
#Import packages
import gzip
import pandas as pd
import numpy as np
from collections import defaultdict
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

In [2]:
#Global vars
#path = "/Users/anat/Documents/Princeton/Research/ExAC/" #Mac path
path = "/home/anat/Research/ExAC/" #Linux path
myFile = "ExAC.r0.3.nonTCGA.sites.vep.vcf.gz"

#Positions in the VCF record
CHROM = 0
POS = 1
ID = 2
REF = 3
ALT = 4
QUAL = 5
FILTER = 6
INFO = 7

#fields for extraction (it's important those names will match *exactly* the dictionary keys in "data_dict")
headlines = ["chrom", "pos", "id", "ref", "alt", "qual", "filter", "AC", "AC_Adj", "AF", "AN", "AN_Adj", "DP", "gene", "conseq", "prot_pos",
            "amino_acids", "codons", "SWISSPROT", "SIFT", "domains", "clin_sig"]

#CSQ positions
#http://useast.ensembl.org/info/docs/tools/vep/vep_formats.html?redirect=no
GENE = 0        #Ensembl stable ID of affected gene
CONSEQ = 3      #consequence type of this variant
PROT_POS = 6    #relative position of amino acid in protein
AMINO_ACIDS = 7 #the change. only given if the variant affects the protein-coding sequence
CODONS = 8      #the alternative codons with the variant base in upper case
SWISSPROT = 20  #UniProtKB/Swiss-Prot identifier of protein product
SIFT = 23       #the SIFT prediction and/or score, with both given as prediction(score)
DOMAINS = 27    #the source and identifer of any overlapping protein domains
GMAF = 30       #Non-reference allele and frequency of existing variant in 1000 Genomes
CLIN_SIG = 37   #Clinical significance of variant from dbSNP: http://varianttools.sourceforge.net/Annotation/DbSNP
                #Variant Clinical Significance, 0 - unknown, 1 -
                #untested, 2 - non-pathogenic, 3 - probable-non-pathogenic, 4 - probable-pathogenic, 
                #5 - pathogenic, 6 - drug-response, 7 - histocompatibility, 255 - other

In [3]:
#A function to find the nth position on a substring x in the string s
def find_nth(s, x, n=0, overlap=False):
    l = 1 if overlap else len(x)
    i = -l
    for c in xrange(n + 1):
        i = s.find(x, i + l)
        if i < 0:
            break
    return i

In [4]:
vcf_file = gzip.open(path+myFile,'r')

In [5]:
#Process meta-data
metadata_dict = {}
data_flag = False
for line in vcf_file:
    if line[0:2] == "##":
        #assign keys according to the format
        key = line[2:line.index('=')]
        if key == "ALT":
            val = dict.fromkeys(["ID","Description"])
        elif key == "FILTER":
            val = dict.fromkeys(["ID","Description"])
        elif key == "FORMAT":
            val = dict.fromkeys(["ID","Number","Type","Description"])
        elif key == "INFO":
            val = dict.fromkeys(["ID","Number", "Type", "Description"])
        elif key == "contig":
            val = dict.fromkeys(["ID","length"])
        elif key == "reference":
            val = dict.fromkeys(["file"])
        #Not processing other metadata types
        else:
            continue
        #fill in the data
        for f in val.keys():
            f_key = line.find(f)
            f_beg = line.find("=", f_key)
            f_end = line.find(",", f_beg)
            if (f_end < 0):
                f_end = line.find(">")
            val[f] = line[f_beg + 1:f_end]
        #Adding to the metadata dictionary
        if not metadata_dict.has_key(key):
            metadata_dict[key] = [val]
        else:
            metadata_dict[key].append(val)
            
    #Processing the data starting the next line
    elif line[0:6] == "#CHROM":
        data_flag = True
        break

In [6]:
#Arrange the INFO metadata to a data-frame
info_df = pd.DataFrame(metadata_dict["INFO"])
info_df = info_df.sort_values("ID")

In [7]:
#A function that saves the data dictionary to DataFrame and then to .csv
def data_to_df_csv(data_dict, headlines, chrom_num):
    #Creating a data_frame from all the parsed values of the chromosome
    df = pd.DataFrame([data_dict[h] for h in headlines])
    df = df.transpose()
    df.columns = headlines
    
    #Saving the df to a file
    df.to_csv(path+"/parsed/parsed_chrom"+chrom_num+".csv", sep='\t')

In [8]:
#Process the data records of the vcf and save each chromosome to a seperate file
chromosome_iter = '1'
data_dict = defaultdict(list)
for line in vcf_file:
    line_parts = line.split("\t")
    
    #If the next line belongs to a different chromosome - saving to file
    if line_parts[CHROM] != chromosome_iter:
        data_to_df_csv(data_dict, headlines, chromosome_iter)
        #Initializing the data dictionary
        data_dict = defaultdict(list)
        print "finished chromosome"+chromosome_iter
        chromosome_iter = line_parts[CHROM]
    
    #Extracting Chromosome number
    data_dict["chrom"].append(line_parts[CHROM])
    
    #Extracting position
    data_dict["pos"].append(int(line_parts[POS]))
    
    #Extracting id
    data_dict["id"].append(line_parts[ID])
    
    #Extracting ref
    data_dict["ref"].append(line_parts[REF])
    
    #Extracting alt
    data_dict["alt"].append(line_parts[ALT])
    
    #Extracting quality
    data_dict["qual"].append(line_parts[QUAL])
    
    #Extracting filter
    data_dict["filter"].append(line_parts[FILTER])
    
    #Extracting fields from the info
    info = line_parts[INFO]
    
    #AC = Allele Count
    AC_beg = info.find("AC=")
    AC_end = info.find(";", AC_beg)
    data_dict["AC"].append(info[AC_beg+3:AC_end])
    
    #AC_adjusted = Adjusted Allele Count
    AC_adj_beg = info.find("AC_Adj=")
    AC_adj_end = info.find(";", AC_adj_beg)
    data_dict["AC_Adj"].append(info[AC_adj_beg+7:AC_adj_end])
    
    #AF = Allele Frequency 
    AF_beg = info.find("AF=")
    AF_end = info.find(";", AF_beg)
    data_dict["AF"].append(info[AF_beg+3:AF_end])
    
    #AN = Allele Number
    AN_beg = info.find("AN=")
    AN_end = info.find(";", AN_beg)
    data_dict["AN"].append(info[AN_beg+3:AN_end])
    
    #AN_adj = Adjusted Allele Number
    AN_adj_beg = info.find("AN_Adj=")
    AN_adj_end = info.find(";", AN_adj_beg)
    data_dict["AN_Adj"].append(info[AN_adj_beg+7:AN_adj_end])
    
    #DP = "Approximate read depth
    DP_beg = info.find("DP=")
    DP_end = info.find(";", DP_beg)
    data_dict["DP"].append(info[DP_beg+3:DP_end])
    
    
    #CSQ = Consequence type as predicted by VEP
    CSQ_beg = info.find("CSQ=")
    CSQ = info[CSQ_beg+3:]
    #Gene
    gene_beg = find_nth(CSQ, "|", GENE)
    gene_end = CSQ.find("|", gene_beg+1)
    data_dict["gene"].append(CSQ[gene_beg+1:gene_end])
    #Consequence
    conseq_beg = find_nth(CSQ, "|", CONSEQ)
    conseq_end = CSQ.find("|", conseq_beg+1)
    data_dict["conseq"].append(CSQ[conseq_beg+1:conseq_end])
    #Protein_pos
    prot_pos_beg = find_nth(CSQ, "|", PROT_POS)
    prot_pos_end = CSQ.find("|", prot_pos_beg+1)
    data_dict["prot_pos"].append(CSQ[prot_pos_beg+1:prot_pos_end])
    #Amino Acids
    aa_beg = find_nth(CSQ, "|", AMINO_ACIDS)
    aa_end = CSQ.find("|", aa_beg+1)
    data_dict["amino_acids"].append(CSQ[aa_beg+1:aa_end])
    #Codons
    codons_beg = find_nth(CSQ, "|", CODONS)
    codons_end = CSQ.find("|", codons_beg+1)
    data_dict["codons"].append(CSQ[codons_beg+1:codons_end])
    #Swissprot
    swiss_beg = find_nth(CSQ, "|", SWISSPROT)
    swiss_end = CSQ.find("|", swiss_beg+1)
    data_dict["SWISSPROT"].append(CSQ[swiss_beg+1:swiss_end])
    #SIFT
    sift_beg = find_nth(CSQ, "|", SIFT)
    sift_end = CSQ.find("|", sift_beg+1)
    data_dict["SIFT"].append(CSQ[sift_beg+1:sift_end])
    #Domains
    domains_beg = find_nth(CSQ, "|", DOMAINS)
    domains_end = CSQ.find("|", domains_beg+1)
    data_dict["domains"].append(CSQ[domains_beg+1:domains_end])
    #clin_sig
    clin_sig_beg = find_nth(CSQ, "|", CLIN_SIG)
    clin_sig_end = CSQ.find("|", clin_sig_beg+1)
    data_dict["clin_sig"].append(CSQ[clin_sig_beg+1:clin_sig_end])

#Saving the data of the last chromosome
data_to_df_csv(data_dict, headlines, chromosome_iter)
print "finished chromosome"+chromosome_iter

finished chromosome1
finished chromosome2
finished chromosome3
finished chromosome4
finished chromosome5
finished chromosome6
finished chromosome7
finished chromosome8
finished chromosome9
finished chromosome10
finished chromosome11
finished chromosome12
finished chromosome13
finished chromosome14
finished chromosome15
finished chromosome16
finished chromosome17
finished chromosome18
finished chromosome19
finished chromosome20
finished chromosome21
finished chromosome22
finished chromosomeX


In [9]:
vcf_file.close()

In [182]:
info_df[info_df['ID'] == "CSQ"]

Unnamed: 0,Description,ID,Number,Type
70,"""Consequence type as predicted by VEP. Format: Allele|Gene|Feature|Feature_type|Consequence|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|ALLELE_NUM|DISTANCE|STRAND|SYMBOL|SYMBOL_SOURCE|HGNC_ID|BIOTYPE|CANONICAL|CCDS|ENSP|SWISSPROT|TREMBL|UNIPARC|SIFT|PolyPhen|EXON|INTRON|DOMAINS|HGVSc|HGVSp|GMAF|AFR_MAF|AMR_MAF|ASN_MAF|EUR_MAF|AA_MAF|EA_MAF|CLIN_SIG|SOMATIC|PUBMED|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE|LoF_info|LoF_flags|LoF_filter|LoF""",CSQ,.,String


In [173]:
info_df = info_df.sort_values("ID")
info_df

Unnamed: 0,Description,ID,Number,Type
0,"""Allele count in genotypes",AC,A,Integer
1,"""African/African American Allele Counts""",AC_AFR,A,Integer
2,"""American Allele Counts""",AC_AMR,A,Integer
3,"""Adjusted Allele Counts""",AC_Adj,A,Integer
4,"""East Asian Allele Counts""",AC_EAS,A,Integer
5,"""Finnish Allele Counts""",AC_FIN,A,Integer
6,"""Adjusted Hemizygous Counts""",AC_Hemi,A,Integer
7,"""Adjusted Heterozygous Counts""",AC_Het,A,Integer
8,"""Adjusted Homozygous Counts""",AC_Hom,A,Integer
9,"""Non-Finnish European Allele Counts""",AC_NFE,A,Integer


In [10]:
pd.options.display.max_colwidth =600