In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
vep_cols = """\
Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON\
|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids\
|Codons|Existing_variation|DISTANCE|STRAND|FLAGS|SYMBOL_SOURCE|HGNC_ID|CANONICAL\
|CCDS|HGVS_OFFSET\
"""
vep_cols = vep_cols.split("|")
vep_cols = [term.strip().capitalize() for term in vep_cols]

In [3]:
vep_df = pd.read_table('./cftr.grch37.vep.vcf', header=0, skiprows=6, usecols=range(10))
info_df = vep_df["INFO"].str.replace("ANN=", "").str.split(",").apply(pd.Series, 1).stack()
info_df = info_df.str.split("|").apply(pd.Series, 1)
info_df.index = info_df.index.droplevel(-1)
info_df.columns = vep_cols
vep_df = vep_df.join(info_df)
vep_df = vep_df[['#CHROM', 'POS', 'REF', 'ALT', 'Consequence', 'Impact', 
                     'Symbol', 'Gene', 'Feature_type', 'Feature', 'Biotype', 
                     'Hgvsc','Hgvsp','Hgnc_id']]
vep_df.head()

Unnamed: 0,#CHROM,POS,REF,ALT,Consequence,Impact,Symbol,Gene,Feature_type,Feature,Biotype,Hgvsc,Hgvsp,Hgnc_id
0,7,117105737,C,A,upstream_gene_variant,MODIFIER,CFTR,ENSG00000001626,Transcript,ENST00000546407,processed_transcript,,,1884
1,7,117105737,C,G,upstream_gene_variant,MODIFIER,CFTR,ENSG00000001626,Transcript,ENST00000546407,processed_transcript,,,1884
2,7,117105737,C,T,upstream_gene_variant,MODIFIER,CFTR,ENSG00000001626,Transcript,ENST00000546407,processed_transcript,,,1884
3,7,117105737,C,CA,upstream_gene_variant,MODIFIER,CFTR,ENSG00000001626,Transcript,ENST00000546407,processed_transcript,,,1884
4,7,117105737,C,CG,upstream_gene_variant,MODIFIER,CFTR,ENSG00000001626,Transcript,ENST00000546407,processed_transcript,,,1884


In [4]:
snpeff_cols = """\
Allele|Annotation|Putative_impact|Gene_name|Gene_ID|Feature_type|Feature_ID|Transcript_Biotype|Rank|HGVSc|HGVSp\
|cDNA_position|CDS_position|Protein_position|Distance|Errors"""
snpeff_cols = snpeff_cols.split("|")
snpeff_cols = [term.strip().capitalize() for term in snpeff_cols]

In [5]:
snpeff_df = pd.read_table('./cftr.grch37.snpeff.vcf', header=0, skiprows=9, usecols=range(10))
info_df = snpeff_df["INFO"].str.split(";").apply(pd.Series, 1)[0] #snpeff includes two other INFO fields that we don't need
info_df = info_df.str.replace("ANN=", "").str.split(",").apply(pd.Series, 1).stack()
info_df = info_df.str.split("|").apply(pd.Series, 1)
info_df.index = info_df.index.droplevel(-1)
info_df.columns = snpeff_cols
snpeff_df = snpeff_df.join(info_df)
snpeff_df = snpeff_df[['#CHROM', 'POS', 'REF', 'ALT', 'Annotation', 'Putative_impact','Gene_id', 'Feature_type', 'Feature_id', 'Transcript_biotype', 
                     'Hgvsc','Hgvsp']]
snpeff_df.head()

Unnamed: 0,#CHROM,POS,REF,ALT,Annotation,Putative_impact,Gene_id,Feature_type,Feature_id,Transcript_biotype,Hgvsc,Hgvsp
0,7,117105737,C,A,upstream_gene_variant,MODIFIER,ENSG00000001626,transcript,ENST00000546407,processed_transcript,n.-101C>A,
0,7,117105737,C,A,intergenic_region,MODIFIER,ENSG00000214684-ENSG00000001626,intergenic_region,ENSG00000214684-ENSG00000001626,,n.117105737C>A,
1,7,117105737,C,G,upstream_gene_variant,MODIFIER,ENSG00000001626,transcript,ENST00000546407,processed_transcript,n.-101C>G,
1,7,117105737,C,G,intergenic_region,MODIFIER,ENSG00000214684-ENSG00000001626,intergenic_region,ENSG00000214684-ENSG00000001626,,n.117105737C>G,
2,7,117105737,C,T,upstream_gene_variant,MODIFIER,ENSG00000001626,transcript,ENST00000546407,processed_transcript,n.-101C>T,
