Extra credit task: TNBC specific variants were also found and written into a file vcf_annot_TNBC.tsv, and it can be seen that among the commonly mutated genes, there are a large number of extra variants associated with the more aggressive form of breast cancer. 

This code identifies common variants between the three breast cancer subtypes
The vcf and bed file contents are read before annotating the files and finding the common variants

In [15]:
import pandas as pd
import numpy as np

In [16]:
#Function to read vcf files
def vcf_read(filename):
    with open(filename) as vcf:
        for line in vcf:
            #removes the header line that starts with ##
            #then extracts the columns by ignoring the #
            if(not(line.startswith("##"))):
                if(line.startswith("#")):
                    vcf_cols = line.split("\t")        #To store vcf file column names
                    break
    #reads the csv and splits it                
    vcf_contents = pd.read_csv(filename,sep='\t',comment='#',header=None)
    vcf_contents.columns = vcf_cols
    return vcf_contents

In [17]:
#Function that converts the bed file content into csv
def bed_read(filename):
    bed_contents = pd.read_csv(filename,sep="\t")
    return bed_contents

In [18]:
#Function to annotate vcf files with gene symbols
def annotate_vcf(vcf_contents,bed_contents):
    start = bed_contents['START']
    stop = bed_contents['STOP']
    all_genes = []
    for pos in vcf_contents['POS']:
        pos_genes = bed_contents.loc[(pos >= bed_contents['START']) & (pos <= bed_contents['STOP'])]['GENESYMBOL'].tolist()
        #appends the all_genes list depending on what the content is (empty or not)
        if(len(pos_genes) == 0):
            all_genes.append("N/A")                   
        else:
            all_genes.append(list(set(pos_genes))[0])
    vcf_contents["GENE_SYMBOL"] = all_genes
    #gives the headers for the columns
    vcf_annot = vcf_contents[["#CHROM","POS","REF","ALT","GENE_SYMBOL"]]
    return vcf_annot

In [19]:
#Function to find common variants
def common_var(vcf_annot_HER2,vcf_annot_nTNBC,vcf_annot_TNBC):
    vcf_annot_HER2 = vcf_annot_HER2[vcf_annot_HER2.GENE_SYMBOL != 'N/A']
    vcf_annot_nTNBC = vcf_annot_nTNBC[vcf_annot_nTNBC.GENE_SYMBOL != 'N/A']
    vcf_annot_TNBC = vcf_annot_TNBC[vcf_annot_TNBC.GENE_SYMBOL != 'N/A']
    
    common_var = pd.merge(vcf_annot_HER2,pd.merge(vcf_annot_nTNBC,vcf_annot_TNBC))

    common_var.to_csv("common_variants.tsv",sep='\t',index=False)        #File containing common variants
    vcf_annot_TNBC.to_csv("vcf_annot_TNBC.tsv",sep='\t',index=False)     #File containing TNBC variants
    

In [20]:
bed_contents = bed_read("hg19_genes_chr1p_bed.txt")

In [21]:
#saving the files under the vcf file names
vcf_contents_HER2 = vcf_read("HER2.vcf")
vcf_contents_nTNBC = vcf_read("nTNBC.vcf")
vcf_contents_TNBC = vcf_read("TNBC.vcf")

In [22]:
#annotating each of the vcfs contents with the bed_contents, creating the annotated files 
vcf_annot_HER2 = annotate_vcf(vcf_contents_HER2,bed_contents)
vcf_annot_nTNBC = annotate_vcf(vcf_contents_nTNBC,bed_contents)
vcf_annot_TNBC = annotate_vcf(vcf_contents_TNBC,bed_contents)

In [23]:
common_var(vcf_annot_HER2,vcf_annot_nTNBC,vcf_annot_TNBC)