In [None]:
from Bio import SeqIO
from collections import Counter
import pandas as pd
import os, glob
import numpy as np
import csv
import re

# Input for specific gene

In [None]:
gene_name = input('Gene name: ')
# files
fasta_in = r'data_folders/FASTA_files_UCSC/' + gene_name.lower() + '_fasta'
syn_out = r'data_folders/Syn_table_out/' + gene_name.lower() + '_syn.csv'
# reference transcript
dfrefseqtr = pd.read_table(r'data_folders/biomart_refseq.txt')
refseqtr = list(dfrefseqtr['RefSeq accession'].loc[dfrefseqtr['Approved symbol'] == gene_name.upper()])[0]
# reference sequence
refsq = r'data_folders/ref_seq/' + gene_name.lower() + '_full_seq_ref'
list_refsq = r'data_folders/ref_seq/' + gene_name.lower() + '_seq_ref.csv'
# flossies
floss_in = r'data_folders/Flossies_annotations/flossies_' + gene_name.lower() + '.csv'
floss_out = r'data_folders/Flossies_annotations/flossies_' + gene_name.lower() + '_out.csv'
#cBioPortal
cbp_in = r'data_folders/cbioportal_annotations/cbioportal_' + gene_name.lower() + '.tsv'
cbp_out = r'data_folders/cbioportal_annotations/cbioportal_' + gene_name.lower() + '_out.csv'
# gnomAD
gAD_in = r'data_folders/gnomAD_annotations/gnomAD_' + gene_name.lower() + '.csv'
gAD_out = r'data_folders/gnomAD_annotations/gnomAD_' + gene_name.lower() + '_out.csv'
# VEP
vep_path_in = r'data_folders/VEP_data/' + gene_name.lower() + r'/' + gene_name.lower() + '_in'
vep_path_out = r'data_folders/VEP_data/' + gene_name.lower() + r'/' + gene_name.lower() + '_out'
vep_for_in = r'data_folders/VEP_data/' + gene_name.lower() + r'/' + gene_name.lower() + r'_in/vep_id_' + gene_name.lower() + '.txt'
vep_data = vep_path_out + r'/'  + gene_name.lower() + r'_vep_data.txt'
vep_out = r'data_folders/VEP_data/' + gene_name.lower() + r'/' + gene_name.lower() + '_out.csv'
# joint file
joint_out = r'joint_files_out/' + gene_name.lower() + '_joint_out.csv'

# Reference sequence

In [None]:
with open(list_refsq,'w') as oref:
    # result in dictionary format
    resultD_ref={}
    # write first line in new document 'o'
    oref.write('CHROM,POS,REF,Ref_Genome,Strand,Ref_Transcript,Trans_Version_Syn\n')
    # FASTA sequence by UCSC Genome Browser
    with open(refsq) as fref:
        for record in SeqIO.parse(fref, 'fasta'):
            # take first number from position range as variable 's', with iteration through sequence s+=1 = position
            s=int(record.description.split('range')[1].split(':')[1].split('-')[0])
            # take reference genome
            refg = record.description.split('_')[0]
            # take chromosome number
            chrom=record.description.split('range')[1].split('chr')[1].split(':')[0]
            # take strand +/-
            strand=record.description.split('strand')[1][1]
            # take transcript, split by '_', but then NM and number separated
            trans=record.description.split('RefSeq_')[1].split('.')[0]
            # for transcript version
            trans_vers = record.description.split('.')[1].split(' ')[0]
            # for '-' strands: reverse complement and reverse exon order + 1
            if strand == '-':
                record.seq = record.seq.reverse_complement()
            else:
                pass                
            # only for reference transcript
            if trans == refseqtr:
                # iterate through sequence by nucleotide
                for nuc in str(record.seq):
                    # all nucleotides in upper case
                    nuc = str(nuc.upper())
                    # entr = specific position of each nucleotide (for key in dictionary)
                    entr = (refg + '_' + chrom + '_' +str(s) + '_' + strand + '_' + nuc + '_' + str(trans_vers))
                    # if nucleotide with specific position is not yet in resulting dictionary
                    if entr not in resultD_ref:
                        # append specific transcript as value of the specific position (= key) in dictionary
                        resultD_ref[entr]=trans
                    else:
                        resultD_ref[entr]=resultD_ref[entr]#+' '+trans
                    s+=1
                
    # address dictionary with both features (key and value), sort the dictionary
    i = 0
    for key,value in sorted(resultD_ref.items()):
        # every key element (specific position element) is connected by '_' --> split by '_' to access each key
        x=key.split('_')
        # write in new document 'oref'
        oref.write(str(x[1])+ ',' + str(x[2])+ ',' + str(x[4])+ ',' + str(x[0])+ ',' + str(x[3])+ ',' + value +
                   ',' + str(x[5])+ '\n')

# Synthetic variants

In [None]:
# open new document, referred to as 'o'
# either every transcript or only reference transcript
with open(syn_out,'w') as o:
    # result in dictionary format
    resultDict={}
    # write first line in new document 'o'
    o.write('Identifier,CHROM,POS,REF,ALT,Ref_Genome,Strand,Ref_Transcript,Trans_Version_Syn\n')
    # open document with FASTA sequence, referred to as 'f'
    # FASTA sequence by UCSC Genome Browser
    with open(fasta_in) as f:
        # parse: all single features as a list iterator
        for record in SeqIO.parse(f, 'fasta'):
            # .split()[].split(): split by (), then take the piese [] and split it by ()
            # take first number from position range as variable 's', with iteration through sequence s+=1 = POS
            s=int(record.description.split('range')[1].split(':')[1].split('-')[0])
            # take reference genome
            refg = record.description.split('_')[0]
            # take chromosome number
            chrom=record.description.split('range')[1].split('chr')[1].split(':')[0]
            # take strand +/-
            strand=record.description.split('strand')[1][1]
            # take transcript, split by '_', but then NM and number separated
            trans=record.description.split('RefSeq_')[1].split('.')[0]
            # for transcript version
            trans_vers = record.description.split('.')[1].split('_')[0]
            # for '-' strands: reverse complement and reverse exon order + 1
            if strand == '-':
                record.seq = record.seq.reverse_complement()
            else:
                pass                
            # only for reference transcript
            if trans == refseqtr:
                # iterate through sequence by nucleotide
                for nuc in str(record.seq):
                    # all nucleotides in upper case
                    nuc = str(nuc.upper())
                    # for specific nucleotide, print list of alternative nucleotides
                    alt = 'ACGT'.replace(str(nuc), '')
                    # for nuc in alternative nucleotides entry for dictionary
                    for cr in alt:
                        # entr = specific position of each nucleotide (for key in dictionary)
                        entr = (refg + '_' + chrom + '_' +str(s) + '_' + strand + '_' + nuc + '_' + cr
                             + '_' + str(trans_vers))
                    # if nucleotide with specific position is not yet in resulting dictionary
                        if entr not in resultDict:
                        # append specific transcript as value of the specific position (= key) in dictionary
                            resultDict[entr]=trans
                        else:
                        # if specific nucleotide position is already in dictionary, append other transcript(s)
                            resultDict[entr]=resultDict[entr]#+' '+trans
                    s+=1
                
    # address dictionary with both features (key and value), sort the dictionary
    i = 0
    for key,value in sorted(resultDict.items()):
        # every key element (specific position element) is connected by '_' --> split by '_' to access each key
        x=key.split('_')
        # for identifier to integrate information to specific position later on
        identi = str(x[1]) + ':g.' + str(x[2]) + str(x[4]) + '>' + str(x[5])
        # write in new document 'o'
        o.write(str(identi)+ ',' +str(x[1])+ ',' +str(x[2])+ ',' +str(x[4])+ ',' +str(x[5])+ ',' +str(x[0])+
                ',' + str(x[3])+ ',' + value + ',' + str(x[6]) + '\n')

# Integrating FLOSSIES annotations

In [None]:
dff = pd.read_csv(floss_in)
df_ref = pd.read_csv(list_refsq)

dff.rename(columns = {'Annotation':'Consequence_Flossies', 'Splice Change':'Splice_Change_Flossies', 
                      'European (n=7325)':'European_(n=7325)_Flossies', 'African (n=2559)':'African_(n=2559)_Flossies', 
                      'Overall Frequency':'Overall_Frequency_Flossies', 'Chrom':'CHROM', 'Position':'POS', 'Reference':'REF', 'Alternate':'ALT'}, inplace = True)

# for deletion
# position = position - 1
dff['POS'].loc[dff['ALT'].isna()] = dff['POS'].astype(int) - 1
df_del = dff.loc[dff['ALT'].isna()].copy()

d = 0
new_ref_del_l = []
new_alt_del_l = []
for d in range(len(df_del)):
    # position of nucleotide of first element of dataframe in which 'Alternate' not filled out
    pos_flossies = df_del.iloc[d]['POS']
    # reference nucleotide at specific position
    ref_refseq = list(df_ref['REF'].loc[df_ref['POS'].astype(int) == pos_flossies])[0]
    # for alternative nucleotide reference nucleotide
    new_alt_del_l.append(ref_refseq)
    # reference nucleotide of first element of dataframe in which 'Alternate' not filled out
    ref_flossies = df_del.iloc[d]['REF']
    # new reference nucleotides for vcf annotation
    new_ref_del = ref_refseq + ref_flossies
    new_ref_del_l.append(new_ref_del)
# add new columns
df_del['REF_n'] = new_ref_del_l
df_del['ALT_n'] = new_alt_del_l
df_del.drop(columns = ['REF', 'ALT'], inplace = True)
df_del.rename(columns = {'REF_n':'REF', 'ALT_n':'ALT'}, inplace = True)
#df_del

# for insertion
# position = position, because something was inserted at this position
df_ins = dff.loc[dff['REF'].isna()].copy()

i = 0
new_alt_ins_l = []
new_ref_ins_l = []
for i in range(len(df_ins)):
    # position of nucleotide of first element of dataframe in which 'Reference' missing
    pos_flossies = df_ins.iloc[i]['POS']
    # reference nucleotide at specific position
    ref_refseq = list(df_ref['REF'].loc[df_ref['POS'].astype(int) == pos_flossies])[0]
    # for reference nucleotide reference nucleotide
    new_ref_ins_l.append(ref_refseq)
    # reference nucleotide of first element of dataframe in which 'Reference' missing
    alt_flossies = df_ins.iloc[i]['ALT']
    # new alternate nucleotides for vcf annotation
    new_alt_ins = ref_refseq + alt_flossies
    new_alt_ins_l.append(new_alt_ins)
# add new columns
df_ins['ALT_n'] = new_alt_ins_l
df_ins['REF_n'] = new_ref_ins_l
df_ins.drop(columns = ['REF', 'ALT'], inplace = True)
df_ins.rename(columns = {'REF_n':'REF', 'ALT_n':'ALT'}, inplace = True)
#df_ins

df_snv = dff.loc[dff['ALT'].notna()].loc[dff['REF'].notna()]

df_floss = df_snv.append([df_del, df_ins])
df_floss = df_floss.sort_index()

# create column for identifier and transcript version
df_floss['Trans_Version_Flossies']= 3
df_floss['Identifier'] = df_floss['CHROM'].astype(str) + ':g.' + df_floss['POS'].astype(str) + df_floss['REF'] + '>' + df_floss['ALT']

# select only wanted columns
df_floss_n = df_floss[['Identifier', 'CHROM', 'POS', 'REF', 'ALT', 'Trans_Version_Flossies', 'Consequence_Flossies', 'Splice_Change_Flossies', 
                        'European_(n=7325)_Flossies', 'African_(n=2559)_Flossies', 'Overall_Frequency_Flossies']].sort_values(by='Identifier')
df_floss_n
# creating Identifier list for VEP
#df_floss_n['Identifier'].to_csv('floss_id_for_vep.txt', index = False)
#df_floss_n1 = pd.read_csv('floss_id_for_vep.txt', skiprows = 1)
#df_floss_n1.to_csv('0415floss_vep.txt', index = False)
df_floss_n.to_csv(floss_out, index = False)

# Integrating cBioPortal annotations

In [None]:
BP = pd.read_table(cbp_in)
df_ref = pd.read_csv(list_refsq)

BP.rename(columns = {'Chromosome':'CHROM', 'Start Pos':'POS', 'Cancer Type':'Cancer_Type_cBP', 'Protein Change':'Protein_Change_cBP', 
                     'Ref':'REF', 'Var':'ALT', 'Annotation':'Annotation_cBP', 'Functional Impact':'Functional_Impact_cBP', 'MS':'MS_cBP'}, inplace = True)

# some withouut CHROM number --> drop
BP['CHROM'] = BP['CHROM'].fillna(0)
BP = BP.loc[(BP['CHROM'] != 0)]
BP['MS_cBP'] = BP['MS_cBP'].replace('.', np.nan)
BP['CHROM'] = BP['CHROM'].astype(int)

# for deletion
# position = position - 1
BP['POS'].loc[BP['ALT'] == '-'] = BP['POS'].astype(int) - 1
df_del = BP.loc[BP['ALT'] == '-'].copy()

d = 0
new_ref_del_l = []
new_alt_del_l = []
for d in range(len(df_del)):
    # position of nucleotide of first element of dataframe in which 'Alternate' not filled out
    pos_cbp = df_del.iloc[d]['POS']
    # reference nucleotide at specific position
    ref_refseq = list(df_ref['REF'].loc[df_ref['POS'].astype(int) == pos_cbp])[0]
    # for alternative nucleotide reference nucleotide
    new_alt_del_l.append(ref_refseq)
    # reference nucleotide of first element of dataframe in which 'Alternate' not filled out
    ref_cbp = df_del.iloc[d]['REF']
    # new reference nucleotides for vcf annotation
    new_ref_del = ref_refseq + ref_cbp
    new_ref_del_l.append(new_ref_del)
# add new columns
df_del['REF_n'] = new_ref_del_l
df_del['ALT_n'] = new_alt_del_l
df_del.drop(columns = ['REF', 'ALT'], inplace = True)
df_del.rename(columns = {'REF_n':'REF', 'ALT_n':'ALT'}, inplace = True)
#df_del

# for insertion
# position = position, because something was inserted at this position
df_ins = BP.loc[BP['REF'] == '-'].copy()

i = 0
new_alt_ins_l = []
new_ref_ins_l = []
for i in range(len(df_ins)):
    # position of nucleotide of first element of dataframe in which 'Reference' missing
    pos_cbp = df_ins.iloc[i]['POS']
    # reference nucleotide at specific position
    ref_refseq = list(df_ref['REF'].loc[df_ref['POS'].astype(int) == pos_cbp])[0]
    # for reference nucleotide reference nucleotide
    new_ref_ins_l.append(ref_refseq)
    # reference nucleotide of first element of dataframe in which 'Reference' missing
    alt_cbp = df_ins.iloc[i]['ALT']
    # new alternate nucleotides for vcf annotation
    new_alt_ins = ref_refseq + alt_cbp
    new_alt_ins_l.append(new_alt_ins)
# add new columns
df_ins['ALT_n'] = new_alt_ins_l
df_ins['REF_n'] = new_ref_ins_l
df_ins.drop(columns = ['REF', 'ALT'], inplace = True)
df_ins.rename(columns = {'REF_n':'REF', 'ALT_n':'ALT'}, inplace = True)
#df_ins

df_snv = BP.loc[BP['ALT'] != '-'].loc[BP['REF'] != '-']

df_BP = df_snv.append([df_del, df_ins])
df_BP = df_BP.sort_index()

df_BP['Identifier'] = df_BP['CHROM'].astype(str) + ':g.' + df_BP['POS'].astype(str) + df_BP['REF'] + '>' + df_BP['ALT']

df_BP[['Identifier', 'CHROM', 'POS', 'REF', 'ALT', 'Cancer_Type_cBP', 'Protein_Change_cBP', 'Annotation_cBP', 'Functional_Impact_cBP', 'MS_cBP']].to_csv(cbp_out, index = False)
#df_BP[['Identifier', 'CHROM', 'POS', 'REF', 'ALT', 'Cancer_Type_cBP', 'Protein_Change_cBP', 'Annotation_cBP', 'Functional_Impact_cBP', 'MS_cBP']]

# Integrating gnomAD annotations

In [None]:
AD = pd.read_csv(gAD_in)

# rename some columns
AD.rename(columns = {'Chromosome':'CHROM', 'Position':'POS', 'Reference':'REF', 'Alternate':'ALT', 'ClinVar Clinical Significance':'ClinVar_gnomAD', 
                     'Allele Count':'AC_gnomAD', 'Allele Number':'AN_gnomAD', 'Allele Frequency':'AF_gnomAD', 
                     'Homozygote Count':'HOM_gnomAD', 'Hemizygote Count':'HEMI_gnomAD'}, inplace = True)

# create column for identifier
AD['Identifier'] = AD['CHROM'].astype(str) + ':g.' + AD['POS'].astype(str) + AD['REF'].astype(str) + '>' + AD['ALT'].astype(str)

AD[['Identifier', 'CHROM', 'POS', 'REF', 'ALT', 'ClinVar_gnomAD', 'AC_gnomAD', 'AN_gnomAD', 'AF_gnomAD', 'HOM_gnomAD']].to_csv(gAD_out, index = False)

# Integrating variants of NCT MASTER

In [None]:
if gene_name.lower().startswith('brca'):
    NM = pd.read_excel(r'data_folders/NCT_MASTER_variants/NCT_MASTER_BRCA_Variants_hg191.xlsx')
    NM.rename(columns = {'#CHROM':'CHROM', 'NCT PID':'NCT_PID', 'NCT SAMPLE_NAME':'NCT_SAMPLE_NAME', 'NCT Sheet':'NCT_Sheet', 'KGE RefSeq (MSKCC)':'KGE_RefSeq', 'NCT TumorVariantFrequency(TVF)':'NCT_TumorVariantFrequency', 'NCT ZYGOSITY':'NCT_ZYGOSITY'}, inplace = True)

    NM['CHROM'] = NM['CHROM'].astype(int)
    # for BRCA1: CHROM == 17, for BRCA2: CHROM = 13
    chromos = 0
    if gene_name.lower() == 'brca1':
        chromos = 17
    elif gene_name.lower() == 'brca2':
        chromos = 13

    NM_br = NM.loc[NM['CHROM'] == chromos]

    # for insertion
    # position = position, because something was inserted at this position
    df_ins = NM_br.loc[NM_br['REF'].isnull()].copy()

    i = 0
    new_alt_ins_l = []
    new_ref_ins_l = []
    for i in range(len(df_ins)):
        # position of nucleotide of first element of dataframe in which 'Reference' missing
        pos_nct = df_ins.iloc[i]['POS']
        # reference nucleotide at specific position
        ref_refseq = list(df_ref['REF'].loc[df_ref['POS'].astype(int) == pos_nct])[0]
        # for reference nucleotide reference nucleotide
        new_ref_ins_l.append(ref_refseq)
        # reference nucleotide of first element of dataframe in which 'Reference' missing
        alt_nct = df_ins.iloc[i]['ALT']
        # new alternate nucleotides for vcf annotation
        new_alt_ins = ref_refseq + alt_nct
        new_alt_ins_l.append(new_alt_ins)
    # add new columns
    df_ins['ALT_n'] = new_alt_ins_l
    df_ins['REF_n'] = new_ref_ins_l
    df_ins.drop(columns = ['REF', 'ALT'], inplace = True)
    df_ins.rename(columns = {'REF_n':'REF', 'ALT_n':'ALT'}, inplace = True)
    #df_ins

    df_others = NM_br.loc[NM_br['REF'].notnull()]

    df_NM_br = df_others.append(df_ins)
    df_NM_br = df_NM_br.sort_index()

    df_NM_br['Identifier'] = df_NM_br['CHROM'].astype(str) + ':g.' + df_NM_br['POS'].astype(str) + df_NM_br['REF'].astype(str) + '>' + df_NM_br['ALT'].astype(str)

    df_NM_br[['Identifier', 'CHROM', 'POS', 'REF', 'ALT', 'NCT_SAMPLE_NAME', 'NCT_Sheet', 'KGE_RefSeq', 'NCT_TumorVariantFrequency', 'NCT_ZYGOSITY']].to_csv((r'data_folders/NCT_MASTER_variants/0504_nct_master_' + gene_name.lower() + '.csv'), index = False)
else:
    pass

# Integrating Findlay data (BRCA1)

In [None]:
if gene_name.lower() == 'brca1':
    findl = pd.read_excel(r'data_folders/findlay/findlay_data.xlsx', skiprows = 2)
    findl['Identifier'] = findl['chromosome'].astype(str) + ':g.' + findl['position (hg19)'].astype(str) + findl['reference'].astype(str) + '>' + findl['alt'].astype(str)
    findl['Trans_Version_Findlay']= 3
    findl.rename(columns = {'consequence':'Consequence_Findlay', 'func.class':'Function_Findlay', 'chromosome':'CHROM', 'position (hg19)':'POS', 'reference':'REF', 'alt':'ALT'}, inplace = True)
    findl[['Identifier', 'CHROM', 'POS', 'REF', 'ALT', 'Trans_Version_Findlay', 'Consequence_Findlay', 'Function_Findlay']].to_csv(r'data_folders/findlay/findlay_data_out.csv', index = False)
else:
    pass

# Integrating Richardson et al data (BRCA2)

In [None]:
if gene_name.lower() == 'brca2':
    rich = pd.read_excel(r'data_folders/richardson_data/rich_et_al_brca2.xlsx', skiprows = [0, 1, 255, 256, 257, 258, 259, 260, 261, 261, 263])
    rich.drop([252], inplace = True)
    rich.rename(columns = {'Unnamed: 1':'as_var_short'}, inplace = True)
    # lists for protein position and protein exchange
    npos = []
    prot = []
    for x in range(len(rich)):
        pos = re.findall('\d+', rich.iloc[x]['as_var_short'])
        pro = rich.iloc[x]['as_var_short'].split('.')[1]
        npro = re.sub('\d+', '/', pro)
        prot.append(npro)
        npos.append(pos)
    new_pos = []
    for el in npos:
        newp = ','.join(el)
        new_pos.append(newp)
    rich['Protein_position'] = new_pos
    rich['Amino_acids'] = prot
    rich['CHROM'] = 13
    rich.rename(columns = {'Variant':'Variant_Rich', 'HDR Score':'HDR_Rich', '95% CI lower bound':'95%_CI_lower_bound_Rich', '95%CI upper bound':'95%_CI_upper_bound_Rich', 'Previously Published HDR Study1-3':'Prev_HDR_Rich'}, inplace = True)
    rich[['CHROM', 'Protein_position', 'Amino_acids', 'Variant_Rich', 'HDR_Rich', '95%_CI_lower_bound_Rich', '95%_CI_upper_bound_Rich', 'Prev_HDR_Rich']].to_csv(r'data_folders/richardson_data/0517_richardson.csv', index = False)
else:
    pass

# Integrating VEP data

In [None]:
# producing VEP data
vep_syn = pd.read_csv(syn_out)
vep_floss = pd.read_csv(floss_out)
vep_cbp = pd.read_csv(cbp_out)
vep_ga = pd.read_csv(gAD_out)
vep_nct = pd.read_csv((r'data_folders/NCT_MASTER_variants/0504_nct_master_' + gene_name.lower() + '.csv'))

# creating Identifier lists for VEP
full_vep = pd.concat([vep_syn[['Identifier', 'CHROM', 'POS', 'REF', 'ALT']], vep_floss[['Identifier', 'CHROM', 'POS', 'REF', 'ALT']], vep_cbp[['Identifier', 'CHROM', 'POS', 'REF', 'ALT']], vep_ga[['Identifier', 'CHROM', 'POS', 'REF', 'ALT']], vep_nct[['Identifier', 'CHROM', 'POS', 'REF', 'ALT']]])
full_vep.drop_duplicates(inplace = True, ignore_index = True)

# deletion problem VEP
full_vep_wo_del = full_vep.loc[full_vep['REF'].str.len() == 1].copy()
full_vep_wo_del['VEP_ID'] = full_vep_wo_del['CHROM'].astype(str) + ':g.' + full_vep_wo_del['POS'].astype(str) + full_vep_wo_del['REF'].astype(str) + '>' + full_vep_wo_del['ALT'].astype(str)

full_vep_del =  full_vep.loc[full_vep['REF'].str.len() > 1]

vep_REF = []
for d in range(len(full_vep_del)):
    vep_REF.append(full_vep_del.iloc[d]['REF'][1:])
full_vep_del['vep_REF'] = vep_REF

full_vep_del['VEP_ID'] = full_vep_del['CHROM'].astype(str) + ':g.' + (full_vep_del['POS'].astype(int) + 1).astype(str) + 'del' + full_vep_del['vep_REF'].astype(str)

all_vep = full_vep_wo_del.append(full_vep_del)
all_vep.sort_index()

all_vep['VEP_ID'].to_csv(vep_for_in, index = False, header = False)

# with vep_for_in --> go to http://grch37.ensembl.org/Homo_sapiens/Tools/VEP?db=core
# save as 'gene_name.lower()_vep_data'

In [None]:
VEP = pd.read_table(vep_data)
VEP.rename(columns = {'#Uploaded_variation':'VEP_ID', 'Consequence':'Consequence_VEP', 'IMPACT':'Impact_VEP', 
                     'Existing_variation':'Existing_variation_VEP', 'CADD_PHRED':'CADD_PHRED_VEP', 'CADD_RAW':'CADD_RAW_VEP', 'SIFT':'SIFT_VEP', 'PolyPhen':'PolyPhen_VEP', 
                     'ada_score':'Ada_score_VEP'}, inplace = True)

# only use rows with reference transcripts
VEP = VEP.replace('-', np.nan)

new = VEP['Feature'].str.split('.')
transcr = []
for el in new:
    transcr.append(el[0])
VEP['Ref_Trans_VEP'] = transcr


df_VEP = VEP.loc[VEP['Ref_Trans_VEP'] == refseqtr]
df_VEP.reset_index(drop = True, inplace = True)

#pd.merge(all_vep['']

df_VEP[['VEP_ID', 'Ref_Trans_VEP', 'Consequence_VEP', 'Impact_VEP']]

new_VEP = pd.merge(df_VEP, all_vep, on = ['VEP_ID'])

# only a few columns in file
new_VEP[['Identifier', 'CHROM', 'POS', 'REF', 'ALT', 'Ref_Trans_VEP', 'Consequence_VEP', 'Impact_VEP', 'EXON', 'INTRON', 'Protein_position', 
         'Amino_acids', 'Codons', 'SIFT_VEP', 'PolyPhen_VEP', 'CADD_PHRED_VEP', 'CADD_RAW_VEP', 'Ada_score_VEP']].to_csv(vep_out, index = False)

# Joining all files

In [None]:
#read .csv as dataframe
fsyn = pd.read_csv(syn_out)
ffloss = pd.read_csv(floss_out)
fcbp = pd.read_csv(cbp_out)
fga = pd.read_csv(gAD_out)
fvep = pd.read_csv(vep_out)
if gene_name.lower().startswith('brca'):
    fnct = pd.read_csv((r'data_folders/NCT_MASTER_variants/0504_nct_master_' + gene_name.lower() + '.csv'))
else:
    pass
if gene_name.lower() == 'brca1':
    ffindlay = pd.read_csv(r'data_folders/findlay/findlay_data_out.csv')
else:
    pass
if gene_name.lower() == 'brca2':
    frich = pd.read_csv(r'data_folders/richardson_data/0517_richardson.csv')
else:
    pass

In [None]:
#join/merge all dfs
# if position is in synthetic list with 75 bp padding
m = fvep.loc[fvep['POS'].isin(list(fsyn['POS']))]

m1 = m.merge(fsyn, on = ['Identifier', 'CHROM', 'POS', 'REF', 'ALT'], how = 'left')
m2 = m1.merge(ffloss, on = ['Identifier', 'CHROM', 'POS', 'REF', 'ALT'], how = 'left')
m3 = m2.merge(fcbp, on = ['Identifier', 'CHROM', 'POS', 'REF', 'ALT'], how = 'left')
m4 = m3.merge(fga, on = ['Identifier', 'CHROM', 'POS', 'REF', 'ALT'], how = 'left')
if gene_name.lower().startswith('brca'):
    m5 = m4.merge(fnct, on = ['Identifier', 'CHROM', 'POS', 'REF', 'ALT'], how = 'left')
    if gene_name.lower() == 'brca1':
        jnt = m5.merge(ffindlay, on = ['Identifier', 'CHROM', 'POS', 'REF', 'ALT'], how = 'left')
    elif gene_name.lower() == 'brca2':
        jnt = m5.merge(frich, on = ['CHROM', 'Protein_position', 'Amino_acids'], how = 'left')
else:
    m4 = jnt
jnt.drop_duplicates(['Identifier', 'CHROM', 'POS', 'REF', 'ALT'], inplace = True, ignore_index = True)

#new document with all annotations at specific positions
jnt.to_csv(joint_out, index = False)