In [None]:
import pandas as pd
import pysam as ps
import numpy as np
import os

In [None]:
# hg19 (GRCh37) reference sequence fasta
ref_fasta = ps.FastaFile('reference_GRCh37.fa')

# directory to FLOSSIES database downloads per gene
# dictionary for genes and dataframes
dfs = dict([(file.strip('.csv').split('_')[1].upper(), pd.read_csv('flossies_directory/'+file)) 
            for file in os.listdir('flossies_directory')])

In [None]:
# especially indels not VCF-conform #CHROM, POS, REF, ALT -> with hg19 reference sequence VCF-conform format
def FlossiesDataCleaning(df):
    dfn = df.copy()
    dfn.rename(columns = {'Annotation':'Consequence_Flossies', 'Splice Change':'Splice_Change_Flossies', 
                         'European (n=7325)':'European_(n=7325)_Flossies',
                         'African (n=2559)':'African_(n=2559)_Flossies', 
                         'Overall Frequency':'Overall_Frequency_Flossies', 
                         'Chrom':'CHROM', 'Position':'POS', 
                         'Reference':'REF', 'Alternate':'ALT'}, inplace = True)
    # for deletion
    # position = position - 1
    dfn['POS'].loc[dfn['ALT'].isna()] = dfn['POS'].astype(int) - 1
    df_del = dfn.loc[dfn['ALT'].isna()].copy()

    new_ref_del_l = []
    new_alt_del_l = []
    for d in range(len(df_del)):
        # position of nucleotide of first element of dataframe in which 'Alternate' not filled out
        pos_flossies = df_del.iloc[d]['POS']
        # reference nucleotide at specific position
        ref_refseq = ref_fasta.fetch(reference=str(df_del.iloc[d]['CHROM']), start=pos_flossies-1, end=pos_flossies)
        # for alternative nucleotide reference nucleotide
        new_alt_del_l.append(ref_refseq)
        # reference nucleotide of first element of dataframe in which 'Alternate' not filled out
        ref_flossies = df_del.iloc[d]['REF']
        # new reference nucleotides for vcf annotation
        new_ref_del = ref_refseq + ref_flossies
        new_ref_del_l.append(new_ref_del)
    # add new columns
    df_del['REF_n'] = new_ref_del_l
    df_del['ALT_n'] = new_alt_del_l
    df_del.drop(columns = ['REF', 'ALT'], inplace = True)
    df_del.rename(columns = {'REF_n':'REF', 'ALT_n':'ALT'}, inplace = True)
    
    # for insertion
    # position = position, because something was inserted at this position
    df_ins = dfn.loc[dfn['REF'].isna()].copy()

    new_alt_ins_l = []
    new_ref_ins_l = []
    for i in range(len(df_ins)):
        # position of nucleotide of first element of dataframe in which 'Reference' missing
        pos_flossies = df_ins.iloc[i]['POS']
        # reference nucleotide at specific position
        ref_refseq = ref_fasta.fetch(reference=str(df_ins.iloc[i]['CHROM']), start=pos_flossies-1, end=pos_flossies)
        # for reference nucleotide reference nucleotide
        new_ref_ins_l.append(ref_refseq)
        # reference nucleotide of first element of dataframe in which 'Reference' missing
        alt_flossies = df_ins.iloc[i]['ALT']
        # new alternate nucleotides for vcf annotation
        new_alt_ins = ref_refseq + alt_flossies
        new_alt_ins_l.append(new_alt_ins)
    # add new columns
    df_ins['ALT_n'] = new_alt_ins_l
    df_ins['REF_n'] = new_ref_ins_l
    df_ins.drop(columns = ['REF', 'ALT'], inplace = True)
    df_ins.rename(columns = {'REF_n':'REF', 'ALT_n':'ALT'}, inplace = True)
    
    df_snv = dfn.loc[dfn['ALT'].notna()].loc[dfn['REF'].notna()].copy()

    dfn_new = df_snv.append([df_del, df_ins])
    dfn_new = dfn_new.sort_index()
    return dfn_new

In [None]:
# Flossies Data Cleaning -> create new dictionary with all dataframes for each genes
floss_dict = {gene:FlossiesDataCleaning(dfs[gene]) for gene in dfs.keys()}
# append all dataframes
floss_df = pd.DataFrame()
for g in floss_dict.keys():
    floss_df = floss_df.append(floss_dict[g], ignore_index=True)
#sum([len(i) for i in floss_dict.values()]), len(floss_df)

In [None]:
# save FLOSSIES annotations in DataFrame
floss_df.to_pickle('DataFrame_flossies')