In [2]:
import pandas as pd
import glob
import os, sys

In [3]:
def convert_caf_output_to_snpeff_input(cohort):
    out_dir = cohort + '/snpeff_input'
    print(out_dir)
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    for chrom in range(1, 23):
        print('Chrom:', chrom)
        cur_file = glob.glob(cohort + '/*chr' + str(chrom) + '.csv')

        if len(cur_file) > 1:
            sys.exit('More than 1 file matched for chrom ' + str(chrom))
        else:
            cur_file = cur_file[0]

        df = pd.read_csv(cur_file)

        # keep SNVs only
        df = df.loc[ df['Variant ID'] == 'snv', ]

        # Create snpEff vcf input with all unique variants
        out_df = pd.DataFrame(index=df.index)
        out_df.reset_index(inplace=True)
        out_df[['chrom', 'pos', 'ref', 'alt']] = out_df['index'].str.split('-', expand=True)
        del out_df['index']
        out_df['aux_1'] = '.'
        out_df['aux_2'] = '.'
        out_df['aux_3'] = '.'
        out_df['aux_4'] = '.'

        # re-order columns
        out_df = out_df[['chrom', 'pos', 'aux_1', 'ref', 'alt', 'aux_2', 'aux_3', 'aux_4']]

        # save to output file

        out_df.to_csv(out_dir + '/' + cohort + '.snpeff_input.chr' + str(chrom) + '.vcf', sep='\t', index=False, header=False)


    # merge output from all chroms into a single file
    os.system('cat '+out_dir+'/'+cohort+'.snpeff_input.chr*.vcf | sort -k1 -n > ' + out_dir+'/'+cohort+'.snpeff_input.All_chr.vcf')
    
    print('Cohort', cohort, ' is completed.')

In [4]:
#cohorts = ['NFE_20K_WES', 'NFE_43K_WES', 'EAS_WES', 'SAS_WES', 'AFR_WES', 'ASJ_WES',
#           'NFE_20K_WGS', 'NFE_43K_WGS', 'EAS_WGS', 'SAS_WGS', 'AFR_WGS', 'ASJ_WGS'
#             'Maximally-diverse_WES', 'Maximally-diverse_WGS']

cohorts = ['NFE_440k']

for cohort in cohorts:
    convert_caf_output_to_snpeff_input(cohort)

NFE_440k/snpeff_input
Chrom: 1
Chrom: 2
Chrom: 3
Chrom: 4
Chrom: 5
Chrom: 6
Chrom: 7
Chrom: 8
Chrom: 9
Chrom: 10
Chrom: 11
Chrom: 12
Chrom: 13
Chrom: 14
Chrom: 15
Chrom: 16
Chrom: 17
Chrom: 18
Chrom: 19
Chrom: 20
Chrom: 21
Chrom: 22
Cohort NFE_440k  is completed.


## Parse Diverse 460k cohort (single file)

In [None]:
def convert_single_caf_output_to_snpeff_input(cohort):
    out_dir = cohort + '/snpeff_input'
    print(out_dir)
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    cur_file = cohort + '/compiled_full_dataset.csv'
    df = pd.read_csv(cur_file)

    # keep SNVs only
    df = df.loc[ df['Variant Type'] == 'snv', ]

    # Create snpEff vcf input with all unique variants
    out_df = pd.DataFrame(index=df.index)
    out_df.reset_index(inplace=True)
    out_df[['chrom', 'pos', 'ref', 'alt']] = out_df['index'].str.split('-', expand=True)
    del out_df['index']
    out_df['aux_1'] = '.'
    out_df['aux_2'] = '.'
    out_df['aux_3'] = '.'
    out_df['aux_4'] = '.'

    # re-order columns
    out_df = out_df[['chrom', 'pos', 'aux_1', 'ref', 'alt', 'aux_2', 'aux_3', 'aux_4']]

    # save to outpu file

    out_df.to_csv(out_dir + '/' + cohort + '.snpeff_input.chr' + str(chrom) + '.vcf', sep='\t', index=False, header=False)

    
    

    # merge output from all chroms into a single file
    os.system('cat '+out_dir+'/'+cohort+'.snpeff_input.chr*.vcf | sort -k1 -n > ' + out_dir+'/'+cohort+'.snpeff_input.All_chr.vcf')
    
    print('Cohort', cohort, ' is completed.')

In [2]:
cohort = 'Diverse_460k'

In [5]:
out_dir = cohort + '/snpeff_input'
print(out_dir)
if not os.path.exists(out_dir):
    os.makedirs(out_dir)

cur_file = cohort + '/compiled_full_dataset.csv'
df = pd.read_csv(cur_file)

# keep SNVs only
df = df.loc[ df['Variant Type'] == 'snv', ]

Diverse_460k/snpeff_input


In [6]:
df

Unnamed: 0.1,Unnamed: 0,Variant ID,Gene,Function,Variant Type,afr_maf,asj_maf,eas_maf,nfe_400k_maf,sas_maf,Gene ID
0,1,1-69496-G-A,OR4F5,missense_variant,snv,0.003900,,,0.000007,,ENSG00000186092
1,2,1-69511-A-G,OR4F5,missense_variant,snv,0.042900,0.0577,,0.133300,0.235800,ENSG00000186092
2,3,1-69525-C-G,OR4F5,synonymous_variant,snv,0.000154,,,,,ENSG00000186092
3,4,1-69559-G-A,OR4F5,missense_variant,snv,0.000153,,,,,ENSG00000186092
4,5,1-69581-C-G,OR4F5,missense_variant,snv,0.000076,,,,,ENSG00000186092
...,...,...,...,...,...,...,...,...,...,...,...
12398544,12398545,Y-20768970-G-C,RPS4Y2,missense_variant,snv,,,,,0.000216,ENSG00000280969
12398545,12398546,Y-21383590-T-G,PRORY,missense_variant,snv,,,,,0.000203,ENSG00000183146
12398546,12398547,Y-21383591-A-G,PRORY,missense_variant,snv,,,,,0.000203,ENSG00000183146
12398547,12398548,Y-21383611-T-C,PRORY,missense_variant,snv,,,,,0.000203,ENSG00000183146


In [11]:
# Create snpEff vcf input with all unique variants
out_df = pd.DataFrame(index=df['Variant ID'])
out_df.reset_index(inplace=True)
out_df[['chrom', 'pos', 'ref', 'alt']] = out_df['Variant ID'].str.split('-', expand=True)
del out_df['Variant ID']
out_df['aux_1'] = '.'
out_df['aux_2'] = '.'
out_df['aux_3'] = '.'
out_df['aux_4'] = '.'

# re-order columns
out_df = out_df[['chrom', 'pos', 'aux_1', 'ref', 'alt', 'aux_2', 'aux_3', 'aux_4']]

# save to output file
out_df.to_csv(out_dir + '/' + cohort + '.snpeff_input.All_chr.vcf', sep='\t', index=False, header=False)

print('Cohort', cohort, ' is completed.')

Cohort Diverse_460k  is completed.
