In [82]:
import pandas as pd
import re
import numpy as np
def read_vcf(vcf_file, separator = '\t'):
    head_line = recognize_vcf_head(vcf_file)
    vcf = pd.read_table(vcf_file, header = head_line, sep = separator)
    vcf = vcf.set_index('ID')
    vcf = vcf.drop(['QUAL', 'FILTER', 'INFO', 'FORMAT'], axis = 1)
    locis = vcf.columns[4:]
    for loci in locis:
        vcf[loci] = vcf[loci].str.extract(r'([\d\.]/[\d\.]):*')
    return vcf
def vcf_to_genepop(vcf_file, popmap,genepop_name):

    pop_map_dict = read_pop_map(popmap)
    vcf = read_vcf(vcf_file)
    locis = vcf.columns[4:]
    for loci in locis:
        rag = vcf['REF'] + vcf['ALT'] + vcf[loci]
        vcf[loci] = rag.apply(change_num_to_base)

    genepop = vcf
    chrom_pos = vcf['#CHROM'].astype(str) + '_' +(vcf['POS']-1).astype(str) 
    genepop.insert(0,'CHROM_POS', chrom_pos)
    genepop = genepop.drop(['REF','ALT','#CHROM', 'POS' ], axis = 1)
    
    for loci in locis:
        genepop[loci] = genepop[loci].apply(change_base_to_genepop_number)
    
    genepop = genepop.T
    genepop.columns = d.iloc[0]
    genepop.head(1).to_csv(genepop_name, header = False, index  = False, sep  = '\t')
    for pop in pop_map_dict:
        with open(genepop_name,'a') as f:
            f.write('pop' + '\n')
            f.close()
        genepop.loc[pop_map_dict[pop]].to_csv(genepop_name, 
                                               header = False, 
                                               index  = True, 
                                               sep  = '\t', 
                                               mode = 'a')
    print('File was generated')
    return genepop


def change_num_to_base(ref_alt_gen):
    ref = ref_alt_gen[0]
    alt = ref_alt_gen[1]
    ref_alt_gen = ref_alt_gen.replace('0',ref)
    ref_alt_gen = ref_alt_gen.replace('1',alt)
    ref_alt_gen = ref_alt_gen.replace('/','')
    return ref_alt_gen[2:]

def change_base_to_genepop_number(genotype):
    bases_dict = {'A':'01','T':'02','G':'03','C':'04','.':'00'}
    genepop_code = ''
    for base in genotype:
        genepop_code += bases_dict[base]
    return genepop_code
        
    
def read_pop_map(popmap, separator = '\t'):
    popdf = pd.read_table(popmap, sep = separator, header = None)
    popdf = popdf.drop_duplicates()
    unique_pops = popdf[1].unique()
    pop_dict = {}
    for unique_pop in unique_pops:
        pop_dict[unique_pop] =  popdf[popdf[1] == unique_pop][0].values.tolist()
    return pop_dict
def recognize_vcf_head(vcf_file):
    with open(vcf_file) as file:
        line = file.readline()
        i = 0
        while 'CHROM' not in line:
            line = file.readline()
            i += 1
    return i


In [83]:
popmap = 'data_pop_map.txt'
b = read_pop_map(popmap)

In [84]:
d = vcf_to_genepop('Xr-567.vcf','data_pop_map.txt','test1')

File was generated


In [80]:
read_pop_map('data_pop_map.txt')

  read_pop_map('data_pop_map.txt')


{1: ['Xr_SBI_12-2',
  'Xr_SBI_26-4',
  'Xr_SBI_28-4',
  'Xr_SBI_29-4',
  'Xr_SBI_30-4',
  'Xr_SBI_31-4',
  'Xr_SBI_32-4',
  'Xr_SBI_33-4',
  'Xr_SBI_37-5',
  'Xr_SBI_38-5',
  'Xr_SBI_39-5',
  'Xr_SBI_41-5',
  'Xr_SBI_44-5'],
 2: ['Xr_SCL_02b-2',
  'Xr_SCL_03',
  'Xr_SCL_04',
  'Xr_SCL_05-1',
  'Xr_SCL_06',
  'Xr_SCL_07-1',
  'Xr_SCL_08',
  'Xr_SCL_09',
  'Xr_SCL_10',
  'Xr_SCL_11',
  'Xr_SCL_12',
  'Xr_SCL_13',
  'Xr_SCL_14',
  'Xr_SCL_15-1',
  'Xr_SCL_17',
  'Xr_SCL_18',
  'Xr_SCL_19',
  'Xr_SCL_20',
  'Xr_SCL_21',
  'Xr_SCL_22',
  'Xr_SCL_23',
  'Xr_SCL_24',
  'Xr_SCL_25',
  'Xr_SCL_26',
  'Xr_SCL_29-5',
  'Xr_SCL_33-5',
  'Xr_SCL_34-5',
  'Xr_SCL_37-5',
  'Xr_SCL_39-5',
  'Xr_SCL_40-5',
  'Xr_SCL_41-5',
  'Xr_SCL_42-5'],
 3: ['Xr_SNI_03',
  'Xr_SNI_04',
  'Xr_SNI_05',
  'Xr_SNI_06',
  'Xr_SNI_07',
  'Xr_SNI_08',
  'Xr_SNI_09',
  'Xr_SNI_10',
  'Xr_SNI_11',
  'Xr_SNI_12',
  'Xr_SNI_13',
  'Xr_SNI_14',
  'Xr_SNI_15',
  'Xr_SNI_16',
  'Xr_SNI_17',
  'Xr_SNI_18',
  'Xr_SNI_19',
  'Xr_SN

In [41]:
d.columns = d.iloc[0]
d.loc['CHROM_POS'].to_csv('test', header = False, index  = False )

In [42]:
d.loc['CHROM_POS']

CHROM_POS
43806_70        43806_70
43978_60        43978_60
44472_7          44472_7
44672_60        44672_60
44728_20        44728_20
                 ...    
1422564_83    1422564_83
1422973_66    1422973_66
1428956_26    1428956_26
1437671_62    1437671_62
1474931_50    1474931_50
Name: CHROM_POS, Length: 1473, dtype: object

In [48]:
d.head(1).to_csv('test', header = False, index  = True, sep  = '\t')