## Import Packages and Set Directory

In [3]:
import pandas as pd
import numpy as np
import sys
import argparse
import os
os.chdir('/Users/abc6435/Desktop/RABvcfs')

## Functions

In [None]:
#Import Population File
pop_file="/Users/abc6435/Desktop/RABvcfs/pops.txt"
pops = pd.read_csv(pop_file, sep="\t", header=None)
popA = pops[pops[0]=="popA"][1].astype(str)
popB = pops[pops[0]=="popB"][1].astype(str)

#Import VCF
vcf_file="/Users/abc6435/Desktop/RABvcfs/rab_chr29_test.vcf"
with open(vcf_file) as file:
    for line in file:
        if line.startswith('#CHROM'):
            cols = line.lstrip("#").strip().split("\t")
data = pd.read_csv(vcf_file, sep='\t', comment='#', names=cols)

#Extract Genotype Information
loci = data[['CHROM','POS','REF','ALT']]
samples = data.drop(columns=['CHROM', 'POS', 'REF', 'ALT','ID','QUAL','FILTER','INFO','FORMAT'])
samples = samples.apply(lambda col: col.str.split(":").str[0])
alleles = pd.concat([loci,samples], axis=1)

#Count Alleles
aA = alleles[popA].astype(str)
alleles['popA_0'] = aA.apply(lambda col: col.str.count('0')).sum(axis=1) 
alleles['popA_1'] = aA.apply(lambda col: col.str.count('1')).sum(axis=1)
aB = alleles[popB].astype(str)
alleles['popB_0'] = aB.apply(lambda col: col.str.count('0')).sum(axis=1)
alleles['popB_1'] = aB.apply(lambda col: col.str.count('1')).sum(axis=1)

#Simulated Data Boolean
simulated_vcf = True

#Calculate Frequencies (Empirical VCF)
alleles['f_popA_0'] = alleles['popA_0']/(alleles['popA_0'] + alleles['popA_1'])
alleles['f_popA_1'] = alleles['popA_1']/(alleles['popA_0'] + alleles['popA_1'])
alleles['f_popB_0'] = alleles['popB_0']/(alleles['popB_0'] + alleles['popB_1'])
alleles['f_popB_1'] = alleles['popB_1']/(alleles['popB_0'] + alleles['popB_1'])
alleles['nA'] = (alleles['popA_0'] + alleles['popA_1']) / 2
alleles['nB'] = (alleles['popB_0'] + alleles['popB_1']) / 2

#Count Genotypes
alleles['popA_hom'] = aA.apply(lambda col: col.str.count(r'1/1|1\|1')).sum(axis=1)
alleles['popA_het'] = aA.apply(lambda col: col.str.count(r'0/1|0\|1|1/0|1\|0')).sum(axis=1)
alleles['popB_hom'] = aB.apply(lambda col: col.str.count(r'1/1|1\|1')).sum(axis=1)
alleles['popB_het'] = aB.apply(lambda col: col.str.count(r'0/1|0\|1|1/0|1\|0')).sum(axis=1)
freq = alleles[['CHROM','POS','REF','ALT',
                      'f_popA_1','f_popB_1',
                      'popA_hom', 'popA_het',
                      'popB_hom', 'popB_het',
                      'nA','nB']].dropna(axis="rows")

#Adjust Frequencies for sites lost/gained (Simulated VCF)
if simulated_vcf: 
    miss_popA = (alleles['popA_0'] == 0) & (alleles['popA_1'] == 0)
    alleles.loc[miss_popA, 'f_popA_0'] = 0
    alleles.loc[miss_popA, 'f_popA_1'] = 0
    miss_popB = (alleles['popB_0'] == 0) & (alleles['popB_1'] == 0)
    alleles.loc[miss_popB, 'f_popB_0'] = 0
    alleles.loc[miss_popB, 'f_popB_1'] = 0
    alleles.loc[miss_popA, 'nA'] = len(popA)
    alleles.loc[miss_popA, 'nB'] = len(popB)
    freq = alleles[['CHROM','POS','REF','ALT',
                      'f_popA_1','f_popB_1',
                      'popA_hom', 'popA_het',
                      'popB_hom', 'popB_het',
                      'nA','nB']]

freq

In [65]:
#Define readvcf()
def readvcf(vcf_file, pop_file, simulated_vcf=False):
    #Import Population File
    pops = pd.read_csv(pop_file, sep="\t", header=None)
    popA = pops[pops[0]=="popA"][1].astype(str)
    popB = pops[pops[0]=="popB"][1].astype(str)

    #Import VCF
    with open(vcf_file) as file:
        for line in file:
            if line.startswith('#CHROM'):
                cols = line.lstrip("#").strip().split("\t")
    data = pd.read_csv(vcf_file, sep='\t', comment='#', names=cols)

    #Extract Genotype Information
    loci = data[['CHROM','POS','REF','ALT']]
    samples = data.drop(columns=['CHROM', 'POS', 'REF', 'ALT','ID','QUAL','FILTER','INFO','FORMAT'])
    samples = samples.apply(lambda col: col.str.split(":").str[0])
    alleles = pd.concat([loci,samples], axis=1)

    #Count alleles
    gA = alleles[popA].astype(str)
    alleles['popA_0'] = gA.apply(lambda col: col.str.count('0')).sum(axis=1) 
    alleles['popA_1'] = gA.apply(lambda col: col.str.count('1')).sum(axis=1)
    gB = alleles[popB].astype(str)
    alleles['popB_0'] = gB.apply(lambda col: col.str.count('0')).sum(axis=1)
    alleles['popB_1'] = gB.apply(lambda col: col.str.count('1')).sum(axis=1)
           
    #Calculate Frequencies (Empirical VCF)
    alleles['f_popA_0'] = alleles['popA_0']/(alleles['popA_0'] + alleles['popA_1'])
    alleles['f_popA_1'] = alleles['popA_1']/(alleles['popA_0'] + alleles['popA_1'])
    alleles['f_popB_0'] = alleles['popB_0']/(alleles['popB_0'] + alleles['popB_1'])
    alleles['f_popB_1'] = alleles['popB_1']/(alleles['popB_0'] + alleles['popB_1'])
    alleles['nA'] = (alleles['popA_0'] + alleles['popA_1']) / 2
    alleles['nB'] = (alleles['popB_0'] + alleles['popB_1']) / 2

    #Count Genotypes
    alleles['popA_hom'] = aA.apply(lambda col: col.str.count(r'1/1|1\|1')).sum(axis=1)
    alleles['popA_het'] = aA.apply(lambda col: col.str.count(r'0/1|0\|1|1/0|1\|0')).sum(axis=1)
    alleles['popB_hom'] = aB.apply(lambda col: col.str.count(r'1/1|1\|1')).sum(axis=1)
    alleles['popB_het'] = aB.apply(lambda col: col.str.count(r'0/1|0\|1|1/0|1\|0')).sum(axis=1)
    freq = alleles[['CHROM','POS','REF','ALT',
                      'f_popA_1','f_popB_1',
                      'popA_hom', 'popA_het',
                      'popB_hom', 'popB_het',
                      'nA','nB']].dropna(axis="rows")
    freq = freq[(freq["nA"] > 1) & (freq["nB"] > 1)]

    #Adjust Frequencies for sites lost/gained (Simulated VCF)
    if simulated_vcf: 
        miss_popA = (alleles['popA_0'] == 0) & (alleles['popA_1'] == 0)
        alleles.loc[miss_popA, 'f_popA_0'] = 0
        alleles.loc[miss_popA, 'f_popA_1'] = 0
        miss_popB = (alleles['popB_0'] == 0) & (alleles['popB_1'] == 0)
        alleles.loc[miss_popB, 'f_popB_0'] = 0
        alleles.loc[miss_popB, 'f_popB_1'] = 0
        alleles.loc[miss_popA, 'nA'] = len(popA)
        alleles.loc[miss_popA, 'nB'] = len(popB)
        freq = alleles[['CHROM','POS','REF','ALT',
                      'f_popA_1','f_popB_1',
                      'popA_hom', 'popA_het',
                      'popB_hom', 'popB_het',
                      'nA','nB']]
    return freq

#Define importsites()
def importsites(neutral_file, mutation_file, derived_sites):
    neutral = pd.read_csv(neutral_file, sep='\t', header=(0))
    mutation = pd.read_csv(mutation_file, sep='\t', header=(0))
    neutral.rename(columns={'chromo':'CHROM', 'position':'POS'}, inplace=True)
    mutation.rename(columns={'chromo':'CHROM', 'position':'POS'}, inplace=True)
    neu_der = pd.merge(derived_sites, neutral, on=['CHROM','POS'],how='inner', indicator=True)
    mut_der = pd.merge(derived_sites, mutation, on=['CHROM','POS'],how='inner', indicator=True)
    neu_der = neu_der[neu_der['_merge']=='both'].drop(columns=['_merge'])
    mut_der = mut_der[mut_der['_merge']=='both'].drop(columns=['_merge'])
    #Report Number of Sites
    print("Number of derived mutations =", len(mut_der))
    return neu_der, mut_der

#Define calcRAB()
def calcRAB(neu_der, mut_der, seed):
    np.random.seed(seed)
    index1=np.random.permutation(len(neu_der))[:10000]
    neu1=neu_der.iloc[index1]
    f_AD = mut_der['f_popA_1']
    f_BD = mut_der['f_popB_1']
    f_AN = neu1['f_popA_1']
    f_BN = neu1['f_popB_1']
    LAB = sum(f_AD*(1-f_BD))/sum(f_AN*(1-f_BN))
    LBA = sum(f_BD*(1-f_AD))/sum(f_BN*(1-f_AN))
    RAB = LAB/LBA
    return RAB

#Define calcRAB_neu()
def calcRAB_neu(neu_der, seed):
    np.random.seed(seed)
    index1=np.random.permutation(len(neu_der))[:10000]
    neu1=neu_der.iloc[index1]
    index2=np.random.permutation(len(neu_der))[:10000]
    neu2=neu_der.iloc[index2]
    f_AD = neu1['f_popA_1']
    f_BD = neu1['f_popB_1']
    f_AN = neu2['f_popA_1']
    f_BN = neu2['f_popB_1']
    LAB = sum(f_AD*(1-f_BD))/sum(f_AN*(1-f_BN))
    LBA = sum(f_BD*(1-f_AD))/sum(f_BN*(1-f_AN))
    RAB_neu = LAB/LBA
    return RAB_neu

#Define calcRAB_subs()
def calcRAB_sub(neu_sub, mut_sub):
    index1=np.random.permutation(len(neu_sub))[:10000]
    neu1=neu_sub.iloc[index1]
    f_AD = mut_sub['f_popA_1']
    f_BD = mut_sub['f_popB_1']
    f_AN = neu1['f_popA_1']
    f_BN = neu1['f_popB_1']
    LAB = sum(f_AD*(1-f_BD))/sum(f_AN*(1-f_BN))
    LBA = sum(f_BD*(1-f_AD))/sum(f_BN*(1-f_AN))
    RAB_sub = LAB/LBA
    return RAB_sub

#Define samplesites()
def samplesites(sites, psites):
    nsites = int(round(len(sites) * psites))
    indices = np.random.permutation(len(sites))[:nsites]
    subsamp = sites.iloc[indices]
    return subsamp

#Define jackknife()
def jackknife(neu_der, mut_der, psites, iter):
    if jackknife:
        jx = []
        for i in range(iter):
            neu_sub = samplesites(neu_der, psites)
            mut_sub = samplesites(mut_der, psites)
            jx.append(calcRAB_sub(neu_sub, mut_sub))
    return np.array(jx)
    

## Run

In [71]:
der = readvcf("/Users/abc6435/Desktop/RABvcfs/rab_chr29_test.vcf", 
        "/Users/abc6435/Desktop/RABvcfs/pops.txt",
        simulated_vcf=False)

neutral, mutation = importsites('/Users/abc6435/Desktop/RABvcfs/intergenic.txt',
                                '/Users/abc6435/Desktop/RABvcfs/tolerated.txt', der)

print("PopA Homozygotes = ", mutation['popA_hom'].sum())
print("PopA Heterozygotes = ", mutation['popA_het'].sum())
print("PopB Homozygotes = ", mutation['popB_hom'].sum())
print("PopB Heterozygotes = ", mutation['popB_het'].sum())
print("calcRAB", calcRAB(neutral, mutation, 20))
print("calcRAB_neu",calcRAB_neu(neutral, 20))
jx_array=jackknife(neutral, mutation, 0.30, 5)
print("jackknife", jx_array)
np.percentile(jx_array, [2.5, 97.5])

Number of derived mutations = 4
PopA Homozygotes =  1
PopA Heterozygotes =  7
PopB Homozygotes =  1
PopB Heterozygotes =  9
calcRAB 0.7163863178952442
calcRAB_neu 0.9999999999999998
jackknife [0.25666064 0.18805467 0.28849752 0.71764261 0.85804196]


array([0.19491526, 0.84400202])

## Terminal

In [None]:
source ~/RABvcfs_env/bin/activate
python3 /Users/abc6435/Desktop/KROH/scripts/analysis/slim/rab/RABvcfs.py --vcf "rab_chr29_test.vcf" --pop pops.txt --fileN intergenic.txt --fileM nonsynonymous.txt --seed 34 --psites 0.30 --iter 5 --simulated_vcf

calcRAB 0.9821766816874066
calcRAB_neu 1.021948329297477
jackknife [0.91832788 1.07702961 0.84243057 1.08278136 0.90063503]


array([0.84825102, 1.08220619])