# $R_{AB}$ Executable

## Run Script
- An example of how to run RABmafs.py.
- Package requirements: pandas

`/usr/bin/python3 ./RABmafs.py --popA_mut_sites file1.maf --popB_mut_sites file2.maf --popA_int_sites file3.maf --popB_int_sites file4.maf`

## Define Functions
- First checks file extensions are correct.
- Imports Data.
- Checks that popA and popB have identical mutational and intergenic sites.
- Given that sites are identical, parses the "KnownEM" column into separate vectors.
- Calculates LAB, LBA, and returns RAB. 

In [75]:
#Import Packages
import pandas as pd
import os
import numpy as np
import sys
import argparse

#Define readmafs()
def readmafs(popA_mut_sites, popB_mut_sites, popA_int_sites, popB_int_sites):
    #Checks file extensions (files must end in .maf)
    f1 = popA_mut_sites.split(".")
    if f1[-1] == "gz":
        sys.exit("ERR: MAF file should be unzipped")
    f2 = popB_mut_sites.split(".")
    if f2[-1] == "gz":
        sys.exit("ERR: MAF file should be unzipped")
    f3 = popA_int_sites.split(".")
    if f3[-1] == "gz":
        sys.exit("ERR: MAF file should be unzipped")
    f4 = popB_int_sites.split(".")
    if f4[-1] == "gz":
        sys.exit("ERR: MAF file should be unzipped")

    #Imports Data
    popA_mut = pd.read_csv(popA_mut_sites, sep='\t', header=(0))
    popB_mut = pd.read_csv(popB_mut_sites, sep='\t', header=(0))
    popA_int = pd.read_csv(popA_int_sites, sep='\t', header=(0))
    popB_int = pd.read_csv(popB_int_sites, sep='\t', header=(0))

    #Check Headers
    if not {'chromo', 'position', 'knownEM'}.issubset(popA_mut.columns):
        sys.exit("ERR: Check file headers")
    if not {'chromo', 'position', 'knownEM'}.issubset(popB_mut.columns):
        sys.exit("ERR: Check file headers")
    if not {'chromo', 'position', 'knownEM'}.issubset(popA_int.columns):
        sys.exit("ERR: Check file headers")
    if not {'chromo', 'position', 'knownEM'}.issubset(popB_int.columns):
        sys.exit("ERR: Check file headers")
    
    #Checks Sites in PopA and PopB
    popA_mut_keys = popA_mut['chromo'].astype(str) + "_" + popA_mut['position'].astype(str)
    popB_mut_keys = popB_mut['chromo'].astype(str) + "_" + popB_mut['position'].astype(str)
    popA_int_keys = popA_int['chromo'].astype(str) + "_" + popA_int['position'].astype(str)
    popB_int_keys = popB_int['chromo'].astype(str) + "_" + popB_int['position'].astype(str)
    popA_mut_keys.equals(popB_mut_keys)
    popA_int_keys.equals(popB_int_keys)

    if len(popA_mut_keys)!= len(popB_mut_keys):
        sys.exit("ERR: Number of mutational sites in popA must be identical to popB")
    if len(popA_int_keys)!= len(popB_int_keys):
        sys.exit("ERR: Number of intergenic sites in popA must be identical to popB")
    if not popA_mut_keys.equals(popB_mut_keys):
        sys.exit("ERR: Mutational sites in popA and popB must be identical.")
    if not popA_int_keys.equals(popB_int_keys):
        sys.exit("ERR: Intergenic sites in popA and popB must be identical.")
    else:
        return popA_mut, popB_mut, popA_int, popB_int

#Define parsemafs()
def parsemafs(popA_mut, popB_mut, popA_int, popB_int):
    #Parse Data
    f_AD = popA_mut['knownEM']
    f_BD = popB_mut['knownEM']
    f_AN = popA_int['knownEM']
    f_BN = popB_int['knownEM']
    return f_AD, f_BD, f_AN, f_BN

#Define calcRAB()
def calcRAB(f_AD, f_BD, f_AN, f_BN):
    LAB = sum(f_AD*(1-f_BD))/sum(f_AN*(1-f_BN))
    LBA = sum(f_BD*(1-f_AD))/sum(f_BN*(1-f_BN))
    RAB = LAB/LBA
    return RAB

#Define samplemafs()
def samplemafs(popA_sites, popB_sites, Psites):
    #Subsample Sites
    Nsites = int(round(len(popA_sites) * Psites))
    sites = np.random.permutation(len(popA_sites))[:Nsites]
    popA_sites_subsamp = popA_sites.iloc[sites]
    popB_sites_subsamp = popB_sites.iloc[sites]

    return popA_sites_subsamp, popB_sites_subsamp

#Define jackknife()
def jackknife(popA_mut, popB_mut, popA_int, popB_int, Psites, iter):
    jx = []
    for i in range(iter):
        popA_mut_subsamp, popB_mut_subsamp = samplemafs(popA_mut, popB_mut, Psites)
        popA_int_subsamp, popB_int_subsamp = samplemafs(popA_int, popB_int, Psites)
        
        f_sub_AD, f_sub_BD, f_sub_AN, f_sub_BN = parsemafs(popA_mut_subsamp, 
                                                           popB_mut_subsamp,
                                                           popA_int_subsamp,
                                                           popB_int_subsamp)
        jx.append(calcRAB(f_sub_AD, f_sub_BD, f_sub_AN, f_sub_BN))
    return np.array(jx)

# TEST 
os.chdir('/Users/annamariacalderon/Desktop/')
popA_mut, popB_mut, popA_int, popB_int = readmafs('hKIWA_der_lossoffunction.maf','cKIWA_der_lossoffunction.maf',
         'hKIWA_der_intergenic1.maf','cKIWA_der_intergenic1.maf')
f_AD, f_BD, f_AN, f_BN = parsemafs(popA_mut, popB_mut, popA_int, popB_int)
jx_array = jackknife(popA_mut, popB_mut, popA_int, popB_int, 0.90, 5)
q025, q975 = np.percentile(jx_array, [2.5, 97.5])

print("RAB =", calcRAB(f_AD, f_BD, f_AN, f_BN))
print("2.5% =", q025)
print("97.5% =", q975)


RAB = 0.24029115696196543
2.5% = 0.23785114247492864
97.5% = 0.24101283986596886
