This notebook best viewed here: https://nbviewer.jupyter.org

This notebooke encompasses
- sending files to start varscan_pipeline on server
- refiltering varscan output to get variety-specific SNPs
- recalculating RD
- maf filtering
- LD pruning to get SNPs for structure estimation in GEA

In [1]:
from pythonimports import *

# copy over fastq and md5 files to compute canada server

In [2]:
DIR = '/data/fastq/mengmeng/CoAdapTree_DouglasFir/received_2019_Sep10'
fastqs = fs(DIR, pattern='.fastq')
len(fastqs)

352

In [4]:
cmdtext = op.join(DIR, 'cp_to_graham_cmds.txt')
with open(cmdtext, 'w') as o:
    cmds = []
    for fastq in fastqs:
        cmds.append(f'rsync -avz {fastq} graham:/scratch/lindb/DF_pooled/')
    o.write("%s" % '\n'.join(cmds))

In [8]:
lview, dview = get_client('default')

56 56


In [9]:
def exe(cmd):
    import os
    os.system(cmd)

In [10]:
len(cmds)

352

In [11]:
jobs = make_jobs(cmds, exe, lview)
watch_async(jobs)

61
352


KeyboardInterrupt: 

In [12]:
cmdtext

'/data/fastq/mengmeng/CoAdapTree_DouglasFir/received_2019_Sep10/cp_to_graham_cmds.txt'

In [13]:
for j in jobs:
    x = j.r

In [18]:
needed = []
for cmd in cmds:
    fq = op.basename(cmd.split()[2])
    for x in ['NS.1195.001.D707---D504.DF_p54_cap25_kit3_R1.fastq.gz',
              'NS.1195.001.D707---D504.DF_p54_cap25_kit3_R2.fastq.gz',
              'NS.1195.001.D707---D505.DF_p85_cap27_kit3_R1.fastq.gz']:
        if fq == x:
            needed.append(cmd)
len(needed)

3

In [20]:
jobs = make_jobs(needed, exe, lview)
watch_async(jobs)

3
3


# filter output based on variety

I ran all pops through pipeline at the same time, but we want to also run analyses on each variety (coastal and interior) so we need to refilter snps.

In [188]:
lview,dview = get_client()

56 56


In [234]:
# modified from filter_VariantsToTable.py to only pull out baseline-filtered snps based on variety
# modifications are marked with ########## (other than imports)
def pklload(path):
    import pickle
    pkl = pickle.load(open(path, 'rb'))
    return pkl
dview['pklload'] = pklload

def get_varscan_names(df, tablefile):                                          ############ added tablefile arg
    """Convert generic sample/pool names from varscan to something meaningful."""
    print('renaming varscan columns ...')
    import os 
    pool = os.path.basename(os.path.dirname(os.path.dirname(os.path.dirname(tablefile))))          ############ added
    parentdir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(tablefile))))      ############
    
    # get order of samps used to create varscan cmds (same order as datatable)
    samps = pklload(os.path.join(parentdir, f'{pool}/pkl_files/poolsamps.pkl'))[pool]                 ############ 
    # create a list of names that varscan gives by default
    generic = ['Sample%s' % (i+1) for i in range(len(samps))]
    # create a map between generic and true samp names
    dic = dict((gen, samp) for (gen, samp) in zip(generic, samps))
    # rename the columns in df
    cols = []
    for col in df:
        if '.' in col:
            gen, rest = col.split(".")
            samp = dic[gen]
            col = '.'.join([samp, rest])
        cols.append(col)
    df.columns = cols
    return df
dview['get_varscan_names'] = get_varscan_names

def load_data(tablefile, variety):
    """
    Load the VariantsToTable output.
    
    Positional arguments:
    tablefile - path to VariantsToTable output - used to find ploidy etc
    
    Returns:
    df - pandas.dataframe; VariantsToTable output
    tf - basename of tablefile
    """
    import os
    import pandas 
    
    tf = os.path.basename(tablefile)

    # load the data, create a column with CHROM-POS for locusID
    df = pandas.read_csv(tablefile, sep='\t')
    print(f'{tf} has {len(df.index)} rows (includes multiallelic)')
    df['locus'] = ["%s-%s" % (contig, pos) for (contig, pos) in zip(df['CHROM'].tolist(), df['POS'].tolist())]
    df = get_varscan_names(df, tablefile)
    
    # keep only columns for this variety
    cols = [col for col in df.columns if '.' not in col or col.split(".")[0] in varlist[variety]]  ### added
    df = df[[col for col in df.columns if col in cols]].copy()                                     ### added
    
    return df, tf
dview['load_data'] = load_data

def write_file(tablefile, df, tipe, variety):
    import pandas
    import os
    """Write filtered pandas.dataframe to file using args to create file name."""
#     newfile = tablefile.replace(".txt", f"_{tipe}.txt")                   ########## commented out
    write_dir = os.path.dirname(tablefile) + f"_{variety}"                  ########## added
    bname = os.path.basename(tablefile).replace(".txt", f"_{tipe}_{variety}.txt")##### added   
    newfile = os.path.join(write_dir, bname)                              ########## added
    print(f'{tipe}_path = ', newfile)                                     ########## added
    
    df.to_csv(newfile, index=False, sep='\t')
    print('finished filtering VariantsToTable file: %s' % newfile)
dview['write_file'] = write_file

def adjust_freqs(smalldf):
    """
    For loci with REF=N, set freqs of pools with REF=N in GT to numpy.nan.
    Set alt freqs with respect to the second alt allele.
    
    Positional arguments:
    smalldf - pandas.dataframe; df with only REF=N
    
    Returns:
    ndf - smalldf with adjusted freqs in zeroth row
    """
    import pandas
    import numpy
    gtcols = [col for col in smalldf.columns if 'GT' in col]

    for col in gtcols:
        gt = smalldf.loc[1, col]
        if isinstance(gt, str):
            freqcol = col.split(".")[0] + '.FREQ'
            if not gt == 'N/N':
                freq = smalldf.loc[0, freqcol]
                if isinstance(freq, str):
                    if "%" in freq:
                        newfreq = "%s%%" % (100 - float(freq.split("%")[0]))
                        smalldf.loc[0, freqcol] = newfreq
            else:
                # if gt = N/N, adjust to undefined
                smalldf.loc[1, freqcol] = numpy.nan
        gt2 = smalldf.loc[0, col]
        if isinstance(gt2, str):
            if gt == 'N/N':
                # if gt = N/N, adjust to undefined
                smalldf.loc[0, freqcol] = numpy.nan
    return smalldf
dview['adjust_freqs'] = adjust_freqs

def get_refn_snps(df, tipe, ndfs=None):
    """
    Isolate polymorphisms with REF=N but two ALT single nuleodite alleles.
    
    Positional arguments:
    df - pandas.dataframe; current filtered VariantsToTable output
    
    Returns:
    dfs - list of loci (pandas.dataframes) with REF=N and two ALT alleles, counts with respect to second ALT
    ndfs - return from pandas.conat(dfs)
    """
    import pandas
    # as far as I can tell, crisp output from convert_pooled_vcf.py will not output REF = N
    ndf = df[df['REF'] == 'N'].copy()
    ndf = ndf[ndf['TYPE'] == tipe].copy()
    ncount = table(ndf['locus'])
    nloci = [locus for locus in ncount if ncount[locus] == 2]
    ndf = ndf[ndf['locus'].isin(nloci)].copy()
    dfs = []
    for locus in uni(ndf['locus']):
        smalldf = ndf[ndf['locus'] == locus].copy()
        if len(smalldf.index) == 2:
            smalldf.index = range(len(smalldf.index))
            smalldf = adjust_freqs(smalldf)
            smalldf.loc[0,'ALT'] = "%s+%s" % (smalldf.loc[0,'ALT'], smalldf.loc[1,"ALT"])
            dfs.append(pandas.DataFrame(smalldf.loc[0,:]).T)
    if len(dfs) > 0:
        ndfs = pandas.concat(dfs)
    return (dfs, ndfs)
dview['get_refn_snps'] = get_refn_snps

def keep_snps(df, tf):
    """
    Count CHROM-POS (locus) and keep only those with one ALT.
    
    Positional arguments:
    df - pandas.dataframe; currently filtered VariantsToTable output
    tf - basename of path to VariantsToTable output
    Returns:
    df - pandas.dataframe; non-multiallelic-filtered VariantsToTable output
    """
    import pandas
    loccount = table(df['locus'])
    goodloci = [locus for locus in loccount if loccount[locus] == 1]
    print(f'{tf} has {len(goodloci)} good loci (non-multiallelic)')

    # filter df for multiallelic (multiple lines), REF != N
    df = df[df['locus'].isin(goodloci)].copy()
    df = df[df['REF'] != 'N'].copy()
    return df
dview['keep_snps'] = keep_snps

def filter_missing_data(df, tf, tipe):
    """
    Remove loci with < 25% missing data.
    Count numpy.nan in .FREQ col to assess % missing data.
    
    Positional arguments:
    df - pandas.dataframe; VariantsToTable output
    tf - str; basename of tablefile
    tipe - str; one of either "SNP" or "INDEL"
    
    Returns:
    df - pandas.dataframe; missing data-filtered VariantsToTable output
    """
    import tqdm
    import pandas
    import math
    freqcols = [col for col in df.columns if '.FREQ' in col]
    copy = get_copy(df, freqcols)
    keepers = []
    # else statement for running single pos.path.(megagamtos.path.yte) through:
    thresh = math.floor(0.25 * len(freqcols)) if len(freqcols) > 1 else 1
    for locus in tqdm.tqdm(copy.columns):
        # if there is less than 25% missing data:
        # the only time x != x is when x is nan (fastest way to count it)
        count = sum(1 for x in copy[locus] if x != x)
        if count < thresh:
            keepers.append(locus)
    df = df[df.index.isin(keepers)].copy()
    df.index = range(len(df.index))
    return df
dview['filter_missing_data'] = filter_missing_data

def get_copy(df, cols):
    """
    Transpose dataframe using specific columns (that will be index after transformation).
    Doing so helps speed things up.
    """
    import pandas
    return df[cols].T.copy()
dview['get_copy'] = get_copy

def get_variety_freq_cutoffs(variety, ploidy):
    """
    Use number of pops per variety to determine lowfreq, highfreq.
    Differs from pipeline.
    """
    lowfreq = 1/sum([popploidy for pop,popploidy in ploidy.items() if pop in varlist[variety]])
    ###############                                               ##### note diffs with get_freq_cutoffs(tablefile)
    highfreq = 1 - lowfreq
    return lowfreq, highfreq
dview['get_variety_freq_cutoffs'] = get_variety_freq_cutoffs

def filter_freq(df, tf, tipe, tablefile, variety):
    """
    Keep fixed loci.
    
    Positional arguments:
    df - pandas.dataframe; VariantsToTable output
    tablefile - path to VariantsToTable output - used to find ploidy etc
    tf - str; basename of tablefile
    tipe - str; one of either "SNP" or "INDEL"
    
    Returns:
    df - pandas.dataframe; freq-filtered VariantsToTable output
    """
    import tqdm
    import pandas
    import os
    import math
    # believe it or not, it's faster to do qual and freq filtering in two steps vs an 'and' statement
#     lowfreq, highfreq = get_freq_cutoffs(tablefile)                                         ############ removed
#     print(f'filtering for global frequency ({lowfreq}, {highfreq})...')                     ############ moved
    df.reset_index(drop=True, inplace=True)
    
    # prep for filtering
    freqcols = [col for col in df.columns if '.FREQ' in col]
    pool = os.path.basename(os.path.dirname(os.path.dirname(os.path.dirname(tablefile))))     ############ changed
    parentdir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(tablefile)))) ############
    ploidy = pklload(os.path.join(parentdir, f'{pool}/pkl_files/ploidy.pkl'))[pool]           ############
    lowfreq, highfreq = get_variety_freq_cutoffs(variety, ploidy)                             ############ added
    print(f'filtering for global frequency ({lowfreq}, {highfreq})...')                       ## moved from above
    
    # carry on with poolseq datas
    filtloci = []
    afs = []
    copy = get_copy(df, freqcols)
    for locus in tqdm.tqdm(copy.columns):
        freqs = dict((samp.replace(".FREQ",""),freq) for (samp,freq)
                     in copy[locus].str.rstrip('%').astype('float').items()
                     if not math.isnan(freq))  # faster than .str.rstrip('%').astype('float').dropna()
        if len(freqs) > 0:  # avoid loci with all freqs masked (avoid ZeroDivisionError)
            # calc globfreq using the samps/ploidy that are present for this locus
            globfreq = sum([ploidy[samp]*(freq/100)
                            for (samp,freq) in freqs.items()]) / sum([ploidy[samp] for samp in freqs])
            if lowfreq <= globfreq <= highfreq:
                filtloci.append(locus)
                # since we're going in order of rows in df ...
                # ... we can use afs to replace AF col later since we reduce df to filtloci
                afs.append(globfreq)
                # which is about 40x faster than: df.loc[locus, 'AF'] = globfreq
    print(f'{tf} has {len(filtloci)} {tipe}s that have global MAF > {lowfreq*100}%')
    df = df[df.index.isin(filtloci)].copy()
    df.index = range(len(df.index))
    df['AF'] = afs
    return df
dview['filter_freq'] = filter_freq

def filter_qual(df, tf, tipe, tablefile, variety):
    """
    mask freqs that have GQ < 20.
    
    Positional arguments:
    df - pandas.dataframe; VariantsToTable output
    tf - str; basename of tablefile
    tipe - str; one of either "SNP" or "INDEL"
    
    Returns: pandas.dataframe; quality-filtered VariantsToTable output
    - FREQ and GT are masked (numpy.nan) if GQ < 20
    """
    import tqdm
    import pandas
    import numpy
    gqcols = [col for col in df.columns if '.GQ' in col]
    print(f'masking bad freqs for {len(gqcols)} pools...')
    for col in tqdm.tqdm(gqcols):
        freqcol = col.replace(".GQ", ".FREQ")
#         gtcol = col.replace(".GQ", ".GT")  # pretty sure this is depricated
        # badloci True if qual < 20
#         df.loc[df[col] < 20, [freqcol, gtcol]] = np.nan
        df.loc[df[col] < 20, freqcol] = numpy.nan

    print('filtering for missing data ...')
    df = filter_missing_data(df, tf, tipe)

    if len(df.index) > 0:
        print(f'{tf} has {len(df.index)} {tipe}s that have GQ >= 20 and < 25% missing data')
        df = filter_freq(df, tf, tipe, tablefile, variety)
        df.index = range(len(df.index))
    else:
        print(f'{tf} did not have any {tipe}s that have GQ >= 20 for >= 75% of pops' +
              '\nnot bothering to filter for freq')
#         df = drop_freq_cols(df)
    return df
dview['filter_qual'] = filter_qual


def main(tablefile, tipe='SNP', parentdir=None, ret=True, variety=None):   ########## changed default args
    import sys
    import pandas
    import numpy
    import math
    import tqdm
    import os
    from collections import Counter
    # load the data
    df, tf = load_data(tablefile, variety)
    
    # filter only SNPs
    df = df[df['TYPE'] == tipe].copy()

    # determine loci with REF=N but biallelic otherwise
    if tipe == 'SNP':
        dfs, ndfs = get_refn_snps(df, tipe)

        # determine which loci are multiallelic
        df = keep_snps(df, tf)
    
    if len(df.index) == 0:
        if ret is True:
            return df
        else:
            # save
            write_file(tablefile, df, tipe)

    # add in loci with REF=N but biallelic otherwise
    if tipe == 'SNP' and len(dfs) > 0:
        print(f'{tf} has {len(ndfs.index)} biallelic {tipe}s with REF=N')
        dfs.append(df)
        df = pandas.concat(dfs)

    # filter for quality and missing data
    df.index = range(len(df.index))
    if 'varscan' in tf and tipe == 'SNP':
        # if we allow to continue for INDEL, each line is treated as a locus (not true for INDEL)
        df = filter_qual(df, tf, tipe, tablefile, variety)


########################################################################################################
    # look for filtering options called at 00_start.py
    if parentdir is not None and tipe == 'SNP':
        # translate stitched (if called at 00_start)   ############## no need to translate for DF

        # remove repeats (if called at 00_start) - want to remove repeats before paralogs
        df = remove_repeats(df.copy(),
                            parentdir,
                            tablefile,
                            os.path.basename(os.path.dirname(os.path.dirname(os.path.dirname(tablefile)))), ## added
                            variety)  ###### added
#                             op.basename(pooldir))  # commented out

        # remove paralog SNPs (if called at 00_start)
        df = remove_paralogs(df.copy(), parentdir, tablefile,
                             os.path.basename(os.path.dirname(os.path.dirname(os.path.dirname(tablefile)))),## added
                             variety) ###### added
########################################################################################################

        
    if ret is True:
        print('returning df')
        return df
    else:
        # save
        write_file(tablefile, df, tipe, variety)

dview['main'] = main
dview['uni'] = uni
from pythonimports import table # in case I use 'table' in an iteration
dview['table'] = table

In [207]:
def remove_paralogs(snps, parentdir, snpspath, pool, variety):
    """
    Remove sites from snptable that are thought to have multiple gene copies align to this position.
    
    # assumes
    # paralog file has 'CHROM' and 'locus' in the header (best if this is the only data, reads in quicker)
    #   where CHROM is the reference chromosome/scaffold
    #   where locus is hyphen-separated CHROM-POS
    
    # paralog file is created from calling SNPs on haplotype data as diploid
    #   no need to worry about translating stiched -> unstitched if SNPs called on same reference.
    """
    import os, pandas
    parpkl = os.path.join(parentdir, f'{pool}/pkl_files/paralog_snps.pkl')
    if os.path.exists(parpkl):
        # read in paralogfile
#         paralogdict = pklload(parpkl)                                            ############ commented out
#         if paralogdict[pool] is not None:                                        ############ commented out
        if True:                                                                   ########## added
            print('Removing paralogs sites ...')
#             paralogs = pd.read_csv(paralogdict[pool], sep='\t')                  ############ commented out
            refdir = '/data/database/DouglasFir_ref_genome'                        ############ added
            paralogfile = os.path.join(refdir, 'DF_mega-varscan_all_bedfiles_SNP_paralog_snps.txt')# added
            paralogs = pandas.read_table(paralogfile)                                  ############ added
            # remove and isolate paralogs from snps
            truths = snps['locus'].isin(paralogs['locus'])
            found_paralogs = snps[truths].copy()
            snps = snps[~truths].copy()
            snps.index = range(len(snps.index))

            # write paralogs to a file
#             parafile = snpspath.replace(".txt", "_PARALOGS.txt")                 ########## commented out
            
            write_dir = os.path.dirname(snpspath) + f"_{variety}"                  ########## added
            bname = os.path.basename(snpspath).replace(".txt", f"_PARALOGS_{variety}.txt")### added   
            parafile = os.path.join(write_dir, bname)                              ########## added
            print('paralog_path = ', parafile)                                     ########## added
            
            found_paralogs.to_csv(parafile, sep='\t', index=False)
            print(f'{os.path.basename(snpspath)} has {len(snps.index)} non-paralog SNPs')
    return snps
dview['remove_paralogs'] = remove_paralogs


def remove_repeats(snps, parentdir, snpspath, pool, variety):
    """
    Remove SNPs that are found to be in repeat-masked regions.
    
    # assumes
    # that the positions have been translated BEFORE removing repeats
        # took forever to create unstitched repeat regions, don't want to translate repeat file
        # this way I can just use unstitched chrom if reference is stitched
    # repeat file has a header ('CHROM', 'start', 'stop')
    # start and stop positions of repeat regions are 1-based
    """
    import pandas
    import tqdm
    import os
    reppkl = os.path.join(parentdir, f'{pool}/pkl_files/repeat_regions.pkl')
    if os.path.exists(reppkl):
        # read in repeat regions
#         repeatdict = pklload(reppkl)                                             ########## commented out
#         if repeatdict[pool] is not None:                                         ########## commented out
        if True:                                                                   ########## added
            print('Removing repeat regions ...')
            # if user selected translation be applied to this pool
#             repeats = pd.read_csv(repeatdict[pool], sep='\t')                    ########## commented out
            repeats = pandas.read_table('/data/database/DouglasFir_ref_genome/DF_ref_edit_repeats.txt')   #### added
            # figure out if data is from stitched or not
            if 'unstitched_chrom' in snps.columns:
                # then the snps have been translated: stitched -> unstitched
                chromcol = 'unstitched_chrom'
                poscol = 'unstitched_pos'
                print('\tsnps have been translated')
            else:
                # otherwise SNPs were called on unstitched reference
                chromcol = 'CHROM'
                poscol = 'POS'
                print('\tsnps have not been translated')
            # reduce repeats to the chroms that matter (helps speed up lookups)
            repeats = repeats[repeats['CHROM'].isin(snps[chromcol].tolist())].copy()

            # isolate SNPs in repeat regions
            repeat_snps = []
            for chrom in tqdm.tqdm(uni(snps[chromcol])):
                reps = repeats[repeats['CHROM'] == chrom].copy()
                mysnps = snps[snps[chromcol] == chrom].copy()
                if len(reps.index) > 0 and len(mysnps.index) > 0:
                    for row in mysnps.index:
                        pos = snps.loc[row, poscol]  # index is maintained from snps to mysnsps
                        df = reps[reps['stop'].astype(int) >= int(pos)].copy()
                        df = df[df['start'].astype(int) <= int(pos)].copy()
                        if len(df.index) > 0:
                            assert len(df.index) == 1
                            repeat_snps.append(row)

            # save repeats
            print(f'\tSaving {len(repeat_snps)} repeat regions')
#             repeat_path = snpspath.replace(".txt", "_REPEATS.txt")               ########## comm ented out
            write_dir = os.path.dirname(snpspath) + f"_{variety}"                  ########## added
            bname = os.path.basename(snpspath).replace(".txt", f"_REPEATS_{variety}.txt")### added   
            repeat_path = os.path.join(write_dir, bname)                           ########## added
            print('repeat_path = ', repeat_path)                                   ########## added
            
            myrepeats = snps[snps.index.isin(repeat_snps)].copy()
            myrepeats.to_csv(repeat_path, sep='\t', index=False)

            # remove SNPs in repeat regions
            snps = snps[~snps.index.isin(repeat_snps)].copy()
            snps.index = range(len(snps.index))

            print(f'{os.path.basename(snpspath)} has {len(snps.index)} SNPs outside of repeat regions')

    return snps
dview['remove_repeats'] = remove_repeats

In [5]:
# envdata has variety ID
envdata = pd.read_table('/data/projects/pool_seq/environemental_data/df_std_env-19variables.txt')
envdata = envdata[envdata['our_id']==envdata['our_id']]  # removes irrelevant pop with our_id=nan

pool2var = {}
varlist = {}
for row in envdata.index:
    pool = envdata.loc[row, 'our_id']
    variety = envdata.loc[row, 'Variety']
    pool2var[pool] = variety
    if variety not in varlist:
        varlist[variety] = []
    varlist[variety].append(pool)
dview['varlist'] = varlist

In [97]:
# create directories to save files
for variety in varlist.keys():
    makedir(f'/data/projects/pool_seq/DF_datasets/DF_pooled_GEA/DF_pooled/snpsANDindels/01_unfiltered_{variety}')
    print(variety)

FDC
FDI


In [113]:
# test out filtering
tablefile = '/data/projects/pool_seq/DF_datasets/DF_pooled_GEA/DF_pooled/snpsANDindels/01_unfiltered/DF_pooled_varscan_bedfile_0391_table.txt'
df = main(tablefile, tipe='SNP', parentdir="/data/projects/pool_seq/DF_datasets/DF_pooled_GEA",
          ret=False, variety='FDC')

  0%|          | 0/45 [00:00<?, ?it/s]

DF_pooled_varscan_bedfile_0391_table.txt has 11890 rows (includes multiallelic)
renaming varscan columns ...
DF_pooled_varscan_bedfile_0391_table.txt has 10219 good loci (non-multiallelic)
masking bad freqs for 45 pools...


100%|██████████| 45/45 [00:00<00:00, 643.56it/s]


filtering for missing data ...


100%|██████████| 10216/10216 [00:00<00:00, 15204.98it/s]
  4%|▍         | 199/4571 [00:00<00:02, 1984.03it/s]

DF_pooled_varscan_bedfile_0391_table.txt has 4571 SNPs that have GQ >= 20 and < 25% missing data
filtering for global frequency (0.0002796420581655481, 0.9997203579418344)...


100%|██████████| 4571/4571 [00:02<00:00, 2018.31it/s]


DF_pooled_varscan_bedfile_0391_table.txt has 4137 SNPs that have global MAF > 0.02796420581655481%
Removing repeat regions ...


  0%|          | 0/29 [00:00<?, ?it/s]

	snps have not been translated


100%|██████████| 29/29 [00:05<00:00,  5.19it/s]


	Saving 56 repeat regions
repeat_path =  /data/projects/pool_seq/DF_datasets/DF_pooled_GEA/DF_pooled/snpsANDindels/01_unfiltered_FDC/DF_pooled_varscan_bedfile_0391_table_REPEATS_FDC.txt
DF_pooled_varscan_bedfile_0391_table.txt has 4081 SNPs outside of repeat regions
Removing paralogs sites ...
paralog_path =  /data/projects/pool_seq/DF_datasets/DF_pooled_GEA/DF_pooled/snpsANDindels/01_unfiltered_FDC/DF_pooled_varscan_bedfile_0391_table_PARALOGS_FDC.txt
DF_pooled_varscan_bedfile_0391_table.txt has 4076 non-paralog SNPs
SNP_path =  /data/projects/pool_seq/DF_datasets/DF_pooled_GEA/DF_pooled/snpsANDindels/01_unfiltered_FDC/DF_pooled_varscan_bedfile_0391_table_SNP_FDC.txt
finished filtering VariantsToTable file: /data/projects/pool_seq/DF_datasets/DF_pooled_GEA/DF_pooled/snpsANDindels/01_unfiltered_FDC/DF_pooled_varscan_bedfile_0391_table_SNP_FDC.txt


#### now do in parallel

In [192]:
# get all of the varscan outputs
files = fs('/data/projects/pool_seq/DF_datasets/DF_pooled_GEA/DF_pooled/snpsANDindels/01_unfiltered',
           endswith='table.txt')
len(files)

932

In [241]:
# filter FDC variety
jobs = []
for f in files:
    jobs.append(lview.apply_async(main, f, **{'tipe':'SNP',
                                              'parentdir':"/data/projects/pool_seq/DF_datasets/DF_pooled_GEA",
                                              'ret':False,
                                              'variety':'FDC'}))
watch_async(jobs)

932
932


In [248]:
# make sure no errors
for j in jobs:
    x = j.r

In [249]:
# filter FDI variety
fdijobs = []
for f in files:
    fdijobs.append(lview.apply_async(main, f, **{'tipe':'SNP',
                                              'parentdir':"/data/projects/pool_seq/DF_datasets/DF_pooled_GEA",
                                              'ret':False,
                                              'variety':'FDI'}))
watch_async(fdijobs)

932
932


In [250]:
# make sure no errors
for j in fdijobs:
    x = j.r

In [254]:
# check to see how many files were produced per bedfile
# filtfiles = fs('/data/projects/pool_seq/DF_datasets/DF_pooled_GEA/DF_pooled/snpsANDindels/01_unfiltered_FDC',
#                endswith='.txt')
filtfiles = fs('/data/projects/pool_seq/DF_datasets/DF_pooled_GEA/DF_pooled/snpsANDindels/01_unfiltered_FDI',
               endswith='.txt')
bedtofiles = {}
for f in filtfiles:
    bed = f.split("bedfile_")[1].split("_table")[0]
    assert float(bed) == int(bed)
    if bed not in bedtofiles:
        bedtofiles[bed] = []
    bedtofiles[bed].append(f)

In [255]:
missing = []
for i in range(int(max(bedtofiles.keys()))):
    bed = str(i).zfill(4)
    if not bed in bedtofiles.keys():
        missing.append(files[i])
    elif len(bedtofiles[bed]) != 3:
        missing.append(files[i])
len(missing)

0

#### combine dataframes

In [1]:
from pythonimports import *

In [257]:
# make new dirs
for variety in varlist:
    makedir(f'/data/projects/pool_seq/DF_datasets/DF_pooled_GEA/DF_pooled/snpsANDindels/02_baseline_filtered_{variety}')

In [2]:
lview,dview = get_client()

56 56


In [3]:
def read_df(f):
    import pandas
    return pandas.read_table(f)

In [6]:
# combine dfs
for variety in varlist:
    d = f'/data/projects/pool_seq/DF_datasets/DF_pooled_GEA/DF_pooled/snpsANDindels/01_unfiltered_{variety}'
    dstdir = f'/data/projects/pool_seq/DF_datasets/DF_pooled_GEA/DF_pooled/snpsANDindels/02_baseline_filtered_{variety}'
    assert op.exists(dstdir)
    for tipe in ['SNP', 'PARALOGS', 'REPEATS']:
        files = fs(d, pattern=tipe, endswith='.txt')
        jobs = make_jobs(files, read_df, lview)
        watch_async(jobs)
        df = pd.concat([j.r for j in jobs])
        file = op.join(dstdir, f'DF_pooled-varscan_all_bedfiles_{tipe}_{variety}.txt')
        print(variety, tipe, len(files), nrow(df))
        df.to_csv(file, sep='\t', index=False)

932
932
FDI REPEATS 932 282344


# filter for MAF

filter for MAF >= 0.05 for each variety

In [2]:
from pythonimports import *

In [3]:
lview,dview = get_client()

56 56


In [15]:
def get_skipto_df(f, skipto, nrows, cols=None, filter_maf=False, **kwargs):
    """Retrieve dataframe in parallel so that all rows are captured when iterating.
    
    f = filename to open
    skipto = row number to skip, read rows thereafter
    nrows = how many rows to read from f after skipto
    """
    import pandas
    
    if skipto == 0:
        df = pandas.read_table(f, nrows=nrows-1)
    else:
        df = pandas.read_table(f, skiprows=range(1, skipto), nrows=nrows)
    
    if cols is not None:
        if isinstance(cols, str):
            cols = [cols]
        df = df[cols].copy()
    
    if filter_maf is True:
        return maf_filter(df, **kwargs)
    
    return df
dview['get_skipto_df'] = get_skipto_df

def maf_filter(chunk, maf=0.05, **kwargs):
    """filter minor allele frequency >= maf, create maf column, return df."""
    import pandas
    import os

    # filter for MAF
    df = chunk[(chunk['AF'].astype(float) >= maf) & (chunk['AF'].astype(float) <= (1-maf))].copy()
    # create MAF column
    df['MAF'] = df['AF']
    df.loc[df['AF'].astype(float) > 0.5, 'MAF'] = 1 - chunk['AF'][chunk['AF'].astype(float) > 0.5]
    assert sum(df['MAF']<maf) == 0
    
    return df
dview['maf_filter'] = maf_filter

In [42]:
# get linenums for each variety and each type
linenums = {}
for variety in ['FDI', 'FDC']:
    d = f'/data/projects/pool_seq/DF_datasets/DF_pooled_GEA/DF_pooled/snpsANDindels/02_baseline_filtered_{variety}'
    for tipe in ['SNP', 'PARALOGS', 'REPEATS']:
        f = op.join(d, f'DF_pooled-varscan_all_bedfiles_{tipe}_{variety}.txt')
        out = !wc -l $f
        linenums[f] = int(out[0].split()[0])-1
        print(variety, tipe, linenums[f])

FDI SNP 9062679
FDI PARALOGS 1859
FDI REPEATS 282344
FDC SNP 9409666
FDC PARALOGS 1831
FDC REPEATS 291081


#### first filter interior

In [19]:
# read in SNPs in parallel
f = '/data/projects/pool_seq/DF_datasets/DF_pooled_GEA/DF_pooled/snpsANDindels/02_baseline_filtered_FDI/DF_pooled-varscan_all_bedfiles_SNP_FDI.txt'
nrows = 50000
jobs = []
count = 0
for skipto in range(0, linenums[f], nrows):
    num = str(count).zfill(4)
    jobs.append(lview.apply_async(get_skipto_df, *(f, skipto, nrows), **{'filter_maf':True, 'maf':0.05}))
    count += 1
watch_async(jobs)

182
182


In [20]:
# check for errors
for j in jobs:
    x = j.r

In [21]:
# concat filtered data
interior = pd.concat([j.r for j in jobs])
interior.shape

(2609583, 351)

In [None]:
# since I have this SNP file read in already, go to Recalculate RD section, then come back for coastal variety

In [43]:
linenums

{'/data/projects/pool_seq/DF_datasets/DF_pooled_GEA/DF_pooled/snpsANDindels/02_baseline_filtered_FDI/DF_pooled-varscan_all_bedfiles_SNP_FDI.txt': 9062679,
 '/data/projects/pool_seq/DF_datasets/DF_pooled_GEA/DF_pooled/snpsANDindels/02_baseline_filtered_FDI/DF_pooled-varscan_all_bedfiles_PARALOGS_FDI.txt': 1859,
 '/data/projects/pool_seq/DF_datasets/DF_pooled_GEA/DF_pooled/snpsANDindels/02_baseline_filtered_FDI/DF_pooled-varscan_all_bedfiles_REPEATS_FDI.txt': 282344,
 '/data/projects/pool_seq/DF_datasets/DF_pooled_GEA/DF_pooled/snpsANDindels/02_baseline_filtered_FDC/DF_pooled-varscan_all_bedfiles_SNP_FDC.txt': 9409666,
 '/data/projects/pool_seq/DF_datasets/DF_pooled_GEA/DF_pooled/snpsANDindels/02_baseline_filtered_FDC/DF_pooled-varscan_all_bedfiles_PARALOGS_FDC.txt': 1831,
 '/data/projects/pool_seq/DF_datasets/DF_pooled_GEA/DF_pooled/snpsANDindels/02_baseline_filtered_FDC/DF_pooled-varscan_all_bedfiles_REPEATS_FDC.txt': 291081}

In [47]:
# now do paralogs and repeats
fdi_remaining = {}
for key in keys(linenums)[1:3]:
    df = pd.read_csv(key, sep='\t')
    print(op.basename(key), df.shape)
    fdi_remaining[key] = maf_filter(df)
    print('\t', fdi_remaining[key].shape)

DF_pooled-varscan_all_bedfiles_PARALOGS_FDI.txt (1859, 350)
	 (988, 351)
DF_pooled-varscan_all_bedfiles_REPEATS_FDI.txt (282344, 350)
	 (89609, 351)


In [None]:
# go to recalc RD and then come back for coastal variety paralogs and repeats

#### now do coastal variety

In [31]:
# read in SNPs in parallel
f = '/data/projects/pool_seq/DF_datasets/DF_pooled_GEA/DF_pooled/snpsANDindels/02_baseline_filtered_FDC/DF_pooled-varscan_all_bedfiles_SNP_FDC.txt'
nrows = 50000
jobs = []
count = 0
for skipto in range(0, linenums[f], nrows):
    num = str(count).zfill(4)
    jobs.append(lview.apply_async(get_skipto_df, *(f, skipto, nrows), **{'filter_maf':True, 'maf':0.05}))
    count += 1
watch_async(jobs)

189
189


In [32]:
# check for errors
for j in jobs:
    x = j.r

In [33]:
# concat filtered data
coastal = pd.concat([j.r for j in jobs])
coastal.shape

(2350673, 375)

In [56]:
keys(linenums)[4:]

['/data/projects/pool_seq/DF_datasets/DF_pooled_GEA/DF_pooled/snpsANDindels/02_baseline_filtered_FDC/DF_pooled-varscan_all_bedfiles_PARALOGS_FDC.txt',
 '/data/projects/pool_seq/DF_datasets/DF_pooled_GEA/DF_pooled/snpsANDindels/02_baseline_filtered_FDC/DF_pooled-varscan_all_bedfiles_REPEATS_FDC.txt']

In [58]:
# now do paralogs and repeats
fdc_remaining = {}
for key in keys(linenums)[4:]:
    df = pd.read_csv(key, sep='\t')
    print(op.basename(key), df.shape)
    fdc_remaining[key] = maf_filter(df)
    print('\t', fdc_remaining[key].shape)

DF_pooled-varscan_all_bedfiles_PARALOGS_FDC.txt (1831, 374)
	 (1110, 375)
DF_pooled-varscan_all_bedfiles_REPEATS_FDC.txt (291081, 374)
	 (79955, 375)


# Recalcuate RD

Looking at our testdata (1 poolseq pop vs indSeq of same individuals), AD/DP was consistent with the frequency prediction from GATK. We saw that adjusting FREQ to AD / (AD + RD) decreased concordance between the two datasets. So that we are consistent with respect to uncorrected and corrected, I'm adjusting RD = DP - AD so we don't have to make adjustments in the future

In [27]:
def recalc_rd(df):
    """Recalculate RD so RD = DP - AD."""
    rdcols = [col for col in df if '.RD' in col]
    for col in nb(rdcols):
        pop = col.split(".")[0]
        df[f'{pop}.RD'] = df[f'{pop}.DP'] - df[f'{pop}.AD']
    return df

def save_file(df, f):
    """Save file to background using one of the ipcluster engines so I can contiue working."""
    import pandas
    df.to_csv(f, sep='\t', index=False)
    return f

In [23]:
# make dirs
newdirs = {}
for variety in ['FDI', 'FDC']:
    newdirs[variety] = makedir(f'/data/projects/pool_seq/DF_datasets/DF_pooled_GEA/DF_pooled/snpsANDindels/03_maf-p05_RD-recalculated_{variety}')
    print(newdirs[variety])

/data/projects/pool_seq/DF_datasets/DF_pooled_GEA/DF_pooled/snpsANDindels/03_maf-p05_RD-recalculated_FDI
/data/projects/pool_seq/DF_datasets/DF_pooled_GEA/DF_pooled/snpsANDindels/03_maf-p05_RD-recalculated_FDC


#### recalc interior files

In [24]:
# before recalculating SNP data, compare to below
interior.head()

Unnamed: 0,CHROM,POS,REF,ALT,AF,QUAL,TYPE,FILTER,ADP,WT,HET,HOM,NC,DF_p18.GT,DF_p18.GQ,DF_p18.SDP,DF_p18.DP,DF_p18.FREQ,DF_p18.PVAL,DF_p18.AD,DF_p18.RD,DF_p19.GT,DF_p19.GQ,DF_p19.SDP,DF_p19.DP,DF_p19.FREQ,DF_p19.PVAL,DF_p19.AD,DF_p19.RD,DF_p20.GT,DF_p20.GQ,DF_p20.SDP,DF_p20.DP,DF_p20.FREQ,DF_p20.PVAL,DF_p20.AD,DF_p20.RD,DF_p3.GT,DF_p3.GQ,DF_p3.SDP,DF_p3.DP,DF_p3.FREQ,DF_p3.PVAL,DF_p3.AD,DF_p3.RD,DF_p33.GT,DF_p33.GQ,DF_p33.SDP,DF_p33.DP,DF_p33.FREQ,...,DF_p86.GT,DF_p86.GQ,DF_p86.SDP,DF_p86.DP,DF_p86.FREQ,DF_p86.PVAL,DF_p86.AD,DF_p86.RD,DF_p87.GT,DF_p87.GQ,DF_p87.SDP,DF_p87.DP,DF_p87.FREQ,DF_p87.PVAL,DF_p87.AD,DF_p87.RD,DF_p88.GT,DF_p88.GQ,DF_p88.SDP,DF_p88.DP,DF_p88.FREQ,DF_p88.PVAL,DF_p88.AD,DF_p88.RD,DF_p89.GT,DF_p89.GQ,DF_p89.SDP,DF_p89.DP,DF_p89.FREQ,DF_p89.PVAL,DF_p89.AD,DF_p89.RD,DF_p9.GT,DF_p9.GQ,DF_p9.SDP,DF_p9.DP,DF_p9.FREQ,DF_p9.PVAL,DF_p9.AD,DF_p9.RD,DF_p94.GT,DF_p94.GQ,DF_p94.SDP,DF_p94.DP,DF_p94.FREQ,DF_p94.PVAL,DF_p94.AD,DF_p94.RD,locus,MAF
0,jcf7190000000000,77602,G,T,0.822964,-10.0,SNP,PASS,11,6,34,26,21,G/T,84.0,24,24.0,79.17%,3.6826e-09,19.0,5.0,T/T,121.0,26,26.0,92.31%,7.6222e-13,24.0,2.0,./.,,2,,,,,,T/T,87.0,20,20.0,90%,1.6758e-09,18.0,2.0,./.,,5,,,...,G/T,29.0,9,9.0,77.78%,0.0011312,7.0,2.0,T/T,44.0,12,12.0,83.33%,3.3652e-05,10.0,2.0,T/T,52.0,10,10.0,100%,5.4125e-06,10.0,0.0,T/T,52.0,15,15.0,80%,5.2605e-06,12.0,3.0,T/T,39.0,11,11.0,81.82%,0.000111,9.0,2.0,G/T,31.0,12,12.0,66.67%,0.00067304,8.0,4.0,jcf7190000000000-77602,0.177036
2,jcf7190000000000,77657,T,C,0.808995,-10.0,SNP,PASS,16,5,51,26,5,C/C,160.0,43,43.0,81.4%,9.5933e-17,35.0,8.0,C/C,128.0,34,34.0,82.35%,1.349e-13,28.0,6.0,./.,,3,,,,,,C/C,71.0,20,20.0,80%,7.7086e-08,16.0,4.0,C/C,47.0,11,11.0,90.91%,...,C/C,68.0,18,18.0,83.33%,1.4655e-07,15.0,3.0,C/C,81.0,17,17.0,94.12%,7.7134e-09,16.0,1.0,C/C,60.0,15,15.0,86.67%,8.7675e-07,13.0,2.0,T/C,79.0,23,23.0,78.26%,1.1937e-08,18.0,5.0,T/C,49.0,17,17.0,70.59%,1.1e-05,12.0,5.0,T/C,60.0,18,18.0,77.78%,8.0605e-07,14.0,4.0,jcf7190000000000-77657,0.191005
5,jcf7190000000000,77738,T,C,0.804214,-10.0,SNP,PASS,13,2,49,28,8,T/C,103.0,32,32.0,75%,4.1964e-11,24.0,8.0,C/C,126.0,27,27.0,92.59%,2.0853e-13,25.0,2.0,./.,,3,,,,,,C/C,76.0,18,18.0,88.89%,2.0936e-08,16.0,2.0,T/C,29.0,9,9.0,77.78%,...,C/C,71.0,18,18.0,88.24%,7.3277e-08,15.0,2.0,C/C,52.0,10,10.0,100%,5.4125e-06,10.0,0.0,./.,,7,,,,,,C/C,105.0,25,25.0,88%,2.5916e-11,22.0,3.0,T/C,18.0,9,9.0,,0.014706,5.0,4.0,C/C,55.0,14,14.0,85.71%,2.9913e-06,12.0,2.0,jcf7190000000000-77738,0.195786
6,jcf7190000000000,77764,T,C,0.933934,-10.0,SNP,PASS,12,1,28,50,8,C/C,127.0,29,29.0,89.66%,1.6496e-13,26.0,3.0,C/C,117.0,21,21.0,100%,1.8578e-12,21.0,0.0,./.,,4,,,,,,C/C,76.0,14,14.0,100%,2.4927e-08,14.0,0.0,C/C,58.0,11,11.0,100%,...,C/C,71.0,17,17.0,88.24%,7.3277e-08,15.0,2.0,C/C,52.0,10,10.0,100%,5.4125e-06,10.0,0.0,C/C,36.0,9,9.0,88.89%,0.00020568,8.0,1.0,C/C,86.0,18,18.0,94.44%,2.0936e-09,17.0,1.0,C/C,52.0,10,10.0,100%,5e-06,10.0,0.0,C/C,52.0,10,10.0,100%,5.4125e-06,10.0,0.0,jcf7190000000000-77764,0.066066
7,jcf7190000000000,77784,A,G,0.904187,-10.0,SNP,PASS,11,0,32,38,17,G/G,107.0,27,27.0,85.19%,1.6161e-11,23.0,4.0,G/G,111.0,20,20.0,100%,7.2544e-12,20.0,0.0,./.,,3,,,,,,G/G,64.0,12,12.0,100%,3.698e-07,12.0,0.0,G/G,46.0,9,9.0,100%,...,G/G,49.0,13,13.0,84.62%,1.0096e-05,11.0,2.0,G/G,41.0,8,8.0,100%,7.77e-05,8.0,0.0,G/G,46.0,9,9.0,100%,2.0568e-05,9.0,0.0,G/G,34.0,10,10.0,80%,0.00035723,8.0,2.0,G/G,46.0,9,9.0,100%,2.1e-05,9.0,0.0,A/G,37.0,12,12.0,75%,0.00016826,9.0,3.0,jcf7190000000000-77784,0.095813


In [25]:
# recalculate SNP data
print(interior.shape)
interior_recalc = recalc_rd(interior)
print(interior_recalc.shape)
interior_recalc.head()

 17%|█▋        | 7/42 [00:00<00:00, 69.41it/s]

(2609583, 351)


100%|██████████| 42/42 [00:00<00:00, 74.59it/s]


(2609583, 351)


Unnamed: 0,CHROM,POS,REF,ALT,AF,QUAL,TYPE,FILTER,ADP,WT,HET,HOM,NC,DF_p18.GT,DF_p18.GQ,DF_p18.SDP,DF_p18.DP,DF_p18.FREQ,DF_p18.PVAL,DF_p18.AD,DF_p18.RD,DF_p19.GT,DF_p19.GQ,DF_p19.SDP,DF_p19.DP,DF_p19.FREQ,DF_p19.PVAL,DF_p19.AD,DF_p19.RD,DF_p20.GT,DF_p20.GQ,DF_p20.SDP,DF_p20.DP,DF_p20.FREQ,DF_p20.PVAL,DF_p20.AD,DF_p20.RD,DF_p3.GT,DF_p3.GQ,DF_p3.SDP,DF_p3.DP,DF_p3.FREQ,DF_p3.PVAL,DF_p3.AD,DF_p3.RD,DF_p33.GT,DF_p33.GQ,DF_p33.SDP,DF_p33.DP,DF_p33.FREQ,...,DF_p86.GT,DF_p86.GQ,DF_p86.SDP,DF_p86.DP,DF_p86.FREQ,DF_p86.PVAL,DF_p86.AD,DF_p86.RD,DF_p87.GT,DF_p87.GQ,DF_p87.SDP,DF_p87.DP,DF_p87.FREQ,DF_p87.PVAL,DF_p87.AD,DF_p87.RD,DF_p88.GT,DF_p88.GQ,DF_p88.SDP,DF_p88.DP,DF_p88.FREQ,DF_p88.PVAL,DF_p88.AD,DF_p88.RD,DF_p89.GT,DF_p89.GQ,DF_p89.SDP,DF_p89.DP,DF_p89.FREQ,DF_p89.PVAL,DF_p89.AD,DF_p89.RD,DF_p9.GT,DF_p9.GQ,DF_p9.SDP,DF_p9.DP,DF_p9.FREQ,DF_p9.PVAL,DF_p9.AD,DF_p9.RD,DF_p94.GT,DF_p94.GQ,DF_p94.SDP,DF_p94.DP,DF_p94.FREQ,DF_p94.PVAL,DF_p94.AD,DF_p94.RD,locus,MAF
0,jcf7190000000000,77602,G,T,0.822964,-10.0,SNP,PASS,11,6,34,26,21,G/T,84.0,24,24.0,79.17%,3.6826e-09,19.0,5.0,T/T,121.0,26,26.0,92.31%,7.6222e-13,24.0,2.0,./.,,2,,,,,,T/T,87.0,20,20.0,90%,1.6758e-09,18.0,2.0,./.,,5,,,...,G/T,29.0,9,9.0,77.78%,0.0011312,7.0,2.0,T/T,44.0,12,12.0,83.33%,3.3652e-05,10.0,2.0,T/T,52.0,10,10.0,100%,5.4125e-06,10.0,0.0,T/T,52.0,15,15.0,80%,5.2605e-06,12.0,3.0,T/T,39.0,11,11.0,81.82%,0.000111,9.0,2.0,G/T,31.0,12,12.0,66.67%,0.00067304,8.0,4.0,jcf7190000000000-77602,0.177036
2,jcf7190000000000,77657,T,C,0.808995,-10.0,SNP,PASS,16,5,51,26,5,C/C,160.0,43,43.0,81.4%,9.5933e-17,35.0,8.0,C/C,128.0,34,34.0,82.35%,1.349e-13,28.0,6.0,./.,,3,,,,,,C/C,71.0,20,20.0,80%,7.7086e-08,16.0,4.0,C/C,47.0,11,11.0,90.91%,...,C/C,68.0,18,18.0,83.33%,1.4655e-07,15.0,3.0,C/C,81.0,17,17.0,94.12%,7.7134e-09,16.0,1.0,C/C,60.0,15,15.0,86.67%,8.7675e-07,13.0,2.0,T/C,79.0,23,23.0,78.26%,1.1937e-08,18.0,5.0,T/C,49.0,17,17.0,70.59%,1.1e-05,12.0,5.0,T/C,60.0,18,18.0,77.78%,8.0605e-07,14.0,4.0,jcf7190000000000-77657,0.191005
5,jcf7190000000000,77738,T,C,0.804214,-10.0,SNP,PASS,13,2,49,28,8,T/C,103.0,32,32.0,75%,4.1964e-11,24.0,8.0,C/C,126.0,27,27.0,92.59%,2.0853e-13,25.0,2.0,./.,,3,,,,,,C/C,76.0,18,18.0,88.89%,2.0936e-08,16.0,2.0,T/C,29.0,9,9.0,77.78%,...,C/C,71.0,18,18.0,88.24%,7.3277e-08,15.0,3.0,C/C,52.0,10,10.0,100%,5.4125e-06,10.0,0.0,./.,,7,,,,,,C/C,105.0,25,25.0,88%,2.5916e-11,22.0,3.0,T/C,18.0,9,9.0,,0.014706,5.0,4.0,C/C,55.0,14,14.0,85.71%,2.9913e-06,12.0,2.0,jcf7190000000000-77738,0.195786
6,jcf7190000000000,77764,T,C,0.933934,-10.0,SNP,PASS,12,1,28,50,8,C/C,127.0,29,29.0,89.66%,1.6496e-13,26.0,3.0,C/C,117.0,21,21.0,100%,1.8578e-12,21.0,0.0,./.,,4,,,,,,C/C,76.0,14,14.0,100%,2.4927e-08,14.0,0.0,C/C,58.0,11,11.0,100%,...,C/C,71.0,17,17.0,88.24%,7.3277e-08,15.0,2.0,C/C,52.0,10,10.0,100%,5.4125e-06,10.0,0.0,C/C,36.0,9,9.0,88.89%,0.00020568,8.0,1.0,C/C,86.0,18,18.0,94.44%,2.0936e-09,17.0,1.0,C/C,52.0,10,10.0,100%,5e-06,10.0,0.0,C/C,52.0,10,10.0,100%,5.4125e-06,10.0,0.0,jcf7190000000000-77764,0.066066
7,jcf7190000000000,77784,A,G,0.904187,-10.0,SNP,PASS,11,0,32,38,17,G/G,107.0,27,27.0,85.19%,1.6161e-11,23.0,4.0,G/G,111.0,20,20.0,100%,7.2544e-12,20.0,0.0,./.,,3,,,,,,G/G,64.0,12,12.0,100%,3.698e-07,12.0,0.0,G/G,46.0,9,9.0,100%,...,G/G,49.0,13,13.0,84.62%,1.0096e-05,11.0,2.0,G/G,41.0,8,8.0,100%,7.77e-05,8.0,0.0,G/G,46.0,9,9.0,100%,2.0568e-05,9.0,0.0,G/G,34.0,10,10.0,80%,0.00035723,8.0,2.0,G/G,46.0,9,9.0,100%,2.1e-05,9.0,0.0,A/G,37.0,12,12.0,75%,0.00016826,9.0,3.0,jcf7190000000000-77784,0.095813


In [29]:
# save interior SNP data
intfile = op.join(newdirs['FDI'], 'DF_pooled-varscan_all_bedfiles_SNP_FDI_maf_RD-recalculated.txt')
write_jobs = [lview.apply_async(save_file, *(interior_recalc, intfile))]

In [48]:
# recalc RD for paralogs and repeats
fdi_recalc = {}
for key,df in fdi_remaining.items():
    print(op.basename(key), df.shape)
    fdi_recalc[key] = recalc_rd(df)
    print('\t', fdi_recalc[key].shape)

100%|██████████| 42/42 [00:00<00:00, 3787.51it/s]
100%|██████████| 42/42 [00:00<00:00, 1378.00it/s]

DF_pooled-varscan_all_bedfiles_PARALOGS_FDI.txt (988, 351)
	 (988, 351)
DF_pooled-varscan_all_bedfiles_REPEATS_FDI.txt (89609, 351)
	 (89609, 351)





In [51]:
newdirs

{'FDI': '/data/projects/pool_seq/DF_datasets/DF_pooled_GEA/DF_pooled/snpsANDindels/03_maf-p05_RD-recalculated_FDI',
 'FDC': '/data/projects/pool_seq/DF_datasets/DF_pooled_GEA/DF_pooled/snpsANDindels/03_maf-p05_RD-recalculated_FDC'}

In [54]:
# save paralog and repeats
for key,df in fdi_recalc.items():
    dst = op.join(newdirs['FDI'], op.basename(key).replace(".txt", "_maf_RD-recalculated.txt"))
    print(op.basename(dst))
    save_file(df, dst)

DF_pooled-varscan_all_bedfiles_PARALOGS_FDI_maf_RD-recalculated.txt
DF_pooled-varscan_all_bedfiles_REPEATS_FDI_maf_RD-recalculated.txt


#### now recalc coastal files

In [34]:
# before recalculating, compare to below
coastal.head()

Unnamed: 0,CHROM,POS,REF,ALT,AF,QUAL,TYPE,FILTER,ADP,WT,HET,HOM,NC,DF_p1.GT,DF_p1.GQ,DF_p1.SDP,DF_p1.DP,DF_p1.FREQ,DF_p1.PVAL,DF_p1.AD,DF_p1.RD,DF_p2.GT,DF_p2.GQ,DF_p2.SDP,DF_p2.DP,DF_p2.FREQ,DF_p2.PVAL,DF_p2.AD,DF_p2.RD,DF_p23.GT,DF_p23.GQ,DF_p23.SDP,DF_p23.DP,DF_p23.FREQ,DF_p23.PVAL,DF_p23.AD,DF_p23.RD,DF_p24.GT,DF_p24.GQ,DF_p24.SDP,DF_p24.DP,DF_p24.FREQ,DF_p24.PVAL,DF_p24.AD,DF_p24.RD,DF_p25.GT,DF_p25.GQ,DF_p25.SDP,DF_p25.DP,DF_p25.FREQ,...,DF_p93.GT,DF_p93.GQ,DF_p93.SDP,DF_p93.DP,DF_p93.FREQ,DF_p93.PVAL,DF_p93.AD,DF_p93.RD,DF_p95.GT,DF_p95.GQ,DF_p95.SDP,DF_p95.DP,DF_p95.FREQ,DF_p95.PVAL,DF_p95.AD,DF_p95.RD,DF_p96.GT,DF_p96.GQ,DF_p96.SDP,DF_p96.DP,DF_p96.FREQ,DF_p96.PVAL,DF_p96.AD,DF_p96.RD,DF_p97.GT,DF_p97.GQ,DF_p97.SDP,DF_p97.DP,DF_p97.FREQ,DF_p97.PVAL,DF_p97.AD,DF_p97.RD,DF_p98.GT,DF_p98.GQ,DF_p98.SDP,DF_p98.DP,DF_p98.FREQ,DF_p98.PVAL,DF_p98.AD,DF_p98.RD,DF_p17.GT,DF_p17.GQ,DF_p17.SDP,DF_p17.DP,DF_p17.FREQ,DF_p17.PVAL,DF_p17.AD,DF_p17.RD,locus,MAF
0,jcf7190000000000,77738,T,C,0.668265,-10.0,SNP,PASS,13,2,49,28,8,T/C,49.0,17,17.0,70.59%,1.1e-05,12.0,5.0,./.,,6,,,,,,T/C,18.0,8,8.0,,0.012821,5.0,3.0,T/C,32.0,11,11.0,72.73%,0.000516,8.0,3.0,T/C,29.0,15,15.0,53.33%,...,T/C,46.0,20,20.0,60%,2.2548e-05,12.0,8.0,T/C,40.0,15,15.0,66.67%,9.995e-05,10.0,5.0,T/C,50.0,22,22.0,59.09%,9.5813e-06,13.0,9.0,T/C,60.0,18,18.0,77.78%,8.0605e-07,14.0,4.0,./.,,6,,,,,,T/C,32.0,11,11.0,72.73%,0.000516,8.0,3.0,jcf7190000000000-77738,0.331735
1,jcf7190000000000,77764,T,C,0.753734,-10.0,SNP,PASS,12,1,28,50,8,T/C,50.0,16,16.0,75%,8e-06,12.0,4.0,./.,,4,,,,,,./.,,6,,,,,,C/C,49.0,13,13.0,84.62%,1.0096e-05,11.0,2.0,T/C,34.0,15,15.0,60%,...,T/C,53.0,19,19.0,68.42%,5.0106e-06,13.0,6.0,T/C,46.0,15,15.0,73.33%,2.4988e-05,11.0,4.0,T/C,75.0,25,25.0,72%,2.6626e-08,18.0,7.0,C/C,65.0,16,16.0,87.5%,2.5454e-07,14.0,2.0,./.,,4,,,,,,C/C,44.0,12,12.0,83.33%,3.4e-05,10.0,2.0,jcf7190000000000-77764,0.246266
3,jcf7190000000004,54418,T,C,0.36718,-10.0,SNP,PASS,36,1,86,0,0,T/C,39.0,47,47.0,25.53%,0.000109,12.0,35.0,T/C,27.0,20,20.0,40%,0.001638,8.0,12.0,T/C,26.0,31,31.0,25.81%,0.002333,8.0,23.0,T/C,32.0,43,43.0,23.26%,0.00054148,10.0,33.0,T/C,47.0,42,42.0,33.33%,...,T/C,72.0,45,45.0,44.44%,6.2178e-08,20.0,25.0,T/C,83.0,50,50.0,46%,4.3451e-09,23.0,27.0,T/C,29.0,36,36.0,25%,0.0011061,9.0,27.0,T/C,68.0,43,43.0,45.24%,1.3525e-07,19.0,23.0,T/C,76.0,45,45.0,46.67%,2.2206e-08,21.0,24.0,T/T,9.0,14,14.0,,0.11111,3.0,11.0,jcf7190000000004-54418,0.36718
5,jcf7190000000004,54508,A,G,0.080056,-10.0,SNP,PASS,53,67,20,0,0,A/A,89.0,65,65.0,4.62%,0.12209,3.0,62.0,A/A,40.0,32,32.0,6.25%,0.24603,2.0,30.0,A/G,22.0,44,44.0,15.91%,0.006036,7.0,37.0,A/G,21.0,65,65.0,10.77%,0.0065904,7.0,58.0,A/G,15.0,67,67.0,,...,A/A,85.0,67,67.0,5.97%,0.059691,4.0,63.0,A/G,15.0,82,82.0,,0.029351,5.0,77.0,A/A,126.0,68,68.0,0%,1.0,0.0,68.0,A/G,22.0,52,52.0,13.46%,0.0062977,7.0,45.0,A/G,35.0,59,59.0,18.64%,0.0002925,11.0,48.0,A/A,28.0,28,28.0,10.71%,0.11818,3.0,25.0,jcf7190000000004-54508,0.080056
6,jcf7190000000004,54540,A,G,0.333741,-10.0,SNP,PASS,56,1,58,28,0,A/G,56.0,67,67.0,25.37%,2e-06,17.0,50.0,A/G,37.0,36,36.0,30.56%,0.000199,11.0,25.0,A/G,18.0,47,47.0,,0.013188,6.0,41.0,A/G,83.0,67,67.0,35.82%,4.8109e-09,24.0,43.0,A/G,86.0,70,70.0,35.71%,...,A/G,67.0,67,67.0,29.85%,1.7998e-07,20.0,47.0,A/G,58.0,82,82.0,21.95%,1.3391e-06,18.0,64.0,A/G,45.0,69,69.0,20.29%,2.9344e-05,14.0,55.0,A/G,53.0,58,58.0,27.59%,4.6015e-06,16.0,42.0,A/G,88.0,62,62.0,40.32%,1.4168e-09,25.0,37.0,A/G,15.0,26,26.0,,0.02531,5.0,21.0,jcf7190000000004-54540,0.333741


In [36]:
print(coastal.shape)
coastal_recalc = recalc_rd(coastal)
print(coastal_recalc.shape)
coastal_recalc.head()

 20%|██        | 9/45 [00:00<00:00, 82.09it/s]

(2350673, 375)


100%|██████████| 45/45 [00:00<00:00, 80.61it/s]


(2350673, 375)


Unnamed: 0,CHROM,POS,REF,ALT,AF,QUAL,TYPE,FILTER,ADP,WT,HET,HOM,NC,DF_p1.GT,DF_p1.GQ,DF_p1.SDP,DF_p1.DP,DF_p1.FREQ,DF_p1.PVAL,DF_p1.AD,DF_p1.RD,DF_p2.GT,DF_p2.GQ,DF_p2.SDP,DF_p2.DP,DF_p2.FREQ,DF_p2.PVAL,DF_p2.AD,DF_p2.RD,DF_p23.GT,DF_p23.GQ,DF_p23.SDP,DF_p23.DP,DF_p23.FREQ,DF_p23.PVAL,DF_p23.AD,DF_p23.RD,DF_p24.GT,DF_p24.GQ,DF_p24.SDP,DF_p24.DP,DF_p24.FREQ,DF_p24.PVAL,DF_p24.AD,DF_p24.RD,DF_p25.GT,DF_p25.GQ,DF_p25.SDP,DF_p25.DP,DF_p25.FREQ,...,DF_p93.GT,DF_p93.GQ,DF_p93.SDP,DF_p93.DP,DF_p93.FREQ,DF_p93.PVAL,DF_p93.AD,DF_p93.RD,DF_p95.GT,DF_p95.GQ,DF_p95.SDP,DF_p95.DP,DF_p95.FREQ,DF_p95.PVAL,DF_p95.AD,DF_p95.RD,DF_p96.GT,DF_p96.GQ,DF_p96.SDP,DF_p96.DP,DF_p96.FREQ,DF_p96.PVAL,DF_p96.AD,DF_p96.RD,DF_p97.GT,DF_p97.GQ,DF_p97.SDP,DF_p97.DP,DF_p97.FREQ,DF_p97.PVAL,DF_p97.AD,DF_p97.RD,DF_p98.GT,DF_p98.GQ,DF_p98.SDP,DF_p98.DP,DF_p98.FREQ,DF_p98.PVAL,DF_p98.AD,DF_p98.RD,DF_p17.GT,DF_p17.GQ,DF_p17.SDP,DF_p17.DP,DF_p17.FREQ,DF_p17.PVAL,DF_p17.AD,DF_p17.RD,locus,MAF
0,jcf7190000000000,77738,T,C,0.668265,-10.0,SNP,PASS,13,2,49,28,8,T/C,49.0,17,17.0,70.59%,1.1e-05,12.0,5.0,./.,,6,,,,,,T/C,18.0,8,8.0,,0.012821,5.0,3.0,T/C,32.0,11,11.0,72.73%,0.000516,8.0,3.0,T/C,29.0,15,15.0,53.33%,...,T/C,46.0,20,20.0,60%,2.2548e-05,12.0,8.0,T/C,40.0,15,15.0,66.67%,9.995e-05,10.0,5.0,T/C,50.0,22,22.0,59.09%,9.5813e-06,13.0,9.0,T/C,60.0,18,18.0,77.78%,8.0605e-07,14.0,4.0,./.,,6,,,,,,T/C,32.0,11,11.0,72.73%,0.000516,8.0,3.0,jcf7190000000000-77738,0.331735
1,jcf7190000000000,77764,T,C,0.753734,-10.0,SNP,PASS,12,1,28,50,8,T/C,50.0,16,16.0,75%,8e-06,12.0,4.0,./.,,4,,,,,,./.,,6,,,,,,C/C,49.0,13,13.0,84.62%,1.0096e-05,11.0,2.0,T/C,34.0,15,15.0,60%,...,T/C,53.0,19,19.0,68.42%,5.0106e-06,13.0,6.0,T/C,46.0,15,15.0,73.33%,2.4988e-05,11.0,4.0,T/C,75.0,25,25.0,72%,2.6626e-08,18.0,7.0,C/C,65.0,16,16.0,87.5%,2.5454e-07,14.0,2.0,./.,,4,,,,,,C/C,44.0,12,12.0,83.33%,3.4e-05,10.0,2.0,jcf7190000000000-77764,0.246266
3,jcf7190000000004,54418,T,C,0.36718,-10.0,SNP,PASS,36,1,86,0,0,T/C,39.0,47,47.0,25.53%,0.000109,12.0,35.0,T/C,27.0,20,20.0,40%,0.001638,8.0,12.0,T/C,26.0,31,31.0,25.81%,0.002333,8.0,23.0,T/C,32.0,43,43.0,23.26%,0.00054148,10.0,33.0,T/C,47.0,42,42.0,33.33%,...,T/C,72.0,45,45.0,44.44%,6.2178e-08,20.0,25.0,T/C,83.0,50,50.0,46%,4.3451e-09,23.0,27.0,T/C,29.0,36,36.0,25%,0.0011061,9.0,27.0,T/C,68.0,43,43.0,45.24%,1.3525e-07,19.0,24.0,T/C,76.0,45,45.0,46.67%,2.2206e-08,21.0,24.0,T/T,9.0,14,14.0,,0.11111,3.0,11.0,jcf7190000000004-54418,0.36718
5,jcf7190000000004,54508,A,G,0.080056,-10.0,SNP,PASS,53,67,20,0,0,A/A,89.0,65,65.0,4.62%,0.12209,3.0,62.0,A/A,40.0,32,32.0,6.25%,0.24603,2.0,30.0,A/G,22.0,44,44.0,15.91%,0.006036,7.0,37.0,A/G,21.0,65,65.0,10.77%,0.0065904,7.0,58.0,A/G,15.0,67,67.0,,...,A/A,85.0,67,67.0,5.97%,0.059691,4.0,63.0,A/G,15.0,82,82.0,,0.029351,5.0,77.0,A/A,126.0,68,68.0,0%,1.0,0.0,68.0,A/G,22.0,52,52.0,13.46%,0.0062977,7.0,45.0,A/G,35.0,59,59.0,18.64%,0.0002925,11.0,48.0,A/A,28.0,28,28.0,10.71%,0.11818,3.0,25.0,jcf7190000000004-54508,0.080056
6,jcf7190000000004,54540,A,G,0.333741,-10.0,SNP,PASS,56,1,58,28,0,A/G,56.0,67,67.0,25.37%,2e-06,17.0,50.0,A/G,37.0,36,36.0,30.56%,0.000199,11.0,25.0,A/G,18.0,47,47.0,,0.013188,6.0,41.0,A/G,83.0,67,67.0,35.82%,4.8109e-09,24.0,43.0,A/G,86.0,70,70.0,35.71%,...,A/G,67.0,67,67.0,29.85%,1.7998e-07,20.0,47.0,A/G,58.0,82,82.0,21.95%,1.3391e-06,18.0,64.0,A/G,45.0,69,69.0,20.29%,2.9344e-05,14.0,55.0,A/G,53.0,58,58.0,27.59%,4.6015e-06,16.0,42.0,A/G,88.0,62,62.0,40.32%,1.4168e-09,25.0,37.0,A/G,15.0,26,26.0,,0.02531,5.0,21.0,jcf7190000000004-54540,0.333741


In [38]:
# save coastal
coastfile = op.join(newdirs['FDC'], 'DF_pooled-varscan_all_bedfiles_SNP_FDC_maf_RD-recalculated.txt')
write_jobs.append(lview.apply_async(save_file, *(coastal_recalc, coastfile)))

In [60]:
# recalc RD for paralogs and repeats
fdc_recalc = {}
for key,df in fdc_remaining.items():
    print(op.basename(key), df.shape)
    fdc_recalc[key] = recalc_rd(df)
    print('\t', fdc_recalc[key].shape)

100%|██████████| 45/45 [00:00<00:00, 3780.47it/s]
100%|██████████| 45/45 [00:00<00:00, 1586.17it/s]

DF_pooled-varscan_all_bedfiles_PARALOGS_FDC.txt (1110, 375)
	 (1110, 375)
DF_pooled-varscan_all_bedfiles_REPEATS_FDC.txt (79955, 375)
	 (79955, 375)





In [63]:
# save paralog and repeats
for key,df in fdc_recalc.items():
    dst = op.join(newdirs['FDC'], op.basename(key).replace(".txt", "_maf_RD-recalculated.txt"))
    print(op.basename(dst))
    save_file(df, dst)

DF_pooled-varscan_all_bedfiles_PARALOGS_FDC_maf_RD-recalculated.txt
DF_pooled-varscan_all_bedfiles_REPEATS_FDC_maf_RD-recalculated.txt


In [62]:
dst

'/data/projects/pool_seq/DF_datasets/DF_pooled_GEA/DF_pooled/snpsANDindels/03_maf-p05_RD-recalculated_FDC/DF_pooled-varscan_all_bedfiles_REPEATS_FDC_maf_RD-recalculated.txt'