In [1]:
import numpy as np

import pandas as pd

import csv

import os
import subprocess

import warnings
warnings.filterwarnings("ignore")

# VCF functions

In [2]:
def get_files_in_directory(directory: str):
    # Create an empty list to store the filenames
    file_list = []

    # Use the os module to get a list of all the files in the directory
    for filename in os.listdir(directory):
        # Check that the file is a regular file (not a directory or a special file)
        if os.path.isfile(os.path.join(directory, filename)):
            # Add the filename to the list
            file_list.append(directory+filename)

    return sorted(file_list)

def get_vcf(vcf_file):

    #eventually store header and data lines separately
    header = []
    vcf_list = []
    
    with open(vcf_file, newline = '', errors='ignore') as variants:  #open vcf file, take as tab delimited, create huge list that is the vcf file
        variant_reader = csv.reader(variants, delimiter='\t')
        for variant in variant_reader:
                vcf_list.append(variant)

    #list of ALL lines in vcf file
    vcf_lines = []

    for i in range(len(vcf_list)): #for every line in VCF file

        if vcf_list[i][0][0] == "#": #if header lines -> header

            header.append(str(vcf_list[i][0]))

        elif vcf_list[i][0][0] != "#": #if data/variant lines -> vcf_lines

            vcf_lines.append(vcf_list[i])

    return vcf_lines, header


def build_ref_df(ref_vcf): #build pandas dataframe from VCF of 'reference'/parental SNPs

    snp_lines, snp_header = get_vcf(ref_vcf) #read in VCF, split header from data lines

    snp_data = []

    for i in range(len(snp_lines)):
        df_row = [ snp_lines[i][0], int(snp_lines[i][1]), snp_lines[i][3] ,snp_lines[i][4].split(',')[0] ] #create data rows that are only chromosome, position, reference allele, and variant
        snp_data.append(df_row)

    ref_df = pd.DataFrame(snp_data, columns=["chromosome", "position", "reference", "variant"]) # make dataframe with given column names

    ref_df= ref_df.reset_index(drop=True)

    return ref_df


def build_GATK_snp_df(snp_vcf): #build pandas dataframe from VCF samples with GATK genotype calls

    snp_lines, snp_header = get_vcf(snp_vcf) #create data rows that are only chromosome, position, reference allele, and variant

    # our file convention is 'project_name'-'germ_cell_origin'-'sample_number' ... for example, BSP-OR-001 where 'OR' = oocyte-derived recombinant and sample = 001

    # WT-G0-001 - WT = wild-type, G0 = the original NDJ progeny (or F1 for the progeny of a male), 001 = ID number

    #modify this to fit your file naming convention and check for consistency in later functions
    condition = snp_vcf.split("/")[-1].split(".")[0].split("-")[0] # per our file naming convention, get genotype/condition

    sample_type = snp_vcf.split("/")[-1].split(".")[0].split("-")[1] # per our file naming convention, get germ cell of origin

    sample = snp_vcf.split("/")[-1].split(".")[0].split("-")[2] # per our file naming convention, get sample number

    snp_data = []

    for i in range(len(snp_lines)):

        chrom, position, dot1, ref, alt, qual, dot2, info, annotations, annot_data = snp_lines[i] #split SNP data into variables

        annotation_dict = dict(zip(annotations.split(':'), annot_data.split(':'))) # create dictionary for info tags and data

        if alt == '<NON_REF>': #replace with more conventional '.' for lack of variant allele
            alt = '.'

        alt = alt.split(",")[0] #take most prevalent alternate allele

        if annot_data == '0/0' or annot_data[-1] == './.' or 'DP' not in annotation_dict: 
        #if weird site with genotype called but NaN or no read counts given...USELESS...skip this line...
        
            continue


        else:

            DP = int(annotation_dict['DP']) #get total read number used to call variants

            GT = annotation_dict['GT'] #get preliminary genotype call

            ref_reads = 0
            var_reads = 0

            if 'AD' in annotation_dict:
                reads = annotation_dict['AD'].split(',') # if Allelic Depth field is present, split into reads per allele

                ref_reads = int(reads[0]) #ref reads are always first

                if len(reads)>1:
                    var_reads = int(reads[1]) # if variant allele present, get read counts

            elif 'AD' not in annotation_dict: #unless no variant alleles present, ref reads = total reads

                ref_reads = DP



            qual = 0 #qual will be zero unless variant is called

            if snp_lines[i][4] != '.' and snp_lines[i][4] != '<NON_REF>':

                qual = float(snp_lines[i][5]) #get QUAL score

            df_row = [ chrom, int(position), ref, alt, qual, int(ref_reads), int(var_reads), DP, GT, sample_type, sample] #make dataframe rows
            snp_data.append(df_row)

    #make dataframe with given column names matching data in line 116 (df_row)
    snp_df = pd.DataFrame(snp_data, columns=["chromosome", "position", "reference", "variant", 'QUAL',
                                             'ref_reads', 'variant_reads', 'DP', 'genotype', 'sample_type', 'sample_num'])


    snp_df = snp_df[snp_df['genotype']!='./.'] #drop lines with no genotype called
    snp_df= snp_df.reset_index(drop=True)
    snp_df = snp_df.drop(columns='genotype')

    #change names of chromosomes
    # chrom_name_dict = {'I_1':"N2_chrI", 'II_1':"N2_chrII", 'III_1':"N2_chrIII", 'IV':"N2_chrIV", 'V_1':"N2_chrV", 'X_1':"N2_chrX"}
    # snp_df = snp_df.replace(chrom_name_dict)

    return snp_df

# Parental and Sample VCF dataframe merging

In [3]:
def align_vcf_dfs(vcf0, vcf1):
    # align dataframes on [chromosome, position] so that all positions are represented
    (vcf0, vcf1) = vcf0.set_index("position").align(
        vcf1.set_index("position"), fill_value=np.NaN
    )
    vcfs = []
    for i in ((vcf0, vcf1), (vcf1, vcf0)):
        vcf = i[0].reset_index()
        hom_vcf = i[1].reset_index()
        vcf = vcf.fillna(hom_vcf["chromosome"])
        vcf = vcf[~vcf.index.duplicated(keep="first")]
        mask = vcf["position"].isnull()
        vcf.loc[mask, "position"] = hom_vcf["position"]
        vcfs.append(vcf)
    return vcfs

def reassign_vcf_reference(vcf0, vcf1):
    # this needs to be done for the two parental references and the samples
    # all dataframes have to include "non-variant sites" (relative to dm6) to get the info for those
    # since they could be variants in w1118

    # make a df of every position in the dataframes
    all_positions = vcf0.join(vcf1, on=['chromosome', 'position'], how='outer')
    
    vcfs = align_vcf_dfs(vcf0, vcf1)
    # this will return a df where vcf1 is the reference sequence and vcf0 is the alternate
    # copy other vcf's REF column, some rows should be NaN because vcf1 matches dm6 there
    # so for those we will fill with the original REF values
    vcfs[0]['REF_x'] = np.where(
            vcfs[1].loc[vcfs[1]['ref_reads'].isna()],
            vcfs[0].loc['ref_reads'],
            vcfs[1]['ref_reads']
            )

    # if ALT is empty, it should match the original REF at that site
    vcfs[0]['ALT_x'] = np.where(
            vcfs[0].loc[vcfs[0]['variant_reads'].isna()],
            vcfs[1].loc['ref_reads'],
            vcfs[0]['variant_reads']
            )

    vcfs[0].drop(['alt_reads', 'ref_reads'], axis=1)
    
    # if ALT matches the REF, use . convention
    vcfs[0]['alt_reads'] = np.where(
            vcfs[0].loc['alt_reads_x' == vcfs[0]['ref_reads_x']],
            '.',
            vcfs[0]['alt_reads_x']
            )

    vcfs[0]['ref_reads'] = vcfs[0].loc[:, 'ref_reads_x']
    vcfs[0] = vcfs[0].drop(['alt_reads_x', 'ref_reads_x'], axis=1)

    # get the QUAL and DP information for the newly added ALT positions from the original dataframe
    vcfs[0] = vcfs[0].join(all_positions, on=['chromosome', 'position'], how='left', rsuffix='_right')
    vcfs[0] = vcfs[0].drop(['ref_reads_right', 'variant_reads_right', 'QUAL', 'DP'])
    vcfs[0] = vcfs[0].rename({'QUAL_right': 'QUAL', 'DP_right': 'DP'})

    return vcfs[0]

    
def merge_sample_ref_dfs(dm6_sample_vcf_df, ref_vcf_df):

    # change the sample vcf so the reference matches ref_vcf_df
    sample_vcf_df = reassign_vcf_reference(dm6_sample_vcf_df, ref_vcf_df)

    ## merge vcf dataframes on [chromosome, position] to retain every position/marker in the reference marker set

    df_merge = ref_vcf_df.merge(sample_vcf_df, on=['chromosome','position'], copy=False, how='outer')

    ## drop positions/rows that are not in reference marker set

    df_merge = df_merge.drop(df_merge.loc[df_merge['reference_x'].isna()].index, inplace=False)

    ## drop redundant reference column

    df_merge = df_merge.drop(columns=['reference_y'])

    ## replace NaN with zeros for marker sites with no variant reads and not called in GATK vcf, set proper empty variant allele

    df_merge[['QUAL', "ref_reads", "variant_reads", "DP"]] = df_merge[['QUAL', "ref_reads", "variant_reads", "DP"]].fillna(0)
    df_merge['variant_y'] = df_merge['variant_y'].fillna('.')

    ## refill missing gamete and sample number values that were NaN

    sample_type = df_merge[(df_merge['sample_type']=='G0') | (df_merge['sample_type']=='F1')].sample_type.unique()[0]
    sample_num = [x for x in df_merge.sample_num.unique() if isinstance(x, str)][0]

    df_merge['sample_type'] = sample_type
    df_merge['sample_num'] = sample_num

    ## if sample variant is not target marker variant, change to empty variant allele and set variant reads to zero

    df_merge['variant_y'] = df_merge.apply(lambda row: '.' if row['variant_x'] != row['variant_y'] else row['variant_y'], axis =1)
    df_merge['variant_reads'] = df_merge.apply(lambda row: 0.0 if row['variant_x'] != row['variant_y'] else row['variant_reads'], axis =1)

    return df_merge


def build_master_dataframe_list(sample_vcf_files: list, alt_parent_file: str, ref_parent_file: str, minimum_dp=50, minimum_var_reads=10, minimum_var_qual=200):

    print('building dataframe from reference vcfs...')
    ref_parent_vcf_df = build_ref_df(ref_parent_file)
    alt_parent_vcf_df = build_ref_df(alt_parent_file)

    # anti-join of the parental dataframes to find positions exclusive to each
    print('merging reference vcfs into a single reference...')
    outer_df = alt_parent_vcf_df.merge(ref_parent_vcf_df, how='outer', indicator=True)
    alt_parent_vcf_df_unique = outer_df[(outer_df._merge=='left_only')].drop('_merge', axis=1)
    ref_parent_vcf_df_unique = outer_df[(outer_df._merge=='right_only')].drop('_merge', axis=1)
    
    # edit the vcf dfs so that parent_vcf_df2 is the reference
    ref_df = reassign_vcf_reference(alt_parent_vcf_df_unique, ref_parent_vcf_df_unique)

    # drop heterozygous reads
    ref_df = ref_df.drop(ref_df['genotype'] == '0/1')

    print("filtering reference sites: requiring minimum of", minimum_dp, "total reads per site...")
    ref_df_filt1 = ref_df.drop(ref_df.loc[ (ref_df['DP']<minimum_dp) ].index, inplace=False)

    print('filtering reference variant SNPs: require a minimum of', minimum_var_reads, 'variant reads...')
    ref_df_filt2 = ref_df_filt1.drop(ref_df_filt1.loc[(ref_df_filt1['variant_reads']<minimum_var_reads)].index, inplace=False)

    print('filtering reference SNPs: require a minimum QUAL score of ', minimum_var_qual, '...')
    ref_df_filt3 = ref_df_filt2.drop(ref_df_filt2.loc[(ref_df_filt2['QUAL']<minimum_var_qual)].index, inplace=False)

    sample_vcf_merges = []

    print('building dataframes from each sample vcf and merging with reference vcf...')

    for i in range(len(sample_vcf_files)):

        merge_df = merge_sample_ref_dfs(build_GATK_snp_df(sample_vcf_files[i]), ref_df_filt3) 

        sample_vcf_merges.append(merge_df)

        print(sample_vcf_files[i].split('/')[-1], 'complete...')

    print('building master dataframe of chromosomes...')
    master_recombinant_df = pd.concat(sample_vcf_merges, ignore_index=True)
    
    ##uncomment below to make use of filters for minimum read depth, minimum variant reads, and minimum QUAL score

#     print("filtering all sites: requiring minimum of", minimum_dp, "total reads per site...")
#     master_recombinant_df_filt1 = master_recombinant_df.drop(master_recombinant_df.loc[ (master_recombinant_df['DP']<minimum_dp) ].index, inplace=False)

#     print('filtering variant SNPs: require a minimum of', minimum_var_reads, 'variant reads...')
#     master_recombinant_df_filt2 = master_recombinant_df_filt1.drop(master_recombinant_df_filt1.loc[(master_recombinant_df_filt1['Code']=='CB4856') & (master_recombinant_df_filt1['variant_reads']<minimum_var_reads)].index, inplace=False)

#     print('filtering variant SNPs: require a minimum QUAL score of ', minimum_var_qual, '...')
#     master_recombinant_df_filt3 = master_recombinant_df_filt2.drop(master_recombinant_df_filt2.loc[(master_recombinant_df_filt2['Code']=='CB4856') & (master_recombinant_df_filt2['QUAL']<minimum_var_qual)].index, inplace=False)

    #master_recombinant_df = master_recombinant_df_filt3

    master_recombinant_df = master_recombinant_df.drop(columns=['variant_y'])
    master_recombinant_df = master_recombinant_df.rename(columns={'reference_x':'reference', 'variant_x':'variant'})
    master_recombinant_df = master_recombinant_df.astype({'ref_reads':int, 'variant_reads':int})
    master_recombinant_df.reset_index(drop=True, inplace=True)

    print("done!")

    return master_recombinant_df

# TIGER functions

* adapted methods from Rowan et. al. : https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4349092/
* original TIGER scripts found at: https://github.com/betharowan/TIGER_Scripts-for-distribution

In [4]:
def get_TIGER_inputs_and_mkdirs(new_tiger_directory: str):

    os.mkdir(new_tiger_directory)

    # Create an empty list to store the filenames
    file_list = []

    # Use the os module to get a list of all the files in the current directory

    for filename in os.listdir(os.getcwd()):

        if os.path.isfile(os.path.join(os.getcwd(), filename)):

            if filename.split('.')[-2] == "tiger_input":

                os.mkdir(new_tiger_directory+'/'+filename.split('.')[0])

                os.replace( os.getcwd()+'/'+filename , new_tiger_directory+'/'+filename.split('.')[0]+'/'+filename )

def create_TIGER_inputs_and_directory(master_dataframe, new_project_dir: str):

    if new_project_dir not in os.listdir():

        for sample_type in master_dataframe.sample_type.unique():

            for sample_num in master_dataframe.sample_num.unique(): #subset master by individual samples...
    
                filename = str(sample_type) + "_" + str(sample_num)+".tiger_input.txt"
    
                df = master_dataframe[(master_dataframe['sample_type']==sample_type) & (master_dataframe['sample_num']==sample_num)]
    
    
                df['tiger_chrom'] = df.apply(lambda row: tiger_chrom_name_dict[row['chromosome']], axis=1)
    
                df.to_csv(filename, sep="\t", header=False, index=False, columns = ['tiger_chrom', 'position', 'reference', 'ref_reads', 'variant', 'variant_reads'])
    
            get_TIGER_inputs_and_mkdirs(new_tiger_directory=new_project_dir)
    
        else:
            print("No TIGER input or folders made. This project folder already exists...")

def run_TIGER_pipeline(master_dataframe, project_dir: str):

    #try to make TIGER inputs


    if project_dir not in os.listdir(): # if project folder not made...

        print('making TIGER inputs...')
        create_TIGER_inputs_and_directory(master_dataframe, new_project_dir=project_dir) #make inputs, project directory, and sample sub-directories

        for sample_dir in os.listdir(project_dir):

            sample_folder = os.getcwd()+'/'+project_dir+'/'+sample_dir+'/'

            if sample_dir != '.DS_Store':

                if sample_dir+'.tiger_input.txt' in os.listdir(sample_folder) and sample_dir+'.CO_estimates.txt' not in os.listdir(sample_folder): #if folder has input but missing this TIGER output, do TIGER

                    subprocess.run(['sh', 'run_TIGER.sh', os.getcwd()+'/'+project_dir+'/'+sample_dir+'/', sample_dir])

                elif sample_dir+'.tiger_input.txt' in os.listdir(sample_folder) and sample_dir+'.CO_estimates.txt' in os.listdir(sample_folder): #if folder has input and has this TIGER output, skip...
                    print(sample_dir, 'already processed...')
                    continue

                elif sample_dir+'.tiger_input.txt' not in os.listdir(sample_folder): #warn if input not in folder for some reason
                    print(sample_dir, 'needs TIGER input...')



    elif project_dir in os.listdir(): #if project directory alredy exists
        for sample_dir in os.listdir(project_dir): #for every sample directory, run tiger

            if sample_dir != '.DS_Store': #skip mac's DS_store file...useless

                sample_folder = os.getcwd()+'/'+project_dir+'/'+sample_dir+'/'

                if sample_dir+'.tiger_input.txt' in os.listdir(sample_folder) and sample_dir+'.CO_estimates.txt' not in os.listdir(sample_folder): #if folder has input but missing this TIGER output, do TIGER

                    subprocess.run(['sh', 'run_TIGER.sh', os.getcwd()+'/'+project_dir+'/'+sample_dir+'/', sample_dir])

                elif sample_dir+'.tiger_input.txt' in os.listdir(sample_folder) and sample_dir+'.CO_estimates.txt' in os.listdir(sample_folder): #if folder has input and has this TIGER output, skip...
                    print(sample_dir, 'already processed...')
                    continue

                elif sample_dir+'.tiger_input.txt' not in os.listdir(sample_folder): #warn if input not in folder for some reason
                    print(sample_dir, 'needs TIGER input...')
                    

# Functions for processing TIGER outputs and creating new dataframes

## Processing TIGER outputs to create SNP/marker dataframes

In [5]:
def get_TIGER_files(project_dir: str, file_pattern: str):

    files = []

    for folder in os.listdir(project_dir):

        if folder[0] != '.':

            files.append(project_dir+'/'+folder+"/"+folder+file_pattern)

    return files


def create_TIGER_master_df(project_dir: str, file_pattern: str):
    #this dataframe this function makes is relatively large because it gives you genotype and HMM state calls for every SNP marker across each chromosome for each sample
    
    #This is closest to the 'raw' data that TIGER outputs, but this format is mostly useful for plotting and visualization of individual chromosomes for manual inspection of SNPs and crossover calls

    #get list of file names for all TIGER outputs
    files_list = get_TIGER_files(project_dir, file_pattern)

    data = []

    for file in files_list:

        #make dataframe; adjust column names if you end up using different TIGER output files than I did (file_pattern='.CO_estimates.txt')
        df = pd.read_csv(file, sep="\t", header=None, names = ["Sample", "chromosome", "position", "base_geno", "hmm_state1", "hmm_state2",
                                                   "reference", "ref_reads", "variant", "var_reads"])

        #This dict will convert parental arabidopsis genotype calls (Col and Ler) to C. elegans genotype calls (N2 is Bristol WT, CB4856 is Hawaiian WT)
        #Replace these values with your preferred genotype labels as needed
        ref_parental_genotype = "w1118"
        alt_parental_genotype = "Oregon RM"
        genotype_dict = {'CC':ref_parental_genotype, "CL":"het", "LL":alt_parental_genotype, "CU":"u"+ref_parental_genotype, "LU":'u'+alt_parental_genotype, "UN":"unknown", '?':"?"}
        df = df.replace(genotype_dict)
        
        #for our specific cross scheme to ID crossovers, heterozygous and homozygous CB4856 calls are should all be considered CB4856
        df['hmm_state1'] = df['hmm_state1'].replace({'het':alt_parental_genotype})
        
        #add a unique identifier for each chromosome
        #example... BSP-OR-001-1 = (project)-(gamete)-(sample number)-(chromosome)
        df['chrom_id'] = df.apply(lambda row: row['Sample']+"-"+str(row['chromosome']), axis=1)

        print(file, 'done')
        data.append(df)

    # make dataframe from all samples in data list
    data = pd.concat(data)

    return data

In [None]:
def read_master_df(master_df_file):
    pd.read_parquet(master_df_file)

## Processing TIGER outputs to create intervals dataframe

In [6]:
def make_transition_intervals_df(master_df):
    
    #These new transition intervals are the potential crossover intervals that can be classified later as crossover or 'not crossover'

    new_dfs = []

    for chrom_id in master_df.chrom_id.unique():

        df = master_df[master_df['chrom_id']==chrom_id]

        starts = list(df.start)
        stops = list(df.stop)
        states = list(df.hmm_state)

        new_intervals = []

        for i in range(len(stops)-1):

            if states[i] != states[i+1]:

                row = [ df['sample'].unique()[0], df['chromosome'].unique()[0], stops[i], starts[i+1], 'transition', df['chrom_id'].unique()[0], (starts[i+1]-stops[i]) ]

                new_intervals.append(row)

        new_intervals_df = pd.DataFrame(new_intervals, columns=df.columns)


        full_intervals_df = pd.concat([df, new_intervals_df]).sort_values(by='start').reset_index(drop=True)

        new_dfs.append(full_intervals_df)



    return pd.concat(new_dfs)




def create_state_intervals_df(project_dir: str, file_pattern: str):
    #This will create a much smaller dataframe than the marker dataframe above by collapsing runs of the same HMM state into intervals and creating a new row for a 2-marker transition interval between HMM states

    #for our specific cross scheme to ID crossovers, heterozygous and homozygous CB4856 calls are should all be considered CB4856
    genotype_dict = {'CC':ref_parental_genotype, "CL":alt_parental_genotype, "LL":alt_parental_genotype, "CU":"u"+ref_parental_genotype, "LU":'u'+alt_parental_genotype, "UN":"unknown", '?':"?"}

    files_list = get_TIGER_files(project_dir=project_dir, file_pattern=file_pattern)

    data = []

    for file in files_list:

        #create df given columns in files using file_pattern='.CO_estimates.breaks.txt' in TIGER outputs
        df = pd.read_csv(file, sep="\t", header=None, names = ["sample", "chromosome", "start", "stop", "hmm_state"])

        df['hmm_state'] = df.apply(lambda row: genotype_dict[(row['hmm_state'])], axis=1)
        df['chrom_id'] = df.apply(lambda row: row['sample']+"-"+str(row['chromosome']), axis=1)
        df['length'] = df.apply(lambda row: row['stop']-row['start']+1, axis=1)
#         df['gamete'] = df['sample'].unique()[0].split("-")[1]

        data.append(df)


    master_df = pd.concat(data)

    master_df = make_transition_intervals_df(master_df)

    master_df = master_df.sort_values(by=['chrom_id', 'chromosome', 'start']).reset_index(drop=True)

    return master_df


def get_marker_counts_per_interval(master_bases_df, master_intervals_df):
    
    #This is useful to retain marker density information for each interval, which is needed for later maths and classification of crossover intervals

    new_dfs = []

    for chrom_id in master_bases_df.chrom_id.unique():

        bases_df = master_bases_df[master_bases_df['chrom_id']==chrom_id]
        intervals_df = master_intervals_df[master_intervals_df['chrom_id']==chrom_id]

        starts = list(intervals_df.start)
        stops = list(intervals_df.stop)

        counts = []

        for i in range(len(starts)):

            counts.append(len(bases_df[(bases_df['position']>=starts[i]) & (bases_df['position']<=stops[i]) ]))

        intervals_df['marker_counts'] = counts

        new_dfs.append(intervals_df)

    markers_counted_df = pd.concat(new_dfs)

    return markers_counted_df

# Ubiquitious items and file names

In [7]:
work_dir = os.getcwd()

#dictionary of chromosome names and corresponding lengths, change these as needed!
n2_fasta_names = ['chr2L', 'chr2R', 'chr3L', 'chr3R', 'chr4', 'chrX']

# TODO: change these
n2_chrom_lengths = [15114068, 15311845, 13819453, 17493838, 20953657, 17739129]

tiger_chrom_len_dict = dict(zip([1,2,3,4,5,6], n2_chrom_lengths))
tiger_chrom_name_dict = dict(zip(n2_fasta_names, [1,2,3,4,5,6]))
# tiger_chrom_len_dict = dict(zip([1,2,3,4,5,6], n2_chrom_lengths))
# tiger_chrom_name_dict = dict(zip(n2_fasta_names, [1,2,3,4,5,6]))

# ref_parental_genotype = 'N2' #change this to your reference genotype
# alt_parental_genotype = 'CB4856' #change this to your alternate parental genotype

ref_parental_genotype = 'w1118' #change this to your reference genotype
alt_parental_genotype = 'oregonr' #change this to your alternate parental genotype

# sample_vcfs_path = '/Users/zac/Desktop/VCF_to_TIGER/sample_vcf/'
# cb_ref_vcf = '/Users/zac/Desktop/VCF_to_TIGER/reference_vcf/CB.aligned.to.N2.SNPs.noHet.no-repeats.vcf'
sample_vcfs_path = os.path.join(work_dir, "samples/")
ref_vcf = os.path.join(work_dir, "references/w1118_snps.vcf")
alt_vcf = os.path.join(work_dir, "references/oregonr_snps.vcf")

sample_vcfs = get_files_in_directory(sample_vcfs_path)

# Run Pipeline: VCF processing > TIGER HMM

In [None]:
#VCF Processing and save
master_df_file = "../../data/parquets/progeny.parquet2"
# vcf_master_df = build_master_dataframe_list(sample_vcf_files=sample_vcfs, alt_parent_file=alt_vcf, ref_parent_file=ref_vcf)
vcf_master_df = pd.read_parquet(master_df)
#Save the dataframe as a pickle file
save_vcf_data_as = "test_vcfs.pickle.gzip"
vcf_master_df.to_pickle(save_vcf_data_as, compression='gzip')

building dataframe from reference vcfs...


In [None]:
#run TIGER scripts to get HMM outputs
run_TIGER_pipeline(vcf_master_df, project_dir='TIGER_test_output/')

In [None]:
#generate TIGER marker and interval dataframes
tiger_marker_df = create_TIGER_master_df(project_dir='TIGER_test_output', file_pattern='.CO_estimates.txt')
tiger_marker_df.to_pickle('TIGER_hmm_states.all_markers.pickle.gzip', compression='gzip')

In [None]:
tiger_pre_intervals = create_state_intervals_df(project_dir='TIGER_test_output', file_pattern='.CO_estimates.breaks.txt')
tiger_intervals = get_marker_counts_per_interval(tiger_marker_df, tiger_pre_intervals)

tiger_intervals.to_pickle('TIGER_hmm_intervals.pickle.gzip', compression='gzip')
tiger_intervals