# Script to make a merged vcf genetic data freeze
Kasia Bryc

July 17, 2024

This notebook is designed to make a file containing a list of all the vcf files for dogs
that have been genotyped or sequenced as part of Darwin's Ark. This includes dogs genotyped on arrays (which occurred early on). The goal is to make a merge dataset that will be used for downstream GWAS analysis as part of the dog behavioral genetics PNAS paper.

In [None]:
import pandas as pd
import os

In [None]:
cache_dir = "/seq/vgb/bryc/darwins_cats/dog_survey_analysis/data/"
breed_file = os.path.join(cache_dir, "breed_data.csv")
breed_data = pd.read_csv(breed_file)
barcodes = breed_data.barcode.unique().tolist()

In [None]:
def find_vcf_locations(barcodes, 
                       directory_to_search = "/seq/vgb/rawData/gencove", 
                       secondary_directory = "/seq/vgb/dd"):
    """ Takes a list of barcodes and returns a dict mapping each barcode to a list of the locations it is found."""
    barcode_to_locations = dict()
    for i, barcode in enumerate(barcodes):
        locations = []
        missing = 0
        print(f"[{str(i)}/{len(barcodes)}] Looking for barcode: {str(barcode)}")
        for root, dirs, files in os.walk(directory_to_search):
            for file in files:
                if file.startswith(str(barcode)):
                    if file.endswith(".vcf.gz"):
                        locations.append(os.path.join(root, file))
        barcode_to_locations[barcode] = locations
        # If no vcf file locations are found, try the other back up directory
        if len(locations) == 0:
            for root, dirs, files in os.walk(secondary_directory):
                for file in files:
                    if file.startswith(str(barcode)):
                        if file.endswith(".vcf.gz"):
                            locations.append(os.path.join(root, file))
        if len(locations) == 0:
            print(f"No location found for barcode {barcode}")
            missing += 1
    print(f"{missing}/{len(barcodes)} barcodes not found")        
    return barcode_to_locations

In [None]:
locations_dict = find_vcf_locations(barcodes)

In [None]:
locations_df = pd.DataFrame(locations_dict.items(), columns=['barcode', 'locations'])

In [None]:
locations_df.to_csv("/seq/vgb/dd/data/2024-07-18_data_cache/2024-07-18_dd_vcf_best_search_file_locations.csv")