Class that deals with the extraction details and is helpful here

In [7]:
#multiprocessing over one species triplet and many chromosomes
import pandas as pd
import re
import numpy as np
import gzip
import os
import sys
import json

import multiprocessing

from collections import defaultdict
from urllib.request import urlopen
from ChromosomeMutExtractor import get_mutation_counts
'''
To run extraction on your computer you first need to download the files from:
https://hgdownload.soe.ucsc.edu/goldenPath/hg38/multiz100way/.

Run the commads:
mut_extractor = MPMutExtractor(folder, file_names, species, url)
mut_extractor.get_mutation_counts()

Where:
folder: The folder containing the downloaded maf files with the different chromosomes alignment data
file_names: The name of all the files with the data from the folder
species: A String array with the three (scientific) names of the species you want to analyse.
example = ["Ovis aries", "Capra hircus", "Bos taurus"], where Bos taurus is the outgroup
url - no need to specify, but should contain the url of the multiz100way in UCSC
'''



# Multiprocessing mutational count extractor
class MPMutExtractor:
    """
    This class uses multiprocessing to extract mutation counts
    It extracts counts from multiple chromosome files concurrently using Chromosome_MutSig class.
    
    Important Attributes:
    species_names (String array): scientific name of the 3 species to be used (species[2]) is the outlgroup.
    folder (String): The path of the folder containing alignment files
    file_names (String array): The file names that the sequencing files are in.
    """
    

    def __init__(self, 
                 folder = r"C:\Users\KerenYlab\Downloads" + "\\", 
                 chromosomes_folder = "chr_files",
                 species = ["Ovis aries", "Capra hircus", "Bos taurus"],
                 url = "https://hgdownload.soe.ucsc.edu/goldenPath/hg38/multiz100way/"):
        
        """
        A constructor to initialize a new MPMutExtractor instance.
        
        Parameters:
        self.html : html page information with about maf alignment files
        folder, file_names, species_names : as described above
        """
        
        
        page = urlopen(url)
        html_bytes = page.read()
        self.html = html_bytes.decode("utf-8")
        self.folder = os.path.join(os.getcwd(), chromosomes_folder)
        self.chromosomes = [file.split('.')[0] for file in os.listdir(chromosomes_folder) if file.startswith("chr")]
        chromosome_files = [os.path.join(self.folder, file) for file in os.listdir(chromosomes_folder) if file.startswith("chr")]
        self.file_names = chromosome_files
        self.species_names = species
            
        
    # finds scientific names
    def __find_scientific_names(self):
        start = re.search("Assemblies used in these alignments:", self.html)
        end = re.search("---------------------------------------------------------------", self.html)
        text = self.html[start.end():end.start()]
        names = []
        for line in text.splitlines():
            if (line != '') and not line.isspace() and line[0] != '=':
                names.append(line[25:56].strip(' '))
        self.scientific_names = names
        return names


    #finds names for alignment
    def __find_alignment_names(self):
        names = []
        start = re.search("Assemblies used in these alignments:", self.html)
        end = re.search("---------------------------------------------------------------", self.html)
        text = self.html[start.end():end.start()]
        for line in text.splitlines():
            if (line == '') or line.isspace() or line[0] == '=':
                continue
            names.append(re.findall("/\S*\s", line)[-1][1:-1])
        self.alignment_names = names
        return names
    
    #find common names
    def __find_common_names(self):
        start = re.search("Assemblies used in these alignments:", self.html)
        end = re.search("---------------------------------------------------------------", self.html)
        text = self.html[start.end():end.start()]
        names = []
        for line in text.splitlines():
            if (line != '') and not line.isspace() and line[0] != '=':
                names.append(line[:25].strip(' '))
        self.common_names = names
        return names
    
    #gets the alignment names for the 3 species given to it
    def get_species_alignment_names(self, species = ["Ovis aries", "Capra hircus", "Bos taurus"]):
        
        if not hasattr(self, "scientific_names") or not hasattr(self, "alignment_names"):
            self.common_names = self.__find_common_names()
            self.scientific_names = self.__find_scientific_names()
            self.alignment_names = self.__find_alignment_names()
        
        scientific_names = self.scientific_names
        alignment_names = self.alignment_names
        common_names = self.common_names
        
        species = self.species_names
        chosen_species = [None]*3

        # 3 chosen taxa = Ovis aries, Capra hircus and outgroup Bos taurus
        chosen_species[0] = np.array(alignment_names)[(np.array(scientific_names) == species[0])][0]
        chosen_species[1] = np.array(alignment_names)[(np.array(scientific_names) == species[1])][0]
        chosen_species[2] = np.array(alignment_names)[(np.array(scientific_names) == species[2])][0]

        self.chosen_species = chosen_species
        return chosen_species

All the chosen species triplets

In [8]:
#chosen species for extraction
common_species = [['Human', 'Chimp', 'Gorilla'],
                  ['Chimp', 'Gorilla', 'Orangutan'],
                  ['Gorilla', 'Orangutan', 'Gibbon'],
                  ['Orangutan', 'Gibbon', 'Rhesus'],
                  ['Rhesus', 'Crab-eating macaque', 'Baboon'],
                  ['Baboon', 'Crab-eating macaque', 'Green monkey'],
                  ['Baboon', 'Green monkey', 'Gibbon'],
                  ['Mouse', 'Rat', 'Prairie vole'],
                  ['Brush-tailed rat', 'Chinchilla', 'Guinea pig'],
                  ['Chinese hamster', 'Golden hamster', 'Prairie vole'],
                  ['Prairie vole', 'Rat', 'Lesser Egyptian jerboa'],
                  ['Guinea pig', 'Naked mole-rat', 'Lesser Egyptian jerboa'],
                  ['Pacific walrus', 'Weddell seal', 'Panda'],
                  ['Panda', 'Ferret', 'Dog'],
                  ["David's myotis bat", 'Microbat', 'Big brown bat'],
                  ['Microbat', 'Big brown bat', 'Megabat'],
                  ['Black flying-fox', 'Megabat', 'Big brown bat'],
                  ['Sheep', 'Domestic goat', 'Cow'],
                  ['Dolphin', 'Killer whale', 'Tibetan antelope'],
                  ['Bactrian camel', 'Alpaca', 'Tibetan antelope'],
                  ['Tasmanian devil', 'Wallaby', 'Opossum'],
                  ['White-throated sparrow', 'Medium ground finch', 'Collared flycatcher'],
                  ['Zebra finch', 'Medium ground finch', 'Collared flycatcher'],
                  ['Parrot', 'Scarlet macaw', 'Budgerigar'],
                  ['Peregrine falcon', 'Saker falcon', 'Budgerigar'],
                  ['Chicken', 'Turkey', 'Mallard duck'],
                  ['Chinese softshell turtle', 'Spiny softshell turtle', 'Green seaturtle'],
                  ['Zebra mbuna', 'Pundamilia nyererei', "Burton's mouthbreeder"],
                  ['Pundamilia nyererei', "Burton's mouthbreeder", 'Princess of Burundi'],
                  ["Burton's mouthbreeder", 'Princess of Burundi', 'Nile tilapia'],
                  ['Yellowbelly pufferfish', 'Fugu', 'Tetraodon']]
            
                  
scientific_species = [['Homo sapiens', 'Pan troglodytes', 'Gorilla gorilla gorilla'],
           ['Pan troglodytes', 'Gorilla gorilla gorilla', 'Pongo pygmaeus abelii'],
           ['Gorilla gorilla gorilla', 'Pongo pygmaeus abelii', 'Nomascus leucogenys'],
           ['Pongo pygmaeus abelii', 'Nomascus leucogenys', 'Macaca mulatta'],
           ['Macaca mulatta', 'Macaca fascicularis', 'Papio hamadryas'],
           ['Papio hamadryas', 'Macaca fascicularis', 'Chlorocebus sabaeus'],
           ['Papio hamadryas', 'Chlorocebus sabaeus', 'Nomascus leucogenys'],
           ['Mus musculus', 'Rattus norvegicus', 'Microtus ochrogaster'],
           ['Octodon degus', 'Chinchilla lanigera', 'Cavia porcellus'],
           ['Cricetulus griseus', 'Mesocricetus auratus', 'Microtus ochrogaster'],
           ['Microtus ochrogaster', 'Rattus norvegicus', 'Jaculus jaculus'],
           ['Cavia porcellus', 'Heterocephalus glaber', 'Jaculus jaculus'],
           ['Odobenus rosmarus divergens', 'Leptonychotes weddellii', 'Ailuropoda melanoleuca'],
           ['Ailuropoda melanoleuca', 'Mustela putorius furo', 'Canis lupus familiaris'],
           ['Myotis davidii', 'Myotis lucifugus', 'Eptesicus fuscus'],
           ['Myotis lucifugus', 'Eptesicus fuscus', 'Pteropus vampyrus'],
           ['Pteropus alecto', 'Pteropus vampyrus', 'Eptesicus fuscus'],
           ['Ovis aries', 'Capra hircus', 'Bos taurus'],
           ['Tursiops truncatus', 'Orcinus orca', 'Pantholops hodgsonii'],
           ['Camelus ferus', 'Vicugna pacos', 'Pantholops hodgsonii'],
           ['Sarcophilus harrisii', 'Macropus eugenii', 'Monodelphis domestica'],
           ['Zonotrichia albicollis', 'Geospiza fortis', 'Ficedula albicollis'],
           ['Taeniopygia guttata', 'Geospiza fortis', 'Ficedula albicollis'],
           ['Amazona vittata', 'Ara macao', 'Melopsittacus undulatus'],
           ['Falco peregrinus', 'Falco cherrug', 'Melopsittacus undulatus'],
           ['Gallus gallus', 'Meleagris gallopavo', 'Anas platyrhynchos'],
           ['Pelodiscus sinensis', 'Apalone spinifera', 'Chelonia mydas'],
           ['Maylandia zebra', 'Pundamilia nyererei', 'Haplochromis burtoni'],
           ['Pundamilia nyererei', 'Haplochromis burtoni', 'Neolamprologus brichardi'],
           ['Haplochromis burtoni', 'Neolamprologus brichardi', 'Oreochromis niloticus'],
           ['Takifugu flavidus', 'Takifugu rubripes', 'Tetraodon nigroviridis']]


Getting the common, scientific and alignment names for the species

In [11]:
mp = MPMutExtractor()
mp.get_species_alignment_names()
scientific_names = np.array(mp.scientific_names)
alignment_names = np.array(mp.alignment_names)
common_names = np.array(mp.common_names)

scientific_species = [[scientific_names[common_names == s][0] for s in species_triplet] for species_triplet in common_species]
alignment_species = [[alignment_names[common_names == s][0] for s in species_triplet] for species_triplet in common_species]

Creating the argumets file for condor

In [10]:
#Create input file for condor
with open("species_arguments.txt", "w") as f:
    for i in range(len(alignment_species)):
        f.write(f"{';'.join(scientific_species[i]).replace(' ', '_')},{';'.join(alignment_species[i])}\n")