In [5]:
import os
import pandas as pd
import matplotlib
import ruamel.yaml as YAML
import yaml
import logging
from subprocess import call
from sklearn import linear_model

In [2]:
phenotypes = [
  "LVEDV_automatic",
  "LVESV_automatic",
  "LVSV_automatic",
  "LVEF_automatic",
  "LVM_automatic",
  "RVEDV_automatic",
  "RVESV_automatic",
  "RVSV_automatic",
  "RVEF_automatic",
  "LVEDV_manual",
  "LVESV_manual",
  "LVSV_manual",
  "LVEF_manual",
  "LVM_manual",
  "RVEDV_manual",
  "RVESV_manual",
  "RVSV_manual",
  "RVEF_manual"
]

In [3]:
# copied from http://thoughtsbyclayg.blogspot.com/2008/10/parsing-list-of-numbers-in-python.html
# with a few modifications done afterwards
def parseIntSet(nputstr=""):
  selection = set()
  invalid = set()
  # tokens are comma seperated values
    
  tokens = [x.strip() for x in nputstr.split(',')]
  for i in tokens:
     try:
        # autosomal chromosomes are integers between 1 and 22
        if int(i) <= 22 and int(i) >= 1:
            selection.add(i)
        else:
            invalid.add(i)
     except:
        # if not, then it might be a range
        try:
           token = [int(k.strip()) for k in i.split('-')]
           if len(token) > 1:
              token.sort()
              # we have items seperated by a dash
              # try to build a valid range
              first = token[0]
              last = token[len(token)-1]
              for x in range(first, last+1):
                 selection.add(str(x))
        except:
            if i == "X" or i == "Y":
                selection.add(i)            
            else: 
                # not an int and not a range...
                invalid.add(i)
                
  # Report invalid tokens before returning valid selection
  # print("Invalid set: " + str(invalid))
    
  return selection
# end parseIntSet
    
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False    

In [4]:
class GWAS_Run:
    
    def __init__(self, yaml_config_file):
        
        config = yaml.safe_load(open(yaml_config_file))

        # dir stands for directory
        # f stands for file
        # fp stands for file pattern
        
        ## paths to data
        data_dir = config["data_dirs"]
        genotype_dir = data_dir["genotype"]
        pheno_dir = data_dir["phenotype"]    
        indiv_dir = data_dir["individuals"]
        output_dir = data_dir["output"]
        tmp_dir = data_dir["tmp"]
        
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
            
        if not os.path.exists(tmp_dir):
            os.makedirs(tmp_dir)

        ## file name patterns
        fp_config = config["filename_pattern"]
        self.indiv_f = os.path.join(indiv_dir, fp_config["individuals"])
        self.genotype_fp = os.path.join(genotype_dir, fp_config["genotype"])
        self.bim_fp = os.path.join(genotype_dir, fp_config["bim"])
        self.fam_fp = os.path.join(genotype_dir, fp_config["fam"])
        self.pheno_f = os.path.join(pheno_dir, fp_config["phenotype"])
        self.gwas_f = os.path.join(output_dir, fp_config["gwas"])
        self.pheno_f_tmp = os.path.join(tmp_dir, fp_config["phenotype_tmp"])

        # paths to executable files
        self.sbat_exec = config["exec"]["sbat"]
        self.plink_exec = config["exec"]["plink"]
            
        gwas_run.generate_phenotype_file(phenotypes)
       
        # other parameters of the run
        chromosomes = parseIntSet(str(config["chromosomes"]))
        self.chromosomes = [x for x in chromosomes if is_number(x)]
        if "X" in chromosomes:
            self.chromosomes.append("X")
        if "Y" in chromosomes:
            self.chromosomes.append("Y")
    
        # other options
        self.delete_temp = config.get(["delete_temp"], True)
        self.merge_chromosomes = config.get(["merge_chromosomes"], True)
        self.overwrite_output = config.get(["overwrite_output"], False)
        self.generate_tabix = config.get(["generate_tabix"], False)        
        
        
    def __str__(self):
        return("\n".join([
            "Genotype file pattern: %s" % self.genotype_fp,
            "bim file pattern: %s" % self.bim_fp,
            "fam file pattern: %s" % self.fam_fp,
            "Phenotype file name: %s" % self.pheno_f,
            "GWAS file name: %s" % self.gwas_f
        ]))
    
    
    def generate_phenotype_file(self, phenotypes, tmp=True):
        '''
        Create a phenotype file with the format as required by plink
        IID|FID|phenotype1|phenotype2|...
        '''

        df = pd.read_table(self.pheno_f)
        df.rename(columns={"Subject ID": "IID"}, inplace=True)
        df["FID"] = df["IID"] # duplicate ID column in FID column
        # print(df.head())

        cols = df.columns.to_list()
        cols = [cols[-1]] + [cols[0]] + cols[1:-1]

        # ID|FID|pheno1|pheno2|...
        # os.path.isdir()
        # filter
        cols = [x for x in cols if x in (["IID", "FID"] + phenotypes)]
        not_found = {x for x in phenotypes if x not in cols}
        df = df[cols]
        # raise warning if `not_found` is not empty

        df.to_csv(self.pheno_f_tmp, sep="\t", na_rep="NA", index=False)
        # print(df.head())

        
    def format_genotype_file(origin="bed", destiny="bgen"):
        
        '''
        run shell commands in order to convert from the origin format to the destiny format (the one required by the GWAS tool, e.g. plink or BGENIE)
        '''

        pass
        
        # TODO: add log messages (redirect error output?)
        if (origin, destiny) == ("bed", "bgen"):
            output_f = genotype_f[:-3] + "bgen"
            command = [plink, "--pfile", genotype_f, "--export", "bgen-1.2", "--out", output_f]
        else:
            return

        call(command)

        
    def filter_individuals(ids=None, conditions=None, fields=None, tmp=True):
        '''
        filter by ids, or a set of conditions imposed on a field or fields (e.g. gender, ethnicity and/or age).
        if tmp == True, the files generated herein will be erased in the end
        '''
        pass
    
    
    def execute(self, chromosome, phenotype):
        # TODO: add log messages (redirect error output?)
        if self.covariates is not None:
            command = [self.plink_exec,
                       "--bed", self.genotype_fp.format(chromosome=chromosome),
                       "--bim", self.bim_fp.format(chromosome=chromosome),
                       "--fam", self.fam_fp.format(chromosome=chromosome),
                       "--assoc",
                       "--pheno", self.pheno_f_tmp,
                       "--pheno-name", phenotype,
                       "--out", self.gwas_f.format(phenotype=phenotype, suffix="__chr%s" % chromosome)]
        # return(" ".join(command))
        # command = [plink, "--bed", bed_file, "--bim", bim_file, "--fam", fam_file, "--linear", "assoc", "--pheno", phenotype_f, "--pheno-name", phenotype, "--out", gwas_f]
        
        call(command)
        
    def extract_metadata(fields):
        command = ["ukbconv", ""]
        pass

    def adjust_phenotype(covariates):
        pass

    def manhattan_plot(gwas_results):
        pass

In [1]:
def run(args):
    
    gwas_run = GWAS_Run(args.yaml_config_file)

    for phenotype in phenotypes:

        for chr in gwas_run.chromosomes:
            gwas_run.execute(chr, phenotype)

        if gwas_run.merge_chromosomes:
            with open(gwas_run.gwas_f.format(phenotype=phenotype, suffix="") as merged_fh:
                partitioned_gwas_files = [gwas_run.gwas_fp.format(chromosome=chr) for chr in gwas_run.chromosomes]
                for i, file_chr in enumerate(partitioned_gwas_files):
                    with open(file_chr) as chr_fh:
                        for j, line in enumerate(chr_fh):
                            if (i == 0 and j == 0) or (j != 0):
                                merged_fh.write(line)

    if gwas_run.delete_temp:
        os.rmdir(gwas_run.tmp_dir)

SyntaxError: unexpected EOF while parsing (<ipython-input-1-e0cbc0d78fdf>, line 9)

In [None]:
if __name__ == "__main__":

    import argparse

    parser = argparse.ArgumentParser(description="Harmonize data, run GWAS and generate descriptive plots.")

    parser.add_argument("--yaml_config_file", default="config.yaml")

    args = parser.parse_args()
    
    run(args.yaml_config_file)

In [6]:
import os 
phenotypes = [x.strip() for x in open('run_params/phenotypes.txt')]
print(phenotypes)

FileNotFoundError: [Errno 2] No such file or directory: 'run_params/phenotypes.txt'

In [3]:
os.getcwd()

'/home/rodrigo/Leeds/doctorado/repos/GWAS_pipeline/code'

In [15]:
yaml.safe_load(open("yaml_files/config_test.yaml"))

{'chromosomes': '1-22',
 'phenotypes_list': 'a',
 'individuals': 'only_FID_IID.txt',
 'filename_patterns': ['genotype_patterns.yaml', 'filename_patterns.yaml'],
 'gwas': '../output/gwas/{phenotype}/gwas__{phenotype}{{suffix}}.plink.assoc',
 'suffix': '{}{}{}',
 'exec': {'plink': 'plink'}}

In [29]:
os.chdir("/home/rodrigo/Leeds/doctorado/repos/GWAS_pipeline/code/")
yaml_file = "yaml_files/config_test.yaml"

In [8]:
def unfold_config(yaml_file):
    '''
    This function reads a yaml configuration file
    If some the fields are paths to other yaml files,
    it will load their contents as values of the corresponding keys
    '''
    
    def is_yaml_file(x):
        if isinstance(x, str):
            return x.endswith("yaml") or x.endswith("yml")
        return False
    
    config_dir = os.path.dirname(yaml_file)
    config = yaml.safe_load(open(yaml_file))
    for key in config:
        #TODO: 
        if isinstance(config[key], list) and all([is_yaml_file(x) for x in config[key]]) or is_yaml_file(config[key]):
            dd = {}
            file_names = [x if os.path.exists(x) else os.path.join(config_dir, x) for x in config[key]]
            print(file_names)
            for file_name in file_names:
                dd = {**dd, **yaml.safe_load(open(file_name))}
            config[key] = dd
    return config
                

In [51]:
def is_yaml_file(x):
    if isinstance(x, str):
        return x.endswith("yaml") or x.endswith("yml")
    return False
    
def unfold_config(token):
    '''
    This function reads a yaml configuration file
    If some the fields are paths to other yaml files,
    it will load their contents as values of the corresponding keys
    '''           
    if is_yaml_file(token):
        token = yaml.safe_load(open(token))
    if isinstance(token, dict):
        for k, v in token.items():
            # print("{0}: {1}".format(k, v))
            token[k] = unfold_config(v)
    return token

In [52]:
unfold_config(yaml_file)

{'chromosomes': '1-22',
 'phenotypes_list': 'a',
 'individuals': 'only_FID_IID.txt',
 'filename_patterns': {'genotype': {'bed': '../data/calls/ukb_cal_chr{chromosome}_v2.bed',
   'bim': '../data/calls/ukb_snp_chr{chromosome}_v2.bim',
   'fam': '../data/calls/ukb11350_cal_chr{chromosome}_v2_s488282.fam'},
  'phenotype': {'phenotype_file': '../data/Cardiac_Function_Indexes_11350.tsv',
   'phenotype_file_tmp': '../data/tmp/Cardiac_Function_Indexes_11350_tmp.tsv',
   'covariates': ''},
  'gwas_fp': '../output/gwas/{phenotype}/gwas__{phenotype}{{suffix}}.plink.assoc'},
 'suffix': '{covariate_adjustment}__{inverse_normalization}__{segmentation_method}__{ethnicity}',
 'suffix_tokens': {'covariate_adjustment': {False: None, True: 'Adj'},
  'inverse_normalization': {False: None, True: 'InvNorm'},
  'segmentation_method': {'manual': 'Manual',
   'automatic': 'Auto',
   'manual_plus_automatic': 'Man_And_Auto'},
  'ethnicity': {'white': 'EUR', 'british': 'GBR', 'all': 'All_Ethnicities'}},
 'exec':