# Preliminaries

In [1]:
from pprint import pprint
import os, sys, shlex
# from IPython import embed
from subprocess import call, check_output
repo_rootdir = check_output(shlex.split("git rev-parse --show-toplevel")).strip().decode('ascii')
os.chdir(repo_rootdir)
sys.path.append(os.getcwd())

# Import modules
import pandas as pd
import yaml
import re
from pprint import pprint
from string import Formatter
from copy import deepcopy

from GWAS_pipeline.src.auxiliary import unfold_config
from GWAS_pipeline.src.run_gwas import GWAS_Run
import warnings
import time

from easydict import EasyDict

In [2]:
sys.path.append("CardiacCOMA")
from CardiacCOMA.utils.mlflow_queries import \
    list_artifacts,\
    get_significant_loci,\
    get_metrics_cols, \
    get_params_cols, \
    get_runs_df, \
    get_good_runs,\
    summarize_loci_across_runs,\
    get_model_pretrained_weights

In [None]:
sys.path.append("GWAS_pipeline")
from GWAS_pipeline.main_bgenie import \
    extract_formatter_tokens, \
    prepare_config, \
    adjust_for_covariates, \
    build_bgen_command, \
    postprocess_gwas_by_region

In [None]:
import mlflow
from mlflow.tracking import MlflowClient

TRACKING_URI = "mlruns"
mlflow.set_tracking_uri(TRACKING_URI)
client = MlflowClient()

## Query runs

In [None]:
runs_df = get_good_runs(metric_thres=2)
run_ids = runs_df.run_id.to_list()

## Build configuration

In [72]:
args = {
    "yaml_config_file": "GWAS_pipeline/config_files/ref_config.yaml",
    "name_rules": "GWAS_pipeline/config_files/filename_rules/filename_rules.yaml",
    "suffix": "{experiment_id}_{run_id}",
    "quality_control": "quality_control/quality_control.yaml",
    "experiment_id": "1",
    "dry_run": False,
    "gwas_software": "bgenie",
    "phenotype_file": None,
    "gwas_file": None,
    "phenotypes": None,
    "chromosomes": None,    
    "bgen_sample_file": None,        
    "covariates": None,
    "sample_white_lists": None,
    "sample_black_lists": None,    
    "run_id": "{run_id}"
}

In [73]:
config_to_replace = EasyDict(args)
config_to_replace.run_id = run_ids[0]
config = prepare_config(config_to_replace)
# pprint(config)

GWAS_pipeline/config_files/ref_config.yaml
quality_control/quality_control.yaml
/nobackup/scrb/src/GWAS_pipeline/config_files
genotype_patterns/genotype_patterns.yaml
/nobackup/scrb/src/GWAS_pipeline/config_files


In [74]:
adj_command = adjust_for_covariates(config)
gwas_command = build_bgen_command(config)    
postproc_command = postprocess_gwas_by_region(config)

commands = {
  "1": adj_command,
  "2": gwas_command,
  "3": postproc_command
}  


messages = {
  "1": "\nPreprocessing the phenotype file to perform GWAS on {}\n.".format(config["gwas_software"]),
  "2": "\nSubmitting GWAS jobs to the queue\n",
  "3": "\nConcatenating per-region GWAS files, creating Manhattan plots, Q-Q plots and region-wise summaries\n"
}   

In [75]:
config_to_replace.steps_to_run = ["1"] #,"2","3"]
config_to_replace.no_print_commands = False

In [76]:
args = config_to_replace

In [77]:
for k, command in commands.items():
    if k in args.steps_to_run: 
        if not args.no_print_commands:
            print(command)
        if not args.dry_run:
            print(messages[k])
            call(shlex.split(command))

Rscript src/preprocess_files_for_GWAS.R
--phenotype_file ../CardiacCOMA/mlruns/1/b6a7eedfc0e84b8b9d1f099ed5f158c4/artifacts/output/latent_vector.csv
--columns_to_exclude ID subset
--covariates_config_yaml covariates/std_covariates_PC.yaml
--output_file ../CardiacCOMA/mlruns/1/b6a7eedfc0e84b8b9d1f099ed5f158c4/artifacts/output/latent_vector_adj_10PCs.csv
--gwas_software bgenie
--bgen_sample_file data/transforms/genotypes_by_region/samples.txt


Preprocessing the phenotype file to perform GWAS on bgenie
.


FileNotFoundError: [Errno 2] No such file or directory: 'Rscript': 'Rscript'

In [None]:
yaml.dump(config, open(os.path.join(os.path.dirname(config["filename_patterns"]["gwas"]), "config.yaml"), "w"))

In [None]:
f"python main_bgenie.py --run_id {run_id} --experiment_id 1 --gwas_software bgenie --steps_to_run 1"