In [1]:
import sys
import argparse
import warnings
from pathlib import Path
from scripts.folder_structure import FolderStructure
from scripts.input_processing import InputProcessor
from scripts.variables_manager import VariablesManager
from scripts.mmseqs_clustering import MMseqsClustering
from scripts.gwas import GWASWorkflow
from scripts.processing import Processor

%load_ext autoreload
%autoreload 2
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 40)

# workflow IO
input_dir = "/Users/januszkoszucki/MGG Dropbox/Projects/kleb-prophage-div/2025-02-12_KLEBDATA_LIGHT"
output_dir = "/Users/januszkoszucki/MGG Dropbox/Projects/gwas/2025-03-26_KLEBDATA_LIGHT_GWAS"

SyntaxError: ':' expected after dictionary key (folder_structure.py, line 317)

In [None]:
# folder structure
folder_manager = FolderStructure(input_dir, output_dir)
structure = folder_manager.get_paths_dict()
params = folder_manager.get_params_dict()

In [None]:
# input processor
inprocessor = InputProcessor(structure, params)
inprocessor.process_bacteria_table()
inprocessor.process_bacteria_iqtree()
inprocessor.process_prophage_table()
inprocessor.process_prophage_proteins()
inprocessor.process_function_predictions(run=False)
inprocessor.process_recombinant_depos(run=False)

In [None]:
# variables manager
manager = VariablesManager(structure, params)
mmseqs_vars    = manager.get_mmseqs_vars()
proc_clus_vars = manager.get_process_clustering_vars()
matrix_vars    = manager.get_matrix_vars()
functions_vars = manager.get_map_functions_vars()

lasso_vars        = manager.get_gwas_vars(mode='lasso')
elastic_net_vars  = manager.get_gwas_vars(mode='elastic_net')

In [None]:
# mmseqs clustering
clustering = MMseqsClustering(*mmseqs_vars)
clustering.run_mmseqs(run=False)
clustering.clean_clustering()
clustering.process_clustering(*proc_clus_vars)
clustering.compute_matrix(*matrix_vars)
clustering.map_functions(*functions_vars)
clustering.get_alignments()

In [None]:
# gwas
gwas = GWASWorkflow(structure, params)
gwas.get_input_files()
gwas.compute_n_variants(run=False)
gwas.get_script(*lasso_vars)
gwas.get_script(*elastic_net_vars)
gwas.run_scripts()

In [None]:
# processor
processor = Processor(structure, params)
processor.concatenate_pyseer()
processor.compute_metrics(run=False)
processor.combine_info_bootstrap()
processor.pyseer_hits_with_CI()