In [1]:
%load_ext autoreload
%autoreload 2

In [30]:
### setup ###

### imports

import sys
sys.path.append('/Users/kszczepaniak/Code/phage-pipeline-env/phage-pipeline')
from lib_phage.clustering import cluster_proteins
from lib_phage.utils import setup_dir_tree, fetch_and_rename_protein_ids, build_hhr_table
from lib_phage.logs import check_input_repr_prot_selection, validate_output_repr_prot_selection
from lib_phage.logs import check_input_all_vs_all_HMM, save_params_hhblits, validate_output_hhblits
from lib_phage.logs import validate_create_db, validate_search_all_vs_all
from lib_phage.prot_compare import save_individual_seqs, run_hhblits, build_hh_db, run_all_vs_all

### paths

# data sources

cds_all_filepath = '/Users/kszczepaniak/Data/Phage/Rafal_Dropbox/data/input/combined/coding-seqs/cds-aa.fa.gz'

# work dirs
work_dir = '/Users/kszczepaniak/Data/Phage/phage-pipeline-workdir-dev/'
setup_dir_tree(work_dir)

# binaries and libraries

mmseqs_binpath  = 'mmseqs'
uniref_db_path  = '/Users/kszczepaniak/Data/Phage/db/UniRef30_2020_06/UniRef30_2020_06'
hhsuite_bins    = '/Users/kszczepaniak/Tools/hh-suite/build/bin'
hhsuite_scripts = '/Users/kszczepaniak/Tools/hh-suite/build/scripts'

/tmp/ directory already set up
/tmp/repr-proteins/ directory already set up
/tmp/mmseqs/ directory already set up
/tmp/all-by-all/ directory already set up
/tmp/all-by-all/individual-seqs/ directory already set up
/output/ directory already set up
/output/prot-families/ directory already set up
/output/prot-families/representative/ directory already set up
/output/prot-families/all-by-all/ directory already set up
/intermediate/ directory already set up
/intermediate/prot-families/ directory already set up
/intermediate/prot-families/profiles/ directory already set up
/intermediate/prot-families/all-by-all/ directory already set up
/intermediate/prot-families/db/ directory already set up
/log/ directory already set up


In [31]:
### Get representative proteins ###

# set clustering params
cluster_params_min_seqid   = 0.3
cluster_params_sensitivity = 7
cluster_params_coverage    = 0.95

# check input files integrity & if this step was already executed:
# if it was warn about data overwrite
if check_input_repr_prot_selection():

    # perform clustering
    clustering_filepath = cluster_proteins(input_fasta_filepath=cds_all_filepath,
                                           output_dirpath=work_dir + 'output/prot-families/representative',
                                           mmseqs_tempdir=work_dir + 'tmp/mmseqs',
                                           cluster_params_min_seqid=cluster_params_min_seqid,
                                           cluster_params_sensitivity=cluster_params_sensitivity,
                                           cluster_params_coverage=cluster_params_coverage,
                                           verbose=True)

    no_repr_prot, name_table_filepath = fetch_and_rename_protein_ids(work_dir, clustering_filepath, cds_all_filepath)

    # verify output and save log file
    validate_output_repr_prot_selection(work_dir=work_dir,
                                        output_dirpath=work_dir + 'output/prot-families/representative',
                                        cluster_params_min_seqid=cluster_params_min_seqid,
                                        cluster_params_sensitivity=cluster_params_sensitivity,
                                        cluster_params_coverage=cluster_params_coverage)

Creating db... Done!
Clustering... Done!
Generating a clustering table... Done!
DEVEL: Restricting input to 16 proteins for fast calculation
Validation success, log file stored.


In [32]:
### Perform all vs all comparison ###
# create profiles for each protein with hhblits #

# set create profiles params
cpu  = 2
n    = 1
mact = 0.35
p    = 90
qid  = 10
cov  = 10

# validate previous step
if check_input_all_vs_all_HMM(work_dir=work_dir):

    # execute current step
    save_individual_seqs(work_dir=work_dir)

    run_hhblits(work_dir=work_dir, hhsuite_bins=hhsuite_bins, hhsuite_scripts=hhsuite_scripts, cpu=cpu, 
                uniref_db_path=uniref_db_path, n=n, mact=mact, p=p, qid=qid, cov=cov)

    # save params to log
    save_params_hhblits(work_dir=work_dir, n=n, mact=mact, p=p, qid=qid, cov=cov)

This step was already executed. Run validation fucntion with force=True to overwrite.


In [33]:
# build db #

# validate previous step
if validate_output_hhblits(work_dir):
    
    # execute current step
    build_hh_db(work_dir=work_dir, hhsuite_bins=hhsuite_bins,
                hhsuite_scripts=hhsuite_scripts, verbose=True)

Database already exists. Overwrite? [y/n]y
Database cleaned.
Concatenated a3m alignments.
Created HMM profiles.
Created column state (CS) sequence database.
DB sorted.
DB successfuly created.


In [157]:
# search all vs all #

# set all vs all search params
cpu  = 2
n    = 1
p    = 90

if validate_create_db(work_dir=work_dir):

    run_all_vs_all(work_dir=work_dir, hhsuite_bins=hhsuite_bins, 
                   hhsuite_scripts=hhsuite_scripts, cpu=cpu, n=n, p=p)

Parameters saved, log file stored.


In [26]:
# create results table #

# check if previous step complete
if validate_search_all_vs_all(work_dir):
    
    # create results table
    build_hhr_table(work_dir)
