### Workflow from genomes to families tables
* Setup dir structure
* Cluster ORFs and select reprseqs
* Create HMM profiles with uniclust
* Create db from set of profiles
* Run all vs all comparison of profiles
* Collect resulst of all-vs-all into table

This notebook does **not** contain running profiles on HHsuite-formated databases.

In [1]:
%load_ext autoreload
%autoreload 2
import warnings
warnings.filterwarnings('ignore')

In [219]:
### setup ###

### imports

import random
import sys
pipeline_env_path = '/Users/kszczepaniak/Code/phage-pipeline-env'
lib_pp_path =       '/Users/kszczepaniak/Code/phage-pipeline-env/phage-pipeline'
sys.path.append(lib_pp_path)
import submodules.pdf.domain_split as ds
from lib_phage.clustering import cluster_proteins
from lib_phage.utils import setup_dir_tree, fetch_and_rename_protein_ids, build_hhr_table
from lib_phage.utils import process_phanotate_output, create_reprseq_profile_from_clustering
from lib_phage.utils import create_bash_script_to_parse_hhr_results, run_parsing_with_bash
from lib_phage.utils import concatenate_parsing_results, clean_clustering_partial_data
from lib_phage.logs import check_input_repr_prot_selection, validate_output_repr_prot_selection
from lib_phage.logs import check_input_all_vs_all_HMM, save_params_hhblits, validate_output_hhblits
from lib_phage.logs import validate_create_db, validate_search_all_vs_all, validate_input_ECF, validate_output_ECF
from lib_phage.prot_compare import save_individual_seqs, run_hhblits, build_hh_db, run_all_vs_all
from lib_phage.ecf_finder_wrapper import load_and_filter_data, store_scan_results

from lib_phage.repr_hits_pairwise import get_prob_cov

### run mode
run_mode = 'hhblits' # profile creation mode [mmseqs/hhblits]

### paths

# data sources

# cds_all_filepath = '/Users/kszczepaniak/Data/Phage/Rafal_Dropbox/data/input/combined/coding-seqs/cds-aa.fa.gz'
phanotate_filepath = '/Users/kszczepaniak/Data/Phage/phage-pipeline-input-dev/seq-test.phanotate.txt'

# work dirs
work_dir = '/Users/kszczepaniak/Data/Phage/phage-pipeline-workdir-dev/'
setup_dir_tree(work_dir)

# binaries and libraries

mmseqs_binpath  = 'mmseqs'
uniref_db_path  = '/Users/kszczepaniak/Data/Phage/db/UniRef30_2020_06/UniRef30_2020_06'
hhsuite_bins    = '/Users/kszczepaniak/Tools/hh-suite/build/bin'
hhsuite_scripts = '/Users/kszczepaniak/Tools/hh-suite/build/scripts'

/tmp/ directory already set up
/tmp/repr-proteins/ directory already set up
/tmp/mmseqs/ directory already set up
/tmp/all-by-all/ directory already set up
/tmp/all-by-all/individual-seqs/ directory already set up
/tmp/parse/ directory already set up
/tmp/prot-families/ directory already set up
/tmp/prot-families/pair_table_chunks/ directory already set up
/input/ directory already set up
/input/phanotate/ directory already set up
/input/coding-seqs/ directory already set up
/output/ directory already set up
/output/prot-families/ directory already set up
/output/prot-families/representative/ directory already set up
/output/prot-families/all-by-all/ directory already set up
//output/prot-families/functional/ directory already set up
//output/prot-families/families/ directory already set up
/output/prot-families/all-by-all/hhblits/ directory already set up
/output/prot-families/all-by-all/mmseqs/ directory already set up
/intermediate/ directory already set up
/intermediate/prot-famili

In [8]:
### Create protein db from Phanotate data ###

# translate Phanotate & compress with bgzip
process_phanotate_output(phanotate_filepath=phanotate_filepath, work_dir=work_dir)

All fasta translated. File compressed.


In [9]:
### Get representative proteins ###

# set clustering params
cluster_params_min_seqid   = 0.3
cluster_params_sensitivity = 7
cluster_params_coverage    = 0.95

# check input files integrity & if this step was already executed:
# if it was warn about data overwrite
if check_input_repr_prot_selection():

    # perform clustering
    clustering_filepath, clustering_msa_filepath = cluster_proteins(input_fasta_filepath=work_dir + 'input/coding-seqs/cds-aa.fa.gz',
                                           output_dirpath=work_dir + 'output/prot-families/representative',
                                           mmseqs_tempdir=work_dir + 'tmp/mmseqs',
                                           mmseqs_binpath=mmseqs_binpath,
                                           cluster_params_min_seqid=cluster_params_min_seqid,
                                           cluster_params_sensitivity=cluster_params_sensitivity,
                                           cluster_params_coverage=cluster_params_coverage,
                                           verbose=True)

    no_repr_prot, name_table_filepath = fetch_and_rename_protein_ids(work_dir, clustering_filepath, 
                                                                     work_dir + 'input/coding-seqs/cds-aa.fa.gz')

    # verify output and save log file
    validate_output_repr_prot_selection(work_dir=work_dir,
                                        output_dirpath=work_dir + 'output/prot-families/representative',
                                        cluster_params_min_seqid=cluster_params_min_seqid,
                                        cluster_params_sensitivity=cluster_params_sensitivity,
                                        cluster_params_coverage=cluster_params_coverage)

Creating db... Done!
Clustering... Done!
Generating a clustering table... Done!
Creating MSA... Done!
Validation success, log file stored.


In [28]:
### Create profiles from mmseqs clusters ###
# profile for singleton in clustering is the sequence itself

create_reprseq_profile_from_clustering(clustering_filepath, clustering_msa_filepath,
                                       cds_all_filepath = work_dir + 'input/coding-seqs/cds-aa.fa',
                                       profile_outdir = work_dir + 'intermediate/prot-families/profiles/mmseqs')

In [10]:
### Perform all vs all comparison ###
# create profiles for each protein with hhblits #

# set create profiles params
cpu  = 2 # max number of CPUs to be used in the step
n    = 1
mact = 0.35
p    = 90
qid  = 10
cov  = 10

if run_mode == 'hhblits': # execute this only when creating profiles with hhblits

    # validate previous step
    if check_input_all_vs_all_HMM(work_dir=work_dir, force=True):

        # execute current step
        save_individual_seqs(work_dir=work_dir)

        run_hhblits(work_dir=work_dir, hhsuite_bins=hhsuite_bins, hhsuite_scripts=hhsuite_scripts, cpu=cpu, 
                    uniref_db_path=uniref_db_path, n=n, mact=mact, p=p, qid=qid, cov=cov)

        # save params to log
        save_params_hhblits(work_dir=work_dir, n=n, mact=mact, p=p, qid=qid, cov=cov)

This will overwrite all data from this step. Proceed? [y/n]y
Clearing /Users/kszczepaniak/Data/Phage/phage-pipeline-workdir-dev/intermediate/prot-families/all-by-all...
Parameters saved, log file stored.


In [30]:
# Build db #

if run_mode == 'hhblits': # execute this only when creating profiles with hhblits

    # validate previous step
    if validate_output_hhblits(work_dir):

        # execute current step
        build_hh_db(work_dir=work_dir, hhsuite_bins=hhsuite_bins,
                    hhsuite_scripts=hhsuite_scripts, verbose=True, run_mode=run_mode)
        
elif run_mode == 'mmseqs':
    # execute current step
    build_hh_db(work_dir=work_dir, hhsuite_bins=hhsuite_bins,
                hhsuite_scripts=hhsuite_scripts, verbose=True, run_mode=run_mode)


Database already exists. Overwrite? [y/n]y
Database cleaned.
Concatenated a3m alignments.
Created HMM profiles.
Created column state (CS) sequence database.
DB sorted.
DB successfuly created.


In [31]:
# search all vs all #

# set all vs all search params
cpu  = 2 # max number of CPUs to be used in the step
n    = 1
p    = 90

if validate_create_db(work_dir=work_dir):

    run_all_vs_all(work_dir=work_dir, hhsuite_bins=hhsuite_bins, 
                   hhsuite_scripts=hhsuite_scripts, cpu=cpu, n=n, 
                   p=p, a3m_wildcard='reprseq*a3m', run_mode=run_mode)

Parameters saved, log file stored.


In [32]:
# create results table #

# check if previous step complete
if validate_search_all_vs_all(work_dir, run_mode=run_mode):
    
    # create results table
    build_hhr_table(work_dir, run_mode=run_mode)


HHblits all-vs-all step complete. Updated status.


In [192]:
# repr-hits-pairwise #
## define paths
inter_dir = work_dir + 'intermediate/'
all_by_all_output_dir = output_dir + 'prot-families/all-by-all/' + run_mode + '/'
families_output_dir = output_dir + 'prot-families/families/'

# load hhr_table
table_hhr_filename = all_by_all_output_dir + 'table-hhr.txt'
table_hhr = pd.read_csv(table_hhr_filename, sep=',')
table_hhr

Unnamed: 0,qname,qstart,qend,qlength,sname,sstart,send,slength,pident,bitscore,eval,prob,pval
0,reprseq00001,1,86,86,reprseq00001,1,86,86,100,246.5,5.300000e-47,100.0,7.700000e-51
1,reprseq00001,30,44,86,reprseq14105,27,41,73,20,25.8,9.300000e-01,56.2,1.100000e-04
2,reprseq00002,1,125,125,reprseq00002,1,125,125,100,374.6,8.800000e-65,100.0,1.300000e-68
3,reprseq00002,1,125,125,reprseq13080,1,125,138,40,364.7,5.900000e-62,100.0,8.200000e-66
4,reprseq00002,1,125,125,reprseq17146,28,158,158,33,367.2,3.400000e-61,100.0,4.600000e-65
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1382435,reprseq34463,55,76,200,reprseq32916,39,60,113,27,29.7,1.100000e+00,51.5,1.500000e-04
1382436,reprseq34464,1,187,187,reprseq34464,1,187,187,100,463.6,1.400000e-72,100.0,2.100000e-76
1382437,reprseq34464,9,183,187,reprseq06995,1,181,181,32,432.2,8.900000e-68,100.0,1.400000e-71
1382438,reprseq34464,6,73,187,reprseq13037,48,109,201,27,34.7,5.900000e-01,63.0,7.000000e-05


In [142]:
## get unique pairs
## get from table_hhr qname-sname unique pairs of ids

pair_table = table_hhr[['qname', 'sname']]
pair_table = pair_table[pair_table['qname'] != pair_table['sname']]
pair_table = pair_table.drop_duplicates()
pair_table

Unnamed: 0,qname,sname
1,reprseq00001,reprseq14105
3,reprseq00002,reprseq13080
4,reprseq00002,reprseq17146
6,reprseq00003,reprseq27929
8,reprseq00004,reprseq08141
...,...,...
1382434,reprseq34463,reprseq04091
1382435,reprseq34463,reprseq32916
1382437,reprseq34464,reprseq06995
1382438,reprseq34464,reprseq13037


In [235]:
#test#
pair_table_test = pair_table[:1000]

In [236]:
# create script to run in parallel
n_cores = 4

In [237]:
#!! pair_table_test !!#

# divide pair table into sub-tables
chunk_size = int(len(pair_table_test) / n_cores)
pair_table_parts = []
for i in range(n_cores-1):
    pair_table_parts.append(pair_table_test[i*chunk_size:(i+1)*chunk_size])
pair_table_parts.append(pair_table_test[(n_cores-1)*chunk_size:])

parts_dir = work_dir + 'tmp/prot-families/pair_table_chunks/'
for i, pair_table_part in enumerate(pair_table_parts):
    pair_table_part.to_csv(parts_dir + 'pair-table-' + str(i) + '.csv', index=False)

In [238]:
# run in parallel python script to process sub-table
script_filepath =  work_dir + 'tmp/prot-families/run-pairwise-hits.sh'
prob_threshold = 50

cmd = '#!/bin/bash\n\n'
cmd += 'source ' + pipeline_env_path + '/bin/activate\n'

for i, pair_table_part in enumerate(pair_table_parts):
    pair_table_path = parts_dir + 'pair-table-' + str(i) + '.csv'    
    cmd += 'nohup python3 {}/phage-pipeline/lib_phage/run_hits_pairwise_single_table.py {} {} {} {} {} &\n'.format(
    pipeline_env_path, work_dir, run_mode, i, prob_threshold, lib_pp_path)

with open(script_filepath, 'w') as file_sh:
    file_sh.write(cmd)

In [97]:
# there are some paths that are used further in pipeline #

import pandas as pd
import numpy as np

## define paths
output_dir = work_dir + 'output/'
prot_families_dir = output_dir + 'prot-families/'
prot_families_temp_dir = work_dir + 'tmp/prot-families/'

inter_dir = work_dir + 'intermediate/'
all_by_all_output_dir = output_dir + 'prot-families/all-by-all/' + run_mode + '/'
families_output_dir = output_dir + 'prot-families/families/'

## load representative mappings
repr_dir = work_dir + 'output/prot-families/representative/'
repr_table_filename = repr_dir + 'name-table.txt'
repr_table = pd.read_csv(repr_table_filename, sep=',')
nr = len(repr_table)
nrd = len(str(nr))

table_hhr_filename = all_by_all_output_dir + 'table-hhr.txt'
table_hhr = pd.read_csv(table_hhr_filename, sep=',')
table_hhr

Unnamed: 0,qname,qstart,qend,qlength,sname,sstart,send,slength,pident,bitscore,eval,prob,pval
0,reprseq00001,1,86,86,reprseq00001,1,86,86,100,246.5,5.300000e-47,100.0,7.700000e-51
1,reprseq00001,30,44,86,reprseq14105,27,41,73,20,25.8,9.300000e-01,56.2,1.100000e-04
2,reprseq00002,1,125,125,reprseq00002,1,125,125,100,374.6,8.800000e-65,100.0,1.300000e-68
3,reprseq00002,1,125,125,reprseq13080,1,125,138,40,364.7,5.900000e-62,100.0,8.200000e-66
4,reprseq00002,1,125,125,reprseq17146,28,158,158,33,367.2,3.400000e-61,100.0,4.600000e-65
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1382435,reprseq34463,55,76,200,reprseq32916,39,60,113,27,29.7,1.100000e+00,51.5,1.500000e-04
1382436,reprseq34464,1,187,187,reprseq34464,1,187,187,100,463.6,1.400000e-72,100.0,2.100000e-76
1382437,reprseq34464,9,183,187,reprseq06995,1,181,181,32,432.2,8.900000e-68,100.0,1.400000e-71
1382438,reprseq34464,6,73,187,reprseq13037,48,109,201,27,34.7,5.900000e-01,63.0,7.000000e-05
