Skip to content

Commit

Permalink
added msa_size calculation
Browse files Browse the repository at this point in the history
  • Loading branch information
tkosciol committed Sep 26, 2017
1 parent 618d1d0 commit ea48820
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 32 deletions.
28 changes: 13 additions & 15 deletions microprot/scripts/snakemake_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,17 +48,18 @@ def msa_size(msa_fp):
Parameters
----------
msa_fp : str
File path to an MSA file (a3m or just root of file name)
File path to an MSA file (a3m or root of file name)
Returns
-------
msa_size : int
size of an MSA
"""
# msa_dir, msa_ext = os.path.splitext(os.path.abspath(msa_fp))
# if msa_ext != '.a3m':
# msa_ext = '.a3m'
# with open(''.join([msa_dir, msa_ext]), 'r') as f:
msa_dir, msa_ext = os.path.splitext(os.path.abspath(msa_fp))
if msa_ext != '.a3m':
msa_ext = '.a3m'
msa_fp = ''.join([msa_dir, msa_ext])

with open(msa_fp, 'r') as f:
lines = f.readlines()
msa_size = sum(1 for line in lines if line.startswith('>')) - 1
Expand Down Expand Up @@ -131,18 +132,15 @@ def write_db(fname, step=None, version=1, db_fp='/tmp/protein_db'):
for prot in prots:
prot_name = prot.metadata['id']
timestamp = str(datetime.now()).split('.')[0]
"""
`msa_fp` needs to be retrieved from 1 level up (e.g. 01 folder,
instead of 02 folder. For future consideration. Added as issue #48
"""
# msa_fp = '%s/%s' % (trim(fname, '/'), prot_name)
# msa_size_len = msa_size(msa_fp)

# > protein_name # source # commit_no # timestamp # msa_size
append_idx = '>%s # %s # %i # %s\n' % (prot_name,
msa_size_len = msa_size(fname)

# > protein_name # source # msa_size # commit_no # timestamp
append_idx = '>%s # %s # %i # %i # %s\n' % (prot_name,
step,
msa_size_len,
version,
timestamp)
timestamp
)
with open('%s.index' % db_fp, 'a') as f:
f.write(append_idx)

Expand Down
24 changes: 7 additions & 17 deletions microprot/snakemake/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,6 @@ def copy_out(source_dir='01-PDB', dest_dir='PDB', tempdir='/dev/null'):
if len(X_match) > 0:
X_match_names = [re.sub('.match', '', _match) for _match in X_match
if snakemake_helpers.not_empty(_match)]
# TODO
# fix needed here - returns all files, not only non_empty match
for _match in X_match_names:
shell('rsync -rzq --min-size=1 {_match}.{{match,out,a3m}} \
{config[MICROPROT_OUT]}/{dest_dir}/ 2> /dev/null')
Expand Down Expand Up @@ -193,39 +191,31 @@ rule search_split_CM:
log=log[0])


rule search_Pfam:
rule search_split_Pfam:
input:
config['MICROPROT_TEMP']+'/{seq}/02-CM/{seq}'
output:
temp(config['MICROPROT_TEMP']+'/{seq}/03-search_Pfam/{seq}')
temp(config['MICROPROT_TEMP']+'/{seq}/03-Pfam/{seq}')
log:
config['MICROPROT_TEMP']+'/{seq}/{seq}.log'
config['MICROPROT_TEMP']+'/{seq}/{seq}'
threads: config['THREADS']
run:
if snakemake_helpers.not_empty(input[0]):
shell('echo "pass" > {output}')
else:
shell('echo -e "SEARCH PFAM\n-----------" >> {log}')
logfile = '%s.log' % log[0]
search_x(input[0], output[0],
params=config['search_Pfam']['params'],
n_cpu=config['THREADS'],
dbs=config['search_Pfam']['DB'],
log=log[0], hh='hhsearch',
match_exp='non_match')


rule split_Pfam:
input:
rules.search_Pfam.output[0]
output:
temp(config['MICROPROT_TEMP']+'/{seq}/03-split_Pfam/{seq}')
log:
config['MICROPROT_TEMP']+'/{seq}/{seq}'
run:
if snakemake_helpers.not_empty(input[0]):
if snakemake_helpers.not_empty(output[0]):
shell('echo "pass" > {output}')
else:
split_x(input[0], output[0],
split_x(output[0], output[0],
step="Pfam",
frag_len=config['split_Pfam']['params']['min_fragment_length'],
e_val=config['split_Pfam']['params']['max_evalue'],
Expand All @@ -235,7 +225,7 @@ rule split_Pfam:

rule MSA_hhblits:
input:
config['MICROPROT_TEMP']+'/{seq}/03-split_Pfam/{seq}'
config['MICROPROT_TEMP']+'/{seq}/03-Pfam/{seq}'
output:
temp(config['MICROPROT_TEMP']+'/{seq}/04-MSA_hhblits/{seq}')
log:
Expand Down

0 comments on commit ea48820

Please sign in to comment.