Skip to content

Commit

Permalink
feat(pfam/tigrfam): Add PFAM/TIGRFAM output parsers.
Browse files Browse the repository at this point in the history
  • Loading branch information
aaronmussig committed Mar 29, 2022
1 parent 625c597 commit 35c0755
Show file tree
Hide file tree
Showing 5 changed files with 162 additions and 0 deletions.
13 changes: 13 additions & 0 deletions docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,19 @@ This has been written with the intention of personal use, but feel free to use/c

diamond/index

.. toctree::
:caption: PFAM
:titlesonly:

pfam/index


.. toctree::
:caption: TIGRFAM
:titlesonly:

tigrfam/index


.. toctree::
:caption: Utility
Expand Down
12 changes: 12 additions & 0 deletions docs/source/pfam/index.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
****
PFAM
****

Helper functions
----------------

.. autofunction:: magna.pfam.read_pfam


.. autofunction:: magna.pfam.read_pfam_tophit

13 changes: 13 additions & 0 deletions docs/source/tigrfam/index.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
*******
TIGRFAM
*******


Helper functions
----------------

.. autofunction:: magna.tigrfam.read_tigrfam


.. autofunction:: magna.tigrfam.read_tigrfam_tophit

59 changes: 59 additions & 0 deletions magna/pfam/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import numpy as np
import pandas as pd


def read_pfam(path: str) -> pd.DataFrame:
"""Read the PFAM file.
Args:
path: The path to the PFAM file.
"""
dtype = {
'seq_id': object,
'aln_start': np.uintc,
'aln_end': np.uintc,
'env_start': np.uintc,
'env_end': np.uintc,
'hmm_acc': object,
'hmm_name': object,
'type': object,
'hmm_start': np.uintc,
'hmm_end': np.uintc,
'hmm_length': np.float64,
'bit_score': np.float64,
'e_value': np.float64,
'significance': np.float64,
'clan': object
}
lines = list()
with open(path, 'r') as f:
for line in f.readlines():
line = line.strip()
if line.startswith('#') or line == '':
continue
lines.append(line.split())
return pd.DataFrame(lines, columns=dtype)


def read_pfam_tophit(path: str) -> pd.DataFrame:
"""Read the PFAM tophit file.
Args:
path: The path to the PFAM tophit file.
"""
dtype = {
'seq_id': object,
'pfam_acc': object,
'e_value': np.float64,
'bit_score': np.uintc,
}
lines = list()
with open(path, 'r') as f:
f.readline()
for line in f.readlines():
line = line.strip()
gene_id, hits = line.split('\t')
for hit in hits.split(';'):
pfam_acc, e_val, bit_score = hit.split(',')
lines.append([gene_id, pfam_acc, e_val, bit_score])
return pd.DataFrame(lines, columns=dtype)
65 changes: 65 additions & 0 deletions magna/tigrfam/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import numpy as np
import pandas as pd


def read_tigrfam(path: str) -> pd.DataFrame:
"""Read the TIGRFAM file.
Args:
path: The path to the TIGRFAM file.
"""
dtype = {
'seq_id': object,
'hmm_acc': object,
'full_seq_e_value': np.float64,
'full_seq_score': np.float64,
'full_seq_bias': np.float64,
'best_domain_e_value': np.float64,
'best_domain_score': np.float64,
'best_domain_bias': np.float64,
'exp': np.float64,
'reg': np.float64,
'clu': np.float64,
'ov': np.float64,
'env': np.float64,
'dom': np.float64,
'rep': np.float64,
'inc': np.float64,
'description': object,
}
lines = list()
with open(path, 'r') as f:
for line in f.readlines():
line = line.strip()
if line.startswith('#') or line == '':
continue
cols = line.split()
cur_line = [cols[0], cols[3]]
cur_line.extend(cols[4:18])
cur_line.append(' '.join(cols[18:]))
lines.append(cur_line)
return pd.DataFrame(lines, columns=dtype)


def read_tigrfam_tophit(path: str) -> pd.DataFrame:
"""Read the TIGRFAM tophit file.
Args:
path: The path to the TIGRFAM tophit file.
"""
dtype = {
'seq_id': object,
'pfam_acc': object,
'e_value': np.float64,
'bit_score': np.uintc,
}
lines = list()
with open(path, 'r') as f:
f.readline()
for line in f.readlines():
line = line.strip()
gene_id, hits = line.split('\t')
for hit in hits.split(';'):
pfam_acc, e_val, bit_score = hit.split(',')
lines.append([gene_id, pfam_acc, e_val, bit_score])
return pd.DataFrame(lines, columns=dtype)

0 comments on commit 35c0755

Please sign in to comment.