feat(pfam/tigrfam): Add PFAM/TIGRFAM output parsers.

aaronmussig · Mar 29, 2022 · 35c0755 · 35c0755
1 parent 625c597
commit 35c0755
Show file tree

Hide file tree

Showing 5 changed files with 162 additions and 0 deletions.
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -31,6 +31,19 @@ This has been written with the intention of personal use, but feel free to use/c
 
     diamond/index
 
+.. toctree::
+    :caption: PFAM
+    :titlesonly:
+
+    pfam/index
+
+
+.. toctree::
+    :caption: TIGRFAM
+    :titlesonly:
+
+    tigrfam/index
+
 
 .. toctree::
    :caption: Utility

diff --git a/docs/source/pfam/index.rst b/docs/source/pfam/index.rst
@@ -0,0 +1,12 @@
+****
+PFAM
+****
+
+Helper functions
+----------------
+
+.. autofunction:: magna.pfam.read_pfam
+
+
+.. autofunction:: magna.pfam.read_pfam_tophit
+
diff --git a/docs/source/tigrfam/index.rst b/docs/source/tigrfam/index.rst
@@ -0,0 +1,13 @@
+*******
+TIGRFAM
+*******
+
+
+Helper functions
+----------------
+
+.. autofunction:: magna.tigrfam.read_tigrfam
+
+
+.. autofunction:: magna.tigrfam.read_tigrfam_tophit
+
diff --git a/magna/pfam/__init__.py b/magna/pfam/__init__.py
@@ -0,0 +1,59 @@
+import numpy as np
+import pandas as pd
+
+
+def read_pfam(path: str) -> pd.DataFrame:
+    """Read the PFAM file.
+
+    Args:
+        path: The path to the PFAM file.
+    """
+    dtype = {
+        'seq_id': object,
+        'aln_start': np.uintc,
+        'aln_end': np.uintc,
+        'env_start': np.uintc,
+        'env_end': np.uintc,
+        'hmm_acc': object,
+        'hmm_name': object,
+        'type': object,
+        'hmm_start': np.uintc,
+        'hmm_end': np.uintc,
+        'hmm_length': np.float64,
+        'bit_score': np.float64,
+        'e_value': np.float64,
+        'significance': np.float64,
+        'clan': object
+    }
+    lines = list()
+    with open(path, 'r') as f:
+        for line in f.readlines():
+            line = line.strip()
+            if line.startswith('#') or line == '':
+                continue
+            lines.append(line.split())
+    return pd.DataFrame(lines, columns=dtype)
+
+
+def read_pfam_tophit(path: str) -> pd.DataFrame:
+    """Read the PFAM tophit file.
+
+    Args:
+        path: The path to the PFAM tophit file.
+    """
+    dtype = {
+        'seq_id': object,
+        'pfam_acc': object,
+        'e_value': np.float64,
+        'bit_score': np.uintc,
+    }
+    lines = list()
+    with open(path, 'r') as f:
+        f.readline()
+        for line in f.readlines():
+            line = line.strip()
+            gene_id, hits = line.split('\t')
+            for hit in hits.split(';'):
+                pfam_acc, e_val, bit_score = hit.split(',')
+                lines.append([gene_id, pfam_acc, e_val, bit_score])
+    return pd.DataFrame(lines, columns=dtype)
diff --git a/magna/tigrfam/__init__.py b/magna/tigrfam/__init__.py
@@ -0,0 +1,65 @@
+import numpy as np
+import pandas as pd
+
+
+def read_tigrfam(path: str) -> pd.DataFrame:
+    """Read the TIGRFAM file.
+
+    Args:
+        path: The path to the TIGRFAM file.
+    """
+    dtype = {
+        'seq_id': object,
+        'hmm_acc': object,
+        'full_seq_e_value': np.float64,
+        'full_seq_score': np.float64,
+        'full_seq_bias': np.float64,
+        'best_domain_e_value': np.float64,
+        'best_domain_score': np.float64,
+        'best_domain_bias': np.float64,
+        'exp': np.float64,
+        'reg': np.float64,
+        'clu': np.float64,
+        'ov': np.float64,
+        'env': np.float64,
+        'dom': np.float64,
+        'rep': np.float64,
+        'inc': np.float64,
+        'description': object,
+    }
+    lines = list()
+    with open(path, 'r') as f:
+        for line in f.readlines():
+            line = line.strip()
+            if line.startswith('#') or line == '':
+                continue
+            cols = line.split()
+            cur_line = [cols[0], cols[3]]
+            cur_line.extend(cols[4:18])
+            cur_line.append(' '.join(cols[18:]))
+            lines.append(cur_line)
+    return pd.DataFrame(lines, columns=dtype)
+
+
+def read_tigrfam_tophit(path: str) -> pd.DataFrame:
+    """Read the TIGRFAM tophit file.
+
+    Args:
+        path: The path to the TIGRFAM tophit file.
+    """
+    dtype = {
+        'seq_id': object,
+        'pfam_acc': object,
+        'e_value': np.float64,
+        'bit_score': np.uintc,
+    }
+    lines = list()
+    with open(path, 'r') as f:
+        f.readline()
+        for line in f.readlines():
+            line = line.strip()
+            gene_id, hits = line.split('\t')
+            for hit in hits.split(';'):
+                pfam_acc, e_val, bit_score = hit.split(',')
+                lines.append([gene_id, pfam_acc, e_val, bit_score])
+    return pd.DataFrame(lines, columns=dtype)