fix(datasets): Reduced the gunc dataset to just RefSeq/GenBank genomes.

aaronmussig · Feb 2, 2022 · d432765 · d432765
1 parent d74eccf
commit d432765
Showing 1 changed file with 16 additions and 6 deletions.
diff --git a/magna/dataset/gunc/__init__.py b/magna/dataset/gunc/__init__.py
@@ -1,6 +1,5 @@
 import os
 import tempfile
-
 import numpy as np
 import pandas as pd
 
@@ -19,27 +18,38 @@ def __init__(self):
         self.df = self._read()
 
     def _read(self):
+        print('Note: Only the RefSeq and GenBank results are used.')
         return pd.read_feather(self.path)
 
     @staticmethod
     def _read_tsv(path):
         dtype = {
+            'genome': np.object,
             'n_genes_called': np.uintc,
             'n_genes_mapped': np.uintc,
             'n_contigs': np.uintc,
+            'taxonomic_level': np.object,
             'proportion_genes_retained_in_major_clades': np.float16,
             'genes_retained_index': np.float16,
             'clade_separation_score': np.float16,
             'contamination_portion': np.float16,
             'n_effective_surplus_clades': np.float16,
             'mean_hit_identity': np.float16,
             'reference_representation_score': np.float16,
+            'pass.GUNC': np.object,
+            'study': np.object,
         }
-        converters = {'pass.GUNC': lambda x: x == 'True'}
-        print(f'Note: Line 1,934,122 is skipped as GMGC.SAMEA2623756.bin.19 (GMGC unfiltered) is NaN.')
-        df = pd.read_csv(path, sep='\t', index_col=False, dtype=dtype,
-                         converters=converters, skiprows=[1934122])
-        return df
+        rows = list()
+        allowed_studies = frozenset({'GenBank', 'RefSeq'})
+        with open(path, 'r') as f:
+            header = {k: i for i, k in enumerate(
+                f.readline().strip().split('\t'))}
+            study_idx = header['study']
+            for line in f.readlines():
+                cols = line.strip().split('\t')
+                if cols[study_idx] in allowed_studies:
+                    rows.append(cols)
+        return pd.DataFrame(rows, columns=dtype)
 
     def _download(self):
         with tempfile.TemporaryDirectory() as tmpdir: