Skip to content

Commit

Permalink
fix(datasets): Reduced the gunc dataset to just RefSeq/GenBank genomes.
Browse files Browse the repository at this point in the history
  • Loading branch information
aaronmussig committed Feb 2, 2022
1 parent d74eccf commit d432765
Showing 1 changed file with 16 additions and 6 deletions.
22 changes: 16 additions & 6 deletions magna/dataset/gunc/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import os
import tempfile

import numpy as np
import pandas as pd

Expand All @@ -19,27 +18,38 @@ def __init__(self):
self.df = self._read()

def _read(self):
print('Note: Only the RefSeq and GenBank results are used.')
return pd.read_feather(self.path)

@staticmethod
def _read_tsv(path):
dtype = {
'genome': np.object,
'n_genes_called': np.uintc,
'n_genes_mapped': np.uintc,
'n_contigs': np.uintc,
'taxonomic_level': np.object,
'proportion_genes_retained_in_major_clades': np.float16,
'genes_retained_index': np.float16,
'clade_separation_score': np.float16,
'contamination_portion': np.float16,
'n_effective_surplus_clades': np.float16,
'mean_hit_identity': np.float16,
'reference_representation_score': np.float16,
'pass.GUNC': np.object,
'study': np.object,
}
converters = {'pass.GUNC': lambda x: x == 'True'}
print(f'Note: Line 1,934,122 is skipped as GMGC.SAMEA2623756.bin.19 (GMGC unfiltered) is NaN.')
df = pd.read_csv(path, sep='\t', index_col=False, dtype=dtype,
converters=converters, skiprows=[1934122])
return df
rows = list()
allowed_studies = frozenset({'GenBank', 'RefSeq'})
with open(path, 'r') as f:
header = {k: i for i, k in enumerate(
f.readline().strip().split('\t'))}
study_idx = header['study']
for line in f.readlines():
cols = line.strip().split('\t')
if cols[study_idx] in allowed_studies:
rows.append(cols)
return pd.DataFrame(rows, columns=dtype)

def _download(self):
with tempfile.TemporaryDirectory() as tmpdir:
Expand Down

0 comments on commit d432765

Please sign in to comment.