Parse data sources when reading lineage CSV file

apriha · Sep 9, 2018 · d554a60 · d554a60
1 parent 8b60140
commit d554a60
Show file tree

Hide file tree

Showing 3 changed files with 34 additions and 10 deletions.
diff --git a/lineage/individual.py b/lineage/individual.py
@@ -371,7 +371,7 @@ def _add_snps(self, snps, discrepant_snp_positions_threshold,
             return discrepant_positions, discrepant_genotypes
 
         build = snps.build
-        source = snps.source
+        source = [s.strip() for s in snps.source.split(',')]
 
         if not snps.build_detected:
             print('build not detected, assuming build {}'.format(snps.build))
@@ -385,7 +385,7 @@ def _add_snps(self, snps, discrepant_snp_positions_threshold,
         snps = self._double_single_alleles(snps.snps, 'X')
 
         if self._snps is None:
-            self._source.append(source)
+            self._source.extend(source)
             self._snps = snps
         else:
             common_snps = self._snps.join(snps, how='inner', rsuffix='_added')
@@ -439,7 +439,7 @@ def _add_snps(self, snps, discrepant_snp_positions_threshold,
                 return discrepant_positions, discrepant_genotypes
 
             # add new SNPs
-            self._source.append(source)
+            self._source.extend(source)
             self._snps = self._snps.combine_first(snps)
             self._snps.loc[discrepant_genotypes.index, 'genotype'] = np.nan
 

diff --git a/lineage/snps.py b/lineage/snps.py
@@ -135,7 +135,7 @@ def _read_raw_data(self, file):
             elif first_line.startswith('RSID'):
                 return self._read_ftdna(file)
             elif 'lineage' in first_line:
-                return self._read_lineage_csv(file)
+                return self._read_lineage_csv(file, comments)
             elif first_line.startswith('rsid'):
                 return self._read_generic_csv(file)
             else:
@@ -256,26 +256,34 @@ def _read_ancestry(file):
         return sort_snps(df), 'AncestryDNA'
 
     @staticmethod
-    def _read_lineage_csv(file):
+    def _read_lineage_csv(file, comments):
         """ Read and parse CSV file generated by lineage.
 
         Parameters
         ----------
         file : str
             path to file
+        comments : str
+            comments at beginning of file
 
         Returns
         -------
         pandas.DataFrame
             individual's genetic data normalized for use with `lineage`
         str
-            name of data source
+            name of data source(s)
         """
+        source = ''
+        for comment in comments.split('\n'):
+            if 'Source(s):' in comment:
+                source = comment.split('Source(s):')[1].strip()
+                break
+
         df = pd.read_csv(file, comment='#', header=0, na_values='--',
                          names=['rsid', 'chrom', 'pos', 'genotype'],
                          index_col=0, dtype={'chrom': object, 'pos': np.int64})
 
-        return sort_snps(df), 'lineage'
+        return sort_snps(df), source
 
     @staticmethod
     def _read_generic_csv(file):

diff --git a/tests/test_individual.py b/tests/test_individual.py
@@ -120,12 +120,28 @@ def test_snps_ancestry(l, generic_snps):
     pd.testing.assert_frame_equal(ind.snps, generic_snps)
 
 
-def test_source_lineage(l):
-    ind = l.create_individual('', 'tests/input/chromosomes.csv')
+def test_source_lineage_file(l):
+    ind = l.create_individual('', 'tests/input/GRCh37.csv')
     assert ind.source == 'generic'
+    ind.load_snps('tests/input/23andme.txt')
+    assert ind.source == 'generic, 23andMe'
     file = ind.save_snps()
     ind_saved_snps = l.create_individual('', file)
-    assert ind_saved_snps.source == 'lineage'
+    assert ind_saved_snps.source == 'generic, 23andMe'
+    pd.testing.assert_frame_equal(ind.snps, ind_saved_snps.snps)
+
+
+def test_source_lineage_file_gzip(l):
+    ind = l.create_individual('', 'tests/input/GRCh37.csv')
+    assert ind.source == 'generic'
+    ind.load_snps('tests/input/23andme.txt')
+    assert ind.source == 'generic, 23andMe'
+    file = ind.save_snps()
+    with open(file, 'rb') as f_in:
+        with gzip.open(file + '.gz', 'wb') as f_out:
+            shutil.copyfileobj(f_in, f_out)
+    ind_saved_snps = l.create_individual('', file + '.gz')
+    assert ind_saved_snps.source == 'generic, 23andMe'
     pd.testing.assert_frame_equal(ind.snps, ind_saved_snps.snps)