Skip to content

Commit

Permalink
Parse data sources when reading lineage CSV file
Browse files Browse the repository at this point in the history
  • Loading branch information
apriha committed Sep 9, 2018
1 parent 8b60140 commit d554a60
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 10 deletions.
6 changes: 3 additions & 3 deletions lineage/individual.py
Expand Up @@ -371,7 +371,7 @@ def _add_snps(self, snps, discrepant_snp_positions_threshold,
return discrepant_positions, discrepant_genotypes

build = snps.build
source = snps.source
source = [s.strip() for s in snps.source.split(',')]

if not snps.build_detected:
print('build not detected, assuming build {}'.format(snps.build))
Expand All @@ -385,7 +385,7 @@ def _add_snps(self, snps, discrepant_snp_positions_threshold,
snps = self._double_single_alleles(snps.snps, 'X')

if self._snps is None:
self._source.append(source)
self._source.extend(source)
self._snps = snps
else:
common_snps = self._snps.join(snps, how='inner', rsuffix='_added')
Expand Down Expand Up @@ -439,7 +439,7 @@ def _add_snps(self, snps, discrepant_snp_positions_threshold,
return discrepant_positions, discrepant_genotypes

# add new SNPs
self._source.append(source)
self._source.extend(source)
self._snps = self._snps.combine_first(snps)
self._snps.loc[discrepant_genotypes.index, 'genotype'] = np.nan

Expand Down
16 changes: 12 additions & 4 deletions lineage/snps.py
Expand Up @@ -135,7 +135,7 @@ def _read_raw_data(self, file):
elif first_line.startswith('RSID'):
return self._read_ftdna(file)
elif 'lineage' in first_line:
return self._read_lineage_csv(file)
return self._read_lineage_csv(file, comments)
elif first_line.startswith('rsid'):
return self._read_generic_csv(file)
else:
Expand Down Expand Up @@ -256,26 +256,34 @@ def _read_ancestry(file):
return sort_snps(df), 'AncestryDNA'

@staticmethod
def _read_lineage_csv(file):
def _read_lineage_csv(file, comments):
""" Read and parse CSV file generated by lineage.
Parameters
----------
file : str
path to file
comments : str
comments at beginning of file
Returns
-------
pandas.DataFrame
individual's genetic data normalized for use with `lineage`
str
name of data source
name of data source(s)
"""
source = ''
for comment in comments.split('\n'):
if 'Source(s):' in comment:
source = comment.split('Source(s):')[1].strip()
break

df = pd.read_csv(file, comment='#', header=0, na_values='--',
names=['rsid', 'chrom', 'pos', 'genotype'],
index_col=0, dtype={'chrom': object, 'pos': np.int64})

return sort_snps(df), 'lineage'
return sort_snps(df), source

@staticmethod
def _read_generic_csv(file):
Expand Down
22 changes: 19 additions & 3 deletions tests/test_individual.py
Expand Up @@ -120,12 +120,28 @@ def test_snps_ancestry(l, generic_snps):
pd.testing.assert_frame_equal(ind.snps, generic_snps)


def test_source_lineage(l):
ind = l.create_individual('', 'tests/input/chromosomes.csv')
def test_source_lineage_file(l):
ind = l.create_individual('', 'tests/input/GRCh37.csv')
assert ind.source == 'generic'
ind.load_snps('tests/input/23andme.txt')
assert ind.source == 'generic, 23andMe'
file = ind.save_snps()
ind_saved_snps = l.create_individual('', file)
assert ind_saved_snps.source == 'lineage'
assert ind_saved_snps.source == 'generic, 23andMe'
pd.testing.assert_frame_equal(ind.snps, ind_saved_snps.snps)


def test_source_lineage_file_gzip(l):
ind = l.create_individual('', 'tests/input/GRCh37.csv')
assert ind.source == 'generic'
ind.load_snps('tests/input/23andme.txt')
assert ind.source == 'generic, 23andMe'
file = ind.save_snps()
with open(file, 'rb') as f_in:
with gzip.open(file + '.gz', 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
ind_saved_snps = l.create_individual('', file + '.gz')
assert ind_saved_snps.source == 'generic, 23andMe'
pd.testing.assert_frame_equal(ind.snps, ind_saved_snps.snps)


Expand Down

0 comments on commit d554a60

Please sign in to comment.