Skip to content

Commit

Permalink
Refactor database build (#62)
Browse files Browse the repository at this point in the history
  • Loading branch information
standage committed Jun 24, 2020
1 parent 475bcc5 commit e70ef36
Show file tree
Hide file tree
Showing 35 changed files with 12,512 additions and 1,830 deletions.
1 change: 1 addition & 0 deletions dbbuild/.gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
hg38.fasta*
databases/
34 changes: 16 additions & 18 deletions dbbuild/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,25 +14,28 @@ Alternatively, you can submit a pull request to the MicroHapDB Github repository

## Rebuilding the Database From Scratch: The Short Version

You'll need the following software:
Building MicroHapDB from scratch requires installing several software packages.
Conda provides the most convenient way to install these.

- Python 3
- [Pandas][]
- [Snakemake][]
- [pyfaidx][]
```
conda create -c bioconda --name microhapdb -y python=3.7 pandas snakemake pyfaidx rsidx scikit-allel
conda activate microhapdb
```

And the following data:
Next, building MicroHapDB from scratch also requires the human reference genome, dbSNP, and the 1000 Genomes Project Phase 3 data.
These can be downloaded and indexed using with the `prep-dbs.sh` script.
**Note**: this can take several hours, depending on the speed of the Internet connection, computer processors, and other factors.

- Human reference genome (version GRCh38; http://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz)
```
git clone https://github.com/bioforensics/MicroHapDB.git # If you haven't already done so
cd MicroHapDB/dbbuild/
./prep-dbs.sh
```

With these dependencies installed, clone the MicroHapDB git repository and use Snakemake to rebuild the database in the `dbbuild/` directory.
Finally, with these data sets in place, MicroHapDB can be built using `Snakemake`.

```
git clone https://github.com/bioforensics/MicroHapDB.git
cd MicroHapDB/dbbuild/
wget http://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz
gunzip hg38.fa.gz
snakemake --config refr=hg38.fa -p tables
snakemake --configfile config.json --cores 1 -p tables
```

If the Snakemake build process completes successfully, copy the newly created database tables to MicroHapDB's main data directory to complete the update!
Expand Down Expand Up @@ -199,8 +202,3 @@ So for example, for marker `mh07PK-38311`:
...and therefore the PermID is `MHDBM-3ae6dc1b`.

MicroHapDB uses the [pearhash](https://github.com/ze-phyr-us/pearhash) library under the MIT license to compute Pearson hashes.


[Pandas]: https://pandas.pydata.org
[Snakemake]: https://snakemake.readthedocs.io/en/stable/
[pyfaidx]: https://github.com/mdshw5/pyfaidx
95 changes: 36 additions & 59 deletions dbbuild/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,22 @@ def merge_markers(markerfiles, sourcefiles):
return data


def write_variant_map(data, output):
print('Variant', 'Marker', sep='\t', file=output)
def construct_variant_map(data):
varlist, markerlist = list(), list()
for n, row in data.iterrows():
marker = row['Name']
for variant in row['VarRef'].split(','):
print(variant, marker, sep='\t', file=output)
marker = row.Name
numoffsets = row.Offsets.count(',') + 1
numrsids = row.VarRef.count(',') + 1
if numoffsets != numrsids:
message = 'marker "{name:s}" has {no:d} variants but only {nv:d} rsids'.format(
name=marker, no=numoffsets, nv=numrsids
)
print('WARNING:', message, file=sys.stderr)
for variant in row.VarRef.split(','):
varlist.append(variant)
markerlist.append(marker)
outdata = pandas.DataFrame({'Variant': varlist, 'Marker': markerlist})
return outdata.sort_values('Variant')


def populate_idmap(data):
Expand Down Expand Up @@ -92,62 +102,30 @@ def add_marker_permids(data, refr):
data['PermID'] = pandas.Series(permids)


def compute_marker_ae(data, freqs):
aes = list()
pops1kgp = set([
'CHB', 'JPT', 'CHS', 'CDX', 'KHV', 'CEU', 'TSI', 'FIN', 'GBR', 'IBS', 'YRI', 'LWK', 'GWD',
'MSL', 'ESN', 'ASW', 'ACB', 'MXL', 'PUR', 'CLM', 'PEL', 'GIH', 'PJL', 'BEB', 'STU', 'ITU'
])
for n, row in data.iterrows():
if n > 0 and n % 25 == 0:
print('...calculated AvgAe for', n, 'markers')
marker = row.Name
freqs_1kgp = freqs[(freqs.Marker == marker) & (freqs.Population.isin(pops1kgp))]
if len(freqs_1kgp) == 0:
print(
'[WARNING] no frequency data for marker ', marker, ', not computing AvgAe',
sep='', file=sys.stderr
)
aes.append(None)
continue
ae_per_pop = list()
for pop in pops1kgp:
ae_recip = 0.0
for haplotype in freqs[freqs.Marker == marker].Allele.unique():
nallelesfreq = haplotype.count(',')
nallelesmarker = row.Offsets.count(',')
if nallelesfreq != nallelesmarker:
message = 'allele count disagreement for {marker:s}:'.format(marker=marker)
message += ' {:d} vs {:d};'.format(nallelesmarker, nallelesfreq)
message += ' ({:s})'.format(haplotype)
raise ValueError(message)
for freq in freqs[(freqs.Marker == marker) & (freqs.Population == pop)].Frequency:
ae_recip += freq ** 2
assert ae_recip > 0.0, marker
ae = 1.0 / ae_recip
ae_per_pop.append(ae)
assert len(ae_per_pop) == 26
avg_ae = sum(ae_per_pop) / len(ae_per_pop)
aes.append('{:.04f}'.format(avg_ae))
return data.assign(AvgAe=pandas.Series(aes))


def sort_and_clean(data):
data['ChromSort'] = data.Chrom.apply(lambda c: '0X' if c == 'chrX' else int(c[3:]))
data['TempPos'] = data.Offsets.apply(lambda os: int(os.split(',')[0]))
data = data.sort_values(['ChromSort', 'TempPos'])
data.drop(columns=['Xref', 'VarRef', 'ChromSort', 'TempPos'], inplace=True)
nr = data.drop_duplicates(subset=('Name', 'Chrom', 'Offsets'))
assert(len(nr.Name) == len(nr.Name.unique())) # If markers are defined in more than one place, make sure the definitions are identical.
return data[['Name', 'PermID', 'Reference', 'Chrom', 'Offsets', 'AvgAe', 'Source']]
return data[['Name', 'PermID', 'Reference', 'Chrom', 'Offsets', 'AvgAe', 'In', 'Source']]


def add_avgae(data, aefile):
aes = pandas.read_csv(aefile, sep='\t')
avgae = {'Marker': list(), 'AvgAe': list()}
for marker, mdata in aes.groupby('Marker'):
assert len(mdata) == 26
meanae = mdata.Ae.mean()
avgae['Marker'].append(marker)
avgae['AvgAe'].append(meanae)
return data.join(pandas.DataFrame(avgae).set_index('Marker'), on='Name')


def add_informativeness(data, informfile):
info = pandas.read_csv(informfile, sep='\t')
data['In'] = None
for n, row in info.iterrows():
data.loc[data.Name == row.Marker, 'In'] = row.Informativeness
return data[['Name', 'PermID', 'Reference', 'Chrom', 'Offsets', 'AvgAe', 'In', 'Source']]
info = pandas.read_csv(informfile, sep='\t').rename(columns={'Informativeness': 'In'})
return data.join(info.set_index('Marker'), on='Name')


SOURCES = [os.path.basename(file) for file in glob('sources/*') if os.path.isdir(file)]
Expand Down Expand Up @@ -209,7 +187,7 @@ rule frequencies:
rule markers:
input:
config['refr'],
'frequency.tsv',
'sources/1kgp/marker-aes.tsv',
'sources/1kgp/marker-informativeness.tsv',
expand('sources/{source}/marker.tsv', source=SOURCES),
expand('sources/{source}/source.txt', source=SOURCES),
Expand All @@ -218,26 +196,25 @@ rule markers:
varmap='variantmap.tsv',
idmap='marker-idmap.tsv'
run:
freqfile = input[1]
aefile = input[1]
informfile = input[2]
input = input[3:]
numsources = int(len(input) / 2)
markerfiles = input[:numsources]
sourcefiles = input[numsources:]

data = merge_markers(markerfiles, sourcefiles)
with open(output.varmap, 'w') as fh:
write_variant_map(data, fh)
varmap = construct_variant_map(data)
varmap.to_csv(output.varmap, sep='\t', index=False)
add_marker_permids(data, config['refr'])
freqs = pandas.read_csv(freqfile, sep='\t')
data = compute_marker_ae(data, freqs)

idmap = populate_idmap(data)
idmap.to_csv(output.idmap, sep='\t', index=False)

data = sort_and_clean(data)
data = add_avgae(data, aefile)
data = add_informativeness(data, informfile)
data.to_csv(output.table, sep='\t', index=False)
data = sort_and_clean(data)
data.to_csv(output.table, sep='\t', index=False, float_format='%.4f')


rule idmap:
Expand Down
6 changes: 6 additions & 0 deletions dbbuild/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"refr": "databases/hg38.fa",
"dbsnp37": "databases/dbSNP/dbSNP_GRCh37.vcf.gz",
"dbsnp38": "databases/dbSNP/dbSNP_GRCh38.vcf.gz",
"1kgp_dir": "databases/1000Genomes"
}
Loading

0 comments on commit e70ef36

Please sign in to comment.