# Example usage

To use `dnarecords` in a project:

In [1]:
import dnarecords

print(dnarecords.__version__)

0.1.4


To transform your genomics data into DNARecords:

In [8]:
import dnarecords as dr


hl = dr.helper.DNARecordsUtils.init_hail()
hl.utils.get_1kg('/tmp/1kg')
mt = hl.read_matrix_table('/tmp/1kg/1kg.mt')
mt = mt.annotate_entries(dosage=hl.pl_dosage(mt.PL))

dnarecords_path = '/tmp/dnarecords'
writer = dr.writer.DNARecordsWriter(mt.dosage)
writer.write(dnarecords_path, sparse=True, sample_wise=True, variant_wise=True,
             tfrecord_format=True, parquet_format=True,
             write_mode='overwrite', gzip=True)

print(f'DNARecords createt at {dnarecords_path}')

DNARecords createt at /tmp/dnarecords
2022-05-11 11:12:54 Hail: INFO: 1KG files found


To read your DNARecords dataset as **Tensorflow Datasets**:

In [11]:
import dnarecords as dr
import tensorflow as tf

dnarecords_path = '/tmp/dnarecords'
reader = dr.reader.DNARecordsReader(dnarecords_path)

samplewise_ds = reader.sample_wise_dataset()
tf.print(next(iter(samplewise_ds)))

variantwise_ds = reader.variant_wise_dataset()
tf.print(next(iter(variantwise_ds)))

{'chr1': 'SparseTensor(indices=[[0]
 [2]
 [4]
 ...
 [906]
 [907]
 [908]], values=[0.336072147 0.00498687895 0.0593509413 ... 0.99874264 0.0306534301 1.88818419], shape=[909])',
 'chr10': 'SparseTensor(indices=[[0]
 [1]
 [2]
 ...
 [529]
 [530]
 [531]], values=[0.0593509413 0.00315230922 0.99941957 ... 0.0593509413 1.79924 0.0306534301], shape=[532])',
 'chr11': 'SparseTensor(indices=[[0]
 [1]
 [2]
 ...
 [567]
 [568]
 [569]], values=[0.200760186 0.015601662 0.136806905 ... 0.015601662 1.92641246 1.94064903], shape=[570])',
 'chr12': 'SparseTensor(indices=[[0]
 [1]
 [2]
 ...
 [563]
 [564]
 [565]], values=[1.00395739 1.96934652 0.0593509413 ... 0.200760037 1.00158238 0.999999821], shape=[566])',
 'chr13': 'SparseTensor(indices=[[0]
 [1]
 [2]
 ...
 [333]
 [334]
 [335]], values=[0.015601662 1.00000501 1.92641246 ... 0.0306534301 0.0593509413 0.200760469], shape=[336])',
 'chr14': 'SparseTensor(indices=[[0]
 [1]
 [2]
 ...
 [334]
 [335]
 [336]], values=[0.136806905 0.00788068399 0.00788068399 

To read your DNARecords dataset as Pyspark DataFrames:

In [13]:
import dnarecords as dr

dnarecords_path = '/tmp/dnarecords'
reader = dr.reader.DNASparkReader(dnarecords_path)

samplewise_df = reader.sample_wise_dnarecords()
samplewise_df.show(2)

variantwise_df = reader.variant_wise_dnarecords()
variantwise_df.show(2)

+--------------------+----------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+-----------------+--------------------+--------------------+----------------+----------------+--------------------+--------------------+-----------------+----------------+--------------------+---+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+--------------------+-----------------+--------------------+--------------------+--------------------+-----------------+----------------+--------------------+----------------+-----------------+--------------------+--------------------+-----------------+----------------+--------------------+--------------------+-----------------+--------------------+--------------------+--------------------+-----------------+-------