Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ADAM-2127] Update python doc per GenomicRdd --> GenomicDataset change #2128

Merged
merged 1 commit into from
Mar 1, 2019
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 50 additions & 50 deletions adam-python/bdgenomics/adam/rdd.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,14 @@

GenomicDataset
VCFSupportingGenomicDataset
AlignmentRecordRDD
CoverageRDD
FeatureRDD
FragmentRDD
GenotypeRDD
NucleotideContigFragmentRDD
VariantRDD
VariantContextRDD
AlignmentRecordDataset
CoverageDataset
FeatureDataset
FragmentDataset
GenotypeDataset
NucleotideContigFragmentDataset
VariantDataset
VariantContextDataset
"""

import logging
Expand Down Expand Up @@ -154,17 +154,17 @@ def filterByOverlappingRegions(self, querys):

return self._replaceRdd(self._jvmRdd.filterByOverlappingRegions(javaRrs))

def union(self, rdds):
def union(self, datasets):
"""
Unions together multiple genomic datasets.

:param list rdds: The RDDs to union into this RDD.
:return: Returns a new RDD containing the union of this RDD and the other RDDs.
:param list datasets: The datasets to union into this dataset.
:return: Returns a new genomic dataset containing the union of this and the other datasets.
"""


return self._replaceRdd(self._jvmRdd.union(map(lambda x: x._jvmRdd,
rdds)))
datasets)))


def _wrapTransformation(self,
Expand Down Expand Up @@ -306,7 +306,7 @@ def pipe(self,
convFnInst))


def broadcastRegionJoin(self, genomicRdd, flankSize=0):
def broadcastRegionJoin(self, genomicDataset, flankSize=0):
"""
Performs a broadcast inner join between this genomic dataset and another genomic dataset.

Expand All @@ -316,19 +316,19 @@ def broadcastRegionJoin(self, genomicRdd, flankSize=0):
is an inner join, all values who do not overlap a value from the other
genomic dataset are dropped.

:param GenomicDataset genomicRdd: The right genomic dataset in the join.
:param GenomicDataset genomicDataset: The right genomic dataset in the join.
:param int flankSize: Sets a flankSize for the distance between elements to be
joined. If set to 0, an overlap is required to join two elements.
:return: Returns a new genomic dataset containing all pairs of keys that
overlapped in the genomic coordinate space.
"""

return GenomicDataset(self._jvmRdd.broadcastRegionJoin(genomicRdd._jvmRdd,
return GenomicDataset(self._jvmRdd.broadcastRegionJoin(genomicDataset._jvmRdd,
flankSize),
self.sc)


def rightOuterBroadcastRegionJoin(self, genomicRdd, flankSize=0):
def rightOuterBroadcastRegionJoin(self, genomicDataset, flankSize=0):
"""
Performs a broadcast right outer join between this genomic dataset and another genomic dataset.

Expand All @@ -340,20 +340,20 @@ def rightOuterBroadcastRegionJoin(self, genomicRdd, flankSize=0):
not overlap any values in the left genomic dataset, it will be paired with a `None`
in the product of the join.

:param GenomicDataset genomicRdd: The right genomic dataset in the join.
:param GenomicDataset genomicDataset: The right genomic dataset in the join.
:param int flankSize: Sets a flankSize for the distance between elements to be
joined. If set to 0, an overlap is required to join two elements.
:return: Returns a new genomic dataset containing all pairs of keys that
overlapped in the genomic coordinate space, and all keys from the
right genomic dataset that did not overlap a key in the left genomic dataset.
"""

return GenomicDataset(self._jvmRdd.rightOuterBroadcastRegionJoin(genomicRdd._jvmRdd,
return GenomicDataset(self._jvmRdd.rightOuterBroadcastRegionJoin(genomicDataset._jvmRdd,
flankSize),
self.sc)


def broadcastRegionJoinAndGroupByRight(self, genomicRdd, flankSize=0):
def broadcastRegionJoinAndGroupByRight(self, genomicDataset, flankSize=0):
"""
Performs a broadcast inner join between this genomic dataset and another genomic dataset.

Expand All @@ -363,19 +363,19 @@ def broadcastRegionJoinAndGroupByRight(self, genomicRdd, flankSize=0):
is an inner join, all values who do not overlap a value from the other
genomic dataset are dropped.

:param GenomicDataset genomicRdd: The right genomic dataset in the join.
:param GenomicDataset genomicDataset: The right genomic dataset in the join.
:param int flankSize: Sets a flankSize for the distance between elements to be
joined. If set to 0, an overlap is required to join two elements.
:return: Returns a new genomic dataset containing all pairs of keys that
overlapped in the genomic coordinate space.
"""

return GenomicDataset(self._jvmRdd.broadcastRegionJoinAndGroupByRight(genomicRdd._jvmRdd,
return GenomicDataset(self._jvmRdd.broadcastRegionJoinAndGroupByRight(genomicDataset._jvmRdd,
flankSize),
self.sc)


def rightOuterBroadcastRegionJoinAndGroupByRight(self, genomicRdd, flankSize=0):
def rightOuterBroadcastRegionJoinAndGroupByRight(self, genomicDataset, flankSize=0):
"""
Performs a broadcast right outer join between this genomic dataset and another genomic dataset.
In a broadcast join, the left side of the join (broadcastTree) is broadcast to
Expand All @@ -386,20 +386,20 @@ def rightOuterBroadcastRegionJoinAndGroupByRight(self, genomicRdd, flankSize=0):
not overlap any values in the left genomic dataset, it will be paired with a `None`
in the product of the join.

:param GenomicDataset genomicRdd: The right genomic dataset in the join.
:param GenomicDataset genomicDataset: The right genomic dataset in the join.
:param int flankSize: Sets a flankSize for the distance between elements to be
joined. If set to 0, an overlap is required to join two elements.
:return: Returns a new genomic dataset containing all pairs of keys that
overlapped in the genomic coordinate space, and all keys from the
right genomic dataset that did not overlap a key in the left genomic dataset.
"""

return GenomicDataset(self._jvmRdd.rightOuterBroadcastRegionJoinAndGroupByRight(genomicRdd._jvmRdd,
return GenomicDataset(self._jvmRdd.rightOuterBroadcastRegionJoinAndGroupByRight(genomicDataset._jvmRdd,
flankSize),
self.sc)


def shuffleRegionJoin(self, genomicRdd, flankSize=0):
def shuffleRegionJoin(self, genomicDataset, flankSize=0):
"""
Performs a sort-merge inner join between this genomic dataset and another genomic dataset.

Expand All @@ -409,18 +409,18 @@ def shuffleRegionJoin(self, genomicRdd, flankSize=0):
overlap function. Since this is an inner join, all values who do not
overlap a value from the other genomic dataset are dropped.

:param GenomicDataset genomicRdd: The right genomic dataset in the join.
:param GenomicDataset genomicDataset: The right genomic dataset in the join.
:param int flankSize: Sets a flankSize for the distance between elements to be
joined. If set to 0, an overlap is required to join two elements.
:return: Returns a new genomic dataset containing all pairs of keys that
overlapped in the genomic coordinate space.
"""

return GenomicDataset(self._jvmRdd.shuffleRegionJoin(genomicRdd._jvmRdd, flankSize),
return GenomicDataset(self._jvmRdd.shuffleRegionJoin(genomicDataset._jvmRdd, flankSize),
self.sc)


def rightOuterShuffleRegionJoin(self, genomicRdd, flankSize=0):
def rightOuterShuffleRegionJoin(self, genomicDataset, flankSize=0):
"""
Performs a sort-merge right outer join between this genomic dataset and another genomic dataset.

Expand All @@ -432,19 +432,19 @@ def rightOuterShuffleRegionJoin(self, genomicRdd, flankSize=0):
If a value from the right genomic dataset does not overlap any values in the left
genomic dataset, it will be paired with a `None` in the product of the join.

:param GenomicDataset genomicRdd: The right genomic dataset in the join.
:param GenomicDataset genomicDataset: The right genomic dataset in the join.
:param int flankSize: Sets a flankSize for the distance between elements to be
joined. If set to 0, an overlap is required to join two elements.
:return: Returns a new genomic dataset containing all pairs of keys that
overlapped in the genomic coordinate space, and all keys from the
right genomic dataset that did not overlap a key in the left genomic dataset.
"""

return GenomicDataset(self._jvmRdd.rightOuterShuffleRegionJoin(genomicRdd._jvmRdd, flankSize),
return GenomicDataset(self._jvmRdd.rightOuterShuffleRegionJoin(genomicDataset._jvmRdd, flankSize),
self.sc)


def leftOuterShuffleRegionJoin(self, genomicRdd, flankSize=0):
def leftOuterShuffleRegionJoin(self, genomicDataset, flankSize=0):
"""
Performs a sort-merge left outer join between this genomic dataset and another genomic dataset.

Expand All @@ -456,19 +456,19 @@ def leftOuterShuffleRegionJoin(self, genomicRdd, flankSize=0):
If a value from the left genomic dataset does not overlap any values in the right
genomic dataset, it will be paired with a `None` in the product of the join.

:param GenomicDataset genomicRdd: The right genomic dataset in the join.
:param GenomicDataset genomicDataset: The right genomic dataset in the join.
:param int flankSize: Sets a flankSize for the distance between elements to be
joined. If set to 0, an overlap is required to join two elements.
:return: Returns a new genomic dataset containing all pairs of keys that
overlapped in the genomic coordinate space, and all keys from the
left genomic dataset that did not overlap a key in the left genomic dataset.
"""

return GenomicDataset(self._jvmRdd.leftOuterShuffleRegionJoin(genomicRdd._jvmRdd, flankSize),
return GenomicDataset(self._jvmRdd.leftOuterShuffleRegionJoin(genomicDataset._jvmRdd, flankSize),
self.sc)


def leftOuterShuffleRegionJoinAndGroupByLeft(self, genomicRdd, flankSize=0):
def leftOuterShuffleRegionJoinAndGroupByLeft(self, genomicDataset, flankSize=0):
"""
Performs a sort-merge left outer join between this genomic dataset and another genomic dataset,
followed by a groupBy on the left value.
Expand All @@ -481,19 +481,19 @@ def leftOuterShuffleRegionJoinAndGroupByLeft(self, genomicRdd, flankSize=0):
If a value from the left genomic dataset does not overlap any values in the right
genomic dataset, it will be paired with an empty Iterable in the product of the join.

:param GenomicDataset genomicRdd: The right genomic dataset in the join.
:param GenomicDataset genomicDataset: The right genomic dataset in the join.
:param int flankSize: Sets a flankSize for the distance between elements to be
joined. If set to 0, an overlap is required to join two elements.
:return: Returns a new genomic dataset containing all pairs of keys that
overlapped in the genomic coordinate space, and all keys from the
left genomic dataset that did not overlap a key in the left genomic dataset.
"""

return GenomicDataset(self._jvmRdd.leftOuterShuffleRegionJoinAndGroupByLeft(genomicRdd._jvmRdd, flankSize),
return GenomicDataset(self._jvmRdd.leftOuterShuffleRegionJoinAndGroupByLeft(genomicDataset._jvmRdd, flankSize),
self.sc)


def fullOuterShuffleRegionJoin(self, genomicRdd, flankSize=0):
def fullOuterShuffleRegionJoin(self, genomicDataset, flankSize=0):
"""
Performs a sort-merge full outer join between this genomic dataset and another genomic dataset.

Expand All @@ -504,19 +504,19 @@ def fullOuterShuffleRegionJoin(self, genomicRdd, flankSize=0):
genomic dataset does not overlap any values in the other genomic dataset, it will be paired with
a `None` in the product of the join.

:param GenomicDataset genomicRdd: The right genomic dataset in the join.
:param GenomicDataset genomicDataset: The right genomic dataset in the join.
:param int flankSize: Sets a flankSize for the distance between elements to be
joined. If set to 0, an overlap is required to join two elements.
:return: Returns a new genomic dataset containing all pairs of keys that
overlapped in the genomic coordinate space, and values that did not
overlap will be paired with a `None`.
"""

return GenomicDataset(self._jvmRdd.fullOuterShuffleRegionJoin(genomicRdd._jvmRdd, flankSize),
return GenomicDataset(self._jvmRdd.fullOuterShuffleRegionJoin(genomicDataset._jvmRdd, flankSize),
self.sc)


def rightOuterShuffleRegionJoinAndGroupByLeft(self, genomicRdd, flankSize=0):
def rightOuterShuffleRegionJoinAndGroupByLeft(self, genomicDataset, flankSize=0):
"""
Performs a sort-merge right outer join between this genomic dataset and another genomic dataset,
followed by a groupBy on the left value, if not null.
Expand All @@ -529,7 +529,7 @@ def rightOuterShuffleRegionJoinAndGroupByLeft(self, genomicRdd, flankSize=0):
right genomic dataset who did not overlap a value from the left genomic dataset are placed into
a length-1 Iterable with a `None` key.

:param GenomicDataset genomicRdd: The right genomic dataset in the join.
:param GenomicDataset genomicDataset: The right genomic dataset in the join.
:param int flankSize: Sets a flankSize for the distance between elements to be
joined. If set to 0, an overlap is required to join two elements.
:return: Returns a new genomic dataset containing all pairs of keys that
Expand All @@ -538,11 +538,11 @@ def rightOuterShuffleRegionJoinAndGroupByLeft(self, genomicRdd, flankSize=0):
right genomic dataset that did not overlap an item in the left genomic dataset.
"""

return GenomicDataset(self._jvmRdd.rightOuterShuffleRegionJoinAndGroupByLeft(genomicRdd._jvmRdd, flankSize),
return GenomicDataset(self._jvmRdd.rightOuterShuffleRegionJoinAndGroupByLeft(genomicDataset._jvmRdd, flankSize),
self.sc)


def shuffleRegionJoinAndGroupByLeft(self, genomicRdd, flankSize=0):
def shuffleRegionJoinAndGroupByLeft(self, genomicDataset, flankSize=0):
"""
Performs a sort-merge inner join between this genomic dataset and another genomic dataset,
followed by a groupBy on the left value.
Expand All @@ -553,15 +553,15 @@ def shuffleRegionJoinAndGroupByLeft(self, genomicRdd, flankSize=0):
overlap function. In the same operation, we group all values by the left
item in the genomic dataset.

:param GenomicDataset genomicRdd: The right genomic dataset in the join.
:param GenomicDataset genomicDataset: The right genomic dataset in the join.
:param int flankSize: Sets a flankSize for the distance between elements to be
joined. If set to 0, an overlap is required to join two elements.
:return: Returns a new genomic dataset containing all pairs of keys that
overlapped in the genomic coordinate space, grouped together by
the value they overlapped in the left genomic dataset.
"""

return GenomicDataset(self._jvmRdd.shuffleRegionJoinAndGroupByLeft(genomicRdd._jvmRdd, flankSize),
return GenomicDataset(self._jvmRdd.shuffleRegionJoinAndGroupByLeft(genomicDataset._jvmRdd, flankSize),
self.sc)


Expand Down Expand Up @@ -952,7 +952,7 @@ def countKmers(self, kmerLength):
Cuts reads into _k_-mers, and then counts the number of occurrences of each _k_-mer.

:param int kmerLength: The value of _k_ to use for cutting _k_-mers.
:return: Returns an RDD containing k-mer/count pairs.
:return: Returns an DataFrame containing k-mer/count pairs.
:rtype: DataFrame containing "kmer" string and "count" long.
"""

Expand All @@ -968,7 +968,7 @@ def sortReadsByReferencePosition(self):
put at the end and sorted by read name. Contigs are ordered
lexicographically by name.

:return: Returns a new RDD containing sorted reads.
:return: Returns a new genomic dataset containing sorted reads.
:rtype: bdgenomics.adam.rdd.AlignmentRecordDataset
"""
return AlignmentRecordDataset(self._jvmRdd.sortReadsByReferencePosition(),
Expand Down Expand Up @@ -1070,7 +1070,7 @@ def realignIndelsFromKnownIndels(self,
"""
Realigns indels using a consensus-based heuristic from prior called INDELs.

:param bdgenomics.adam.rdd.VariantDataset knownIndels: An RDD of previously
:param bdgenomics.adam.rdd.VariantDataset knownIndels: A genomic dataset of previously
called INDEL variants.
:param bool isSorted: If the input data is sorted, setting this
parameter to true avoids a second sort.
Expand Down Expand Up @@ -1175,8 +1175,8 @@ def reassembleReadPairs(self,
Reassembles read pairs from two sets of unpaired reads.

The assumption is that the two sets were _originally_ paired together.
The RDD that this is called on should be the RDD with the first read
from the pair.
The genomic dataset that this is called on should be the genomic dataset
with the first read from the pair.

:param pyspark.rdd.RDD secondPairRdd: The rdd containing the second read
from the pairs.
Expand Down