Skip to content

Commit

Permalink
SO-4440 #resolve #time 4h
Browse files Browse the repository at this point in the history
  • Loading branch information
cmark committed Dec 3, 2020
1 parent 245c1c7 commit e149a4d
Show file tree
Hide file tree
Showing 2 changed files with 104 additions and 29 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,10 @@ public static Expression matchTermOriginal(String term) {
return exactMatch(Fields.TERM_ORIGINAL, term);
}

public static Expression matchTermOriginal(Iterable<String> terms) {
return matchAny(Fields.TERM_ORIGINAL, terms);
}

public static Expression matchTermRegex(String regex) {
return regexp(Fields.TERM_ORIGINAL, regex);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package scripts;

import java.util.function.Supplier

import com.b2international.index.Hits
import com.b2international.index.aggregations.Aggregation
import com.b2international.index.aggregations.AggregationBuilder
Expand All @@ -16,47 +18,116 @@ import com.b2international.snowowl.snomed.common.SnomedTerminologyComponentConst
import com.b2international.snowowl.snomed.datastore.index.entry.SnomedConceptDocument
import com.b2international.snowowl.snomed.datastore.index.entry.SnomedDescriptionIndexEntry
import com.b2international.snowowl.snomed.datastore.index.entry.SnomedDocument
import com.google.common.base.Suppliers
import com.google.common.collect.HashMultimap
import com.google.common.collect.Lists
import com.google.common.collect.Multimap
import com.google.common.collect.Sets

RevisionSearcher searcher = ctx.service(RevisionSearcher.class)
def RevisionSearcher searcher = ctx.service(RevisionSearcher.class)
def Set<ComponentIdentifier> issues = []

Iterable<Hits<String>> activeConceptBatches = searcher.scroll(Query.select(String.class)
def Supplier<Set<String>> activeConceptIds = Suppliers.memoize({
def Set<String> conceptIds = []

searcher.scroll(Query.select(String.class)
.from(SnomedConceptDocument.class)
.fields(SnomedConceptDocument.Fields.ID)
.where(SnomedConceptDocument.Expressions.active())
.limit(10_000)
.limit(100_000)
.build())

Set<String> activeConceptIds = Sets.newHashSet()

activeConceptBatches.each({ conceptBatch ->
activeConceptIds.addAll(conceptBatch.getHits())
.each { Hits<String> conceptBatch ->
conceptIds.addAll(conceptBatch.getHits())
}
return conceptIds
})

ExpressionBuilder activeFsnExpression = Expressions.builder()
.filter(SnomedDescriptionIndexEntry.Expressions.active())
.filter(SnomedDescriptionIndexEntry.Expressions.type(Concepts.FULLY_SPECIFIED_NAME))
.filter(SnomedDescriptionIndexEntry.Expressions.concepts(activeConceptIds))

if (params.isUnpublishedOnly) {
activeFsnExpression.filter(SnomedDocument.Expressions.effectiveTime(EffectiveTimes.UNSET_EFFECTIVE_TIME))
}

Aggregation<String> activeDescriptionsByOriginalTerm = searcher
.aggregate(AggregationBuilder.bucket("ruleSnomedCommon2", String.class, SnomedDescriptionIndexEntry.class)
.query(activeFsnExpression.build())
.onFieldValue(SnomedDescriptionIndexEntry.Fields.TERM_ORIGINAL)
.fields(SnomedDescriptionIndexEntry.Fields.ID)
.minBucketSize(2))

def Multimap<String, String> descriptionsByTerm = HashMultimap.create()

// load all unpublished FSNs with their terms first
searcher.scroll(Query.select(String[].class)
.from(SnomedDescriptionIndexEntry.class)
.fields(SnomedDescriptionIndexEntry.Fields.ID, SnomedDescriptionIndexEntry.Fields.TERM, SnomedDescriptionIndexEntry.Fields.CONCEPT_ID)
.where(
Expressions.builder()
.filter(SnomedDescriptionIndexEntry.Expressions.active())
.filter(SnomedDescriptionIndexEntry.Expressions.type(Concepts.FULLY_SPECIFIED_NAME))
.filter(SnomedDocument.Expressions.effectiveTime(EffectiveTimes.UNSET_EFFECTIVE_TIME))
.build()
)
.limit(10_000)
.build())
.each { Hits<String[]> descriptionsToCheck ->

descriptionsToCheck.each { descriptionToCheck ->
if (activeConceptIds.get().contains(descriptionToCheck[2])) {
descriptionsByTerm.put(descriptionToCheck[1], descriptionToCheck[0])
}
}

}

List<ComponentIdentifier> issues = Lists.newArrayList()
// then check the duplication between all unpublished terms by checking larger than 1 buckets
descriptionsByTerm.keySet().each { term ->
def descriptions = descriptionsByTerm.get(term)
// report all buckets with more than 1 item
if (descriptions.size() > 1) {
descriptions.each { description ->
issues.add(ComponentIdentifier.of(SnomedTerminologyComponentConstants.DESCRIPTION_NUMBER, description))
}
}
}

// then scroll through all possible duplicates among published terms
searcher.search(Query.select(String[].class)
.from(SnomedDescriptionIndexEntry.class)
.fields(SnomedDescriptionIndexEntry.Fields.ID, SnomedDescriptionIndexEntry.Fields.CONCEPT_ID)
.where(
Expressions.builder()
.filter(SnomedDescriptionIndexEntry.Expressions.active())
.filter(SnomedDescriptionIndexEntry.Expressions.type(Concepts.FULLY_SPECIFIED_NAME))
.filter(SnomedDescriptionIndexEntry.Expressions.matchTermOriginal(descriptionsByTerm.keySet())) // send in all unpublished terms
.mustNot(SnomedDocument.Expressions.effectiveTime(EffectiveTimes.UNSET_EFFECTIVE_TIME)) // only published
.build()
)
.limit(10_000)
.build())
.each { Hits<String[]> publishedTermsBatch ->
// all returned terms are duplicate of an unpublished term
publishedTermsBatch.each { publishedTerm ->
if (activeConceptIds.get().contains(publishedTerm[1])) {
issues.add(ComponentIdentifier.of(SnomedTerminologyComponentConstants.DESCRIPTION_NUMBER, publishedTerm[0]))
}
}
}


} else {
// published and unpublished FSNs both count, use aggregation to gather all possible terms
// TODO eval performance diff of scroll through all sorted via term vs aggregation
searcher
.aggregate(AggregationBuilder.bucket("rule664", String.class, SnomedDescriptionIndexEntry.class)
.query(
Expressions.builder()
.filter(SnomedDescriptionIndexEntry.Expressions.active())
.filter(SnomedDescriptionIndexEntry.Expressions.type(Concepts.FULLY_SPECIFIED_NAME))
.filter(SnomedDescriptionIndexEntry.Expressions.concepts(activeConceptIds.get()))
.build()
)
.onFieldValue(SnomedDescriptionIndexEntry.Fields.TERM_ORIGINAL)
.fields(SnomedDescriptionIndexEntry.Fields.ID)
.minBucketSize(2))
.getBuckets()
.values()
.each { bucket ->
bucket.each { id ->
issues.add(ComponentIdentifier.of(SnomedTerminologyComponentConstants.DESCRIPTION_NUMBER, id))
}
}
}

activeDescriptionsByOriginalTerm.getBuckets().values().each({ bucket ->
bucket.each({ id ->
issues.add(ComponentIdentifier.of(SnomedTerminologyComponentConstants.DESCRIPTION_NUMBER, id))
})
})

return issues
return issues.toList()

0 comments on commit e149a4d

Please sign in to comment.