Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SO-2127 snomed rf2 importer issues #122

Merged
merged 30 commits into from
Jan 6, 2017
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
54b1ff8
SO-2127: fixing issues with RF2 importer
cmark Nov 24, 2016
3fe1824
SO-1782: store taxonomy defect information in data objects...
cmark Nov 25, 2016
0a46423
SO-2127: fix NPE when getting datatype for cd member rows
cmark Dec 6, 2016
c44cee9
SO-2127: skip cycle detection during import
cmark Dec 6, 2016
df2aa25
SO-2127: improve performance of in-memory ID service
cmark Dec 6, 2016
ce44373
SO-2127: extract relevant taxonomy information for nodes and edges...
cmark Dec 6, 2016
fb31a71
SO-2127: support bulk methods in import time component lookup impls
cmark Dec 6, 2016
7ebd046
SO-2127: refactor RF2 importer to support batch processing
cmark Dec 6, 2016
9613dd5
SO-2127: disable error reporting functionality
cmark Dec 6, 2016
923c222
SO-2127: batch RF2 processing improvements
cmark Dec 6, 2016
d38dcf8
SO-2127: get temp CDOResource outside the for loop (lang.refset import)
cmark Dec 6, 2016
0f9b4a1
SO-2127: remove unnecessary setId calls from member importers
cmark Dec 6, 2016
6de7228
Merge remote-tracking branch 'origin/develop' into issue/SO-2127-snom…
cmark Dec 7, 2016
51b9bc3
SO-2127: fix concrete domain member index entry building
cmark Dec 7, 2016
0331d77
SO-2127: use log message arguments instead of string concatenation
cmark Dec 7, 2016
c59ecd7
SO-2127: run index purge/optimize before creating version...
cmark Dec 7, 2016
9b96893
SO-2127: remove commit notification from notification queue
cmark Dec 8, 2016
a01efba
SO-2127: enqueue commit notifications when enabled in tx context
cmark Dec 8, 2016
59e33b5
SO-2127: remove temporary resource form lang. refset importer
cmark Dec 8, 2016
35eefec
SO-2127: change importer logger names to snomed.importer.rf2
cmark Dec 8, 2016
f7a3f56
SO-2127: reduce document load times during import
cmark Dec 9, 2016
8934bf7
SO-2127: rename constant from ON to TAB_SPLITTER
cmark Jan 4, 2017
07770db
SO-2127: fix comments in import time lookup services
cmark Jan 5, 2017
d2f5bce
SO-2127: include stack trace in log messages
cmark Jan 5, 2017
d3e48b6
SO-2127: use rows.clear instead of recreating List instance
cmark Jan 5, 2017
4105458
SO-2127: simplify processing of rows
cmark Jan 5, 2017
e3e04ed
SO-2127: rename component creation template methods
cmark Jan 5, 2017
efecf53
SO-2127: generate available identifiers in bulk load method for symmetry
cmark Jan 5, 2017
1b80174
SO-2127: avoid setting core component identifier twice
cmark Jan 5, 2017
b780da1
SO-2127: rename field extended to hasMapTargetDescription
cmark Jan 5, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -528,7 +528,6 @@ public Boolean execute(RevisionSearcher index) throws IOException {
final String message = String.format("Validation encountered %s issue(s).", defects.size());
LogUtils.logImportActivity(IMPORT_LOGGER, requestingUserId, branchPath, message);

final String lineSeparator = System.getProperty("line.separator");
final Map<String, BufferedWriter> defectWriters = newHashMap();
try {
for (SnomedValidationDefect defect : defects) {
Expand All @@ -537,10 +536,7 @@ public Boolean execute(RevisionSearcher index) throws IOException {
if (!defectWriters.containsKey(defectsFile)) {
defectWriters.put(defectsFile, Files.newWriter(new File(defectsFile), Charsets.UTF_8));
}
for (String lineDefect : defect.getDefects()) {
defectWriters.get(defectsFile).write(defect.getDefectType().name() + "\t" + lineDefect);
defectWriters.get(defectsFile).write(lineSeparator);
}
defect.writeTo(defectWriters.get(defectsFile));
}
} finally {
for (BufferedWriter writer : defectWriters.values()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,15 @@
package com.b2international.snowowl.snomed.importer.rf2.validation;

import static com.b2international.snowowl.snomed.common.ContentSubType.SNAPSHOT;
import static com.google.common.collect.Sets.newHashSet;
import static com.google.common.collect.Lists.newArrayListWithExpectedSize;
import static java.util.Collections.emptySet;
import static org.slf4j.LoggerFactory.getLogger;

import java.io.File;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map;

Expand All @@ -36,12 +37,14 @@
import com.b2international.snowowl.snomed.SnomedConstants.Concepts;
import com.b2international.snowowl.snomed.datastore.index.entry.SnomedRelationshipIndexEntry;
import com.b2international.snowowl.snomed.datastore.taxonomy.InvalidRelationship;
import com.b2international.snowowl.snomed.datastore.taxonomy.InvalidRelationship.MissingConcept;
import com.b2international.snowowl.snomed.datastore.taxonomy.SnomedTaxonomyBuilder;
import com.b2international.snowowl.snomed.datastore.taxonomy.SnomedTaxonomyBuilderResult;
import com.b2international.snowowl.snomed.importer.net4j.DefectType;
import com.b2international.snowowl.snomed.importer.net4j.ImportConfiguration;
import com.b2international.snowowl.snomed.importer.net4j.SnomedIncompleteTaxonomyValidationDefect;
import com.b2international.snowowl.snomed.importer.net4j.SnomedValidationDefect;
import com.b2international.snowowl.snomed.importer.net4j.TaxonomyDefect;
import com.b2international.snowowl.snomed.importer.rf2.RepositoryState;
import com.b2international.snowowl.snomed.importer.rf2.util.Rf2FileModifier;
import com.google.common.collect.ArrayListMultimap;
Expand All @@ -56,6 +59,19 @@
public class SnomedTaxonomyValidator {

private static final Logger LOGGER = getLogger(SnomedTaxonomyValidator.class);

private static final Comparator<String> EFFECTIVE_TIME_COMPARATOR = new Comparator<String>() {
@Override
public int compare(String o1, String o2) {
// consider empty greater than non-empty
if (o1.isEmpty() && !o2.isEmpty()) {
return 1;
} else if (!o1.isEmpty() && o2.isEmpty()) {
return -1;
}
return o1.compareTo(o2);
}
};

// new RF2 state
private final File conceptsFile;
Expand Down Expand Up @@ -112,105 +128,17 @@ public Collection<SnomedValidationDefect> validate() {
*/
private Collection<SnomedValidationDefect> doValidate() {
try {
final Rf2BasedSnomedTaxonomyBuilder builder = Rf2BasedSnomedTaxonomyBuilder.newInstance(new SnomedTaxonomyBuilder(conceptIds, statements), characteristicType);

final Multimap<String, InvalidRelationship> invalidRelationships = ArrayListMultimap.create();

if (snapshot) {

LOGGER.info("Validating SNOMED CT ontology based on the given RF2 release files...");


if (hasConceptImport()) {
final String conceptFilePath = removeConceptHeader();
builder.applyNodeChanges(conceptFilePath);
}

if (hasRelationshipImport()) {
final String relationshipFilePath = removeRelationshipHeader();
builder.applyEdgeChanges(relationshipFilePath);
}

final SnomedTaxonomyBuilderResult result = builder.build();
if (!result.getStatus().isOK()) {
invalidRelationships.putAll("", result.getInvalidRelationships());
}
} else {

LOGGER.info("Validating SNOMED CT ontology based on the given RF2 release files...");

final Map<String, File> conceptFiles = hasConceptImport() ? Rf2FileModifier.split(conceptsFile) : ImmutableMap.<String, File>of();
final Map<String, File> relationshipFiles = hasRelationshipImport() ? Rf2FileModifier.split(relationshipsFile) : ImmutableMap.<String, File>of();

final List<String> effectiveTimes = ImmutableSortedSet.<String>naturalOrder()
.addAll(conceptFiles.keySet())
.addAll(relationshipFiles.keySet())
.build()
.asList();


for (final String effectiveTime : effectiveTimes) {
LOGGER.info("Validating concepts and relationships from '" + effectiveTime + "'...");

final File conceptFile = conceptFiles.get(effectiveTime);
final File relationshipFile = relationshipFiles.get(effectiveTime);

builder.applyNodeChanges(getFilePath(conceptFile));
builder.applyEdgeChanges(getFilePath(relationshipFile));
final SnomedTaxonomyBuilderResult result = builder.build();
if (!result.getStatus().isOK()) {
invalidRelationships.putAll(effectiveTime, result.getInvalidRelationships());
}
}
}

final Multimap<String, InvalidRelationship> invalidRelationships = processTaxonomy();
if (!invalidRelationships.isEmpty()) {
final Collection<String> defects = newHashSet();
final Collection<String> conceptIdsToInactivate = newHashSet();

final Collection<TaxonomyDefect> defects = newArrayListWithExpectedSize(invalidRelationships.size());
for (final String effectiveTime : invalidRelationships.keySet()) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that ArrayListMultimap's key set might not preserve the ordering used in processTaxonomy. (Not sure if we are relying on it here.)

for (final InvalidRelationship invalidRelationship: invalidRelationships.get(effectiveTime)) {
final String sourceId = Long.toString(invalidRelationship.getSourceId());
final String destinationId = Long.toString(invalidRelationship.getDestinationId());

if (conceptIds.contains(invalidRelationship.getDestinationId())) {
conceptIdsToInactivate.add(destinationId);
}

if (conceptIds.contains(invalidRelationship.getSourceId())) {
conceptIdsToInactivate.add(sourceId);
}

final StringBuilder sb = new StringBuilder();
sb.append("IS A relationship");
sb.append(" '" + invalidRelationship.getRelationshipId() + "'");
sb.append(" has a missing or inactive ");

switch (invalidRelationship.getMissingConcept()) {
case DESTINATION:
sb.append("destination concept");
sb.append(" '" + destinationId);
break;
case SOURCE:
sb.append("source concept");
sb.append(" '" + sourceId);
break;
default:
throw new IllegalStateException("Unexpected missing concept type '" + invalidRelationship.getMissingConcept() + "'.");
}

sb.append("' in effectiveTime ");
sb.append("".equals(effectiveTime) ? "Unpublished/Undefined" : effectiveTime);
sb.append("'.");

defects.add(sb.toString());
defects.add(new TaxonomyDefect(invalidRelationship.getRelationshipId(), effectiveTime, invalidRelationship.getMissingConcept() == MissingConcept.DESTINATION ? TaxonomyDefect.Type.MISSING_DESTINATION : TaxonomyDefect.Type.MISSING_SOURCE, invalidRelationship.getMissingConceptId()));
}
}

final SnomedValidationDefect defect = new SnomedIncompleteTaxonomyValidationDefect(relationshipsFile.getName(), defects, conceptIdsToInactivate);
return Collections.<SnomedValidationDefect>singleton(defect);
return Collections.singleton(new SnomedIncompleteTaxonomyValidationDefect(relationshipsFile.getName(), defects));
}

} catch (final IOException e) {
LOGGER.error("Validation failed.", e);
return Collections.<SnomedValidationDefect>singleton(new SnomedValidationDefect(relationshipsFile.getName(), DefectType.IO_PROBLEM, Collections.<String>emptySet()));
Expand All @@ -220,6 +148,59 @@ private Collection<SnomedValidationDefect> doValidate() {
return emptySet();
}

private Multimap<String, InvalidRelationship> processTaxonomy() throws IOException {
final Rf2BasedSnomedTaxonomyBuilder builder = Rf2BasedSnomedTaxonomyBuilder.newInstance(new SnomedTaxonomyBuilder(conceptIds, statements), characteristicType);
final Multimap<String, InvalidRelationship> invalidRelationships = ArrayListMultimap.create();
if (snapshot) {

LOGGER.info("Validating SNOMED CT ontology based on the given RF2 release files...");
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This output could be changed slightly, so that we know immediately that one or the other branch was hit (snapshot or non-snapshot validation); in this case, we don't have to check whether additional lines appear after this one saying Validating taxonomy in '20020131'... if the validator uses non-snapshot mode.



if (hasConceptImport()) {
final String conceptFilePath = removeConceptHeader();
builder.applyNodeChanges(conceptFilePath);
}

if (hasRelationshipImport()) {
final String relationshipFilePath = removeRelationshipHeader();
builder.applyEdgeChanges(relationshipFilePath);
}

final SnomedTaxonomyBuilderResult result = builder.build();
if (!result.getStatus().isOK()) {
invalidRelationships.putAll("", result.getInvalidRelationships());
}
} else {

LOGGER.info("Validating SNOMED CT ontology based on the given RF2 release files...");

final Map<String, File> conceptFiles = hasConceptImport() ? Rf2FileModifier.split(conceptsFile) : ImmutableMap.<String, File>of();
final Map<String, File> relationshipFiles = hasRelationshipImport() ? Rf2FileModifier.split(relationshipsFile) : ImmutableMap.<String, File>of();

final List<String> effectiveTimes = ImmutableSortedSet.orderedBy(EFFECTIVE_TIME_COMPARATOR)
.addAll(conceptFiles.keySet())
.addAll(relationshipFiles.keySet())
.build()
.asList();


for (final String effectiveTime : effectiveTimes) {
LOGGER.info("Validating taxonomy in '{}'...", effectiveTime);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some extra formatting for the "unpublished" effective time would be nice ✨


final File conceptFile = conceptFiles.get(effectiveTime);
final File relationshipFile = relationshipFiles.get(effectiveTime);

builder.applyNodeChanges(getFilePath(conceptFile));
builder.applyEdgeChanges(getFilePath(relationshipFile));
final SnomedTaxonomyBuilderResult result = builder.build();
if (!result.getStatus().isOK()) {
invalidRelationships.putAll(effectiveTime, result.getInvalidRelationships());
}
}
}
return invalidRelationships;
}

private String getFilePath(@Nullable final File file) {
return null == file ? null : file.getPath();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,10 @@
*/
package com.b2international.snowowl.snomed.importer.net4j;

import static com.google.common.base.Preconditions.checkNotNull;

import java.io.IOException;
import java.io.Writer;
import java.util.Collection;
import java.util.Collections;

/**
* Representation of a validation defect about incomplete taxonomy.
Expand All @@ -26,23 +27,49 @@
public class SnomedIncompleteTaxonomyValidationDefect extends SnomedValidationDefect {

private static final long serialVersionUID = -4826632877877405283L;
private final Collection<TaxonomyDefect> taxonomyDefects;

private Collection<String> conceptIdsToInactivate;
public SnomedIncompleteTaxonomyValidationDefect(final String filePath, final Collection<TaxonomyDefect> taxonomyDefects) {
super(filePath, DefectType.INCONSISTENT_TAXONOMY, Collections.singleton("fake"));
this.taxonomyDefects = taxonomyDefects;
}

public SnomedIncompleteTaxonomyValidationDefect(final String filePath, final Collection<String> defects,
final Collection<String> conceptIdsToInactivate) {

super(filePath, DefectType.INCONSISTENT_TAXONOMY, defects);
this.conceptIdsToInactivate = checkNotNull(conceptIdsToInactivate, "conceptIdsToInactivate");
public Collection<TaxonomyDefect> getTaxonomyDefects() {
return taxonomyDefects;
}

/**
* Returns with a collection of conflicting concept IDs. If these concepts
* are inactivated one can fix the incomplete taxonomy defect.
* @return the conceptIdsToInactivate the concept IDs to inactivate.
*/
public Collection<String> getConceptIdsToInactivate() {
return conceptIdsToInactivate;
@Override
public void writeTo(Writer writer) throws IOException {
// writer header
writer.write("defectType");
writer.write(TAB);
writer.write("id");
writer.write(TAB);
writer.write("effectiveTime");
writer.write(TAB);
writer.write("taxonomyDefectType");
writer.write(TAB);
writer.write("conceptId");
writer.write(TAB);
writer.write(LE);
for (TaxonomyDefect defect : getTaxonomyDefects()) {
// defectType
writer.write(getDefectType().name());
writer.write(TAB);
// id
writer.write(Long.toString(defect.getRelationshipId()));
writer.write(TAB);
// effectiveTime
writer.write(defect.getEffectiveTime());
writer.write(TAB);
// tax.def.type
writer.write(defect.getType().name());
writer.write(TAB);
// conceptId
writer.write(Long.toString(defect.getMissingConceptId()));
writer.write(TAB);
writer.write(LE);
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;

import java.io.IOException;
import java.io.Serializable;
import java.io.Writer;
import java.util.Collection;

import com.google.common.collect.ImmutableList;
Expand All @@ -29,6 +31,8 @@
public class SnomedValidationDefect implements Serializable, Comparable<SnomedValidationDefect> {

private static final long serialVersionUID = 1L;
protected static final String TAB = "\t";
protected static final String LE = System.getProperty("line.separator");

private final String fileName;
private final DefectType defectType;
Expand Down Expand Up @@ -57,5 +61,19 @@ public final Collection<String> getDefects() {
public int compareTo(SnomedValidationDefect o) {
return getDefectType().compareTo(o.getDefectType());
}

/**
* Writes this {@link SnomedValidationDefect SNOMED CT RF2 validation defect} to a writer.
* @param writer
* @throws IOException
*/
public void writeTo(Writer writer) throws IOException {
for (String lineDefect : getDefects()) {
writer.write(getDefectType().name());
writer.write(TAB);
writer.write(lineDefect);
writer.write(LE);
}
}

}