Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Track const config values in analytics #10120

Merged
merged 17 commits into from
Feb 17, 2022
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,11 @@

package io.airbyte.scheduler.persistence.job_tracker;

import static java.util.Collections.emptyMap;
import static java.util.Collections.singletonMap;
import static java.util.stream.Collectors.toMap;

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableMap;
Expand All @@ -28,12 +31,13 @@
import io.airbyte.scheduler.models.Job;
import io.airbyte.scheduler.persistence.JobPersistence;
import io.airbyte.scheduler.persistence.WorkspaceHelper;
import io.airbyte.validation.json.JsonSchemaValidator;
import io.airbyte.validation.json.JsonValidationException;
import java.io.IOException;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import java.util.UUID;

public class JobTracker {
Expand Down Expand Up @@ -118,16 +122,21 @@ public void trackSync(final Job job, final JobState jobState) {
Preconditions.checkArgument(allowedJob, "Job type " + configType + " is not allowed!");
final long jobId = job.getId();
final UUID connectionId = UUID.fromString(job.getScope());
final UUID sourceDefinitionId = configRepository.getSourceDefinitionFromConnection(connectionId).getSourceDefinitionId();
final UUID destinationDefinitionId = configRepository.getDestinationDefinitionFromConnection(connectionId).getDestinationDefinitionId();
final StandardSourceDefinition sourceDefinition = configRepository.getSourceDefinitionFromConnection(connectionId);
final UUID sourceDefinitionId = sourceDefinition.getSourceDefinitionId();
final StandardDestinationDefinition destinationDefinition = configRepository.getDestinationDefinitionFromConnection(connectionId);
final UUID destinationDefinitionId = destinationDefinition.getDestinationDefinitionId();

final Map<String, Object> jobMetadata = generateJobMetadata(String.valueOf(jobId), configType, job.getAttemptsCount());
final Map<String, Object> jobAttemptMetadata = generateJobAttemptMetadata(job.getId(), jobState);
final Map<String, Object> sourceDefMetadata = generateSourceDefinitionMetadata(sourceDefinitionId);
final Map<String, Object> destinationDefMetadata = generateDestinationDefinitionMetadata(destinationDefinitionId);
final Map<String, Object> syncMetadata = generateSyncMetadata(connectionId);
final Map<String, Object> stateMetadata = generateStateMetadata(jobState);
final Map<String, Object> syncConfigMetadata = generateSyncConfigMetadata(job.getConfig());
final Map<String, Object> syncConfigMetadata = generateSyncConfigMetadata(
job.getConfig(),
sourceDefinition.getSpec().getConnectionSpecification(),
destinationDefinition.getSpec().getConnectionSpecification());

final UUID workspaceId = workspaceHelper.getWorkspaceForJobIdIgnoreExceptions(jobId);
track(workspaceId,
Expand All @@ -142,18 +151,20 @@ public void trackSync(final Job job, final JobState jobState) {
});
}

private Map<String, Object> generateSyncConfigMetadata(final JobConfig config) {
private Map<String, Object> generateSyncConfigMetadata(final JobConfig config,
final JsonNode sourceConfigSchema,
final JsonNode destinationConfigSchema) {
if (config.getConfigType() == ConfigType.SYNC) {
final JsonNode sourceConfiguration = config.getSync().getSourceConfiguration();
final JsonNode destinationConfiguration = config.getSync().getDestinationConfiguration();

final Map<String, Object> sourceMetadata = configToMetadata(CONFIG + ".source", sourceConfiguration);
final Map<String, Object> destinationMetadata = configToMetadata(CONFIG + ".destination", destinationConfiguration);
final Map<String, Object> sourceMetadata = configToMetadata(CONFIG + ".source", sourceConfiguration, sourceConfigSchema);
final Map<String, Object> destinationMetadata = configToMetadata(CONFIG + ".destination", destinationConfiguration, destinationConfigSchema);
final Map<String, Object> catalogMetadata = getCatalogMetadata(config.getSync().getConfiguredAirbyteCatalog());

return MoreMaps.merge(sourceMetadata, destinationMetadata, catalogMetadata);
} else {
return Collections.emptyMap();
return emptyMap();
}
}

Expand All @@ -168,30 +179,130 @@ private Map<String, Object> getCatalogMetadata(final ConfiguredAirbyteCatalog ca
return output;
}

protected static Map<String, Object> configToMetadata(final String jsonPath, final JsonNode config) {
/**
* Flattens a config into a map. Uses the schema to determine which fields are const (i.e.
* non-sensitive). Non-const, non-boolean values are replaced with {@link #SET} to avoid leaking
* potentially-sensitive information.
* <p>
* anyOf/allOf schemas are treated as non-const values. These aren't (currently) used in config
* schemas anyway.
*
* @param jsonPath A prefix to add to all the keys in the returned map, with a period (`.`)
* separator
* @param schema The JSON schema that {@code config} conforms to
*/
protected static Map<String, Object> configToMetadata(final String jsonPath, final JsonNode config, final JsonNode schema) {
final Map<String, Object> metadata = configToMetadata(config, schema);
// Prepend all the keys with the root jsonPath
// But leave the values unchanged
final Map<String, Object> output = new HashMap<>();
mergeMaps(output, jsonPath, metadata);
return output;
}

if (config.isObject()) {
final ObjectNode node = (ObjectNode) config;
for (final Iterator<Map.Entry<String, JsonNode>> it = node.fields(); it.hasNext();) {
final var entry = it.next();
final var field = entry.getKey();
final var fieldJsonPath = jsonPath + "." + field;
final var child = entry.getValue();

if (child.isBoolean()) {
output.put(fieldJsonPath, child.asBoolean());
} else if (!child.isNull()) {
if (child.isObject()) {
output.putAll(configToMetadata(fieldJsonPath, child));
} else if (!child.isTextual() || (child.isTextual() && !child.asText().isEmpty())) {
output.put(fieldJsonPath, SET);
/**
* Does the actually interesting bits of configToMetadata. If config is an object, returns a
* flattened map. If config is _not_ an object (i.e. it's a primitive string/number/etc, or it's an
* array) then returns a map of {null: toMetadataValue(config)}.
*/
private static Map<String, Object> configToMetadata(final JsonNode config, final JsonNode schema) {
if (schema.hasNonNull("const")) {
// If this schema is a const, then just dump it into a map:
// * If it's an object, flatten it
// * Otherwise, do some basic conversions to value-ish data.
// It would be a weird thing to declare const: null, but in that case we don't want to report null
// anyway, so explicitly use hasNonNull.
return flatten(config);
} else if (schema.has("oneOf")) {
// If this schema is a oneOf, then find the first sub-schema which the config matches
// and use that sub-schema to convert the config to a map
final JsonSchemaValidator validator = new JsonSchemaValidator();
for (final Iterator<JsonNode> it = schema.get("oneOf").elements(); it.hasNext();) {
final JsonNode subSchema = it.next();
if (validator.test(subSchema, config)) {
return configToMetadata(config, subSchema);
}
}
// If we didn't match any of the subschemas, then something is wrong. Bail out silently.
return emptyMap();
} else if (config.isObject()) {
// If the schema is not a oneOf, but the config is an object (i.e. the schema has "type": "object")
// then we need to recursively convert each field of the object to a map.
// Note: this doesn't handle additionalProperties, so if a config contains properties that are not
// explicitly declared in the schema, they will be ignored.
Copy link
Contributor

@ChristopheDuong ChristopheDuong Feb 8, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be worth reporting the names of additionalProperties not declared in the schema instead of ignoring them?

If we studied the metadata sent to segment on those, we could have a sense of what's missing in the connector specs maybe?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

realized that I might as well just handle additionalProperties in a smarter way, can you take a look at this? e74066d#diff-8ef120819d2dd157a1e8d1838a49917772872273fefb9719b56aa303174581abL235-R261

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM!

final Map<String, Object> output = new HashMap<>();
final JsonNode maybeProperties = schema.get("properties");
if (maybeProperties != null) {
for (final Iterator<Entry<String, JsonNode>> it = config.fields(); it.hasNext();) {
final Entry<String, JsonNode> entry = it.next();
final String field = entry.getKey();
final JsonNode value = entry.getValue();
if (maybeProperties.hasNonNull(field)) {
mergeMaps(output, field, configToMetadata(value, maybeProperties.get(field)));
}
}
}
return output;
} else if (config.isBoolean()) {
return singletonMap(null, config.asBoolean());
} else if ((!config.isTextual() && !config.isNull()) || (config.isTextual() && !config.asText().isEmpty())) {
// This is either non-textual (e.g. integer, array, etc) or non-empty text
return singletonMap(null, SET);
} else {
// Otherwise, this is an empty string, so just ignore it
return emptyMap();
}
}

return output;
/**
* Naively flattens a JsonNode, or dumps it into a {null: node} map. Does _not_ mask values; ONLY
* use this if you're sure that node does NOT contain potentially-sensitive data.
*/
private static Map<String, Object> flatten(final JsonNode node) {
edgao marked this conversation as resolved.
Show resolved Hide resolved
if (node.isObject()) {
final Map<String, Object> output = new HashMap<>();
for (final Iterator<Entry<String, JsonNode>> it = node.fields(); it.hasNext();) {
final Entry<String, JsonNode> entry = it.next();
final String field = entry.getKey();
final JsonNode value = entry.getValue();
mergeMaps(output, field, flatten(value));
}
return output;
} else {
final Object metadataValue;
if (node.isBoolean()) {
metadataValue = node.asBoolean();
} else if (node.isLong()) {
metadataValue = node.asLong();
} else if (node.isDouble()) {
metadataValue = node.asDouble();
} else if (node.isValueNode() && !node.isNull()) {
metadataValue = node.asText();
} else {
// Fallback handling for e.g. arrays
metadataValue = node.toString();
}
return singletonMap(null, metadataValue);
}
}

/**
* Prepend all keys in subMap with prefix, then merge that map into originalMap.
* <p>
* If subMap contains a null key, then instead it is replaced with prefix. I.e. {null: value} is
* treated as {prefix: value} when merging into originalMap.
*/
private static void mergeMaps(final Map<String, Object> originalMap, final String prefix, final Map<String, Object> subMap) {
originalMap.putAll(subMap.entrySet().stream().collect(toMap(
e -> {
final String key = e.getKey();
if (key != null) {
return prefix + "." + key;
} else {
return prefix;
}
},
Entry::getValue)));
}

private Map<String, Object> generateSyncMetadata(final UUID connectionId) throws ConfigNotFoundException, IOException, JsonValidationException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,16 @@
"empty_string": "",
"null_value": null,
"one_of": {
"some_key": 100
}
"type_key": "foo",
"some_key": 100,
"some_undeclared_key": "ignored"
},
"const_object": {
"sub_key": "bar",
"sub_array": [1, 2, 3],
"sub_object": {
"sub_sub_key": "baz"
}
},
"const_null": null
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
{
"type": "object",
"properties": {
"username": {
"type": "string"
},
"password": {
"type": "string"
},
"has_ssl": {
"type": "boolean"
},
"empty_string": {
"type": "string"
},
"null_value": {
"type": "null"
},
"one_of": {
"type": "object",
"oneOf": [
{
"type": "object",
"properties": {
"type_key": {
"const": "foo"
},
"some_key": {
"type": "integer"
}
}
}
]
},
"const_object": {
"const": {
"sub_key": "bar",
"sub_array": [1, 2, 3],
"sub_object": {
"sub_sub_key": "baz"
}
}
},
"const_null": {
"const": null
}
}
}
Loading