Skip to content

Commit

Permalink
Web console: add arrayOfDoublesSketch and other small fixes (#13486)
Browse files Browse the repository at this point in the history
* add padding and keywords

* add arrayOfDoubles

* Update docs/development/extensions-core/datasketches-tuple.md

Co-authored-by: Charles Smith <techdocsmith@gmail.com>

* Update docs/development/extensions-core/datasketches-tuple.md

Co-authored-by: Charles Smith <techdocsmith@gmail.com>

* Update docs/development/extensions-core/datasketches-tuple.md

Co-authored-by: Charles Smith <techdocsmith@gmail.com>

* Update docs/development/extensions-core/datasketches-tuple.md

Co-authored-by: Charles Smith <techdocsmith@gmail.com>

* Update docs/development/extensions-core/datasketches-tuple.md

Co-authored-by: Charles Smith <techdocsmith@gmail.com>

* partiton int

* fix docs

Co-authored-by: Charles Smith <techdocsmith@gmail.com>
  • Loading branch information
vogievetsky and techdocsmith committed Dec 7, 2022
1 parent c7229fc commit 9679f6a
Show file tree
Hide file tree
Showing 15 changed files with 143 additions and 24 deletions.
43 changes: 38 additions & 5 deletions docs/development/extensions-core/datasketches-tuple.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,19 +39,52 @@ druid.extensions.loadList=["druid-datasketches"]
"name" : <output_name>,
"fieldName" : <metric_name>,
"nominalEntries": <number>,
"numberOfValues" : <number>,
"metricColumns" : <array of strings>
"metricColumns" : <array of strings>,
"numberOfValues" : <number>
}
```

|property|description|required?|
|--------|-----------|---------|
|type|This String should always be "arrayOfDoublesSketch"|yes|
|name|A String for the output (result) name of the calculation.|yes|
|name|String representing the output column to store sketch values.|yes|
|fieldName|A String for the name of the input field.|yes|
|nominalEntries|Parameter that determines the accuracy and size of the sketch. Higher k means higher accuracy but more space to store sketches. Must be a power of 2. See the [Theta sketch accuracy](https://datasketches.apache.org/docs/Theta/ThetaErrorTable) for details. |no, defaults to 16384|
|numberOfValues|Number of values associated with each distinct key. |no, defaults to 1|
|metricColumns|If building sketches from raw data, an array of names of the input columns containing numeric values to be associated with each distinct key.|no, defaults to empty array|
|metricColumns|When building sketches from raw data, an array input column that contain numeric values to associate with each distinct key. If not provided, assumes `fieldName` is an `arrayOfDoublesSketch`|no, if not provided `fieldName` is assumed to be an arrayOfDoublesSketch|
|numberOfValues|Number of values associated with each distinct key. |no, defaults to the length of `metricColumns` if provided and 1 otherwise|

You can use the `arrayOfDoublesSketch` aggregator to:

- Build a sketch from raw data. In this case, set `metricColumns` to an array.
- Build a sketch from an existing `ArrayOfDoubles` sketch . In this case, leave `metricColumns` unset and set the `fieldName` to an `ArrayOfDoubles` sketch with `numberOfValues` doubles. At ingestion time, you must base64 encode `ArrayOfDoubles` sketches at ingestion time.

#### Example on top of raw data

Compute a theta of unique users. For each user store the `added` and `deleted` scores. The new sketch column will be called `users_theta`.

```json
{
"type": "arrayOfDoublesSketch",
"name": "users_theta",
"fieldName": "user",
"nominalEntries": 16384,
"metricColumns": ["added", "deleted"],
}
```

#### Example ingesting a precomputed sketch column

Ingest a sketch column called `user_sketches` that has a base64 encoded value of two doubles in its array and store it in a column called `users_theta`.

```json
{
"type": "arrayOfDoublesSketch",
"name": "users_theta",
"fieldName": "user_sketches",
"nominalEntries": 16384,
"numberOfValues": 2,
}
```

### Post Aggregators

Expand Down
3 changes: 3 additions & 0 deletions web-console/lib/keywords.js
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ exports.SQL_KEYWORDS = [
'REPLACE INTO',
'OVERWRITE',
'RETURNING',
'OVER',
'PARTITION BY',
'WINDOW',
];

exports.SQL_EXPRESSION_PARTS = [
Expand Down
4 changes: 1 addition & 3 deletions web-console/script/create-sql-docs.js
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,7 @@ function convertMarkdownToHtml(markdown) {
// Concert to markdown
markdown = snarkdown(markdown);

return markdown
.replace(/<br \/>/g, '<br /><br />') // Double up the <br>s
.replace(/<a[^>]*>(.*?)<\/a>/g, '$1'); // Remove links
return markdown.replace(/<a[^>]*>(.*?)<\/a>/g, '$1'); // Remove links
}

const readDoc = async () => {
Expand Down
4 changes: 2 additions & 2 deletions web-console/src/bootstrap/react-table-defaults.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,12 @@ export function bootstrapReactTable() {
.map((row: any) => row[column.id]);
const previewCount = countBy(previewValues);
return (
<span>
<div className="default-aggregated">
{Object.keys(previewCount)
.sort()
.map(v => `${v} (${previewCount[v]})`)
.join(', ')}
</span>
</div>
);
},
defaultPageSize: 20,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ ORDER BY "start" DESC`;
intervals = await queryDruidSql({
query: SegmentTimeline.getSqlQuery(startDate, endDate),
});
datasources = uniq(intervals.map(r => r.datasource));
datasources = uniq(intervals.map(r => r.datasource).sort());
} else if (capabilities.hasCoordinatorAccess()) {
const startIso = startDate.toISOString();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ export function externalConfigToIngestQueryPattern(
config: ExternalConfig,
isArrays: boolean[],
timeExpression: SqlExpression | undefined,
partitionedByHint: string | undefined,
): IngestQueryPattern {
return {
destinationTableName: guessDataSourceNameFromInputSource(config.inputSource) || 'data',
Expand All @@ -71,7 +72,7 @@ export function externalConfigToIngestQueryPattern(
mainExternalConfig: config,
filters: [],
dimensions: externalConfigToInitDimensions(config, isArrays, timeExpression),
partitionedBy: timeExpression ? 'day' : 'all',
partitionedBy: partitionedByHint || (timeExpression ? 'day' : 'all'),
clusteredBy: [],
};
}
Expand Down
43 changes: 43 additions & 0 deletions web-console/src/druid-models/metric-spec/metric-spec.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ export const METRIC_SPEC_FIELDS: Field<MetricSpec>[] = [
// Should the first / last aggregators become usable at ingestion time, reverse the changes made in:
// https://github.com/apache/druid/pull/10794
'thetaSketch',
'arrayOfDoublesSketch',
{
group: 'HLLSketch',
suggestions: ['HLLSketchBuild', 'HLLSketchMerge'],
Expand All @@ -104,6 +105,7 @@ export const METRIC_SPEC_FIELDS: Field<MetricSpec>[] = [
'doubleMax',
'floatMax',
'thetaSketch',
'arrayOfDoublesSketch',
'HLLSketchBuild',
'HLLSketchMerge',
'quantilesDoublesSketch',
Expand Down Expand Up @@ -178,6 +180,47 @@ export const METRIC_SPEC_FIELDS: Field<MetricSpec>[] = [
</>
),
},
// arrayOfDoublesSketch
{
name: 'nominalEntries',
type: 'number',
defined: typeIs('arrayOfDoublesSketch'),
defaultValue: 16384,
info: (
<>
<p>
Parameter that determines the accuracy and size of the sketch. Higher k means higher
accuracy but more space to store sketches.
</p>
<p>Must be a power of 2.</p>
<p>
See the{' '}
<ExternalLink href="https://datasketches.apache.org/docs/Theta/ThetaErrorTable">
Theta sketch accuracy
</ExternalLink>{' '}
for details.
</p>
</>
),
},
{
name: 'metricColumns',
type: 'string-array',
defined: typeIs('arrayOfDoublesSketch'),
info: (
<>
If building sketches from raw data, an array of names of the input columns containing
numeric values to be associated with each distinct key.
</>
),
},
{
name: 'numberOfValues',
type: 'number',
defined: typeIs('arrayOfDoublesSketch'),
placeholder: 'metricColumns length or 1',
info: <>Number of values associated with each distinct key.</>,
},
// HLLSketchBuild & HLLSketchMerge
{
name: 'lgK',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,13 +82,19 @@ export class WorkbenchQuery {
externalConfig: ExternalConfig,
isArrays: boolean[],
timeExpression: SqlExpression | undefined,
partitionedByHint: string | undefined,
): WorkbenchQuery {
return new WorkbenchQuery({
queryContext: {},
queryParts: [
WorkbenchQueryPart.fromQueryString(
ingestQueryPatternToQuery(
externalConfigToIngestQueryPattern(externalConfig, isArrays, timeExpression),
externalConfigToIngestQueryPattern(
externalConfig,
isArrays,
timeExpression,
partitionedByHint,
),
).toString(),
),
],
Expand Down
9 changes: 7 additions & 2 deletions web-console/src/helpers/execution/sql-task-execution.ts
Original file line number Diff line number Diff line change
Expand Up @@ -124,9 +124,14 @@ export async function reattachTaskExecution(
option: ReattachTaskQueryOptions,
): Promise<Execution | IntermediateQueryState<Execution>> {
const { id, cancelToken, preserveOnTermination } = option;
let execution = await getTaskExecution(id, undefined, cancelToken);
let execution: Execution;

execution = await updateExecutionWithDatasourceExistsIfNeeded(execution, cancelToken);
try {
execution = await getTaskExecution(id, undefined, cancelToken);
execution = await updateExecutionWithDatasourceExistsIfNeeded(execution, cancelToken);
} catch (e) {
throw new Error(`Reattaching to query failed due to: ${e.message}`);
}

if (execution.isFullyComplete()) return execution;

Expand Down
4 changes: 4 additions & 0 deletions web-console/src/react-table/react-table-extra.scss
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,8 @@
}
}
}

.default-aggregated {
padding: 10px 5px;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ export const SqlDataLoaderView = React.memo(function SqlDataLoaderView(
{ inputSource, inputFormat, signature },
isArrays,
timeExpression,
undefined,
),
).toString(),
queryContext: {
Expand All @@ -167,6 +168,7 @@ export const SqlDataLoaderView = React.memo(function SqlDataLoaderView(
{ inputSource, inputFormat, signature },
isArrays,
timeExpression,
undefined,
),
).toString(),
});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ import { Classes, Dialog } from '@blueprintjs/core';
import { SqlExpression } from 'druid-query-toolkit';
import React, { useState } from 'react';

import { ExternalConfig } from '../../../druid-models';
import { ExternalConfig, InputFormat, InputSource } from '../../../druid-models';
import { InputFormatStep } from '../input-format-step/input-format-step';
import { InputSourceStep } from '../input-source-step/input-source-step';

Expand All @@ -32,20 +32,27 @@ export interface ConnectExternalDataDialogProps {
config: ExternalConfig,
isArrays: boolean[],
timeExpression: SqlExpression | undefined,
partitionedByHint: string | undefined,
): void;
onClose(): void;
}

interface ExternalConfigStep {
inputSource?: InputSource;
inputFormat?: InputFormat;
partitionedByHint?: string;
}

export const ConnectExternalDataDialog = React.memo(function ConnectExternalDataDialog(
props: ConnectExternalDataDialogProps,
) {
const { initExternalConfig, onClose, onSetExternalConfig } = props;

const [externalConfigStep, setExternalConfigStep] = useState<Partial<ExternalConfig>>(
const [externalConfigStep, setExternalConfigStep] = useState<ExternalConfigStep>(
initExternalConfig || {},
);

const { inputSource, inputFormat } = externalConfigStep;
const { inputSource, inputFormat, partitionedByHint } = externalConfigStep;

return (
<Dialog
Expand All @@ -65,6 +72,7 @@ export const ConnectExternalDataDialog = React.memo(function ConnectExternalData
{ inputSource, inputFormat, signature },
isArrays,
timeExpression,
partitionedByHint,
);
onClose();
}}
Expand All @@ -76,8 +84,8 @@ export const ConnectExternalDataDialog = React.memo(function ConnectExternalData
<InputSourceStep
initInputSource={inputSource}
mode="sampler"
onSet={(inputSource, inputFormat) => {
setExternalConfigStep({ inputSource, inputFormat });
onSet={(inputSource, inputFormat, partitionedByHint) => {
setExternalConfigStep({ inputSource, inputFormat, partitionedByHint });
}}
/>
)}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ export interface ExampleInput {
description: string;
inputSource: InputSource;
inputFormat?: InputFormat;
partitionedByHint?: string;
}

const TRIPS_INPUT_FORMAT: InputFormat = {
Expand Down Expand Up @@ -122,6 +123,7 @@ export const EXAMPLE_INPUTS: ExampleInput[] = [
],
},
inputFormat: TRIPS_INPUT_FORMAT,
partitionedByHint: 'month',
},
{
name: 'NYC Taxi cabs (all files)',
Expand Down Expand Up @@ -206,6 +208,7 @@ export const EXAMPLE_INPUTS: ExampleInput[] = [
],
},
inputFormat: TRIPS_INPUT_FORMAT,
partitionedByHint: 'month',
},
{
name: 'FlightCarrierOnTime (1 month)',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,11 @@ const ROWS_TO_SAMPLE = 50;
export interface InputSourceStepProps {
initInputSource: Partial<InputSource> | undefined;
mode: 'sampler' | 'msq';
onSet(inputSource: InputSource, inputFormat: InputFormat): void;
onSet(
inputSource: InputSource,
inputFormat: InputFormat,
partitionedByHint: string | undefined,
): void;
}

export const InputSourceStep = React.memo(function InputSourceStep(props: InputSourceStepProps) {
Expand Down Expand Up @@ -169,7 +173,11 @@ export const InputSourceStep = React.memo(function InputSourceStep(props: InputS
useEffect(() => {
const guessedInputFormat = guessedInputFormatState.data;
if (!guessedInputFormat) return;
onSet(exampleInput?.inputSource || (inputSource as any), guessedInputFormat);
onSet(
exampleInput?.inputSource || (inputSource as any),
guessedInputFormat,
exampleInput?.partitionedByHint,
);
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [guessedInputFormatState]);

Expand Down
9 changes: 7 additions & 2 deletions web-console/src/views/workbench-view/workbench-view.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -324,9 +324,14 @@ export class WorkbenchView extends React.PureComponent<WorkbenchViewProps, Workb

return (
<ConnectExternalDataDialog
onSetExternalConfig={(externalConfig, isArrays, timeExpression) => {
onSetExternalConfig={(externalConfig, isArrays, timeExpression, partitionedByHint) => {
this.handleNewTab(
WorkbenchQuery.fromInitExternalConfig(externalConfig, isArrays, timeExpression),
WorkbenchQuery.fromInitExternalConfig(
externalConfig,
isArrays,
timeExpression,
partitionedByHint,
),
'Ext ' + guessDataSourceNameFromInputSource(externalConfig.inputSource),
);
}}
Expand Down

0 comments on commit 9679f6a

Please sign in to comment.