Skip to content

Commit

Permalink
Add correct yaml files for real time data(#6787) (#6916)
Browse files Browse the repository at this point in the history
  • Loading branch information
GSharayu committed May 14, 2021
1 parent 1a38329 commit e379292
Show file tree
Hide file tree
Showing 13 changed files with 330 additions and 4 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
{
"dimensionFieldSpecs": [
{
"dataType": "INT",
"name": "generationNumber"
},
{
"dataType": "STRING",
"name": "stringDimSV1"
},
{
"dataType": "STRING",
"name": "stringDimSV2"
},
{
"dataType": "LONG",
"name": "longDimSV1"
},
{
"dataType": "LONG",
"name": "longDimSV2"
},
{
"dataType": "STRING",
"name": "stringDimMV1",
"singleValueField": false
},
{
"dataType": "STRING",
"name": "stringDimMV2",
"singleValueField": false
},
{
"dataType": "INT",
"name": "intDimMV1",
"singleValueField": false
},
{
"dataType": "INT",
"name": "intDimMV2",
"singleValueField": false
},
{
"dataType": "STRING",
"maxLength": 1000,
"name": "textDim1"
},
{
"dataType": "BYTES",
"name": "bytesDimSV1"
},
{
"dataType": "STRING",
"name": "mapDim1__KEYS",
"singleValueField": false
},
{
"dataType": "INT",
"name": "mapDim1__VALUES",
"singleValueField": false
},
{
"dataType": "STRING",
"name": "mapDim2json"
}
],
"metricFieldSpecs": [
{
"dataType": "INT",
"name": "intMetric1"
},
{
"dataType": "LONG",
"name": "longMetric1"
},
{
"dataType": "FLOAT",
"name": "floatMetric1"
},
{
"dataType": "DOUBLE",
"name": "doubleMetric1"
}
],
"dateTimeFieldSpecs" : [
{
"name" : "HoursSinceEpoch",
"dataType" : "INT",
"format" : "1:HOURS:EPOCH",
"granularity": "1:HOURS"
}
],
"schemaName": "FeatureTest2"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# HoursSinceEpoch generationNumber stringDimSV1 stringDimSV2 longDimSV1 longDimSV2 stringDimMV1 stringDimMV2 intDimMV1 intDimMV2 textDim1 mapDim1__KEYS mapDim1__VALUES mapDim2json intMetric1 longMetric1 floatMetric1 doubleMetric1
123456,__GENERATION_NUMBER__,"s1-0",s2-0,1,2,m1-0-0;m1-0-1,m2-0-0;m2-0-1,3;4,6;7,Java C++ Python,01a0bc,k1;k2;k3;k4;k5,1;1;2;2;2,"{""k1"":1,""k2"":1,""k3"":2,""k4"":2,""k5"":2}",10,11,12.1,13.1
123456,__GENERATION_NUMBER__,"s1-0",s2-0,1,2,m1-0-0;m1-0-1,m2-0-0;m2-0-1,3;4,6;7,Java C++ Python,4877625602,k1;k2;k3;k4;k5,3;3;3;3;3,"{""k1"":3,""k2"":3,""k3"":3,""k4"":3,""k5"":3}",10,11,12.1,13.1, # Dupliate of row 0 1
123456,__GENERATION_NUMBER__,s1-2,s2-2,11,21,m1-2-0;m1-2-1,m2-2-0;m2-2-1,32;42,62;72,Java C++ golang,13225573e3f5,k1;k2;k3;k4;k5,4;5;6;7;7,"{""k1"":4,""k2"":5,""k3"":6,""k4"":7,""k5"":7}",10,21,22.1,23.10
123456,__GENERATION_NUMBER__,s1-2,s2-2,11,21,m1-3-0;m1-3-1,m2-3-0;m2-3-1,32;42,62;72,Java C++ golang,deadbeef,k1;k2;k3;k4;k5,7;7;7;7;7,"{""k1"":7,""k2"":7,""k3"":7,""k4"":7,""k5"":7}",10,21,22.1,23.10, # All sv cols same as prev
123456,__GENERATION_NUMBER__,s1-4,s2-4,41,22,m1-2-0;m1-2-1,m2-2-0;m2-2-1,42;52,72;82,Java C++ golang,deed0507,k1;k2;k3;k4;k5,7;7;8;8;8,"{""k1"":7,""k2"":7,""k3"":8,""k4"":8,""k5"":8}",14,24,24.1,24.10, # All mv cols same as row 2
123456,__GENERATION_NUMBER__,s1-5,,,32,m1-5-0,m2-2-0,,92;22,golang shell bash,,k1;k2;k3;k4;k5,7;7;7;7;7,"{""k1"":7,""k2"":7,""k3"":7,""k4"":7,""k5"":7}",,24,,24.10, # Default values for some columns
123456,__GENERATION_NUMBER__,s1-6,s2-6,7611,7621,m1-5-0;m1-5-1;m1-5-2,m2-2-0,392;462,6662;782,C++ golang python,deed0507,k1;k2;k3;k4;k5,7;8;9;10;20,"{""k1"":7,""k2"":8,""k3"":9,""k4"":10,""k5"":20}",101,251,262.1,263.10, # 3 values in MV
123456,__GENERATION_NUMBER__,s1-6,s2-6,7611,7621,m1-5-0;m1-5-1;m1-5-2,m2-2-0,392;462,6662;782,C++ golang python,deed0507,k1;k2;k3;k4;k5,7;8;9;10;20,"{""k1"":7,""k2"":8,""k3"":9,""k4"":10,""k5"":20}",2147483647,251,262.1,263.10, # MAX_INT in int metric
123456,__GENERATION_NUMBER__,s1-6,s2-6,7611,7621,m1-5-0;m1-5-1;m1-5-2,m2-2-0,392;462,6662;782,C++ golang python,deed0507,k1;k2;k3;k4;k5,7;8;9;10;20,"{""k1"":7,""k2"":8,""k3"":9,""k4"":10,""k5"":20}",2147483647,251,262.1,263.10, # MAX_INT in int metric
123456,__GENERATION_NUMBER__,s1-7,s2-7,6766,6777,m1-6-0;m1-6-1;m1-6-2;m1-6-3,m2-6-0;m2-6-1,392;462,6662;782,golang Java,d54d0507,k1;k2;k3;k4;k5,31;31;32;32;32,"{""k1"":31,""k2"":31,""k3"":32,""k4"":32,""k5"":32}",87,251,262.10,263.10
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"streamType": "kafka",
"stream.kafka.consumer.type": "simple",
"topicName": "PinotRealtimeFeatureTest1Event",
"topicName": "PinotRealtimeFeatureTest2Event",
"partitionColumn": "longDimSV1",
"numPartitions": "1",
"stream.kafka.consumer.prop.auto.offset.reset": "smallest"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
"replication": "1",
"retentionTimeUnit": "",
"retentionTimeValue": "",
"schemaName": "FeatureTest1",
"schemaName": "FeatureTest2",
"segmentAssignmentStrategy": "BalanceNumSegmentAssignmentStrategy",
"segmentPushFrequency": "daily",
"segmentPushType": "APPEND",
Expand All @@ -40,8 +40,10 @@
"segmentFormatVersion": "v3",
"sortedColumn": [],
"streamConfigs": {
"realtime.segment.flush.threshold.size": "63",
"realtime.segment.flush.threshold.time": "1h",
"streamType": "kafka",
"stream.kafka.topic.name": "PinotRealtimeFeatureTest1Event",
"stream.kafka.topic.name": "PinotRealtimeFeatureTest2Event",
"stream.kafka.consumer.type": "simple",
"stream.kafka.decoder.class.name": "org.apache.pinot.plugin.stream.kafka.KafkaJSONMessageDecoder",
"stream.kafka.consumer.factory.class.name": "org.apache.pinot.plugin.stream.kafka20.KafkaConsumerFactory",
Expand All @@ -50,7 +52,7 @@
"stream.kafka.consumer.prop.auto.offset.reset": "largest"
}
},
"tableName": "FeatureTest1_REALTIME",
"tableName": "FeatureTest2",
"tableType": "REALTIME",
"tenants": {
"broker": "DefaultTenant",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#

# Selection
SELECT longDimSV1, intDimMV1 FROM FeatureTest2 WHERE generationNumber = __GENERATION_NUMBER__ AND (stringDimSV1 != 's1-6' AND longDimSV1 BETWEEN 10 AND 1000 OR (intDimMV1 < 42 AND stringDimMV2 IN ('m2-0-0', 'm2-2-0') AND intDimMV2 NOT IN (6,72))) ORDER BY longDimSV1, intDimMV1
SELECT longDimSV1, intDimMV1 FROM FeatureTest2 WHERE generationNumber = __GENERATION_NUMBER__ AND (stringDimSV1 != 's1-6' AND longDimSV1 BETWEEN 10 AND 1000 OR (intDimMV1 < 42 AND stringDimMV2 IN ('m2-0-0', 'm2-2-0') AND intDimMV2 NOT IN (6,72))) LIMIT 5
SELECT longDimSV1, intDimMV1 FROM FeatureTest2 WHERE generationNumber = __GENERATION_NUMBER__ AND (stringDimSV1 != 's1-6' AND longDimSV1 BETWEEN 10 AND 1000 OR (intDimMV1 < 42 AND stringDimMV2 IN ('m2-0-0', 'm2-2-0') AND intDimMV2 NOT IN (6,72))) LIMIT 200

# Aggregation
SELECT count(*) FROM FeatureTest2 WHERE generationNumber = __GENERATION_NUMBER__
SELECT sum(intMetric1), sumMV(intDimMV1), min(intMetric1), minMV(intDimMV2), max(longDimSV1), maxMV(intDimMV1) FROM FeatureTest2 WHERE generationNumber = __GENERATION_NUMBER__
SELECT count(longDimSV1), countMV(intDimMV1), avg(floatMetric1), avgMV(intDimMV2), minMaxRange(doubleMetric1), minMaxRangeMV(intDimMV2) FROM FeatureTest2 WHERE generationNumber = __GENERATION_NUMBER__
SELECT percentile(longDimSV1, 80), percentileMV(intDimMV1, 90), percentileEst(longDimSV1, 80), percentileEstMV(intDimMV1, 90), percentileTDigest(longDimSV1, 80), percentileTDigestMV(intDimMV1, 90) FROM FeatureTest2 WHERE generationNumber = __GENERATION_NUMBER__
SELECT percentile(longDimSV1, 80.01), percentileMV(intDimMV1, 99.99), percentileEst(longDimSV1, 80.01), percentileEstMV(intDimMV1, 99.99), percentileTDigest(longDimSV1, 80.01), percentileTDigestMV(intDimMV1, 99.99) FROM FeatureTest2 WHERE generationNumber = __GENERATION_NUMBER__
SELECT distinctCount(longDimSV1), distinctCountMV(intDimMV1), distinctCountHLL(longDimSV1), distinctCountHLLMV(intDimMV1) FROM FeatureTest2 WHERE generationNumber = __GENERATION_NUMBER__

# Selection & Filtering & Grouping on Aggregation
SELECT longDimSV1, intDimMV1, count(*) FROM FeatureTest2 WHERE generationNumber = __GENERATION_NUMBER__ AND (stringDimSV1 != 's1-6' AND longDimSV1 BETWEEN 10 AND 1000 OR (intDimMV1 < 42 AND stringDimMV2 IN ('m2-0-0', 'm2-2-0') AND intDimMV2 NOT IN (6,72))) GROUP BY longDimSV1, intDimMV1 ORDER BY longDimSV1, intDimMV1 LIMIT 20
SELECT longDimSV1, intDimMV1, sum(intMetric1), sumMV(intDimMV1), min(intMetric1), minMV(intDimMV2), max(longDimSV1), maxMV(intDimMV1) FROM FeatureTest2 WHERE generationNumber = __GENERATION_NUMBER__ AND (stringDimSV1 != 's1-6' AND longDimSV1 BETWEEN 10 AND 1000 OR (intDimMV1 < 42 AND stringDimMV2 IN ('m2-0-0', 'm2-2-0') AND intDimMV2 NOT IN (6,72))) GROUP BY longDimSV1, intDimMV1 ORDER BY longDimSV1, intDimMV1 LIMIT 20
SELECT longDimSV1, intDimMV1, count(longDimSV1), countMV(intDimMV1), avg(floatMetric1), avgMV(intDimMV2), minMaxRange(doubleMetric1), minMaxRangeMV(intDimMV2) FROM FeatureTest2 WHERE generationNumber = __GENERATION_NUMBER__ AND (stringDimSV1 != 's1-6' AND longDimSV1 BETWEEN 10 AND 1000 OR (intDimMV1 < 42 AND stringDimMV2 IN ('m2-0-0', 'm2-2-0') AND intDimMV2 NOT IN (6,72))) GROUP BY longDimSV1, intDimMV1 ORDER BY longDimSV1, intDimMV1 LIMIT 20
SELECT longDimSV1, intDimMV1, percentile(longDimSV1, 80), percentileMV(intDimMV1, 90), percentileEst(longDimSV1, 80), percentileEstMV(intDimMV1, 90), percentileTDigest(longDimSV1, 80), percentileTDigestMV(intDimMV1, 90) FROM FeatureTest2 WHERE generationNumber = __GENERATION_NUMBER__ AND (stringDimSV1 != 's1-6' AND longDimSV1 BETWEEN 10 AND 1000 OR (intDimMV1 < 42 AND stringDimMV2 IN ('m2-0-0', 'm2-2-0') AND intDimMV2 NOT IN (6,72))) GROUP BY longDimSV1, intDimMV1 ORDER BY longDimSV1, intDimMV1 LIMIT 20
SELECT longDimSV1, intDimMV1, percentile(longDimSV1, 80.01), percentileMV(intDimMV1, 99.99), percentileEst(longDimSV1, 80.01), percentileEstMV(intDimMV1, 99.99), percentileTDigest(longDimSV1, 80.01), percentileTDigestMV(intDimMV1, 99.99) FROM FeatureTest2 WHERE generationNumber = __GENERATION_NUMBER__ AND (stringDimSV1 != 's1-6' AND longDimSV1 BETWEEN 10 AND 1000 OR (intDimMV1 < 42 AND stringDimMV2 IN ('m2-0-0', 'm2-2-0') AND intDimMV2 NOT IN (6,72))) GROUP BY longDimSV1, intDimMV1 ORDER BY longDimSV1, intDimMV1 LIMIT 20
SELECT longDimSV1, intDimMV1, distinctCount(longDimSV1), distinctCountMV(intDimMV1), distinctCountHLL(longDimSV1), distinctCountHLLMV(intDimMV1) FROM FeatureTest2 WHERE generationNumber = __GENERATION_NUMBER__ AND (stringDimSV1 != 's1-6' AND longDimSV1 BETWEEN 10 AND 1000 OR (intDimMV1 < 42 AND stringDimMV2 IN ('m2-0-0', 'm2-2-0') AND intDimMV2 NOT IN (6,72))) GROUP BY longDimSV1, intDimMV1 ORDER BY longDimSV1, intDimMV1 LIMIT 20

# Transformation Functions
SELECT DISTINCT add(longDimSV1, sub(longDimSV2, 3)), mod(intMetric1, 10), div(doubleMetric1, mult(floatMetric1, 5)) FROM FeatureTest2 WHERE generationNumber = __GENERATION_NUMBER__ ORDER BY add(longDimSV1, sub(longDimSV2, 3)) DESC, mod(intMetric1, 10), div(doubleMetric1, mult(floatMetric1, 5))
SELECT DISTINCT floor(sqrt(doubleMetric1)), ceil(ln(longDimSV1)), exp(mod(abs(longDimSV2), 3)) FROM FeatureTest2 WHERE generationNumber = __GENERATION_NUMBER__ ORDER BY floor(sqrt(doubleMetric1)), ceil(ln(longDimSV1)), exp(mod(abs(longDimSV2), 3)) DESC
SELECT arrayLength(intDimMV1), arrayLength(valueIn(stringDimMV2, 'm2-2-0', 'm2-3-0')), count(*) FROM FeatureTest2 WHERE generationNumber = __GENERATION_NUMBER__ GROUP BY arrayLength(intDimMV1), arrayLength(valueIn(stringDimMV2, 'm2-2-0', 'm2-3-0'))
SELECT valueIn(intDimMV1, 3, 32), count(*) FROM FeatureTest2 WHERE generationNumber = __GENERATION_NUMBER__ GROUP BY valueIn(intDimMV1, 3, 32)
SELECT DISTINCT upper(stringDimSV1), lower(textDim1), reverse(stringDimSV2), ltrim(substr(textDim1, 4, 9)), rtrim(substr(textDim1, 4, 9)) FROM FeatureTest2 WHERE generationNumber = __GENERATION_NUMBER__ ORDER BY upper(stringDimSV1)
SELECT DISTINCT stringDimSV2, replace(stringDimSV2, 'foo', 'bar'), codePoint(stringDimSV2), rpad(stringDimSV2, 11, 'abc'), lpad(stringDimSV2, 11, 'xyz') from FeatureTest2 WHERE generationNumber = __GENERATION_NUMBER__ AND(strPos(stringDimSV2, '2', 2) > 1 OR startsWith(stringDimSV2, 'foo') = 'true') ORDER BY codePoint(stringDimSV2), replace(stringDimSV2, 'foo', 'bar')

# Groovy Scripts
SELECT DISTINCT longDimSV1, longDimSV2, groovy('{"returnType":"LONG","isSingleValue":true}', 'arg0 + arg1', longDimSV1, longDimSV2) FROM FeatureTest2 WHERE generationNumber = __GENERATION_NUMBER__
SELECT count(*), groovy('{"returnType":"STRING", "isSingleValue":true}', 'def result; if (arg0 < 0) { result = "NEGATIVE"; } else if (arg0 < 10) { result = "SMALL";} else if (arg0 < 50) { result = "MEDIUM";} else{result = "LARGE"}; return result', longDimSV1) FROM FeatureTest2 WHERE generationNumber = __GENERATION_NUMBER__ GROUP BY groovy('{"returnType":"STRING", "isSingleValue":true}', 'def result; if (arg0 < 0) { result = "NEGATIVE"; } else if (arg0 < 10) { result = "SMALL";} else if (arg0 < 50) { result = "MEDIUM";} else{result = "LARGE"}; return result', longDimSV1) ORDER BY count(*) DESC
SELECT DISTINCT groovy('{"returnType":"INT","isSingleValue":true}', 'arg0.toList().max()', intDimMV1) FROM FeatureTest2 WHERE generationNumber = __GENERATION_NUMBER__ ORDER BY groovy('{"returnType":"INT","isSingleValue":true}', 'arg0.toList().max()', intDimMV1)

# Json Map
SELECT DISTINCT stringDimSV1, mapValue(mapDim1__KEYS, 'k1', mapDim1__VALUES), mapValue(mapDim1__KEYS, 'k4', mapDim1__VALUES) FROM FeatureTest2 WHERE generationNumber = __GENERATION_NUMBER__ LIMIT 20
SELECT DISTINCT stringDimSV1, jsonExtractScalar(mapDim2json, '$.k1', 'INT'), jsonExtractScalar(mapDim2json, '$.k4', 'INT') FROM FeatureTest2 WHERE generationNumber = __GENERATION_NUMBER__ ORDER BY stringDimSV1, jsonExtractScalar(mapDim2json, '$.k1', 'INT'), jsonExtractScalar(mapDim2json, '$.k4', 'INT') LIMIT 20
SELECT jsonExtractScalar(mapDim2json, '$.k1', 'INT'), max(jsonExtractScalar(mapDim2json, '$.k2', 'INT')), min(jsonExtractScalar(mapDim2json, '$.k3', 'INT')) FROM FeatureTest2 WHERE generationNumber = __GENERATION_NUMBER__ GROUP BY jsonExtractScalar(mapDim2json, '$.k1', 'INT') ORDER BY max(jsonExtractScalar(mapDim2json, '$.k2', 'INT')) LIMIT 20

# Misc
SELECT longDimSV1, count(*) FROM FeatureTest2 WHERE generationNumber = __GENERATION_NUMBER__ AND REGEXP_LIKE(textDim1, '^Java.*') GROUP BY longDimSV1
SELECT stringDimMV2, count(*) FROM FeatureTest2 WHERE generationNumber = __GENERATION_NUMBER__ AND REGEXP_LIKE(stringDimMV2, 'm2.*0') GROUP BY stringDimMV2 LIMIT 20
SELECT stringDimSV1, longDimSV1, intDimMV1, intDimMV2 FROM FeatureTest2 WHERE generationNumber = __GENERATION_NUMBER__ AND (stringDimSV1 != 's1-6' AND longDimSV1 BETWEEN 10 AND 1000 OR (intDimMV1 < 42 AND stringDimMV2 IN ('m2-0-0', 'm2-2-0') AND intDimMV2 NOT IN (6,72))) ORDER BY longDimSV1 DESC, stringDimSV1 LIMIT 100

0 comments on commit e379292

Please sign in to comment.