Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

index_parallel with single dimension partitioning doesn't honour maxRowsPerSegment/targetRowsPerSegment and creates skewed segments #10693

Open
prabcs opened this issue Dec 18, 2020 · 3 comments

Comments

@prabcs
Copy link
Contributor

prabcs commented Dec 18, 2020

Affected Version

0.20.0

Description

Click here to see ingest spec with maxRowsPerSegment
{
  "type": "index_parallel",
  "spec": {
    "dataSchema": {
      "dataSource": "test_s_d_prabh_1",
      "timestampSpec": {
        "column": "timestamp",
        "format": "iso",
        "missingValue": null
      },
      "dimensionsSpec": {
        "dimensions": [
          {
            "type": "string",
            "name": "countryIsoCode",
            "multiValueHandling": "SORTED_ARRAY",
            "createBitmapIndex": true
          },
          {
            "type": "string",
            "name": "countryName",
            "multiValueHandling": "SORTED_ARRAY",
            "createBitmapIndex": true
          },
          {
            "type": "string",
            "name": "flags",
            "multiValueHandling": "SORTED_ARRAY",
            "createBitmapIndex": true
          },
          {
            "type": "string",
            "name": "isAnonymous",
            "multiValueHandling": "SORTED_ARRAY",
            "createBitmapIndex": true
          },
          {
            "type": "string",
            "name": "isMinor",
            "multiValueHandling": "SORTED_ARRAY",
            "createBitmapIndex": true
          },
          {
            "type": "string",
            "name": "isNew",
            "multiValueHandling": "SORTED_ARRAY",
            "createBitmapIndex": true
          },
          {
            "type": "string",
            "name": "isRobot",
            "multiValueHandling": "SORTED_ARRAY",
            "createBitmapIndex": true
          },
          {
            "type": "string",
            "name": "isUnpatrolled",
            "multiValueHandling": "SORTED_ARRAY",
            "createBitmapIndex": true
          },
          {
            "type": "string",
            "name": "metroCode",
            "multiValueHandling": "SORTED_ARRAY",
            "createBitmapIndex": true
          },
          {
            "type": "string",
            "name": "namespace",
            "multiValueHandling": "SORTED_ARRAY",
            "createBitmapIndex": true
          },
          {
            "type": "string",
            "name": "regionIsoCode",
            "multiValueHandling": "SORTED_ARRAY",
            "createBitmapIndex": true
          }
        ],
        "dimensionExclusions": [
          "sum_commentLength",
          "added",
          "count",
          "delta",
          "sum_deleted",
          "deltaBucket",
          "deleted",
          "sum_deltaBucket",
          "commentLength",
          "sum_added",
          "timestamp",
          "sum_delta"
        ]
      },
      "metricsSpec": [
        {
          "type": "count",
          "name": "count"
        },
        {
          "type": "longSum",
          "name": "sum_added",
          "fieldName": "added",
          "expression": null
        },
        {
          "type": "longSum",
          "name": "sum_commentLength",
          "fieldName": "commentLength",
          "expression": null
        },
        {
          "type": "longSum",
          "name": "sum_deleted",
          "fieldName": "deleted",
          "expression": null
        },
        {
          "type": "longSum",
          "name": "sum_delta",
          "fieldName": "delta",
          "expression": null
        },
        {
          "type": "longSum",
          "name": "sum_deltaBucket",
          "fieldName": "deltaBucket",
          "expression": null
        }
      ],
      "granularitySpec": {
        "type": "uniform",
        "segmentGranularity": "DAY",
        "queryGranularity": "DAY",
        "rollup": true,
        "intervals": [
          "2015-01-01T00:00:00.000Z/2017-01-01T00:00:00.000Z"
        ]
      },
      "transformSpec": {
        "filter": null,
        "transforms": []
      }
    },
    "ioConfig": {
      "type": "index_parallel",
      "inputSource": {
        "type": "http",
        "uris": [
          "https://druid.apache.org/data/wikipedia.json.gz"
        ],
        "httpAuthenticationUsername": null,
        "httpAuthenticationPassword": null
      },
      "inputFormat": {
        "type": "json",
        "flattenSpec": {
          "useFieldDiscovery": true,
          "fields": []
        },
        "featureSpec": {}
      },
      "appendToExisting": false
    },
    "tuningConfig": {
      "type": "index_parallel",
      "maxRowsPerSegment": 10,
      "maxRowsInMemory": 1000000,
      "maxBytesInMemory": 0,
      "maxTotalRows": null,
      "numShards": null,
      "splitHintSpec": null,
      "partitionsSpec": {
        "type": "single_dim",
        "targetRowsPerSegment": null,
        "maxRowsPerSegment": 10,
        "partitionDimension": "namespace",
        "assumeGrouped": false
      },
      "indexSpec": {
        "bitmap": {
          "type": "roaring",
          "compressRunOnSerialization": true
        },
        "dimensionCompression": "lz4",
        "metricCompression": "lz4",
        "longEncoding": "longs",
        "segmentLoader": null
      },
      "indexSpecForIntermediatePersists": {
        "bitmap": {
          "type": "roaring",
          "compressRunOnSerialization": true
        },
        "dimensionCompression": "lz4",
        "metricCompression": "lz4",
        "longEncoding": "longs",
        "segmentLoader": null
      },
      "maxPendingPersists": 0,
      "forceGuaranteedRollup": true,
      "reportParseExceptions": false,
      "pushTimeout": 0,
      "segmentWriteOutMediumFactory": null,
      "maxNumConcurrentSubTasks": 30,
      "maxRetry": 3,
      "taskStatusCheckPeriodMs": 1000,
      "chatHandlerTimeout": "PT10S",
      "chatHandlerNumRetries": 5,
      "maxNumSegmentsToMerge": 100,
      "totalNumMergeTasks": 10,
      "logParseExceptions": false,
      "maxParseExceptions": 2147483647,
      "maxSavedParseExceptions": 0,
      "buildV9Directly": true,
      "partitionDimensions": []
    }
  }
}

Check the result dataset after ingesting the above in Druid v 0.20.0 using this query:

select * from sys.segments 
where datasource = 'test_s_d_prabh_1'
order by "num_rows" desc

You'll notice the skewed segment sizes and number of rows.

Screen Shot 2020-12-13 at 10 51 13 PM

Notice that as per docs, the config is a soft max, not hard:

maxRowsPerSegment | Soft max for the number of rows to include in a partition


Now run another ingest with this ingest spec, that utilizes targetRowsPerSegment:

Click here to see the ingest spec with that new config
{
  "type": "index_parallel",
  "spec": {
    "dataSchema": {
      "dataSource": "test_s_d_prabh_2",
      "timestampSpec": {
        "column": "timestamp",
        "format": "iso",
        "missingValue": null
      },
      "dimensionsSpec": {
        "dimensions": [
          {
            "type": "string",
            "name": "countryIsoCode",
            "multiValueHandling": "SORTED_ARRAY",
            "createBitmapIndex": true
          },
          {
            "type": "string",
            "name": "countryName",
            "multiValueHandling": "SORTED_ARRAY",
            "createBitmapIndex": true
          },
          {
            "type": "string",
            "name": "flags",
            "multiValueHandling": "SORTED_ARRAY",
            "createBitmapIndex": true
          },
          {
            "type": "string",
            "name": "isAnonymous",
            "multiValueHandling": "SORTED_ARRAY",
            "createBitmapIndex": true
          },
          {
            "type": "string",
            "name": "isMinor",
            "multiValueHandling": "SORTED_ARRAY",
            "createBitmapIndex": true
          },
          {
            "type": "string",
            "name": "isNew",
            "multiValueHandling": "SORTED_ARRAY",
            "createBitmapIndex": true
          },
          {
            "type": "string",
            "name": "isRobot",
            "multiValueHandling": "SORTED_ARRAY",
            "createBitmapIndex": true
          },
          {
            "type": "string",
            "name": "isUnpatrolled",
            "multiValueHandling": "SORTED_ARRAY",
            "createBitmapIndex": true
          },
          {
            "type": "string",
            "name": "metroCode",
            "multiValueHandling": "SORTED_ARRAY",
            "createBitmapIndex": true
          },
          {
            "type": "string",
            "name": "namespace",
            "multiValueHandling": "SORTED_ARRAY",
            "createBitmapIndex": true
          },
          {
            "type": "string",
            "name": "regionIsoCode",
            "multiValueHandling": "SORTED_ARRAY",
            "createBitmapIndex": true
          }
        ],
        "dimensionExclusions": [
          "sum_commentLength",
          "added",
          "count",
          "delta",
          "sum_deleted",
          "deltaBucket",
          "deleted",
          "sum_deltaBucket",
          "commentLength",
          "sum_added",
          "timestamp",
          "sum_delta"
        ]
      },
      "metricsSpec": [
        {
          "type": "count",
          "name": "count"
        },
        {
          "type": "longSum",
          "name": "sum_added",
          "fieldName": "added",
          "expression": null
        },
        {
          "type": "longSum",
          "name": "sum_commentLength",
          "fieldName": "commentLength",
          "expression": null
        },
        {
          "type": "longSum",
          "name": "sum_deleted",
          "fieldName": "deleted",
          "expression": null
        },
        {
          "type": "longSum",
          "name": "sum_delta",
          "fieldName": "delta",
          "expression": null
        },
        {
          "type": "longSum",
          "name": "sum_deltaBucket",
          "fieldName": "deltaBucket",
          "expression": null
        }
      ],
      "granularitySpec": {
        "type": "uniform",
        "segmentGranularity": "DAY",
        "queryGranularity": "DAY",
        "rollup": true,
        "intervals": [
          "2015-01-01T00:00:00.000Z/2017-01-01T00:00:00.000Z"
        ]
      },
      "transformSpec": {
        "filter": null,
        "transforms": []
      }
    },
    "ioConfig": {
      "type": "index_parallel",
      "inputSource": {
        "type": "http",
        "uris": [
          "https://druid.apache.org/data/wikipedia.json.gz"
        ],
        "httpAuthenticationUsername": null,
        "httpAuthenticationPassword": null
      },
      "inputFormat": {
        "type": "json",
        "flattenSpec": {
          "useFieldDiscovery": true,
          "fields": []
        },
        "featureSpec": {}
      },
      "appendToExisting": false
    },
    "tuningConfig": {
      "type": "index_parallel",
      "maxRowsPerSegment": 30,
      "maxRowsInMemory": 1000000,
      "maxBytesInMemory": 0,
      "maxTotalRows": null,
      "numShards": null,
      "splitHintSpec": null,
      "partitionsSpec": {
        "type": "single_dim",
        "targetRowsPerSegment": 20,
        "maxRowsPerSegment": null,
        "partitionDimension": "namespace",
        "assumeGrouped": false
      },
      "indexSpec": {
        "bitmap": {
          "type": "roaring",
          "compressRunOnSerialization": true
        },
        "dimensionCompression": "lz4",
        "metricCompression": "lz4",
        "longEncoding": "longs",
        "segmentLoader": null
      },
      "indexSpecForIntermediatePersists": {
        "bitmap": {
          "type": "roaring",
          "compressRunOnSerialization": true
        },
        "dimensionCompression": "lz4",
        "metricCompression": "lz4",
        "longEncoding": "longs",
        "segmentLoader": null
      },
      "maxPendingPersists": 0,
      "forceGuaranteedRollup": true,
      "reportParseExceptions": false,
      "pushTimeout": 0,
      "segmentWriteOutMediumFactory": null,
      "maxNumConcurrentSubTasks": 10,
      "maxRetry": 3,
      "taskStatusCheckPeriodMs": 1000,
      "chatHandlerTimeout": "PT10S",
      "chatHandlerNumRetries": 5,
      "maxNumSegmentsToMerge": 100,
      "totalNumMergeTasks": 10,
      "logParseExceptions": false,
      "maxParseExceptions": 2147483647,
      "maxSavedParseExceptions": 0,
      "buildV9Directly": true,
      "partitionDimensions": []
    }
  }
}

Now, run this query:

select * from sys.segments 
where datasource = 'test_s_d_prabh_2'
order by "num_rows" desc

You'll still notice skewed segments in terms of size and number of rows.

Screen Shot 2020-12-13 at 11 12 36 PM

@vpeack
Copy link

vpeack commented May 8, 2021

Hi,

I'm facing the exact same issue (my dimension values are not evenly distributed), have you found a way to make it work ?

Thanks a lot

@stale
Copy link

stale bot commented May 1, 2022

This issue has been marked as stale due to 280 days of inactivity. It will be closed in 4 weeks if no further activity occurs. If this issue is still relevant, please simply write any comment. Even if closed, you can still revive the issue at any time or discuss it on the dev@druid.apache.org list. Thank you for your contributions.

@stale stale bot added the stale label May 1, 2022
@toughrogrammer
Copy link

toughrogrammer commented Nov 2, 2022

same issue too
I'm druid 0.23.0

@github-actions github-actions bot removed the stale label Oct 5, 2023
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

3 participants