Permalink
Browse files

First commit.

  • Loading branch information...
1 parent 1b4f64b commit 322d3d128e32ed2b86f78b5f540ac324af9ab1fb @cartershanklin committed Dec 11, 2013
Showing with 4,191 additions and 1 deletion.
  1. +60 −1 README.md
  2. +31 −0 ddl/bin_flat/add_partition_inventory.sql
  3. +50 −0 ddl/bin_flat/add_partition_store_sales.sql
  4. +32 −0 ddl/bin_flat/analyze.sql
  5. +9 −0 ddl/bin_flat/call_center.sql
  6. +9 −0 ddl/bin_flat/catalog_page.sql
  7. +9 −0 ddl/bin_flat/catalog_returns.sql
  8. +9 −0 ddl/bin_flat/catalog_sales.sql
  9. +9 −0 ddl/bin_flat/customer.sql
  10. +9 −0 ddl/bin_flat/customer_address.sql
  11. +9 −0 ddl/bin_flat/customer_demographics.sql
  12. +9 −0 ddl/bin_flat/date_dim.sql
  13. +5 −0 ddl/bin_flat/drop_tmp_tables.sql
  14. +9 −0 ddl/bin_flat/household_demographics.sql
  15. +9 −0 ddl/bin_flat/income_band.sql
  16. +9 −0 ddl/bin_flat/inventory.sql
  17. +9 −0 ddl/bin_flat/item.sql
  18. +9 −0 ddl/bin_flat/promotion.sql
  19. +9 −0 ddl/bin_flat/reason.sql
  20. +9 −0 ddl/bin_flat/ship_mode.sql
  21. +9 −0 ddl/bin_flat/store.sql
  22. +9 −0 ddl/bin_flat/store_returns.sql
  23. +9 −0 ddl/bin_flat/store_sales.sql
  24. +9 −0 ddl/bin_flat/time_dim.sql
  25. +9 −0 ddl/bin_flat/warehouse.sql
  26. +9 −0 ddl/bin_flat/web_page.sql
  27. +9 −0 ddl/bin_flat/web_returns.sql
  28. +9 −0 ddl/bin_flat/web_sales.sql
  29. +9 −0 ddl/bin_flat/web_site.sql
  30. +31 −0 ddl/bin_partitioned/analyze.sql
  31. +9 −0 ddl/bin_partitioned/call_center.sql
  32. +9 −0 ddl/bin_partitioned/catalog_page.sql
  33. +103 −0 ddl/bin_partitioned/catalog_returns.sql
  34. +117 −0 ddl/bin_partitioned/catalog_sales.sql
  35. +9 −0 ddl/bin_partitioned/customer.sql
  36. +9 −0 ddl/bin_partitioned/customer_address.sql
  37. +9 −0 ddl/bin_partitioned/customer_demographics.sql
  38. +9 −0 ddl/bin_partitioned/date_dim.sql
  39. +5 −0 ddl/bin_partitioned/drop_tmp_tables.sql
  40. +9 −0 ddl/bin_partitioned/household_demographics.sql
  41. +9 −0 ddl/bin_partitioned/income_band.sql
  42. +55 −0 ddl/bin_partitioned/inventory.sql
  43. +39 −0 ddl/bin_partitioned/item.sql
  44. +9 −0 ddl/bin_partitioned/promotion.sql
  45. +9 −0 ddl/bin_partitioned/reason.sql
  46. +9 −0 ddl/bin_partitioned/ship_mode.sql
  47. +9 −0 ddl/bin_partitioned/store.sql
  48. +89 −0 ddl/bin_partitioned/store_returns.sql
  49. +95 −0 ddl/bin_partitioned/store_sales.sql
  50. +9 −0 ddl/bin_partitioned/time_dim.sql
  51. +9 −0 ddl/bin_partitioned/warehouse.sql
  52. +9 −0 ddl/bin_partitioned/web_page.sql
  53. +97 −0 ddl/bin_partitioned/web_returns.sql
  54. +117 −0 ddl/bin_partitioned/web_sales.sql
  55. +9 −0 ddl/bin_partitioned/web_site.sql
  56. +40 −0 ddl/text/call_center.sql
  57. +18 −0 ddl/text/catalog_page.sql
  58. +37 −0 ddl/text/catalog_returns.sql
  59. +44 −0 ddl/text/catalog_sales.sql
  60. +28 −0 ddl/text/customer.sql
  61. +23 −0 ddl/text/customer_address.sql
  62. +19 −0 ddl/text/customer_demographics.sql
  63. +38 −0 ddl/text/date_dim.sql
  64. +15 −0 ddl/text/household_demographics.sql
  65. +12 −0 ddl/text/income_band.sql
  66. +14 −0 ddl/text/inventory.sql
  67. +32 −0 ddl/text/item.sql
  68. +29 −0 ddl/text/promotion.sql
  69. +12 −0 ddl/text/reason.sql
  70. +15 −0 ddl/text/ship_mode.sql
  71. +39 −0 ddl/text/store.sql
  72. +30 −0 ddl/text/store_returns.sql
  73. +33 −0 ddl/text/store_sales.sql
  74. +20 −0 ddl/text/time_dim.sql
  75. +23 −0 ddl/text/warehouse.sql
  76. +23 −0 ddl/text/web_page.sql
  77. +34 −0 ddl/text/web_returns.sql
  78. +44 −0 ddl/text/web_sales.sql
  79. +36 −0 ddl/text/web_site.sql
  80. +4 −0 sample-queries/README.md
  81. +33 −0 sample-queries/query12.sql
  82. +45 −0 sample-queries/query13.sql
  83. +18 −0 sample-queries/query15.sql
  84. +30 −0 sample-queries/query17.sql
  85. +34 −0 sample-queries/query18.sql
  86. +26 −0 sample-queries/query19.sql
  87. +29 −0 sample-queries/query20.sql
  88. +26 −0 sample-queries/query21.sql
  89. +16 −0 sample-queries/query22.sql
  90. +21 −0 sample-queries/query26.sql
  91. +25 −0 sample-queries/query27.sql
  92. +19 −0 sample-queries/query28.sql
  93. +19 −0 sample-queries/query3.sql
  94. +19 −0 sample-queries/query32.sql
  95. +32 −0 sample-queries/query34.sql
  96. +37 −0 sample-queries/query39.sql
  97. +24 −0 sample-queries/query40.sql
  98. +20 −0 sample-queries/query42.sql
  99. +19 −0 sample-queries/query43.sql
  100. +17 −0 sample-queries/query45.sql
  101. +35 −0 sample-queries/query46.sql
  102. +60 −0 sample-queries/query48.sql
  103. +72 −0 sample-queries/query49.sql
  104. +30 −0 sample-queries/query50.sql
  105. +20 −0 sample-queries/query52.sql
  106. +15 −0 sample-queries/query55.sql
  107. +48 −0 sample-queries/query58.sql
  108. +97 −0 sample-queries/query64.sql
  109. +119 −0 sample-queries/query66.sql
  110. +20 −0 sample-queries/query67.sql
  111. +31 −0 sample-queries/query68.sql
  112. +21 −0 sample-queries/query7.sql
  113. +28 −0 sample-queries/query70.sql
  114. +43 −0 sample-queries/query71.sql
  115. +32 −0 sample-queries/query72.sql
  116. +30 −0 sample-queries/query73.sql
  117. +24 −0 sample-queries/query76.sql
  118. +24 −0 sample-queries/query79.sql
  119. +13 −0 sample-queries/query82.sql
  120. +15 −0 sample-queries/query84.sql
  121. +78 −0 sample-queries/query85.sql
  122. +36 −0 sample-queries/query87.sql
  123. +101 −0 sample-queries/query88.sql
  124. +29 −0 sample-queries/query89.sql
  125. +24 −0 sample-queries/query90.sql
  126. +25 −0 sample-queries/query91.sql
  127. +23 −0 sample-queries/query92.sql
  128. +18 −0 sample-queries/query93.sql
  129. +19 −0 sample-queries/query94.sql
  130. +24 −0 sample-queries/query95.sql
  131. +14 −0 sample-queries/query96.sql
  132. +23 −0 sample-queries/query97.sql
  133. +31 −0 sample-queries/query98.sql
  134. +27 −0 settings/init.sql
  135. +14 −0 settings/load.sql
  136. +22 −0 tpcds-gen/Makefile
  137. +20 −0 tpcds-gen/README.md
  138. +86 −0 tpcds-gen/pom.xml
  139. +218 −0 tpcds-gen/src/main/java/org/notmysock/tpcds/GenTable.java
  140. +21 −0 tpcds-gen/tpcds-buffered.patch
  141. +22 −0 tpcds-gen/tpcds-strcpy.patch
  142. +41 −0 tpcds-setup-sandbox.sh
  143. +41 −0 tpcds-setup.sh
View
@@ -1,4 +1,63 @@
hive-testbench
==============
-Testbench for experimenting with Apache Hive at any data scale.
+A testbench for experimenting with Apache Hive at any data scale.
+
+Overview
+========
+
+The hive-testbench is a data generator and set of queries that lets you experiment with Apache Hive at scale. The testbench allows you to experience base Hive performance on large datasets, and gives an easy way to see the impact of Hive tuning parameters and advanced settings.
+
+Prerequisites
+=============
+
+You will need:
+* A Linux-based HDP cluster (or Sandbox).
+* Between 15 minutes and 6 hours to generate data (depending on the Scale Factor you choose and available hardware).
+
+Install and Setup
+=================
+
+- Optional: Install a Tez capable version of Hive.
+ If you want to compare and contrast Hive on Map/Reduce versus Hive on Tez, install a version of Hive that works with Tez. For now that means installing the [Stinger Phase 3 Beta](http://www.hortonworks.com). Hive 13+, when they are release, will also work.
+
+- Step 1: Prepare your environment.
+ Before you begin, you will need to install gcc, flex, bison and maven on your system. This is needed to compile the data generation program and package it for running inside Hadoop. These only need to be installed on one node of your Hadoop cluster.
+
+ On Ubuntu systems you can install all these via "sudo apt-get install gcc flex bison maven".
+ On RHEL / CentOS, most of these are availabile, start with "sudo yum install gcc flex bison". Maven must be installed the old fashioned way by downloading from http://maven.apache.org/download.cgi. Alternatively, the script "installMaven.sh" might work for you.
+
+- Step 2: Compile and package the data generator.
+ Data will be generated using the "dsgen" program from the [TPC-DS](http://) benchmark suite. It will first be compiled then packaged up in a JAR file so it can be run on all nodes in your Hadoop cluster.
+
+ To compile,
+
+- Step 2: Create a working directory in HDFS.
+ Before generating data, create a directory in HDFS that will hold the generated data. This directory can be removed later if you need space. You will need this path later when you do data generation.
+
+ Example: hadoop fs -mkdir /tmp/tpcds-staging
+
+- Step 3: Decide how much data you want to generate.
+ You need to decide on a "Scale Factor" which represents how much data you will generate. Scale Factor roughly translates to gigabytes, so a Scale Factor of 100 is about 100 gigabytes. One terabyte is Scale Factor 1000. Decide how much data you want and keep it in mind for the next step. If you have a cluster of 4-10 nodes or just want to experiment at a smaller scale, scale 200 (200GB) of data is a good starting point. If you have a large cluster, you may want to choose Scale 1000 (1TB) or more.
+
+- Step 4: Generate and load the data.
+ In this step you will do the actual data generation.
+
+- Step 4a: Generate data on a Hadoop cluster.
+ Use this approach if you want to try Hive out at scale. This approach assumes you have multiple physical Hadoop nodes with plenty of RAM. All tables will be created and large tables will be partitioned by date and bucketed which improves performance among queries that take advantage of partition pruning or SMB joins.
+
+ Example: ./tpcds-setup.sh 200 /tmp/tpcds-staging
+
+- Step 4b: Generate data on a Sandbox.
+ Use this approach if you want to try Hive or Hive/Tez out in a Sandbox environment. This is for experimentation only and you should not generate too much data if you choose this route, 20 GB or less would be appropriate. This approach does not partition data.
+
+ Example: ./tpcds-setup.sh 10 /tmp/tpcds-staging
+
+- Step 5: Run queries.
+ More than 50 sample TPC-DS queries are included for you to try out. You can use the Hive Client, beeline or the SQL tool of your choice.
+
+Feedback
+========
+If you have questions, comments or problems, visit the [Hortonworks Hive forum](http://www.hortonworks.com).
+
+If you have improvements, pull requests are accepted.
@@ -0,0 +1,31 @@
+set hive.enforce.bucketing=true;
+set hive.exec.dynamic.partition.mode=nonstrict;
+set hive.exec.max.dynamic.partitions.pernode=1000000;
+set hive.exec.max.dynamic.partitions=1000000;
+set hive.exec.max.created.files=1000000;
+set hive.metstore.uris=;
+
+create database if not exists ${DB};
+use ${DB};
+
+drop table if exists inventory_part;
+
+dfs -mv ${LOCATION} ${TMP_DIR}/part;
+
+create external table inventory_part
+(
+ inv_item_sk int,
+ inv_warehouse_sk int,
+ inv_quantity_on_hand int,
+ inv_date string
+)
+row format serde '${SERDE}'
+stored as ${FILE}
+location '${TMP_DIR}';
+
+insert into table inventory partition(inv_date)
+select * from inventory_part;
+
+dfs -mv ${TMP_DIR}/part ${LOCATION};
+
+drop table inventory_part;
@@ -0,0 +1,50 @@
+set hive.enforce.bucketing=true;
+set hive.exec.dynamic.partition.mode=nonstrict;
+set hive.exec.max.dynamic.partitions.pernode=1000000;
+set hive.exec.max.dynamic.partitions=1000000;
+set hive.exec.max.created.files=1000000;
+set hive.metstore.uris=;
+
+create database if not exists ${DB};
+use ${DB};
+
+drop table if exists store_sales_part;
+
+dfs -mv ${LOCATION} ${TMP_DIR}/part;
+
+create external table store_sales_part
+(
+ ss_sold_time_sk int,
+ ss_item_sk int,
+ ss_customer_sk int,
+ ss_cdemo_sk int,
+ ss_hdemo_sk int,
+ ss_addr_sk int,
+ ss_store_sk int,
+ ss_promo_sk int,
+ ss_ticket_number int,
+ ss_quantity int,
+ ss_wholesale_cost float,
+ ss_list_price float,
+ ss_sales_price float,
+ ss_ext_discount_amt float,
+ ss_ext_sales_price float,
+ ss_ext_wholesale_cost float,
+ ss_ext_list_price float,
+ ss_ext_tax float,
+ ss_coupon_amt float,
+ ss_net_paid float,
+ ss_net_paid_inc_tax float,
+ ss_net_profit float,
+ ss_sold_date string
+)
+row format serde '${SERDE}'
+stored as ${FILE}
+location '${TMP_DIR}';
+
+insert into table store_sales partition(ss_sold_date)
+select * from store_sales_part;
+
+dfs -mv ${TMP_DIR}/part ${LOCATION};
+
+drop table store_sales_part;
@@ -0,0 +1,32 @@
+use ${DB};
+ADD JAR file://${mysql_jar};
+
+ANALYZE TABLE date_dim COMPUTE STATISTICS;
+ANALYZE TABLE time_dim COMPUTE STATISTICS;
+ANALYZE TABLE item COMPUTE STATISTICS;
+ANALYZE TABLE customer COMPUTE STATISTICS;
+ANALYZE TABLE customer_demographics COMPUTE STATISTICS;
+ANALYZE TABLE household_demographics COMPUTE STATISTICS;
+ANALYZE TABLE customer_address COMPUTE STATISTICS;
+ANALYZE TABLE store COMPUTE STATISTICS;
+ANALYZE TABLE promotion COMPUTE STATISTICS;
+ANALYZE TABLE web_site COMPUTE STATISTICS;
+
+
+
+ANALYZE TABLE inventory COMPUTE STATISTICS;
+ANALYZE TABLE store_sales COMPUTE STATISTICS;
+ANALYZE TABLE store_returns COMPUTE STATISTICS;
+ANALYZE TABLE web_sales COMPUTE STATISTICS;
+ANALYZE TABLE web_returns COMPUTE STATISTICS;
+ANALYZE TABLE catalog_sales COMPUTE STATISTICS;
+ANALYZE TABLE catalog_returns COMPUTE STATISTICS;
+
+
+ANALYZE TABLE web_page COMPUTE STATISTICS;
+ANALYZE TABLE income_band COMPUTE STATISTICS;
+ANALYZE TABLE call_center COMPUTE STATISTICS;
+ANALYZE TABLE ship_mode COMPUTE STATISTICS;
+ANALYZE TABLE reason COMPUTE STATISTICS;
+ANALYZE TABLE catalog_page COMPUTE STATISTICS;
+ANALYZE TABLE warehouse COMPUTE STATISTICS;
@@ -0,0 +1,9 @@
+create database if not exists ${DB};
+use ${DB};
+
+drop table if exists call_center;
+
+create table call_center
+row format serde '${SERDE}'
+stored as ${FILE}
+as select * from ${SOURCE}.call_center;
@@ -0,0 +1,9 @@
+create database if not exists ${DB};
+use ${DB};
+
+drop table if exists catalog_page;
+
+create table catalog_page
+row format serde '${SERDE}'
+stored as ${FILE}
+as select * from ${SOURCE}.catalog_page;
@@ -0,0 +1,9 @@
+create database if not exists ${DB};
+use ${DB};
+
+drop table if exists catalog_returns;
+
+create table catalog_returns
+row format serde '${SERDE}'
+stored as ${FILE}
+as select * from ${SOURCE}.catalog_returns;
@@ -0,0 +1,9 @@
+create database if not exists ${DB};
+use ${DB};
+
+drop table if exists catalog_sales;
+
+create table catalog_sales
+row format serde '${SERDE}'
+stored as ${FILE}
+as select * from ${SOURCE}.catalog_sales;
@@ -0,0 +1,9 @@
+create database if not exists ${DB};
+use ${DB};
+
+drop table if exists customer;
+
+create table customer
+row format serde '${SERDE}'
+stored as ${FILE}
+as select * from ${SOURCE}.customer;
@@ -0,0 +1,9 @@
+create database if not exists ${DB};
+use ${DB};
+
+drop table if exists customer_address;
+
+create table customer_address
+row format serde '${SERDE}'
+stored as ${FILE}
+as select * from ${SOURCE}.customer_address;
@@ -0,0 +1,9 @@
+create database if not exists ${DB};
+use ${DB};
+
+drop table if exists customer_demographics;
+
+create table customer_demographics
+row format serde '${SERDE}'
+stored as ${FILE}
+as select * from ${SOURCE}.customer_demographics;
@@ -0,0 +1,9 @@
+create database if not exists ${DB};
+use ${DB};
+
+drop table if exists date_dim;
+
+create table date_dim
+row format serde '${SERDE}'
+stored as ${FILE}
+as select * from ${SOURCE}.date_dim;
@@ -0,0 +1,5 @@
+create database if not exists ${DB};
+use ${DB};
+
+drop table if exists store_sales_tmp;
+drop table if exists inventory_tmp;
@@ -0,0 +1,9 @@
+create database if not exists ${DB};
+use ${DB};
+
+drop table if exists household_demographics;
+
+create table household_demographics
+row format serde '${SERDE}'
+stored as ${FILE}
+as select * from ${SOURCE}.household_demographics;
@@ -0,0 +1,9 @@
+create database if not exists ${DB};
+use ${DB};
+
+drop table if exists income_band;
+
+create table income_band
+row format serde '${SERDE}'
+stored as ${FILE}
+as select * from ${SOURCE}.income_band;
@@ -0,0 +1,9 @@
+create database if not exists ${DB};
+use ${DB};
+
+drop table if exists inventory;
+
+create table inventory
+row format serde '${SERDE}'
+stored as ${FILE}
+as select * from ${SOURCE}.inventory;
@@ -0,0 +1,9 @@
+create database if not exists ${DB};
+use ${DB};
+
+drop table if exists item;
+
+create table item
+row format serde '${SERDE}'
+stored as ${FILE}
+as select * from ${SOURCE}.item;
@@ -0,0 +1,9 @@
+create database if not exists ${DB};
+use ${DB};
+
+drop table if exists promotion;
+
+create table promotion
+row format serde '${SERDE}'
+stored as ${FILE}
+as select * from ${SOURCE}.promotion;
@@ -0,0 +1,9 @@
+create database if not exists ${DB};
+use ${DB};
+
+drop table if exists reason;
+
+create table reason
+row format serde '${SERDE}'
+stored as ${FILE}
+as select * from ${SOURCE}.reason;
@@ -0,0 +1,9 @@
+create database if not exists ${DB};
+use ${DB};
+
+drop table if exists ship_mode;
+
+create table ship_mode
+row format serde '${SERDE}'
+stored as ${FILE}
+as select * from ${SOURCE}.ship_mode;
@@ -0,0 +1,9 @@
+create database if not exists ${DB};
+use ${DB};
+
+drop table if exists store;
+
+create table store
+row format serde '${SERDE}'
+stored as ${FILE}
+as select * from ${SOURCE}.store;
@@ -0,0 +1,9 @@
+create database if not exists ${DB};
+use ${DB};
+
+drop table if exists store_returns;
+
+create table store_returns
+row format serde '${SERDE}'
+stored as ${FILE}
+as select * from ${SOURCE}.store_returns;
@@ -0,0 +1,9 @@
+create database if not exists ${DB};
+use ${DB};
+
+drop table if exists store_sales;
+
+create table store_sales
+row format serde '${SERDE}'
+stored as ${FILE}
+as select * from ${SOURCE}.store_sales;
@@ -0,0 +1,9 @@
+create database if not exists ${DB};
+use ${DB};
+
+drop table if exists time_dim;
+
+create table time_dim
+row format serde '${SERDE}'
+stored as ${FILE}
+as select * from ${SOURCE}.time_dim;
@@ -0,0 +1,9 @@
+create database if not exists ${DB};
+use ${DB};
+
+drop table if exists warehouse;
+
+create table warehouse
+row format serde '${SERDE}'
+stored as ${FILE}
+as select * from ${SOURCE}.warehouse;
@@ -0,0 +1,9 @@
+create database if not exists ${DB};
+use ${DB};
+
+drop table if exists web_page;
+
+create table web_page
+row format serde '${SERDE}'
+stored as ${FILE}
+as select * from ${SOURCE}.web_page;
Oops, something went wrong.

0 comments on commit 322d3d1

Please sign in to comment.