Skip to content

Commit

Permalink
[CARBONDATA-2434] Add ExternalTableExample and LuceneDataMapExample
Browse files Browse the repository at this point in the history
For preparing 1.4.0 release.

This closes #2268
  • Loading branch information
chenliang613 authored and ravipesala committed May 8, 2018
1 parent 09feb9c commit d5da9a1
Show file tree
Hide file tree
Showing 4 changed files with 234 additions and 6 deletions.
@@ -0,0 +1,104 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.carbondata.examples

import java.io.File

import org.apache.spark.sql.{CarbonEnv, SaveMode, SparkSession}

import org.apache.carbondata.core.constants.CarbonCommonConstants
import org.apache.carbondata.core.metadata.CarbonTableIdentifier
import org.apache.carbondata.core.util.CarbonProperties
import org.apache.carbondata.examples.util.ExampleUtils

/**
* This example is for showing how to create external table with location.
*/

object ExternalTableExample {

def main(args: Array[String]) {
val spark = ExampleUtils.createCarbonSession("ExternalTableExample")
exampleBody(spark)
spark.close()
}

def exampleBody(spark : SparkSession): Unit = {

CarbonProperties.getInstance()
.addProperty(CarbonCommonConstants.CARBON_DATE_FORMAT, "yyyy/MM/dd")

// Create origin_table
spark.sql("DROP TABLE IF EXISTS origin_table")
spark.sql(
s"""
| CREATE TABLE origin_table(
| shortField SHORT,
| intField INT,
| bigintField LONG,
| doubleField DOUBLE,
| stringField STRING,
| timestampField TIMESTAMP,
| decimalField DECIMAL(18,2),
| dateField DATE,
| charField CHAR(5),
| floatField FLOAT
| )
| STORED BY 'carbondata'
""".stripMargin)

val rootPath = new File(this.getClass.getResource("/").getPath
+ "../../../..").getCanonicalPath
val path = s"$rootPath/examples/spark2/src/main/resources/data.csv"

// load 4 times, each load has 10 rows data
// scalastyle:off
(1 to 4).foreach(_ => spark.sql(
s"""
| LOAD DATA LOCAL INPATH '$path'
| INTO TABLE origin_table
| OPTIONS('HEADER'='true', 'COMPLEX_DELIMITER_LEVEL_1'='#')
""".stripMargin))
// scalastyle:on

// 40 rows
spark.sql("SELECT count(*) FROM origin_table").show()

val origin_table_path = CarbonEnv.getTablePath(Some("default"), "origin_table")(spark)

// Create external_table
spark.sql("DROP TABLE IF EXISTS external_table")
spark.sql("CREATE EXTERNAL TABLE external_table STORED BY 'carbondata'" +
s" LOCATION '$origin_table_path'")
spark.sql("SELECT count(*) FROM external_table").show()

// Load 2 times again
(1 to 2).foreach(_ => spark.sql(
s"""
| LOAD DATA LOCAL INPATH '$path'
| INTO TABLE origin_table
| OPTIONS('HEADER'='true', 'COMPLEX_DELIMITER_LEVEL_1'='#')
""".stripMargin))

spark.sql("SELECT count(*) FROM external_table").show()

// Drop tables
spark.sql("DROP TABLE IF EXISTS origin_table")
spark.sql("DROP TABLE IF EXISTS external_table")
}
}
@@ -0,0 +1,116 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.carbondata.examples

import org.apache.spark.sql.{SaveMode, SparkSession}

import org.apache.carbondata.examples.util.ExampleUtils


/**
* This example is for lucene datamap.
*/

object LuceneDataMapExample {

def main(args: Array[String]) {
val spark = ExampleUtils.createCarbonSession("LuceneDataMapExample")
exampleBody(spark)
spark.close()
}

def exampleBody(spark : SparkSession): Unit = {

// build the test data, please increase the data for more obvious comparison.
// if set the data is larger than 100M, it will take 10+ mins.
import scala.util.Random

import spark.implicits._
val r = new Random()
val df = spark.sparkContext.parallelize(1 to 10 * 10 * 1000)
.map(x => ("which test" + r.nextInt(10000) + " good" + r.nextInt(10),
"who and name" + x % 8, "city" + x % 50, x % 60))
.toDF("id", "name", "city", "age")

spark.sql("DROP TABLE IF EXISTS personTable")
df.write.format("carbondata")
.option("tableName", "personTable")
.option("compress", "true")
.mode(SaveMode.Overwrite).save()

// create lucene datamap on personTable
spark.sql(
s"""
| CREATE DATAMAP IF NOT EXISTS dm ON TABLE personTable
| USING 'lucene'
| DMProperties('INDEX_COLUMNS'='id , name')
""".stripMargin)

spark.sql("refresh datamap dm ON TABLE personTable")

// 1. Compare the performance:

def time(code: => Unit): Double = {
val start = System.currentTimeMillis()
code
// return time in second
(System.currentTimeMillis() - start).toDouble / 1000
}

val time_without_lucenedatamap = time {

spark.sql(
s"""
| SELECT count(*)
| FROM personTable where id like '% test1 %'
""".stripMargin).show()

}

val time_with_lucenedatamap = time {

spark.sql(
s"""
| SELECT count(*)
| FROM personTable where TEXT_MATCH('id:test1')
""".stripMargin).show()

}

// scalastyle:off
println("time for query on table with lucene datamap table:" + time_with_lucenedatamap.toString)
println("time for query on table without lucene datamap table:" + time_without_lucenedatamap.toString)
// scalastyle:on

// 2. Search for word "test1" and not "good" in the id field
spark.sql(
s"""
| SELECT id,name
| FROM personTable where TEXT_MATCH('id:test1 -id:good1')
""".stripMargin).show(100)

// 3. TEXT_MATCH_WITH_LIMIT usage:
spark.sql(
s"""
| SELECT id,name
| FROM personTable where TEXT_MATCH_WITH_LIMIT('id:test1',10)
""".stripMargin).show()

spark.sql("DROP TABLE IF EXISTS personTable")
}
}
Expand Up @@ -65,10 +65,6 @@ object PreAggregateDataMapExample {
LOAD DATA LOCAL INPATH '$testData' into table mainTable
""")

spark.sql("""
select * from mainTable
""")

spark.sql(s"""
LOAD DATA LOCAL INPATH '$testData' into table mainTable_other
""")
Expand Down Expand Up @@ -152,16 +148,20 @@ object PreAggregateDataMapExample {

// 2.compare the performance : with pre-aggregate VS main table

// build test data, if set the data is larger than 100M, it will take 10+ mins.
// build the test data, please increase the data for more obvious comparison.
// if set the data is larger than 100M, it will take 10+ mins.

import spark.implicits._

import scala.util.Random
val r = new Random()
val df = spark.sparkContext.parallelize(1 to 10 * 1000 * 1000)
val df = spark.sparkContext.parallelize(1 to 10 * 10 * 1000)
.map(x => ("No." + r.nextInt(100000), "name" + x % 8, "city" + x % 50, x % 60))
.toDF("ID", "name", "city", "age")

// Create table with pre-aggregate
spark.sql("DROP TABLE IF EXISTS personTable")
spark.sql("DROP TABLE IF EXISTS personTableWithoutAgg")
df.write.format("carbondata")
.option("tableName", "personTable")
.option("compress", "true")
Expand Down
Expand Up @@ -105,4 +105,12 @@ class RunExamples extends QueryTest with BeforeAndAfterAll {
test("TimeSeriesPreAggregateTableExample") {
TimeSeriesPreAggregateTableExample.exampleBody(spark)
}

test("LuceneDataMapExample") {
LuceneDataMapExample.exampleBody(spark)
}

test("ExternalTableExample") {
ExternalTableExample.exampleBody(spark)
}
}

0 comments on commit d5da9a1

Please sign in to comment.