From e593d704cad18e897fd1187861855f389ed5184e Mon Sep 17 00:00:00 2001 From: Takeshi Yamamuro Date: Tue, 11 Jul 2017 22:51:27 +0900 Subject: [PATCH 1/2] Add SQL docs for hivemall-spark --- docs/gitbook/SUMMARY.md | 2 + docs/gitbook/spark/binaryclass/a9a_sql.md | 159 +++++++++++++++++++++ docs/gitbook/spark/regression/e2006_sql.md | 151 +++++++++++++++++++ 3 files changed, 312 insertions(+) create mode 100644 docs/gitbook/spark/binaryclass/a9a_sql.md create mode 100644 docs/gitbook/spark/regression/e2006_sql.md diff --git a/docs/gitbook/SUMMARY.md b/docs/gitbook/SUMMARY.md index 539cea13b..f228cfe39 100644 --- a/docs/gitbook/SUMMARY.md +++ b/docs/gitbook/SUMMARY.md @@ -178,9 +178,11 @@ * [Binary Classification](spark/binaryclass/index.md) * [a9a Tutorial for DataFrame](spark/binaryclass/a9a_df.md) + * [a9a Tutorial for SQL](spark/binaryclass/a9a_sql.md) * [Regression](spark/binaryclass/index.md) * [E2006-tfidf regression Tutorial for DataFrame](spark/regression/e2006_df.md) + * [E2006-tfidf regression Tutorial for SQL](spark/regression/e2006_sql.md) * [Generic features](spark/misc/misc.md) * [Top-k Join processing](spark/misc/topk_join.md) diff --git a/docs/gitbook/spark/binaryclass/a9a_sql.md b/docs/gitbook/spark/binaryclass/a9a_sql.md new file mode 100644 index 000000000..7d43837d5 --- /dev/null +++ b/docs/gitbook/spark/binaryclass/a9a_sql.md @@ -0,0 +1,159 @@ + + +a9a +=== +http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#a9a + +Data preparation +================ + +```sh +$ wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a9a +$ wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a9a.t +``` + +```scala +scala> :paste +park.read.format("libsvm").load("a9a") + .select($"label", to_hivemall_features($"features").as("features")) + .createOrReplaceTempView("rawTrainTable") + +// `label` must be [0.0, 1.0] +sql(""" + CREATE OR REPLACE TEMPORARY VIEW trainTable AS + SELECT rescale(label, -1.0, 1.0) AS label, features + FROM rawTrainTable +""") + +scala> trainDf.printSchema +root + |-- label: float (nullable = true) + |-- features: vector (nullable = true) + +scala> :paste +spark.read.format("libsvm").load("a9a.t") + .select($"label", to_hivemall_features($"features").as("features")) + .createOrReplaceTempView("rawTestTable") + +sql(""" + CREATE OR REPLACE TEMPORARY VIEW testTable AS + SELECT + rowid() AS rowid, + rescale(label, -1.0, 1.0) AS target, + features + FROM + rawTestTable +""") + +// Caches data to fix row IDs +sql("CACHE TABLE testTable") + +sql(""" + CREATE OR REPLACE TEMPORARY VIEW testTable_exploded AS + SELECT + rowid, + target, + extract_feature(ft) AS feature, + extract_weight(ft) AS value + FROM ( + SELECT + rowid, + target, + explode(features) AS ft + FROM + testTable + ) +""") + +scala> testDf.printSchema +root + |-- rowid: string (nullable = true) + |-- target: float (nullable = true) + |-- feature: string (nullable = true) + |-- value: double (nullable = true) +``` + +Tutorials +================ + +[Logistic Regression] +--- + +#Training + +```scala +scala> :paste +sql(""" + CREATE OR REPLACE TEMPORARY VIEW modelTable AS + SELECT + feature, AVG(weight) AS weight + FROM ( + SELECT + train_logistic_regr(add_bias(features), label) AS (feature, weight) + FROM + trainTable + ) + GROUP BY + feature +""") +``` + +#Test + +```scala +scala> :paste +sql(""" + CREATE OR REPLACE TEMPORARY VIEW predicted AS + SELECT + rowid, + CASE + WHEN sigmoid(sum(weight * value)) > 0.50 THEN 1.0 + ELSE 0.0 + END AS predicted + FROM + testTable_exploded t LEFT OUTER JOIN modelTable m + ON t.feature = m.feature + GROUP BY + rowid +""") +``` + +#Evaluation + +```scala +val num_test_instances = spark.table("testTable").count + +sql(s""" + SELECT + count(1) / $num_test_instances AS eval + FROM + predicted p INNER JOIN testTable t + ON p.rowid = t.rowid + WHERE + p.predicted = t.target +""") + ++------------------+ +| eval| ++------------------+ +|0.8327921286841418| ++------------------+ +``` + diff --git a/docs/gitbook/spark/regression/e2006_sql.md b/docs/gitbook/spark/regression/e2006_sql.md new file mode 100644 index 000000000..ae871ebc1 --- /dev/null +++ b/docs/gitbook/spark/regression/e2006_sql.md @@ -0,0 +1,151 @@ + + +E2006 +=== +http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression.html#E2006-tfidf + +Data preparation +================ + +```sh +$ wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/E2006.train.bz2 +$ wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/E2006.test.bz2 +``` + +```scala +scala> :paste +spark.read.format("libsvm").load("E2006.train.bz2") + .select($"label", to_hivemall_features($"features").as("features")) + .createOrReplaceTempView("rawTrainTable") + +// `label` must be [0.0, 1.0] +sql(""" + CREATE OR REPLACE TEMPORARY VIEW trainTable AS + SELECT rescale(label, -7.899578, -0.51940954) AS label, features + FROM rawTrainTable +""") + +scala> trainDf.printSchema +root + |-- label: float (nullable = true) + |-- features: vector (nullable = true) + +scala> :paste +spark.read.format("libsvm").load("E2006.test.bz2") + .select($"label", to_hivemall_features($"features").as("features")) + .createOrReplaceTempView("rawTestTable") + +sql(""" + CREATE OR REPLACE TEMPORARY VIEW testTable AS + SELECT + rowid() AS rowid, + rescale(label, -7.899578, -0.51940954) AS target, + features + FROM + rawTestTable +""") + +// Caches data to fix row IDs +sql("CACHE TABLE testTable") + +sql(""" + CREATE OR REPLACE TEMPORARY VIEW testTable_exploded AS + SELECT + rowid, + target, + extract_feature(ft) AS feature, + extract_weight(ft) AS value + FROM ( + SELECT + rowid, + target, + explode(features) AS ft + FROM + testTable +""") + +scala> df.printSchema +root + |-- rowid: string (nullable = true) + |-- target: float (nullable = true) + |-- feature: string (nullable = true) + |-- value: double (nullable = true) +``` + +Tutorials +================ + +[AROWe2] +--- + +#Training + +```scala +scala> :paste +sql(""" + CREATE OR REPLACE TEMPORARY VIEW modelTable AS + SELECT + feature, AVG(weight) AS weight + FROM ( + SELECT + train_arowe2_regr(add_bias(features), label) AS (feature, weight) + FROM + trainTable + ) + GROUP BY + feature +""") +``` + +#Test + +```scala +scala> :paste +sql(""" + CREATE OR REPLACE TEMPORARY VIEW predicted AS + SELECT + rowid, sum(weight * value) AS predicted + FROM + testTable_exploded t LEFT OUTER JOIN modelTable m + ON t.feature = m.feature + GROUP BY + rowid +""") +``` + +#Evaluation + +```scala +scala> :paste +sql(s""" + SELECT + AVG(target), AVG(predicted) + FROM + predicted p INNER JOIN testTable t + ON p.rowid = t.rowid +""") + ++------------------+------------------+ +| avg(target)| avg(predicted)| ++------------------+------------------+ +|0.5489154884487879|0.6030108853227014| ++------------------+------------------+ +``` + From eb3f3f794da4c3c1a319bc930f8bca8707a23d3e Mon Sep 17 00:00:00 2001 From: Takeshi Yamamuro Date: Wed, 12 Jul 2017 08:58:47 +0900 Subject: [PATCH 2/2] Fix minor comments --- docs/gitbook/spark/binaryclass/a9a_sql.md | 12 ++++++++---- docs/gitbook/spark/regression/e2006_sql.md | 12 ++++++++---- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/docs/gitbook/spark/binaryclass/a9a_sql.md b/docs/gitbook/spark/binaryclass/a9a_sql.md index 7d43837d5..06734d9d7 100644 --- a/docs/gitbook/spark/binaryclass/a9a_sql.md +++ b/docs/gitbook/spark/binaryclass/a9a_sql.md @@ -35,10 +35,14 @@ park.read.format("libsvm").load("a9a") .select($"label", to_hivemall_features($"features").as("features")) .createOrReplaceTempView("rawTrainTable") +val (max, min) = sql("SELECT MAX(label), MIN(label) FROM rawTrainTable").collect.map { + case Row(max: Double, min: Double) => (max, min) +}.head + // `label` must be [0.0, 1.0] -sql(""" +sql(s""" CREATE OR REPLACE TEMPORARY VIEW trainTable AS - SELECT rescale(label, -1.0, 1.0) AS label, features + SELECT rescale(label, $min, $max) AS label, features FROM rawTrainTable """) @@ -52,11 +56,11 @@ spark.read.format("libsvm").load("a9a.t") .select($"label", to_hivemall_features($"features").as("features")) .createOrReplaceTempView("rawTestTable") -sql(""" +sql(s""" CREATE OR REPLACE TEMPORARY VIEW testTable AS SELECT rowid() AS rowid, - rescale(label, -1.0, 1.0) AS target, + rescale(label, $min, $max) AS target, features FROM rawTestTable diff --git a/docs/gitbook/spark/regression/e2006_sql.md b/docs/gitbook/spark/regression/e2006_sql.md index ae871ebc1..48477d147 100644 --- a/docs/gitbook/spark/regression/e2006_sql.md +++ b/docs/gitbook/spark/regression/e2006_sql.md @@ -35,10 +35,14 @@ spark.read.format("libsvm").load("E2006.train.bz2") .select($"label", to_hivemall_features($"features").as("features")) .createOrReplaceTempView("rawTrainTable") +val (max, min) = sql("SELECT MAX(label), MIN(label) FROM rawTrainTable").collect.map { + case Row(max: Double, min: Double) => (max, min) +}.head + // `label` must be [0.0, 1.0] -sql(""" +sql(s""" CREATE OR REPLACE TEMPORARY VIEW trainTable AS - SELECT rescale(label, -7.899578, -0.51940954) AS label, features + SELECT rescale(label, $min, $max) AS label, features FROM rawTrainTable """) @@ -52,11 +56,11 @@ spark.read.format("libsvm").load("E2006.test.bz2") .select($"label", to_hivemall_features($"features").as("features")) .createOrReplaceTempView("rawTestTable") -sql(""" +sql(s""" CREATE OR REPLACE TEMPORARY VIEW testTable AS SELECT rowid() AS rowid, - rescale(label, -7.899578, -0.51940954) AS target, + rescale(label, $min, $max) AS target, features FROM rawTestTable