From e593d704cad18e897fd1187861855f389ed5184e Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Tue, 11 Jul 2017 22:51:27 +0900
Subject: [PATCH 1/2] Add SQL docs for hivemall-spark

---
 docs/gitbook/SUMMARY.md                    |   2 +
 docs/gitbook/spark/binaryclass/a9a_sql.md  | 159 +++++++++++++++++++++
 docs/gitbook/spark/regression/e2006_sql.md | 151 +++++++++++++++++++
 3 files changed, 312 insertions(+)
 create mode 100644 docs/gitbook/spark/binaryclass/a9a_sql.md
 create mode 100644 docs/gitbook/spark/regression/e2006_sql.md

diff --git a/docs/gitbook/SUMMARY.md b/docs/gitbook/SUMMARY.md
index 539cea13b..f228cfe39 100644
--- a/docs/gitbook/SUMMARY.md
+++ b/docs/gitbook/SUMMARY.md
@@ -178,9 +178,11 @@
 
 * [Binary Classification](spark/binaryclass/index.md)
     * [a9a Tutorial for DataFrame](spark/binaryclass/a9a_df.md)
+    * [a9a Tutorial for SQL](spark/binaryclass/a9a_sql.md)
 
 * [Regression](spark/binaryclass/index.md)
     * [E2006-tfidf regression Tutorial for DataFrame](spark/regression/e2006_df.md)
+    * [E2006-tfidf regression Tutorial for SQL](spark/regression/e2006_sql.md)
 
 * [Generic features](spark/misc/misc.md)
     * [Top-k Join processing](spark/misc/topk_join.md)
diff --git a/docs/gitbook/spark/binaryclass/a9a_sql.md b/docs/gitbook/spark/binaryclass/a9a_sql.md
new file mode 100644
index 000000000..7d43837d5
--- /dev/null
+++ b/docs/gitbook/spark/binaryclass/a9a_sql.md
@@ -0,0 +1,159 @@
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+a9a
+===
+http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#a9a
+
+Data preparation
+================
+
+```sh
+$ wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a9a
+$ wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a9a.t
+```
+
+```scala
+scala> :paste
+park.read.format("libsvm").load("a9a")
+  .select($"label", to_hivemall_features($"features").as("features"))
+  .createOrReplaceTempView("rawTrainTable")
+
+// `label` must be [0.0, 1.0]
+sql("""
+  CREATE OR REPLACE TEMPORARY VIEW trainTable AS
+    SELECT rescale(label, -1.0, 1.0) AS label, features
+      FROM rawTrainTable
+""")
+
+scala> trainDf.printSchema
+root
+ |-- label: float (nullable = true)
+ |-- features: vector (nullable = true)
+
+scala> :paste
+spark.read.format("libsvm").load("a9a.t")
+  .select($"label", to_hivemall_features($"features").as("features"))
+  .createOrReplaceTempView("rawTestTable")
+
+sql("""
+  CREATE OR REPLACE TEMPORARY VIEW testTable AS
+    SELECT
+        rowid() AS rowid,
+        rescale(label, -1.0, 1.0) AS target,
+        features
+      FROM
+        rawTestTable
+""")
+
+// Caches data to fix row IDs
+sql("CACHE TABLE testTable")
+
+sql("""
+  CREATE OR REPLACE TEMPORARY VIEW testTable_exploded AS
+    SELECT
+        rowid,
+        target,
+        extract_feature(ft) AS feature,
+        extract_weight(ft) AS value
+      FROM (
+        SELECT
+            rowid,
+            target,
+            explode(features) AS ft
+          FROM
+            testTable
+        )
+""")
+
+scala> testDf.printSchema
+root
+ |-- rowid: string (nullable = true)
+ |-- target: float (nullable = true)
+ |-- feature: string (nullable = true)
+ |-- value: double (nullable = true)
+```
+
+Tutorials
+================
+
+[Logistic Regression]
+---
+
+#Training
+
+```scala
+scala> :paste
+sql("""
+  CREATE OR REPLACE TEMPORARY VIEW modelTable AS
+    SELECT
+        feature, AVG(weight) AS weight
+      FROM (
+        SELECT
+            train_logistic_regr(add_bias(features), label) AS (feature, weight)
+          FROM
+            trainTable
+          )
+      GROUP BY
+        feature
+""")
+```
+
+#Test
+
+```scala
+scala> :paste
+sql("""
+  CREATE OR REPLACE TEMPORARY VIEW predicted AS
+    SELECT
+        rowid,
+        CASE
+          WHEN sigmoid(sum(weight * value)) > 0.50 THEN 1.0
+          ELSE 0.0
+        END AS predicted
+      FROM
+        testTable_exploded t LEFT OUTER JOIN modelTable m
+          ON t.feature = m.feature
+      GROUP BY
+        rowid
+""")
+```
+
+#Evaluation
+
+```scala
+val num_test_instances = spark.table("testTable").count
+
+sql(s"""
+  SELECT
+      count(1) / $num_test_instances AS eval
+    FROM
+      predicted p INNER JOIN testTable t
+        ON p.rowid = t.rowid
+    WHERE
+      p.predicted = t.target
+""")
+
++------------------+
+|              eval|
++------------------+
+|0.8327921286841418|
++------------------+
+```
+
diff --git a/docs/gitbook/spark/regression/e2006_sql.md b/docs/gitbook/spark/regression/e2006_sql.md
new file mode 100644
index 000000000..ae871ebc1
--- /dev/null
+++ b/docs/gitbook/spark/regression/e2006_sql.md
@@ -0,0 +1,151 @@
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+E2006
+===
+http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression.html#E2006-tfidf
+
+Data preparation
+================
+
+```sh
+$ wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/E2006.train.bz2
+$ wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/E2006.test.bz2
+```
+
+```scala
+scala> :paste
+spark.read.format("libsvm").load("E2006.train.bz2")
+  .select($"label", to_hivemall_features($"features").as("features"))
+  .createOrReplaceTempView("rawTrainTable")
+
+// `label` must be [0.0, 1.0]
+sql("""
+  CREATE OR REPLACE TEMPORARY VIEW trainTable AS
+    SELECT rescale(label, -7.899578, -0.51940954) AS label, features
+      FROM rawTrainTable
+""")
+
+scala> trainDf.printSchema
+root
+ |-- label: float (nullable = true)
+ |-- features: vector (nullable = true)
+
+scala> :paste
+spark.read.format("libsvm").load("E2006.test.bz2")
+  .select($"label", to_hivemall_features($"features").as("features"))
+  .createOrReplaceTempView("rawTestTable")
+
+sql("""
+  CREATE OR REPLACE TEMPORARY VIEW testTable AS
+    SELECT
+        rowid() AS rowid,
+        rescale(label, -7.899578, -0.51940954) AS target,
+        features
+      FROM
+        rawTestTable
+""")
+
+// Caches data to fix row IDs
+sql("CACHE TABLE testTable")
+
+sql("""
+  CREATE OR REPLACE TEMPORARY VIEW testTable_exploded AS
+    SELECT
+        rowid,
+        target,
+        extract_feature(ft) AS feature,
+        extract_weight(ft) AS value
+      FROM (
+        SELECT
+            rowid,
+            target,
+            explode(features) AS ft
+          FROM
+            testTable
+""")
+
+scala> df.printSchema
+root
+ |-- rowid: string (nullable = true)
+ |-- target: float (nullable = true)
+ |-- feature: string (nullable = true)
+ |-- value: double (nullable = true)
+```
+
+Tutorials
+================
+
+[AROWe2]
+---
+
+#Training
+
+```scala
+scala> :paste
+sql("""
+  CREATE OR REPLACE TEMPORARY VIEW modelTable AS
+    SELECT
+        feature, AVG(weight) AS weight
+      FROM (
+        SELECT
+            train_arowe2_regr(add_bias(features), label) AS (feature, weight)
+          FROM
+            trainTable
+        )
+      GROUP BY
+        feature
+""")
+```
+
+#Test
+
+```scala
+scala> :paste
+sql("""
+  CREATE OR REPLACE TEMPORARY VIEW predicted AS
+    SELECT
+        rowid, sum(weight * value) AS predicted
+      FROM
+        testTable_exploded t LEFT OUTER JOIN modelTable m
+          ON t.feature = m.feature
+      GROUP BY
+        rowid
+""")
+```
+
+#Evaluation
+
+```scala
+scala> :paste
+sql(s"""
+  SELECT
+      AVG(target), AVG(predicted)
+    FROM
+      predicted p INNER JOIN testTable t
+        ON p.rowid = t.rowid
+""")
+
++------------------+------------------+
+|       avg(target)|    avg(predicted)|
++------------------+------------------+
+|0.5489154884487879|0.6030108853227014|
++------------------+------------------+
+```
+

From eb3f3f794da4c3c1a319bc930f8bca8707a23d3e Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Wed, 12 Jul 2017 08:58:47 +0900
Subject: [PATCH 2/2] Fix minor comments

---
 docs/gitbook/spark/binaryclass/a9a_sql.md  | 12 ++++++++----
 docs/gitbook/spark/regression/e2006_sql.md | 12 ++++++++----
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/docs/gitbook/spark/binaryclass/a9a_sql.md b/docs/gitbook/spark/binaryclass/a9a_sql.md
index 7d43837d5..06734d9d7 100644
--- a/docs/gitbook/spark/binaryclass/a9a_sql.md
+++ b/docs/gitbook/spark/binaryclass/a9a_sql.md
@@ -35,10 +35,14 @@ park.read.format("libsvm").load("a9a")
   .select($"label", to_hivemall_features($"features").as("features"))
   .createOrReplaceTempView("rawTrainTable")
 
+val (max, min) = sql("SELECT MAX(label), MIN(label) FROM rawTrainTable").collect.map {
+  case Row(max: Double, min: Double) => (max, min)
+}.head
+
 // `label` must be [0.0, 1.0]
-sql("""
+sql(s"""
   CREATE OR REPLACE TEMPORARY VIEW trainTable AS
-    SELECT rescale(label, -1.0, 1.0) AS label, features
+    SELECT rescale(label, $min, $max) AS label, features
       FROM rawTrainTable
 """)
 
@@ -52,11 +56,11 @@ spark.read.format("libsvm").load("a9a.t")
   .select($"label", to_hivemall_features($"features").as("features"))
   .createOrReplaceTempView("rawTestTable")
 
-sql("""
+sql(s"""
   CREATE OR REPLACE TEMPORARY VIEW testTable AS
     SELECT
         rowid() AS rowid,
-        rescale(label, -1.0, 1.0) AS target,
+        rescale(label, $min, $max) AS target,
         features
       FROM
         rawTestTable
diff --git a/docs/gitbook/spark/regression/e2006_sql.md b/docs/gitbook/spark/regression/e2006_sql.md
index ae871ebc1..48477d147 100644
--- a/docs/gitbook/spark/regression/e2006_sql.md
+++ b/docs/gitbook/spark/regression/e2006_sql.md
@@ -35,10 +35,14 @@ spark.read.format("libsvm").load("E2006.train.bz2")
   .select($"label", to_hivemall_features($"features").as("features"))
   .createOrReplaceTempView("rawTrainTable")
 
+val (max, min) = sql("SELECT MAX(label), MIN(label) FROM rawTrainTable").collect.map {
+  case Row(max: Double, min: Double) => (max, min)
+}.head
+
 // `label` must be [0.0, 1.0]
-sql("""
+sql(s"""
   CREATE OR REPLACE TEMPORARY VIEW trainTable AS
-    SELECT rescale(label, -7.899578, -0.51940954) AS label, features
+    SELECT rescale(label, $min, $max) AS label, features
       FROM rawTrainTable
 """)
 
@@ -52,11 +56,11 @@ spark.read.format("libsvm").load("E2006.test.bz2")
   .select($"label", to_hivemall_features($"features").as("features"))
   .createOrReplaceTempView("rawTestTable")
 
-sql("""
+sql(s"""
   CREATE OR REPLACE TEMPORARY VIEW testTable AS
     SELECT
         rowid() AS rowid,
-        rescale(label, -7.899578, -0.51940954) AS target,
+        rescale(label, $min, $max) AS target,
         features
       FROM
         rawTestTable