In [1]:
%load_ext autoreload
%env SPARK_HOME=/usr/hdp/current/spark2-client

import findspark
findspark.init()

print('findspark initialized ...')

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr, col, column, max, min

spark = SparkSession.builder.appName('mlonspark')\
    .config('spark.jars', '/opt/dev/target/ml-on-spark-1.0.jar')\
    .getOrCreate()

print('pyspark ready ...')

env: SPARK_HOME=/usr/hdp/current/spark2-client
findspark initialized ...
pyspark ready ...


In [2]:
train = spark.read.format("parquet").load("/data/books/ratings-train.parquet")
train.printSchema()

root
 |-- User-ID: string (nullable = true)
 |-- ISBN: string (nullable = true)
 |-- Book-Rating: string (nullable = true)
 |-- rating: float (nullable = true)
 |-- bookId: integer (nullable = true)
 |-- Location: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- userId: integer (nullable = true)



In [3]:
%autoreload
from mlonspark.alternating_least_square import AlternatingLeastSquare

als = AlternatingLeastSquare()\
    .setUserCol("userId")\
    .setItemCol("bookId")\
    .setRatingCol("rating")

print(als)

AlternatingLeastSquare_42ccb8345fc760bac93d


In [4]:
model = als.fit(train)
print(model)

AlternatingLeastSquare_42ccb8345fc760bac93d


In [15]:
model = als.fit(ratings)
print(model)

AlternatingLeastSquare_402e91a7de769379c407


In [8]:
model.write().overwrite().save('/data/books/model.alsmodel')

In [10]:
%autoreload
from mlonspark.alternating_least_square import AlternatingLeastSquareModel

model = AlternatingLeastSquareModel.read().load('/data/books/model.alsmodel')

In [11]:
print(model)

AlternatingLeastSquare_42ccb8345fc760bac93d


In [12]:
test = spark.read.format("parquet").load("/data/books/ratings-test.parquet")
test.printSchema()

root
 |-- User-ID: string (nullable = true)
 |-- ISBN: string (nullable = true)
 |-- Book-Rating: string (nullable = true)
 |-- rating: float (nullable = true)
 |-- bookId: integer (nullable = true)
 |-- Location: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- userId: integer (nullable = true)



In [14]:
predictions = model.transform(test)
predictions.printSchema()
predictions.show(10)

root
 |-- User-ID: string (nullable = true)
 |-- ISBN: string (nullable = true)
 |-- Book-Rating: string (nullable = true)
 |-- rating: float (nullable = true)
 |-- bookId: integer (nullable = true)
 |-- Location: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- userId: integer (nullable = true)

+-------+----------+-----------+------+------+--------------------+----+------+
|User-ID|      ISBN|Book-Rating|rating|bookId|            Location| Age|userId|
+-------+----------+-----------+------+------+--------------------+----+------+
| 100001|0425182673|          0|   0.0| 36672|grimstad, aust-ag...|NULL|100001|
| 100004|0345339711|          0|   0.0|   780|san ysidro, calif...|   0|100004|
| 100004|0425083837|         10|  10.0|  7061|san ysidro, calif...|   0|100004|
| 100004|0439064872|         10|  10.0|  3459|san ysidro, calif...|   0|100004|
| 100004|059035342X|         10|  10.0|  2143|san ysidro, calif...|   0|100004|
| 100009|0060392452|          8|   8.0|  2802|