In [1]:
%load_ext autoreload
%env SPARK_HOME=/usr/hdp/current/spark2-client

import findspark
findspark.init()

print('findspark initialized ...')

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr, col, column, max, min

spark = SparkSession.builder.appName('mlonspark')\
    .config('spark.jars', '/opt/dev/target/ml-on-spark-1.0.jar')\
    .getOrCreate()

print('pyspark ready ...')

env: SPARK_HOME=/usr/hdp/current/spark2-client
findspark initialized ...
pyspark ready ...


In [13]:
%autoreload
from mlonspark.alternating_least_square import AlternatingLeastSquare

als = AlternatingLeastSquare()\
    .setUserCol("userId")\
    .setItemCol("itemId")\
    .setRatingCol("rating")

print(als)

AlternatingLeastSquare_402e91a7de769379c407


In [2]:
books = spark.read.format("csv")\
    .option("sep",";")\
    .option("header", "true")\
    .load("/data/books/BX-Books.csv")

books.printSchema()

from pyspark.sql.functions import monotonically_increasing_id 

identifiedBooks = books.withColumn('bookId', monotonically_increasing_id());
identifiedBooks = identifiedBooks.select(['bookId', 'ISBN'])
identifiedBooks.show(10)

root
 |-- ISBN: string (nullable = true)
 |-- Book-Title: string (nullable = true)
 |-- Book-Author: string (nullable = true)
 |-- Year-Of-Publication: string (nullable = true)
 |-- Publisher: string (nullable = true)
 |-- Image-URL-S: string (nullable = true)
 |-- Image-URL-M: string (nullable = true)
 |-- Image-URL-L: string (nullable = true)

+------+----------+
|bookId|      ISBN|
+------+----------+
|     0|0195153448|
|     1|0002005018|
|     2|0060973129|
|     3|0374157065|
|     4|0393045218|
|     5|0399135782|
|     6|0425176428|
|     7|0671870432|
|     8|0679425608|
|     9|074322678X|
+------+----------+
only showing top 10 rows



In [11]:
ratings = spark.read.format("csv")\
    .option("sep",";")\
    .option("header", "true")\
    .load("/data/books/BX-Book-Ratings.csv")

print("Pocet hodnoceni = %i" % ratings.count())


joinExp = ratings['ISBN'] == identifiedBooks['ISBN']
ratings = ratings.join(identifiedBooks, joinExp, 'inner')

ratings = ratings\
    .withColumn('userId', ratings['User-ID'].cast('int'))\
    .withColumn('itemId', ratings['bookId'].cast('int'))\
    .withColumn('rating', ratings['Book-Rating'].cast('float'))

print("Pocet hodnoceni = %i" % ratings.count())




Pocet hodnoceni = 1149780
Pocet hodnoceni = 1031175


In [4]:
identifiedBooks.where("ISBN = '0155061224'").show()

+----------+----------+
|    bookId|      ISBN|
+----------+----------+
|8590016614|0155061224|
+----------+----------+



In [12]:
ratings.printSchema()
ratings.show(10)


root
 |-- User-ID: string (nullable = true)
 |-- ISBN: string (nullable = true)
 |-- Book-Rating: string (nullable = true)
 |-- bookId: long (nullable = false)
 |-- ISBN: string (nullable = true)
 |-- userId: integer (nullable = true)
 |-- itemId: integer (nullable = false)
 |-- rating: float (nullable = true)

+-------+----------+-----------+----------+----------+------+------+------+
|User-ID|      ISBN|Book-Rating|    bookId|      ISBN|userId|itemId|rating|
+-------+----------+-----------+----------+----------+------+------+------+
| 276725|034545104X|          0|      2966|034545104X|276725|  2966|   0.0|
| 276726|0155061224|          5|8590016614|0155061224|276726| 82022|   5.0|
| 276727|0446520802|          0|     11054|0446520802|276727| 11054|   0.0|
| 276729|052165615X|          3|8590037639|052165615X|276729|103047|   3.0|
| 276729|0521795028|          6|8590037640|0521795028|276729|103048|   6.0|
| 276733|2080674722|          0|    123645|2080674722|276733|123645|   0.0|
| 2

In [None]:
print("Pocet propojenych hodnoceni = %i" % ratingsConverted.count())
print("Pocet hodnoceni s item id = %i" % ratingsConverted.where('bookId is not null').count())
print("Pocet hodnoceni s user id = %i" % ratingsConverted.where('userId is not null').count())

In [15]:
ratingsConverted.show(10)

+-------+----------+-----------+------+----------+------+
|User-ID|      ISBN|Book-Rating|userId|    itemId|rating|
+-------+----------+-----------+------+----------+------+
| 276725|034545104X|          0|276725|      null|   0.0|
| 276726|0155061224|          5|276726| 155061224|   5.0|
| 276727|0446520802|          0|276727| 446520802|   0.0|
| 276729|052165615X|          3|276729|      null|   3.0|
| 276729|0521795028|          6|276729| 521795028|   6.0|
| 276733|2080674722|          0|276733|2080674722|   0.0|
| 276736|3257224281|          8|276736|      null|   8.0|
| 276737|0600570967|          6|276737| 600570967|   6.0|
| 276744|038550120X|          7|276744|      null|   7.0|
| 276745| 342310538|         10|276745| 342310538|  10.0|
+-------+----------+-----------+------+----------+------+
only showing top 10 rows



AttributeError: 'NoneType' object has no attribute 'toPandas'

In [15]:
model = als.fit(ratings)
print(model)

AlternatingLeastSquare_402e91a7de769379c407


In [23]:
model.write().overwrite().save('/data/books/model')