In [1]:
%env SPARK_HOME=/usr/hdp/current/spark2-client

import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr, col, column, max, min

spark = SparkSession.builder.getOrCreate()
print(spark)

env: SPARK_HOME=/usr/hdp/current/spark2-client
<pyspark.sql.session.SparkSession object at 0x7f0330651940>


In [2]:
users_raw = spark.read.format("csv")\
    .option("sep",";")\
    .option("header", "true")\
    .load("/data/books/BX-Users.csv")

users_raw.printSchema()

users = users_raw.select(\
    users_raw['User-ID'].cast('integer').alias('userId'),\
    users_raw['Age'].cast('integer').alias('age'),\
)
users.printSchema()
users.show(5)

root
 |-- User-ID: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Age: string (nullable = true)

root
 |-- userId: integer (nullable = true)
 |-- age: integer (nullable = true)

+------+----+
|userId| age|
+------+----+
|     1|null|
|     2|  18|
|     3|null|
|     4|  17|
|     5|null|
+------+----+
only showing top 5 rows



In [8]:
ratings_raw = spark.read.format("csv")\
    .option("sep",";")\
    .option("header", "true")\
    .load("/data/books/BX-Book-Ratings.csv")

ratings_raw.printSchema()
print(ratings_raw.count())

ratings = ratings_raw.select(\
    ratings_raw['User-ID'].cast('integer').alias('userId'),\
    ratings_raw['ISBN'].cast('integer').alias('itemId'),\
    ratings_raw['Book-Rating'].cast('integer').alias('rating'),\
)
ratings.printSchema()

ratings_filtered = ratings.rdd\
    .filter(lambda row: (row['userId'] is not None))\
    .filter(lambda row: (row['itemId'] is not None))\
    .filter(lambda row: (row['rating'] is not None))\
    .toDF()


root
 |-- User-ID: string (nullable = true)
 |-- ISBN: string (nullable = true)
 |-- Book-Rating: string (nullable = true)

1149780
root
 |-- userId: integer (nullable = true)
 |-- itemId: integer (nullable = true)
 |-- rating: integer (nullable = true)

DataFrame[userId: int, itemId: int, rating: int]
DataFrame[userId: bigint, itemId: bigint, rating: bigint]


In [9]:
from pyspark.ml.recommendation import ALS

als = ALS()\
    .setMaxIter(5)\
    .setRegParam(0.01)\
    .setUserCol("userId")\
    .setItemCol("itemId")\
    .setRatingCol("rating")

alsModel = als.fit(ratings_filtered)