In [4]:
%load_ext autoreload
%env SPARK_HOME=/usr/hdp/current/spark2-client

import findspark
findspark.init()

print('findspark initialized ...')

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr, col, column, max, min

spark = SparkSession.builder.appName('mlonspark')\
    .config('spark.executor.instances', '7')\
    .config('spark.jars', '/opt/dev/target/ml-on-spark-1.0.jar')\
    .getOrCreate()

print('pyspark ready ...')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
env: SPARK_HOME=/usr/hdp/current/spark2-client
findspark initialized ...
pyspark ready ...


In [5]:
train = spark.read.load("/data/lastfm-dataset-360K/coo-data-train.parquet")
train.printSchema()
print(train.rdd.getNumPartitions())

root
 |-- userId: integer (nullable = true)
 |-- artistId: integer (nullable = true)
 |-- listenCount: float (nullable = true)
 |-- scaled-by-user: float (nullable = true)
 |-- scaled-by-artist: float (nullable = true)

8


In [6]:
%autoreload
from mlonspark.alternating_least_square import AlternatingLeastSquare
from pyspark.ml.recommendation import ALS

als = ALS()\
    .setUserCol("userId")\
    .setItemCol("artistId")\
    .setRatingCol("scaled-by-artist")\
    .setAlpha(1.0)\
    .setNumUserBlocks(7)\
    .setNumItemBlocks(7)\
    .setMaxIter(10)\

In [7]:
model = als.fit(train)
print(model)

ALS_48c890458f46ab43dd68


In [8]:
test = spark.read.load("/data/lastfm-dataset-360K/coo-data-test.parquet")
print(test.rdd.getNumPartitions())

7


In [9]:
predictions = model.transform(test)
predictions.printSchema()
predictions.show(10)

root
 |-- userId: integer (nullable = true)
 |-- artistId: integer (nullable = true)
 |-- listenCount: float (nullable = true)
 |-- scaled-by-user: float (nullable = true)
 |-- scaled-by-artist: float (nullable = true)
 |-- prediction: float (nullable = false)

+------+--------+-----------+--------------+----------------+----------+
|userId|artistId|listenCount|scaled-by-user|scaled-by-artist|prediction|
+------+--------+-----------+--------------+----------------+----------+
| 38304|     148|       23.0|     1.3435115|       1.1165468| 1.4010674|
|153214|     148|      166.0|      1.438134|       2.9683454| 3.9653459|
| 54065|     148|      229.0|     3.4793928|       3.7841725| 1.9848043|
|144880|     148|      145.0|     1.1409639|        2.696403|  2.299455|
|263510|     148|       59.0|     1.1044855|       1.5827338| 1.6094072|
| 32705|     148|       77.0|     2.7674417|       1.8158274| 1.4124448|
|177946|     148|       25.0|     1.2352941|        1.142446| 1.4811388|
| 72977|

In [11]:
from pyspark.sql.functions import isnan
nan = predictions.where(isnan(col("listenCount")))  
nan.count()

0

In [12]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import isnan

evaluator = RegressionEvaluator()\
    .setMetricName("rmse")\
    .setLabelCol("listenCount")\
    .setPredictionCol("prediction")

predictionsFiltered = predictions.where(~isnan(col("prediction")))                                
rmse = evaluator.evaluate(predictionsFiltered)

print("RMSE = %f" % rmse)

RMSE = 641.327225


In [13]:
#histogram = train.select(train["listenCount"]).rdd.map(lambda x : x[0]).histogram(100)

In [14]:
#%matplotlib inline
#
#import numpy as np
#import matplotlib.pyplot as plt
#
#def create_hist(rdd_histogram_data):
#    """Given an RDD.histogram, plot a pyplot histogram"""
#    heights = np.array(rdd_histogram_data[1])
#    full_bins = rdd_histogram_data[0]
#    mid_point_bins = full_bins[:-1]
#    widths = [abs(i - j) for i, j in zip(full_bins[:-1], full_bins[1:])]
#    bar = plt.bar(mid_point_bins, heights, width=widths, color='b')
#    return bar
#
#create_hist(histogram)

In [15]:
from pyspark.ml import Pipeline
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator()\
    .setMetricName("rmse")\
    .setLabelCol("scaled-by-artist")\
    .setPredictionCol("prediction")

pipeline = Pipeline().setStages([als])
params = ParamGridBuilder()\
    .addGrid(als.maxIter, range(2, 20, 2))\
    .build()    

cv = TrainValidationSplit()\
    .setTrainRatio(0.75)\
    .setEstimator(als)\
    .setEstimatorParamMaps(params)\
    .setEvaluator(evaluator)

model = cv.fit(train)

In [16]:
model.validationMetrics

[nan, nan, nan, nan, nan, nan, nan, nan, nan]

In [17]:
from pyspark.ml import Pipeline
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator


pipeline = Pipeline().setStages([als])
params = ParamGridBuilder()\
    .addGrid(als.rank, range(2, 20, 2))\
    .build()    

cv = TrainValidationSplit()\
    .setTrainRatio(0.75)\
    .setEstimator(als)\
    .setEstimatorParamMaps(params)\
    .setEvaluator(evaluator)

rankModel = cv.fit(train)

In [18]:
rankModel.validationMetrics

[nan, nan, nan, nan, nan, nan, nan, nan, nan]

In [19]:
%autoreload
from mlonspark.alternating_least_square import AlternatingLeastSquare
from pyspark.ml.recommendation import ALS

#    
als = ALS()\
    .setUserCol("userId")\
    .setItemCol("artistId")\
    .setRatingCol("scaled-by-artist")\
    .setNumUserBlocks(7)\
    .setNumItemBlocks(7)\
    .setMaxIter(10)\
    .setRank(15)\
    .setImplicitPrefs(False)\

model = als.fit(train)
print(model)



ALS_4fa9a871c38887e12f3b


In [20]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import isnan

predictions = model.transform(test)

evaluator = RegressionEvaluator()\
    .setMetricName("rmse")\
    .setLabelCol("scaled-by-artist")\
    .setPredictionCol("prediction")

predictionsFiltered = predictions.where(~isnan(col("prediction")))                                
rmse = evaluator.evaluate(predictionsFiltered)

print("RMSE = %f" % rmse)

RMSE = 0.942539
