In [1]:
%load_ext autoreload
%env SPARK_HOME=/usr/hdp/current/spark2-client

import findspark
findspark.init()
print('findspark initialized ...')

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr, col, column, max, min

env: SPARK_HOME=/usr/hdp/current/spark2-client
findspark initialized ...


In [2]:
def initspark():
    spark = SparkSession.builder.appName('mlonspark')\
        .config('spark.executor.instances', '3')\
        .getOrCreate()
    print('pyspark ready ...')
    return spark

In [3]:
from pyspark.ml.recommendation import ALS
def initalg(val):
    return ALS()\
    .setUserCol("userId")\
    .setItemCol("artistId")\
    .setRatingCol("stdCountPos")\
    .setRank(25)\
    .setRegParam(0.0)\
    .setAlpha(val)\
    .setImplicitPrefs(True)


In [4]:
import matplotlib as mpl
import matplotlib.pyplot as plt

def plot(res):
    x = []
    yTrain = []
    yTest = []
    for v in res:
        x.append(v[0])
        yTrain.append(v[1])
        yTest.append(v[2])

    plt.figure(figsize=(8,4))
    plt.plot(x, yTrain, "g-", linewidth=2, label=r"$TRAIN$")
    plt.plot(x, yTest, "r:", linewidth=2, label=r"TEST")
    plt.legend(loc="upper left", fontsize=15)
    plt.ylabel("RMSE", fontsize=18)

In [5]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import isnan

def runtest(rng, **model_kargs):
    res = []
    for i in rng:
        spark = initspark();
        train = spark.read.load("/data/lastfm-dataset-360K/data-filtered-std-pos-train.parquet")
        test = spark.read.load("/data/lastfm-dataset-360K/data-filtered-std-pos-test.parquet")
        
        evaluator = RegressionEvaluator()\
            .setMetricName("rmse")\
            .setLabelCol("stdCountPos")\
            .setPredictionCol("prediction")
        
        alg = initalg(i)
        model = alg.fit(train)
        trainPredictions = model.transform(train)
        trainPredictionsFiltered = trainPredictions.where(~isnan(col("prediction")))  
        trainRmse = evaluator.evaluate(trainPredictionsFiltered)
        
        
        testPredictions = model.transform(test)
        testPredictionsFiltered = testPredictions.where(~isnan(col("prediction")))    
        
        
        testRmse = evaluator.evaluate(testPredictionsFiltered)
        
        
        res.append([i, trainRmse, testRmse])
        plot(res)
        spark.stop()
    return res

In [None]:
%matplotlib inline

import numpy as np
rng = np.arange(0.0, 10.0, 1.0).tolist()
res = runtest(rng)
print(res)

In [None]:
x = []
yTrain = []
yTest = []
for v in res:
    x.append(v[0])
    yTrain.append(v[1])
    yTest.append(v[2])

print(yTest)

In [None]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

plt.figure(figsize=(8,4))
plt.plot(x, yTrain, "g-", linewidth=2, label=r"$TRAIN$")
plt.plot(x, yTest, "r:", linewidth=2, label=r"TEST")
plt.legend(loc="upper left", fontsize=15)
plt.ylabel("RMSE", fontsize=18)