<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Load-the-data" data-toc-modified-id="Load-the-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Load the data</a></span></li><li><span><a href="#Train-test-split" data-toc-modified-id="Train-test-split-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Train test split</a></span></li></ul></div>

In [1]:
import numpy as np
import pandas as pd
import pyspark
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf # @udf("integer") def myfunc(x,y): return x - y
from pyspark.sql import functions as F # stddev format_number date_format, dayofyear, when
from pyspark.sql.types import StructField, StringType, IntegerType, StructType

print([(x.__name__,x.__version__) for x in [np, pd, pyspark]])

spark = pyspark.sql.SparkSession.builder.appName('bhishan').getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc) # spark_df = sqlContext.createDataFrame(pandas_df)
sc.setLogLevel("INFO")

[('numpy', '1.17.1'), ('pandas', '0.25.1'), ('pyspark', '2.4.4')]


In [2]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

# Load the data

In [3]:
df = spark.read.csv('../data/movielens_ratings.csv',header=True,inferSchema=True)
df.cache()

print(df.count())
print(len(df.columns))
df.printSchema()

pd.DataFrame(df.take(5), columns=df.columns)

1501
3
root
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- userId: integer (nullable = true)



Unnamed: 0,movieId,rating,userId
0,2,3.0,0
1,3,1.0,0
2,5,2.0,0
3,9,4.0,0
4,11,1.0,0


In [4]:
df.describe().show()

+-------+------------------+------------------+------------------+
|summary|           movieId|            rating|            userId|
+-------+------------------+------------------+------------------+
|  count|              1501|              1501|              1501|
|   mean| 49.40572951365756|1.7741505662891406|14.383744170552964|
| stddev|28.937034065088994| 1.187276166124803| 8.591040424293272|
|    min|                 0|               1.0|                 0|
|    max|                99|               5.0|                29|
+-------+------------------+------------------+------------------+



# Train test split

In [5]:
train, test = df.randomSplit([0.8,0.2],seed=100)

In [6]:
als = ALS(regParam=0.01, userCol='userId',itemCol='movieId',
         ratingCol='rating')

In [7]:
model = als.fit(train)

In [8]:
preds = model.transform(test)

In [9]:
preds.show()

+-------+------+------+-----------+
|movieId|rating|userId| prediction|
+-------+------+------+-----------+
|     31|   1.0|    26| 0.41790342|
|     31|   1.0|    18|  2.1023593|
|     85|   1.0|    15|  2.2282162|
|     85|   3.0|    21|  2.6699085|
|     65|   2.0|     5|  3.7254877|
|     53|   1.0|    12|  -1.853287|
|     53|   1.0|    23|  0.7427913|
|     53|   1.0|    25|  0.5582905|
|     78|   1.0|    28|-0.44315022|
|     78|   1.0|    27|  0.9579795|
|     78|   1.0|    22| 0.71688056|
|     78|   1.0|    13|-0.27455226|
|     78|   1.0|    20|  0.6869983|
|     78|   1.0|    17|  1.0022442|
|     78|   1.0|     4|-0.45394632|
|     78|   1.0|    11|  0.4635147|
|     81|   1.0|    22|  -3.016086|
|     81|   1.0|    16| 0.70613337|
|     81|   1.0|    15| 0.17786703|
|     28|   1.0|    27|   1.675967|
+-------+------+------+-----------+
only showing top 20 rows



In [10]:
evaluator = RegressionEvaluator(metricName='rmse',labelCol='rating',
                               predictionCol='prediction')

In [12]:
rmse = evaluator.evaluate(preds)
rmse

1.9567692314143337

In [13]:
single_user = test.filter(test['userId']==11).select(['movieId','userId'])
single_user.show()

+-------+------+
|movieId|userId|
+-------+------+
|      0|    11|
|     12|    11|
|     16|    11|
|     45|    11|
|     47|    11|
|     50|    11|
|     76|    11|
|     78|    11|
|     88|    11|
+-------+------+



In [14]:
rec = model.transform(single_user)
rec.orderBy('prediction',ascending=False).show()

+-------+------+----------+
|movieId|userId|prediction|
+-------+------+----------+
|     76|    11|  5.242566|
|     12|    11| 4.5766935|
|     88|    11| 1.0118527|
|     16|    11| 1.0098516|
|     50|    11| 0.8270397|
|      0|    11| 0.7781983|
|     78|    11| 0.4635147|
|     47|    11|0.19547299|
|     45|    11|0.17380475|
+-------+------+----------+

