In [25]:
import findspark
findspark.init()

import os
PROJECT_HOME = os.path.abspath(os.curdir)
print(PROJECT_HOME)

import pandas as pd
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType 

/home/noobcoder/0_Project/school/BigData/DemoNov29


In [53]:
pd.set_option('display.max_colwidth', 500)

In [3]:
SPARK_MASTER_HOST = os.environ.get('SPARK_MASTER_HOST', 'localhost')
sc = SparkSession.builder\
    .master(f'spark://{SPARK_MASTER_HOST}:7077')\
    .appName('Quiz04_5')\
    .config('spark.executor.memory', '512m')\
    .getOrCreate()

# TASK 05: Recommender systems with ALS

## LOAD DATA

In [26]:
data = sc.sparkContext.textFile(f'file://{PROJECT_HOME}/data/sample_movielens_data.txt')\
    .map(lambda line: str(line).strip().split('::'))\
    .toDF(['user_id', 'movie_id', 'rating'])\
    .withColumn('user_id', F.col('user_id').cast(IntegerType()))\
    .withColumn('movie_id', F.col('movie_id').cast(IntegerType()))\
    .withColumn('rating', F.col('rating').cast(IntegerType()))

data.show(n=20)

+-------+--------+------+
|user_id|movie_id|rating|
+-------+--------+------+
|      0|       2|     3|
|      0|       3|     1|
|      0|       5|     2|
|      0|       9|     4|
|      0|      11|     1|
+-------+--------+------+
only showing top 5 rows



## Create training & Test data

In [56]:
(train_data, test_data) = data.randomSplit([0.8, 0.2])

## Create ALS Model

In [21]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [62]:
als = ALS(maxIter=5, regParam=0.01, userCol='user_id', ratingCol='rating', itemCol='movie_id')

als.setColdStartStrategy('drop')
model = als.fit(train_data)

## Generate prediction using RegressionEvaluator with `r2` metric

In [63]:
evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='movie_id', metricName='r2')
predictions = model.transform(test_data)

r2 = evaluator.evaluate(predictions)
predictions.show(n=10)

+-------+--------+------+----------+
|user_id|movie_id|rating|prediction|
+-------+--------+------+----------+
|     27|      31|     1| 1.8055043|
|      8|      31|     3| 2.2396927|
|     25|      31|     2|0.44899365|
|     14|      31|     3| 2.0162518|
|     18|      31|     1| 0.2124876|
|     12|      85|     1| 0.9194294|
|      1|      85|     3| 1.0778955|
|     13|      85|     1| 1.2044089|
|     16|      85|     5| 2.8538754|
|     20|      85|     2| 1.9367349|
+-------+--------+------+----------+
only showing top 10 rows



## RECOMMEND TOP `5` MOVIES FOR EACH USER

In [64]:
user_recs = model.recommendForAllUsers(5)
user_recs.toPandas().head(10)

Unnamed: 0,user_id,recommendations
0,28,"[(46, 7.011728763580322), (22, 6.188665390014648), (8, 5.105101108551025), (12, 4.890925407409668), (34, 4.101345062255859)]"
1,26,"[(38, 5.215191841125488), (30, 5.102921009063721), (22, 4.955678939819336), (7, 4.826887130737305), (51, 4.33873176574707)]"
2,27,"[(38, 4.2677998542785645), (47, 3.5196070671081543), (18, 3.368234634399414), (19, 3.063934803009033), (27, 3.0505995750427246)]"
3,12,"[(17, 4.942808628082275), (27, 4.918002128601074), (64, 4.886162281036377), (88, 4.772541046142578), (40, 4.508386611938477)]"
4,22,"[(74, 4.979816436767578), (30, 4.97975492477417), (88, 4.883581161499023), (51, 4.751498699188232), (62, 4.034212589263916)]"
5,1,"[(30, 5.546263694763184), (10, 4.254742622375488), (65, 4.240033149719238), (38, 4.198899269104004), (7, 4.0509209632873535)]"
6,13,"[(41, 3.6926305294036865), (83, 3.199340343475342), (92, 2.9805986881256104), (47, 2.868960380554199), (72, 2.865938901901245)]"
7,6,"[(51, 6.390158176422119), (74, 6.313582897186279), (92, 5.353476047515869), (98, 5.062294006347656), (41, 4.957209587097168)]"
8,16,"[(54, 4.919881820678711), (90, 4.893517017364502), (51, 4.822025775909424), (1, 4.655332088470459), (22, 4.108940601348877)]"
9,3,"[(46, 4.935356140136719), (80, 4.011600971221924), (88, 3.82204532623291), (77, 3.3174386024475098), (30, 3.139488935470581)]"


## RECOMMEND TOP `5` USERS OF EACH MOVIE

In [66]:
movie_recs = model.recommendForAllItems(5)
movie_recs.toPandas().head(10)

Unnamed: 0,movie_id,recommendations
0,31,"[(12, 3.917371988296509), (2, 2.683123826980591), (7, 2.4745254516601562), (8, 2.2396926879882812), (14, 2.016251802444458)]"
1,85,"[(6, 2.995903730392456), (16, 2.8538753986358643), (24, 2.3338565826416016), (25, 2.108302116394043), (20, 1.936734914779663)]"
2,65,"[(23, 4.959682941436768), (1, 4.240033149719238), (11, 3.3888955116271973), (18, 3.2355968952178955), (10, 2.60775089263916)]"
3,53,"[(8, 4.8919148445129395), (0, 4.375528335571289), (16, 3.9404916763305664), (7, 3.261551856994629), (4, 3.2231810092926025)]"
4,78,"[(12, 1.1849931478500366), (1, 1.0519057512283325), (8, 1.0452980995178223), (2, 1.04275643825531), (22, 1.0292414426803589)]"
5,34,"[(28, 4.101345062255859), (9, 3.2221744060516357), (3, 2.764707088470459), (21, 2.7040958404541016), (1, 2.5107083320617676)]"
6,81,"[(23, 4.132896900177002), (11, 4.023634433746338), (26, 3.6265156269073486), (1, 2.9930005073547363), (18, 2.883845090866089)]"
7,28,"[(28, 2.320584774017334), (26, 2.3127448558807373), (15, 1.971734642982483), (20, 1.8434174060821533), (24, 1.834133267402649)]"
8,76,"[(0, 3.487499952316284), (18, 3.0076258182525635), (3, 3.0007224082946777), (24, 2.790271520614624), (21, 2.606706142425537)]"
9,26,"[(0, 3.074420928955078), (15, 2.9228017330169678), (8, 2.7477529048919678), (10, 2.74411678314209), (26, 2.599515914916992)]"
