In [1]:
from pyspark.mllib.recommendation import Rating, ALS, MatrixFactorizationModel

In [4]:
## 匯入資料
data = sc.textFile("data/ratings.csv")
data.take(3)

['userId,movieId,rating,timestamp',
 '1,31,2.5,1260759144',
 '1,1029,3.0,1260759179']

In [5]:
## 處理第一列 (header)
header = data.first()
cust_data = data.filter(lambda x: x != header)
cust_data.take(3)

['1,31,2.5,1260759144', '1,1029,3.0,1260759179', '1,1061,3.0,1260759182']

In [6]:
## 確認資料筆數
cust_data.count()

100004

In [7]:
## 讀取 cust_data 前三個欄位
split_data = cust_data.map(lambda x: x.split(",")[:3])
split_data.take(3)

[['1', '31', '2.5'], ['1', '1029', '3.0'], ['1', '1061', '3.0']]

In [8]:
## 建立 Rating RDD
rating_data = split_data.map(lambda x: Rating(int(x[0]), int(x[1]), float(x[2])))
rating_data.take(3)

[Rating(user=1, product=31, rating=2.5),
 Rating(user=1, product=1029, rating=3.0),
 Rating(user=1, product=1061, rating=3.0)]

In [9]:
## 確認資料筆數
rating_data.count()

100004

In [10]:
## 查看不重複會員數
rating_data.map(lambda x: x[0]).distinct().count()

671

In [11]:
## 查看不重複電影數
rating_data.map(lambda x: x[1]).distinct().count()

9066

In [12]:
## 明確評價訓練 
als_model = ALS.train(rating_data, rank = 10, iterations = 5)

In [13]:
## 針對會員推薦電影 (user : 會員 ID、num : 推薦的電影數)
als_model.recommendProducts(user = 100, num = 5)

[Rating(user=100, product=3676, rating=8.81179152427627),
 Rating(user=100, product=1192, rating=8.1519390367354),
 Rating(user=100, product=6954, rating=7.685306590823332),
 Rating(user=100, product=547, rating=7.631926791365951),
 Rating(user=100, product=8908, rating=7.415074037569325)]

In [15]:
## 查看推薦給會員的電影評分
als_model.predict(100, 3676)

8.81179152427627

In [16]:
## 將特定電影推薦給會員 (product : 電影 ID、num : 推薦的會員數)
als_model.recommendUsers(product = 3676, num = 5)

[Rating(user=310, product=3676, rating=14.113581397300395),
 Rating(user=645, product=3676, rating=10.501804545778466),
 Rating(user=66, product=3676, rating=10.443026267359226),
 Rating(user=301, product=3676, rating=10.211777918160678),
 Rating(user=651, product=3676, rating=9.956818827260248)]