In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# https://pythonhosted.org/pywebhdfs/

In [2]:
import pyspark
from pyspark.ml.feature import Word2Vec
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.ml import Pipeline, PipelineModel
import os
 


In [3]:
os.environ['PYSPARK_PYTHON'] = '/home/hduser/anaconda3/bin/python'
# The value is the python command of the version required to start the master and worker in the Linux system
os.environ['PYSPARK_DRIVER_PYTHON'] = r"C:\Users\billy\anaconda3\python.exe"
# The value is the spark directory in the local windows system
os.environ['SPARK_HOME'] = 'C:/spark'
# The value is the local IP, and the IP required to establish a connection, to prevent connection failure when multiple network cards
#os.environ['SPARK_LOCAL_IP'] = '192.168.56.1'
os.environ['HADOOP_HOME'] = "D:/hadoop-3.3.0"
os.environ['HADOOP_CONF_DIR'] = "/usr/local/hadoop/etc/hadoop"

In [4]:
conf  = pyspark.SparkConf().setAppName('sql').setMaster('spark://192.168.133.4:7077').set(
    "spark.submit.deployMode","client").set('spark.driver.memory','2g').set(
        'spark.executor.memory', '2g').set('spark.executor.cores', 1).set(
        'spark.network.timeout', 600).set('spark.executor.heartbeatInterval', 120).set(
    'spark.cores.max', 4).set("spark.driver.host","192.168.133.1").set("spark.driver.port","9999")#.set('spark.python.profile','true')
sc = pyspark.SparkContext(conf=conf)

In [5]:
path = "hdfs://192.168.133.4:9000/user/hduser/"

In [6]:
RawUserRDD = sc.textFile(path+"data/u.data")
print(RawUserRDD.count()

In [8]:
RawUserRDD.first()

'196\t242\t3\t881250949'

In [10]:
from pyspark.mllib.recommendation import Rating

In [59]:
rawRatings = RawUserRDD.map(lambda line:line.split("\t")[:3])
rawRatings.take(5)

[['196', '242', '3'],
 ['186', '302', '3'],
 ['22', '377', '1'],
 ['244', '51', '2'],
 ['166', '346', '1']]

In [60]:
# 準備 ALS 需要的資料格式
ratingsRDD = rawRatings.map(lambda x:(x[0],x[1],x[2]))
ratingsRDD.take(5)

[('196', '242', '3'),
 ('186', '302', '3'),
 ('22', '377', '1'),
 ('244', '51', '2'),
 ('166', '346', '1')]

In [61]:
# Check number of Users & Movies
numUsers = ratingsRDD.map(lambda x:x[0]).distinct().count()
print('numUsers: {}'.format(numUsers))
numMovies = ratingsRDD.map(lambda x:x[1]).distinct().count()
print('numMovies: {}'.format(numMovies))

numUsers: 943
numMovies: 1682


In [62]:
from pyspark.mllib.recommendation import ALS

In [108]:
model = ALS.train(ratingsRDD,100,10,0.01)

In [112]:
# recommend for movies
model.recommendProducts(100,5)
# recommend for users
model.recommendUsers(product=200,num=5)
#predict rating of user:100 with movies 1141 
model.predict(100,1141)

[Rating(user=100, product=50, rating=5.264741865346331),
 Rating(user=100, product=205, rating=5.255808183772364),
 Rating(user=100, product=12, rating=5.248360601754374),
 Rating(user=100, product=408, rating=5.245821371231437),
 Rating(user=100, product=173, rating=5.214595123176999)]

[Rating(user=550, product=200, rating=5.824029028146284),
 Rating(user=592, product=200, rating=5.613487663848079),
 Rating(user=686, product=200, rating=5.607321231537176),
 Rating(user=119, product=200, rating=5.459150811076884),
 Rating(user=939, product=200, rating=5.406562954822373)]

1.723600984798437

In [113]:
# read movies data
itemRDD = sc.textFile(path+"data/u.item")
itemRDD.count()
itemRDD.take(5)
# split and take the information of the movies
movieTitle = itemRDD.map(lambda line:line.split("|")).map(lambda x:(int(x[0]),x[1])).collectAsMap()
movieTitle[5]

1682

['1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)|0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0',
 '2|GoldenEye (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?GoldenEye%20(1995)|0|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0',
 '3|Four Rooms (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Four%20Rooms%20(1995)|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0',
 '4|Get Shorty (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Get%20Shorty%20(1995)|0|1|0|0|0|1|0|0|1|0|0|0|0|0|0|0|0|0|0',
 '5|Copycat (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Copycat%20(1995)|0|0|0|0|0|0|1|0|1|0|0|0|0|0|0|0|1|0|0']

'Copycat (1995)'

In [117]:
def MovieRecommendation(model,movieTitle,inputUserID):
    recommendMovie = model.recommendProducts(inputUserID,5)
    for product in recommendMovie:
        print('User:{}, Recommend Product:{}, Recommend Rating:{}'.format(product[0], movieTitle[product[1]],product[2]))  
    return recommendMovie

recommendMovie = MovieRecommendation(model,movieTitle,100)

User:100, Recommend Product:Star Wars (1977), Recommend Rating:5.264741865346331
User:100, Recommend Product:Patton (1970), Recommend Rating:5.255808183772364
User:100, Recommend Product:Usual Suspects, The (1995), Recommend Rating:5.248360601754374
User:100, Recommend Product:Close Shave, A (1995), Recommend Rating:5.245821371231437
User:100, Recommend Product:Princess Bride, The (1987), Recommend Rating:5.214595123176999


In [118]:
def UserRecommendation(model,movieTitle,inputMovie):
    recommendUser = model.recommendUsers(product=inputMovie,num=5)
    
    for User in recommendUser:
        print('Recommend Product:{} to User:{},  Recommend Rating:{}'.format(movieTitle[inputMovie],User[0],User[2]))  
    return recommendUser

recommendUser= UserRecommendation(model,movieTitle,5)

Recommend Product:Copycat (1995) to User:332,  Recommend Rating:4.990003695073672
Recommend Product:Copycat (1995) to User:256,  Recommend Rating:4.979165538453725
Recommend Product:Copycat (1995) to User:270,  Recommend Rating:4.967893439611046
Recommend Product:Copycat (1995) to User:546,  Recommend Rating:4.921540234611819
Recommend Product:Copycat (1995) to User:907,  Recommend Rating:4.893374981388492


In [119]:
# save models
def SaveModel(sc):
    try:
        model.save(sc,path+"ALSmodel")
        print("save Model as ALSmodel")
    except Exception:
        print("Model is already exists, please delete first then save model")

In [79]:
from pyspark.mllib.recommendation import MatrixFactorizationModel

In [121]:
# load models we saved
model_test = MatrixFactorizationModel.load(sc,path+"ALSmodel")
model_test.recommendProducts(100,5)

[Rating(user=100, product=1120, rating=5.7459085417701665),
 Rating(user=100, product=1169, rating=5.432230823340303),
 Rating(user=100, product=863, rating=5.42895510992681),
 Rating(user=100, product=958, rating=5.217840353650354),
 Rating(user=100, product=1286, rating=5.206188785027526)]