In [None]:
import os
import sys
import json
import pyspark
from pyspark.sql import SparkSession

#initialize sparkcontext object
sc = pyspark.SparkContext('local[*]')




In [None]:
#Demonstrate how to load a dataset suitable for recommendation systems into a PySpark DataFrame.

fields2 = ['product_id', 'user_id', 'review', 'profile_name', 'helpfulness', 'score', 'time']
#validate is a function that takes a line of data as input and checks if all the fields listed in fields2 are present in that line.
def validate(line):
    for field in fields2:
        if field not in line:
            return False
        return True

#reads the text file and returns it as an RDD of strings
reviews_raw = sc.textFile('../Desktop/BDA/movies_1.json')

#if all fields aren't accepted then it isn't added as a review?
#The map() transformation is applied to reviews_raw, which converts each JSON-formatted line of text into a Python dictionary using json.loads().
#This results in an RDD of dictionaries representing individual records. (JSON file stores individual elements as a dictionary)
reviews = reviews_raw.map(lambda line: json.loads(line)).filter(validate)
reviews.cache()


PythonRDD[2] at RDD at PythonRDD.scala:53

In [None]:
#print single review from json file
reviews.take(1)

[{'user_id': 'A141HP4LYPWMSR',
  'product_id': 'B003AI2VGA',
  'review': 'Synopsis: On the daily trek from Juarez, Mexico to El Paso, Texas an ever increasing number of female workers are found raped and murdered in the surrounding desert. Investigative reporter Karina Danes (Minnie Driver) arrives from Los Angeles to pursue the story and angers both the local police and the factory owners who employee the undocumented aliens with her pointed questions and relentless quest for the truth.<br /><br />Her story goes nationwide when a young girl named Mariela (Ana Claudia Talancon) survives a vicious attack and walks out of the desert crediting the Blessed Virgin for her rescue. Her story is further enhanced when the "Wounds of Christ" (stigmata) appear in her palms. She also claims to have received a message of hope for the Virgin Mary and soon a fanatical movement forms around her to fight against the evil that holds such a stranglehold on the area.<br /><br />Critique: Possessing a life

In [None]:
#data statistics, unique users, movies and total reviews
num_movies = reviews.groupBy(lambda entry: entry['product_id']).count()
num_users = reviews.groupBy(lambda entry: entry['user_id']).count()
num_entries = reviews.count()

print("There are " + str(num_users) + " unique users")
print(str(num_movies) + " unique movies have been reviewed.")
print("The total number of reviews are " + str(num_entries))

There are 36409 unique users
1539 unique movies have been reviewed.
The total number of reviews are 50000


In [None]:
#2 Implement a PySpark script that splits the data and trains a recommendation model
import math
import numpy
import hashlib
from pyspark.mllib.recommendation import ALS

def get_hash(s):
    return int(hashlib.sha1(s).hexdigest(), 16) % (10 ** 8)

ratings = reviews.map(lambda entry: tuple([ get_hash(entry['user_id'].encode('utf-8')), get_hash(entry['product_id'].encode('utf-8')), int(entry['score']) ]))
#%random function to separate training and test set into 80 and 20% (>= 2 makes for 80%) given that the likelihood of getting each digit 0-9 is squal, however this is not the case here
train_data = ratings.filter(lambda entry: ((entry[0] + entry[1])%10) >= 2)
test_data = ratings.filter(lambda entry: ((entry[0] + entry[1])%10) < 2)

#train test divide ad siaply count
print ("Number of train samples: " + str(train_data.count()))
print ("Number of test samples: " + str(test_data.count()))


Number of train samples: 39992
Number of test samples: 10008


In [None]:
#3) Implement a PySpark script using the ALS algorithm for collaborative filtering.

#training and training hyperparameters
rank = 20
num_iter = 10

#initialize the Alternating least squares object based on the declared hyperparameters
model = ALS.train(train_data, rank, num_iter)

In [None]:
#Implement code to evaluate the performance of the recommendation model using appropriate metrics

#takes a list lines as input and converts each element of the list to a float. It then returns a new list with the converted values.
def convertToFloat(lines):
    returnedLine = []
    for x in lines:
        returnedLine.append(float(x))
    return returnedLine

#test data points are mapped as a tuple of two elemets into a new RDD called unknown
unknown = test_data.map(lambda entry: (int(entry[0]), int(entry[1])))
#model is used to obtain predictions which are then saved into an RDD as a modified tuple from the unknowns rdd
predictions = model.predictAll(unknown).map(lambda r: ((int(r[0]), int(r[1])), r[2]))
#test data predictions are evaluated in the same way
true_and_predictions = test_data.map(lambda r: ((int(r[0]), int(r[1])), r[2])).join(predictions)

#lambda funtion returns MSE for each individual point, x and y MSE are calculated separately and then summed using the reduce function and saved into the MSE RDD
MSE = true_and_predictions.map(lambda r: (int(r[1][0]) - int(r[1][1])**2).reduce(lambda x, y: x + y)/true_and_predictions.count())

#displays first 10 rows of the prediction
true_and_predictions.take(10)

[((65965270, 62577830), (5, 0.42535904188643614)),
 ((7383110, 62577830), (5, 0.4786121202368883)),
 ((22477285, 58302865), (4, 0.3876043496097198)),
 ((5479805, 58302865), (5, -0.13021188161427588)),
 ((39998009, 30926631), (3, -2.6047266112667753)),
 ((56482098, 109252), (3, 0.8697726509634622)),
 ((89302666, 43077444), (3, 0.7084200093233983)),
 ((6480656, 80115934), (5, 4.976821873707983)),
 ((19486884, 36531906), (4, 0.9414270128294593)),
 ((99927280, 47280600), (5, -0.9020214859888254))]

In [None]:
# recommendation system in dataframe (same as previous code but with dataframe implementation )
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType,BooleanType,DoubleType

spark = SparkSession.builder.master("local[1]").appName("PySpark Read JSON").getOrCreate()

# Reading JSON file into dataframe
dataframe = spark.read.json('../Desktop/BDA/movies_1.json')
dataframe.printSchema()
dataframe.show()

root
 |-- helpfulness: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- profile_name: string (nullable = true)
 |-- review: string (nullable = true)
 |-- score: double (nullable = true)
 |-- summary: string (nullable = true)
 |-- time: long (nullable = true)
 |-- user_id: string (nullable = true)

+-----------+----------+--------------------+--------------------+-----+--------------------+----------+--------------+
|helpfulness|product_id|        profile_name|              review|score|             summary|      time|       user_id|
+-----------+----------+--------------------+--------------------+-----+--------------------+----------+--------------+
|        7/7|B003AI2VGA|Brian E. Erland "...|Synopsis: On the ...|  3.0|"There Is So Much...|1182729600|A141HP4LYPWMSR|
|        4/4|B003AI2VGA|          Grady Harp|THE VIRGIN OF JUA...|  3.0|Worthwhile and Im...|1181952000|A328S9RN3U5M68|
|       8/10|B003AI2VGA|Chrissy K. McVay ...|The scenes in thi...|  5.0|This m

In [None]:
#drop na values
dataframe.dropna()
dataframe.show()

+-----------+----------+--------------------+--------------------+-----+--------------------+----------+--------------+
|helpfulness|product_id|        profile_name|              review|score|             summary|      time|       user_id|
+-----------+----------+--------------------+--------------------+-----+--------------------+----------+--------------+
|        7/7|B003AI2VGA|Brian E. Erland "...|Synopsis: On the ...|  3.0|"There Is So Much...|1182729600|A141HP4LYPWMSR|
|        4/4|B003AI2VGA|          Grady Harp|THE VIRGIN OF JUA...|  3.0|Worthwhile and Im...|1181952000|A328S9RN3U5M68|
|       8/10|B003AI2VGA|Chrissy K. McVay ...|The scenes in thi...|  5.0|This movie needed...|1164844800|A1I7QGUDP043DG|
|        1/1|B003AI2VGA|        golgotha.gov|THE VIRGIN OF JUA...|  3.0|distantly based o...|1197158400|A1M5405JH9THP9|
|        1/1|B003AI2VGA|KerrLines "&#34;M...|Informationally, ...|  3.0|"What's going on ...|1188345600| ATXL536YX71TR|
|        0/0|B003AI2VGA|abra "a devoted 

In [None]:
#train/test split and traning process
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder, CrossValidator

#split training and test set
train_df, test_df = dataframe.randomSplit(weights=[0.8,0.2], seed=100)\

#diaplay entries in each df
print("The training set has " + str(train_df.count()) + " entries")
print("The test set has " + str(test_df.count()) + " entries")

#replace preeixsting column with column values changed to integer data type
train_df = train_df.withColumn("user_id", train_df["user_id"].cast(IntegerType()))
train_df = train_df.withColumn("product_id", train_df["product_id"].cast(IntegerType()))



als = ALS(userCol='user_id', itemCol='product_id', ratingCol='score', coldStartStrategy='drop', nonnegative=True)
#paramgrid builder is used to evaulate ALS for multiple hyperparameters
param_grid = ParamGridBuilder().addGrid(als.rank, [1, 20, 30]).addGrid(als.maxIter, [20]).addGrid(als.regParam, [.05, .15]).build()

evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')

cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=3)

#prints hyperparameters of the best model as evaluated by the cross validater
model = cv.fit(train_df)
print('rank: ', best_model.rank)
print('MaxIter: ', best_model._java_obj.parent().getMaxIter())
print('RegParam: ', best_model._java_obj.parent().getRegParam())



The training set has 39954 entries
The test set has 10046 entries


Py4JJavaError: An error occurred while calling o4713.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 400.0 failed 1 times, most recent failure: Lost task 0.0 in stage 400.0 (TID 1313) (lpcp-23 executor driver): java.lang.RuntimeException: user_id Ids MUST NOT be Null
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.project_doConsume_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$SliceIterator.hasNext(Iterator.scala:268)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at scala.collection.AbstractIterator.to(Iterator.scala:1431)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1431)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at scala.collection.AbstractIterator.toArray(Iterator.scala:1431)
	at org.apache.spark.rdd.RDD.$anonfun$take$2(RDD.scala:1462)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2303)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:92)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:139)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	at java.base/java.lang.Thread.run(Thread.java:840)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2785)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2721)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2720)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2720)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1206)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1206)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1206)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2984)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2923)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2912)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:971)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2263)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2284)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2303)
	at org.apache.spark.rdd.RDD.$anonfun$take$1(RDD.scala:1462)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:405)
	at org.apache.spark.rdd.RDD.take(RDD.scala:1435)
	at org.apache.spark.rdd.RDD.$anonfun$isEmpty$1(RDD.scala:1572)
	at scala.runtime.java8.JFunction0$mcZ$sp.apply(JFunction0$mcZ$sp.java:23)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:405)
	at org.apache.spark.rdd.RDD.isEmpty(RDD.scala:1572)
	at org.apache.spark.ml.recommendation.ALS$.train(ALS.scala:975)
	at org.apache.spark.ml.recommendation.ALS.$anonfun$fit$1(ALS.scala:737)
	at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
	at org.apache.spark.ml.recommendation.ALS.fit(ALS.scala:714)
	at jdk.internal.reflect.GeneratedMethodAccessor139.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:840)
Caused by: java.lang.RuntimeException: user_id Ids MUST NOT be Null
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.project_doConsume_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$SliceIterator.hasNext(Iterator.scala:268)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at scala.collection.AbstractIterator.to(Iterator.scala:1431)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1431)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at scala.collection.AbstractIterator.toArray(Iterator.scala:1431)
	at org.apache.spark.rdd.RDD.$anonfun$take$2(RDD.scala:1462)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2303)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:92)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:139)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	... 1 more
