## Binod Suman Academy YouTube

## Dataset download from https://www.kaggle.com/zygmunt/goodbooks-10k/download

In [93]:
import pyspark as ps
from pyspark.sql import SQLContext
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline
from pyspark.sql import Row
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import udf, col, when
import numpy as np

# to show all predicted book image
from IPython.display import Image
from IPython.display import display

In [5]:
# Create a spark session 
# create s spark context

spark = ps.sql.SparkSession.builder \
            .master("local") \
            .appName("Youtube demo") \
            .getOrCreate() 
            
sc = spark.sparkContext 
sqlContext = SQLContext(sc)

In [7]:
# Create dataframe for rating dataset
# inferSchema: automatically infer column types. 
# We want header of dataframe.

ratings_df = spark.read.csv('goodbooks-10k/ratings.csv',header=True, inferSchema=True)
ratings_df.printSchema()

root
 |-- book_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- rating: integer (nullable = true)



In [8]:
ratings_df.show(5)

+-------+-------+------+
|book_id|user_id|rating|
+-------+-------+------+
|      1|    314|     5|
|      1|    439|     3|
|      1|    588|     5|
|      1|   1169|     4|
|      1|   1185|     4|
+-------+-------+------+
only showing top 5 rows



In [9]:
# Creating data frame from book dataset csv
books_df = spark.read.csv('goodbooks-10k/books.csv',header=True, inferSchema=True) 
books_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- book_id: integer (nullable = true)
 |-- best_book_id: integer (nullable = true)
 |-- work_id: integer (nullable = true)
 |-- books_count: integer (nullable = true)
 |-- isbn: string (nullable = true)
 |-- isbn13: double (nullable = true)
 |-- authors: string (nullable = true)
 |-- original_publication_year: double (nullable = true)
 |-- original_title: string (nullable = true)
 |-- title: string (nullable = true)
 |-- language_code: string (nullable = true)
 |-- average_rating: string (nullable = true)
 |-- ratings_count: string (nullable = true)
 |-- work_ratings_count: string (nullable = true)
 |-- work_text_reviews_count: string (nullable = true)
 |-- ratings_1: double (nullable = true)
 |-- ratings_2: integer (nullable = true)
 |-- ratings_3: integer (nullable = true)
 |-- ratings_4: integer (nullable = true)
 |-- ratings_5: integer (nullable = true)
 |-- image_url: string (nullable = true)
 |-- small_image_url: string (nullable = true)


In [10]:
books_df.show(1)

+---+-------+------------+-------+-----------+---------+----------------+---------------+-------------------------+----------------+--------------------+-------------+--------------+-------------+------------------+-----------------------+---------+---------+---------+---------+---------+--------------------+--------------------+
| id|book_id|best_book_id|work_id|books_count|     isbn|          isbn13|        authors|original_publication_year|  original_title|               title|language_code|average_rating|ratings_count|work_ratings_count|work_text_reviews_count|ratings_1|ratings_2|ratings_3|ratings_4|ratings_5|           image_url|     small_image_url|
+---+-------+------------+-------+-----------+---------+----------------+---------------+-------------------------+----------------+--------------------+-------------+--------------+-------------+------------------+-----------------------+---------+---------+---------+---------+---------+--------------------+--------------------+
|  1

In [11]:
# ALS requires all input should be numeric. But no need to change as we have all data in numeric.
training_df, validation_df = ratings_df.randomSplit([.8, .2])

In [12]:
# These tuninar parameter will be using in ALS
# Watch YOUTUBE Video
iterations = 10
regularization_parameter = 0.1
rank= 4
errors = []
err = 0

In [14]:
als = ALS(maxIter=iterations, regParam=regularization_parameter, rank=4, userCol="user_id", itemCol="book_id", ratingCol="rating")
model = als.fit(training_df)
predictions = model.transform(validation_df)
new_predictions = predictions.filter(col('prediction') != np.nan)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(new_predictions)
print("Root-mean-square error = " + str(rmse))



Root-mean-square error = 0.8958079141064279


In [15]:
als = ALS(maxIter=iterations, regParam=regularization_parameter, rank=5, userCol="user_id", itemCol="book_id", ratingCol="rating")
model = als.fit(training_df)
predictions = model.transform(validation_df)
new_predictions = predictions.filter(col('prediction') != np.nan)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(new_predictions)
print("Root-mean-square error = " + str(rmse))




Root-mean-square error = 0.8992570281172305


In [16]:
for rank in range(4,10):
    als = ALS(maxIter=iterations, regParam=regularization_parameter, rank=rank, userCol="user_id", itemCol="book_id", ratingCol="rating")
    model = als.fit(training_df)
    predictions = model.transform(validation_df)
    new_predictions = predictions.filter(col('prediction') != np.nan)
    evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
    rmse = evaluator.evaluate(new_predictions)
    print("Rank :", rank, " Root-mean-square error = " + str(rmse))

Rank : 4  Root-mean-square error = 0.8958079141064279
Rank : 5  Root-mean-square error = 0.8992570281172305
Rank : 6  Root-mean-square error = 0.9015580965812328
Rank : 7  Root-mean-square error = 0.9087950708936399
Rank : 8  Root-mean-square error = 0.9077128348054919
Rank : 9  Root-mean-square error = 0.9132873140456141


In [None]:
# Build a CrossValidator to evaluate over the same 
# range of ranks with different regularization 
# constants. 

In [19]:
als = ALS(maxIter=iterations, regParam=regularization_parameter, rank=rank, userCol="user_id", itemCol="book_id", ratingCol="rating")
paramGrid = ParamGridBuilder() \
    .addGrid(als.regParam, [0.1, 0.01, 0.18]) \
    .addGrid(als.rank, range(4, 6)) \
    .build()
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
crossval = CrossValidator(estimator=als,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=5)
cvModel = crossval.fit(training_df)

In [20]:
cvModel_pred = cvModel.transform(validation_df)
cvModel_pred = cvModel_pred.filter(col('prediction') != np.nan)
rmse = evaluator.evaluate(cvModel_pred)
print("the rmse for optimal grid parameters with cross validation is: {}".format(rmse))

the rmse for optimal grid parameters with cross validation is: 0.8958079141064279


In [21]:
# Build final model with Rank 4 and Lambda 0.18
final_als = ALS(maxIter=10, regParam=0.1, rank=4, userCol="user_id", itemCol="book_id", ratingCol="rating")
final_model = final_als.fit(training_df)

In [84]:
# show 10 predictions
predictions = final_model.transform(validation_df)
predictions.show(n = 10)
# Predictin is very much match with actual ratings

+-------+-------+------+----------+
|book_id|user_id|rating|prediction|
+-------+-------+------+----------+
|    148|  35982|     3| 3.4614282|
|    148|   3922|     3| 3.8240075|
|    148|   6630|     3| 3.1852236|
|    148|  32055|     3| 3.1400247|
|    148|   7001|     4| 3.7838688|
|    148|  14603|     4| 3.4999962|
|    148|  33065|     3| 3.6070657|
|    148|  42404|     5| 3.9058194|
|    148|  50999|     3|  3.183998|
|    148|  40167|     5| 3.1608467|
+-------+-------+------+----------+
only showing top 10 rows



In [108]:
predictions.join(books_df, "book_id").select("user_id","title","prediction").show(5)



+-------+----------+----------+
|user_id|     title|prediction|
+-------+----------+----------+
|  19526|Lysistrata| 4.5223827|
|  43689|Lysistrata| 4.1192284|
|  37449|Lysistrata| 4.6407695|
|  34404|Lysistrata| 4.2335396|
|  50313|Lysistrata| 4.3338776|
+-------+----------+----------+
only showing top 5 rows



In [90]:
for_one_user = predictions.filter(col("user_id")==35982).join(books_df, "book_id").select("user_id","title","image_url","prediction")

for_one_user.count()


5

In [91]:
for_one_user.show()

+-------+--------------------+--------------------+----------+
|user_id|               title|           image_url|prediction|
+-------+--------------------+--------------------+----------+
|  35982|The Innocent Man:...|https://s.gr-asse...| 3.4493642|
|  35982|The Lost Continen...|https://images.gr...| 2.4551258|
|  35982|Another Bullshit ...|https://s.gr-asse...|  3.948512|
|  35982|The Portrait of a...|https://images.gr...|  3.552127|
|  35982|The Lord of the R...|https://s.gr-asse...| 3.8522704|
+-------+--------------------+--------------------+----------+



In [105]:
for book in for_one_user.take(10):
    print(book.title)
    display(Image(url=book.image_url))

The Innocent Man: Murder and Injustice in a Small Town


The Lost Continent: Travels in Small Town America


Another Bullshit Night in Suck City


The Portrait of a Lady


The Lord of the Rings: Weapons and Warfare


In [22]:
# Generate top 5 books recommendations for each user
userRecomments = final_model.recommendForAllUsers(5)
# Generate top 5 user recommendations for each movie
bookRecomments = final_model.recommendForAllItems(5)

In [24]:
userRecomments.printSchema()

root
 |-- user_id: integer (nullable = false)
 |-- recommendations: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- book_id: integer (nullable = true)
 |    |    |-- rating: float (nullable = true)



In [25]:
userRecomments.select("user_id","recommendations.book_id").show(10, False)

+-------+------------------------------+
|user_id|book_id                       |
+-------+------------------------------+
|148    |[6590, 9076, 562, 862, 6920]  |
|463    |[8109, 3124, 4336, 3082, 3395]|
|471    |[977, 5175, 2520, 5339, 5604] |
|496    |[1338, 7537, 2236, 8854, 3885]|
|833    |[3953, 1338, 6613, 2236, 3885]|
|1088   |[8109, 3628, 7254, 9024, 5207]|
|1238   |[7401, 4868, 852, 5919, 4778] |
|1342   |[7401, 4154, 4868, 6902, 3628]|
|1580   |[6590, 6920, 5207, 862, 3628] |
|1645   |[6590, 7401, 3953, 852, 9076] |
+-------+------------------------------+
only showing top 10 rows



In [26]:
bookRecomments.select("book_id","recommendations.user_id").show(10, False)

+-------+-----------------------------------+
|book_id|user_id                            |
+-------+-----------------------------------+
|1580   |[23536, 20327, 30585, 23677, 20879]|
|4900   |[15566, 38076, 23536, 21791, 9551] |
|5300   |[23536, 27659, 20629, 15449, 10137]|
|6620   |[23536, 24883, 13570, 17113, 38217]|
|7240   |[23536, 41168, 27329, 51253, 16210]|
|7340   |[23536, 20327, 16210, 24883, 41168]|
|7880   |[23536, 21720, 41569, 47983, 12359]|
|9900   |[23536, 20327, 12402, 415, 18510]  |
|471    |[23536, 20327, 16210, 41168, 34474]|
|1591   |[23536, 20327, 41168, 415, 38948]  |
+-------+-----------------------------------+
only showing top 10 rows



In [27]:
# Generate top 10 Book recommendations for a specified set of users
users = ratings_df.select("user_id").distinct().limit(3);
users.show()

+-------+
|user_id|
+-------+
|  32592|
|  19984|
|  35982|
+-------+



In [28]:
userSubsetRecs = model.recommendForUserSubset(users, 10)

In [29]:
userSubsetRecs.select("user_id","recommendations.book_id").show(10, False)

+-------+-----------------------------------------------------------+
|user_id|book_id                                                    |
+-------+-----------------------------------------------------------+
|32592  |[862, 3282, 9842, 5207, 7401, 9076, 4868, 6009, 1788, 464] |
|35982  |[3628, 7254, 9024, 516, 6590, 9516, 3124, 5853, 4149, 2520]|
|19984  |[8976, 8854, 9076, 3952, 6590, 2441, 4868, 862, 1029, 7844]|
+-------+-----------------------------------------------------------+



In [31]:
# Generate top 10 user recommendations for a specified set of books
movies = ratings_df.select("book_id").distinct().limit(3)
movies.show()


+-------+
|book_id|
+-------+
|    148|
|    463|
|    471|
+-------+



In [32]:
movieSubSetRecs = model.recommendForItemSubset(movies, 10)

In [33]:
movieSubSetRecs.select("book_id","recommendations.user_id").show(10, False)

+-------+---------------------------------------------------------------------+
|book_id|user_id                                                              |
+-------+---------------------------------------------------------------------+
|471    |[51190, 50307, 42246, 10664, 31900, 8963, 12236, 12749, 12353, 18033]|
|463    |[37357, 18033, 22551, 42979, 28800, 30653, 49484, 49918, 49328, 3228]|
|148    |[10664, 47531, 51190, 8963, 19406, 8958, 40328, 19167, 17371, 1560]  |
+-------+---------------------------------------------------------------------+



In [36]:
# Predicetin for a user and match with to-read list
book_ids = [860,1524,2885,2914,5297,7397,8802,9506]
user_ids = [4917,4917,4917,4917,4917,4917,4917,4917]
new_user_preds = sqlContext.createDataFrame(zip(book_ids, user_ids), schema=['book_id','user_id'])

In [37]:
new_user_preds.show()

+-------+-------+
|book_id|user_id|
+-------+-------+
|    860|   4917|
|   1524|   4917|
|   2885|   4917|
|   2914|   4917|
|   5297|   4917|
|   7397|   4917|
|   8802|   4917|
|   9506|   4917|
+-------+-------+



In [38]:
new_predictions = model.transform(new_user_preds)
new_predictions.show()

+-------+-------+----------+
|book_id|user_id|prediction|
+-------+-------+----------+
|   2914|   4917|  3.819562|
|    860|   4917| 3.9312704|
|   2885|   4917| 3.5516005|
|   7397|   4917| 3.9508529|
|   8802|   4917| 3.4887486|
|   9506|   4917| 3.0527444|
|   1524|   4917| 4.0575905|
|   5297|   4917| 3.6285481|
+-------+-------+----------+



## Binod Suman Academy YouTube