In [1]:
import sys
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions  import *
from pyspark.sql.types import *

In [2]:
# define the configurations for this Spark program
conf = SparkConf().setMaster("local[*]").setAppName("Books")
conf.set("spark.executor.memory", "6G")
conf.set("spark.driver.memory", "2G")
conf.set("spark.executor.cores", "4")
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
conf.set("spark.default.parallelism", "4")

# create a Spark Session instead of a Spark Context
spark = SparkSession.builder \
    .config(conf = conf) \
  .appName("spark session example") \
  .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/11/23 18:25:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
books_df = spark.read.option("delimiter", ";").option("header", "true").csv('./BX-CSV-Dump/BX-Books.csv')
books_df.show()

+----------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+
|      ISBN|          Book-Title|         Book-Author|Year-Of-Publication|           Publisher|         Image-URL-S|         Image-URL-M|         Image-URL-L|
+----------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+
|0195153448| Classical Mythology|  Mark P. O. Morford|               2002|Oxford University...|http://images.ama...|http://images.ama...|http://images.ama...|
|0002005018|        Clara Callan|Richard Bruce Wright|               2001|HarperFlamingo Ca...|http://images.ama...|http://images.ama...|http://images.ama...|
|0060973129|Decision in Normandy|        Carlo D'Este|               1991|     HarperPerennial|http://images.ama...|http://images.ama...|http://images.ama...|
|0374157065|Flu: The Story of...|    Gina Bari

In [5]:
books_df.select('Book-Title').show()

+--------------------+
|          Book-Title|
+--------------------+
| Classical Mythology|
|        Clara Callan|
|Decision in Normandy|
|Flu: The Story of...|
|The Mummies of Ur...|
|The Kitchen God's...|
|What If?: The Wor...|
|     PLEADING GUILTY|
|Under the Black F...|
|Where You'll Find...|
|Nights Below Stat...|
|Hitler's Secret B...|
|  The Middle Stories|
|            Jane Doe|
|A Second Chicken ...|
|The Witchfinder (...|
|More Cunning Than...|
|Goodbye to the Bu...|
|       The Testament|
|Beloved (Plume Co...|
+--------------------+
only showing top 20 rows



In [6]:
books_df.select('Publisher').distinct().count()

                                                                                

16807

In [7]:
books_df.createOrReplaceTempView('Table')
df = spark.sql("SELECT Publisher from Table")
df.show()

+--------------------+
|           Publisher|
+--------------------+
|Oxford University...|
|HarperFlamingo Ca...|
|     HarperPerennial|
|Farrar Straus Giroux|
|W. W. Norton &amp...|
|    Putnam Pub Group|
|Berkley Publishin...|
|          Audioworks|
|        Random House|
|            Scribner|
|     Emblem Editions|
|       Citadel Press|
|House of Anansi P...|
|          Mira Books|
|Health Communicat...|
|Brilliance Audio ...|
|Kensington Publis...|
|      River City Pub|
|                Dell|
|               Plume|
+--------------------+
only showing top 20 rows



In [10]:
user_ratings_df = spark.read.option("delimiter", ";").option("header", "true").csv('./BX-CSV-Dump/BX-Book-Ratings.csv')
# Columns User-ID, ISBN and Book-Rating were in string format, which we convert to int
ratings_df = user_ratings_df.withColumn("User-ID",
                                        user_ratings_df['User-ID'].\
                                        cast(IntegerType())).\
										withColumn("ISBN", user_ratings_df['ISBN'].\
           								cast(IntegerType())).\
    									withColumn("Book-Rating",\
                                        user_ratings_df['Book-Rating'].\
                                  		cast(IntegerType())).\
        								na.drop()
ratings_df.show()


+-------+----------+-----------+
|User-ID|      ISBN|Book-Rating|
+-------+----------+-----------+
| 276726| 155061224|          5|
| 276727| 446520802|          0|
| 276729| 521795028|          6|
| 276733|2080674722|          0|
| 276737| 600570967|          6|
| 276745| 342310538|         10|
| 276746| 425115801|          0|
| 276746| 449006522|          0|
| 276746| 553561618|          0|
| 276746| 786013990|          0|
| 276746| 786014512|          0|
| 276747|  60517794|          9|
| 276747| 451192001|          0|
| 276747| 609801279|          0|
| 276747| 671537458|          9|
| 276747| 679776818|          8|
| 276747| 943066433|          7|
| 276747|1570231028|          0|
| 276747|1885408226|          7|
| 276748| 747558167|          6|
+-------+----------+-----------+
only showing top 20 rows



In [12]:
# define parameters
als = ALS(maxIter=5, regParam=0.01, userCol="User-ID", itemCol="ISBN", ratingCol="Book-Rating",coldStartStrategy="drop")
#fit the model to the ratings
dataframemodel = als.fit(ratings_df)


                                                                                

In [14]:
ratings = ratings_df.filter(col('User-ID')==17)
books_df.join(ratings,ratings.ISBN==books_df.ISBN).\
    				select(col('User-ID'),col('Book-Title'),col('Book-Author'),col('Year-Of-Publication'),col('Book-Rating')).\
        			show()


                                                                                

+-------+--------------------+-----------------+-------------------+-----------+
|User-ID|          Book-Title|      Book-Author|Year-Of-Publication|Book-Rating|
+-------+--------------------+-----------------+-------------------+-----------+
|     17|OUT OF THE SILENT...|       C.S. Lewis|               1996|          0|
|     17|Prelude to Founda...|     ISAAC ASIMOV|               1989|          0|
|     17|             Prophet| Frank E. Peretti|               1992|          3|
|     17|     Winter Solstice|Rosamunde Pilcher|               2001|          0|
|     17| Death in the Clouds|  Agatha Christie|               1997|          7|
|     17|Piercing the Dark...| Frank E. Peretti|               1989|          6|
|     17|Bant/Spec.Last of...|    Louis L'Amour|               1987|          5|
+-------+--------------------+-----------------+-------------------+-----------+

