In [1]:
from pyspark.sql import SparkSession, Row, functions
import os

In [2]:
def loadMovieNames():
    movie_names = {}
    with open('/Users/ayusman/migrate/hadooop/hadoop-basics/ml-100k/u.item', encoding = "ISO-8859-1") as f:
        for line in f:
            fields = line.split('|')
            movie_names[int(fields[0])] = fields[1]
    
    return  movie_names

In [3]:
def parseInput(line):
    fields = line.split()
    return Row(int (fields[1]), float (fields[2]), 1.0)

In [4]:
sparkSession = SparkSession.builder.appName("spark dataset").getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/01/28 13:16:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/01/28 13:16:02 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/01/28 13:16:02 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [5]:
#movieNames dictionary
movieNames = loadMovieNames()

In [6]:
os.getcwd()

'/Users/ayusman/migrate/hadooop/hadoop-basics/Spark'

In [7]:
# get the raw data
lines = sparkSession.sparkContext.textFile('/Users/ayusman/migrate/hadooop/hadoop-basics/ml-100k/u.data')

In [8]:
#converting to RDD of raw data
movies = lines.map(parseInput)

In [9]:
# creating dataframe/dataset with that raw data
movieDataset = sparkSession.createDataFrame(movies)

In [10]:
movieDataset.show()

+----+---+---+
|  _1| _2| _3|
+----+---+---+
| 242|3.0|1.0|
| 302|3.0|1.0|
| 377|1.0|1.0|
|  51|2.0|1.0|
| 346|1.0|1.0|
| 474|4.0|1.0|
| 265|2.0|1.0|
| 465|5.0|1.0|
| 451|3.0|1.0|
|  86|3.0|1.0|
| 257|2.0|1.0|
|1014|5.0|1.0|
| 222|5.0|1.0|
|  40|3.0|1.0|
|  29|3.0|1.0|
| 785|3.0|1.0|
| 387|5.0|1.0|
| 274|2.0|1.0|
|1042|4.0|1.0|
|1184|2.0|1.0|
+----+---+---+
only showing top 20 rows



In [11]:
#compute average rating for each movieID
# _1 = movieid, _2=rating, _3=count
averageRating = movieDataset.groupBy("_1").avg("_2")

In [12]:
#average rating for movieIds
averageRating.take(3)

[Row(_1=474, avg(_2)=4.252577319587629),
 Row(_1=29, avg(_2)=2.6666666666666665),
 Row(_1=26, avg(_2)=3.452054794520548)]

In [13]:
#compute the count
counts = movieDataset.groupBy("_1").count()

In [14]:
counts.take(3)

[Row(_1=474, count=194), Row(_1=29, count=114), Row(_1=26, count=73)]

In [15]:
averageAndcounts = counts.join(averageRating, "_1")

In [16]:
averageAndcounts.take(2)

[Row(_1=474, count=194, avg(_2)=4.252577319587629),
 Row(_1=29, count=114, avg(_2)=2.6666666666666665)]

In [17]:
# best movies
top10 = averageAndcounts.orderBy(("avg(_2)"), ascending=[ 0]).take(10)

In [18]:
top10

[Row(_1=1536, count=1, avg(_2)=5.0),
 Row(_1=1189, count=3, avg(_2)=5.0),
 Row(_1=1599, count=1, avg(_2)=5.0),
 Row(_1=1293, count=3, avg(_2)=5.0),
 Row(_1=814, count=1, avg(_2)=5.0),
 Row(_1=1653, count=1, avg(_2)=5.0),
 Row(_1=1500, count=2, avg(_2)=5.0),
 Row(_1=1122, count=1, avg(_2)=5.0),
 Row(_1=1201, count=1, avg(_2)=5.0),
 Row(_1=1467, count=2, avg(_2)=5.0)]

In [19]:
# searching for key i.e movie[0] in movieNames dictionary and its corresponding average rating
for movie in top10:
    print(movieNames[movie[0]], movie[2])

Aiqing wansui (1994) 5.0
Prefontaine (1997) 5.0
Someone Else's America (1995) 5.0
Star Kid (1997) 5.0
Great Day in Harlem, A (1994) 5.0
Entertaining Angels: The Dorothy Day Story (1996) 5.0
Santa with Muscles (1996) 5.0
They Made Me a Criminal (1939) 5.0
Marlene Dietrich: Shadow and Light (1996)  5.0
Saint of Fort Washington, The (1993) 5.0


In [20]:
# worst movies
last10 = averageAndcounts.orderBy(("avg(_2)"), ascending=[1]).take(10)
last10

[Row(_1=1571, count=1, avg(_2)=1.0),
 Row(_1=858, count=3, avg(_2)=1.0),
 Row(_1=1570, count=1, avg(_2)=1.0),
 Row(_1=830, count=1, avg(_2)=1.0),
 Row(_1=439, count=5, avg(_2)=1.0),
 Row(_1=1343, count=1, avg(_2)=1.0),
 Row(_1=1559, count=1, avg(_2)=1.0),
 Row(_1=1374, count=2, avg(_2)=1.0),
 Row(_1=1548, count=1, avg(_2)=1.0),
 Row(_1=852, count=1, avg(_2)=1.0)]

In [25]:
for movie in last10:
    print(movieNames[movie[0]], movie[1], movie[2])

Touki Bouki (Journey of the Hyena) (1973) 1 1.0
Amityville: Dollhouse (1996) 3 1.0
Quartier Mozart (1992) 1 1.0
Power 98 (1995) 1 1.0
Amityville: A New Generation (1993) 5 1.0
Lotto Land (1995) 1 1.0
Hostile Intentions (1994) 1 1.0
Falling in Love Again (1980) 2 1.0
The Courtyard (1995) 1 1.0
Bloody Child, The (1996) 1 1.0


In [89]:
sparkSession.stop()