# Movie Similarities Project - Faizan Wali Bhutto

In [4]:
# importing needed libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

print("Spark session initialized:", spark)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Spark session initialized: <pyspark.sql.session.SparkSession object at 0x7f5b6abb4df0>

In [14]:
#S3 path to the dataset
s3_path = "s3://faizans-bucket-asgn-6/ml-10M/"

#load ratings data
ratings = spark.read.csv(s3_path + "ratings.dat", sep="::", inferSchema=True, header=False)
ratings = ratings.withColumnRenamed("_c0", "userId") \
                 .withColumnRenamed("_c1", "movieId") \
                 .withColumnRenamed("_c2", "rating") \
                 .withColumnRenamed("_c3", "timestamp")

#loading movies data
movies = spark.read.csv(s3_path + "movies.dat", sep="::", inferSchema=True, header=False)
movies = movies.withColumnRenamed("_c0", "movieId") \
               .withColumnRenamed("_c1", "title") \
               .withColumnRenamed("_c2", "genres")

#first few rows of each DF to verify
ratings.show(5)
movies.show(5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|    122|   5.0|838985046|
|     1|    185|   5.0|838983525|
|     1|    231|   5.0|838983392|
|     1|    292|   5.0|838983421|
|     1|    316|   5.0|838983392|
+------+-------+------+---------+
only showing top 5 rows

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+
only showing top 5 rows

### Prepping the data for similarity calculation

In [15]:
from pyspark.sql.functions import col, lit

#filter ratings for toy story - movieId 1 and renaming for clarity
toy_story_ratings = ratings.filter(ratings.movieId == 1).select("userId", "rating").withColumnRenamed("rating", "toy_story_rating")

#join with the ratings DF to find other movies rated by users who rated toy story
user_movie_ratings = ratings.join(toy_story_ratings, "userId")

#joining with movies DF to get movie titles and prepare columns
#renaming columns to match the required output
user_movie_ratings = user_movie_ratings.join(movies, "movieId") \
    .select(
        lit("Toy Story (1995)").alias("Movie Name"),  #"Toy Story (1995)" as the constant movie name
        col("title").alias("Similar Movies"),         #renaming "title" to "Similar Movies"
        col("rating"),
        col("toy_story_rating")
    )

#first few rows
user_movie_ratings.show(5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------+--------------------+------+----------------+
|      Movie Name|      Similar Movies|rating|toy_story_rating|
+----------------+--------------------+------+----------------+
|Toy Story (1995)|    Toy Story (1995)|   5.0|             5.0|
|Toy Story (1995)|Sense and Sensibi...|   5.0|             5.0|
|Toy Story (1995)|   Get Shorty (1995)|   4.0|             5.0|
|Toy Story (1995)|City of Lost Chil...|   5.0|             5.0|
|Toy Story (1995)|12 Monkeys (Twelv...|   5.0|             5.0|
+----------------+--------------------+------+----------------+
only showing top 5 rows

### Calculating the similarity scores

In [16]:
from pyspark.sql.functions import sum, sqrt, count, lit

#components for cosine similarity and count co-ratings
similarity_components = user_movie_ratings.groupBy("Similar Movies").agg(
    sum(col("rating") * col("toy_story_rating")).alias("dot_product"),
    sqrt(sum(col("rating") * col("rating"))).alias("rating_norm"),
    sqrt(sum(col("toy_story_rating") * col("toy_story_rating"))).alias("toy_story_norm"),
    count("*").alias("co_rating_count")  #counting the number of co-ratings
)

#cosine similarity score and applying the co-rating filter
similarity_scores = similarity_components.filter(col("co_rating_count") >= 10) \
    .withColumn("score", col("dot_product") / (col("rating_norm") * col("toy_story_norm")))

#filtering out toy story itself and selecting columns in the required format
similar_movies = similarity_scores.filter(col("Similar Movies") != "Toy Story (1995)") \
    .select(
        lit("Toy Story (1995)").alias("Movie Name"),
        col("Similar Movies"),
        col("score")
    ) \
    .orderBy(col("score").desc())

#10 most similar movies to toy stor
similar_movies.show(10)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------+--------------------+------------------+
|      Movie Name|      Similar Movies|             score|
+----------------+--------------------+------------------+
|Toy Story (1995)|In Old Chicago (1...|0.9966215554619151|
|Toy Story (1995)|Marooned in Iraq ...|0.9946998598171956|
|Toy Story (1995)|They Died with Th...|0.9930166580182394|
|Toy Story (1995)|Standard Operatin...|0.9924827921220798|
|Toy Story (1995)|    Kings Row (1942)|0.9919860408658672|
|Toy Story (1995)|Desperate Hours, ...|0.9902073326897796|
|Toy Story (1995)|Piece of the Acti...|0.9896856126772272|
|Toy Story (1995)|  Road to Rio (1947)|0.9895503677685183|
|Toy Story (1995)|       Hawaii (1966)| 0.989278067592776|
|Toy Story (1995)|Lone Wolf and Cub...|0.9889210628340293|
+----------------+--------------------+------------------+
only showing top 10 rows