<a href="https://colab.research.google.com/github/aravindgopisetty/BDA-ASSIGNMENT/blob/main/BDA_ASSIGNMENT2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# Install Java (required for Spark)
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# Download Spark 3.3.2 (check for the latest version at https://spark.apache.org/downloads.html)
!wget -q https://archive.apache.org/dist/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz

# Unzip Spark
!tar xf spark-3.3.2-bin-hadoop3.tgz

# Set environment variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.2-bin-hadoop3"

# Install findspark to locate Spark
!pip install -q findspark

In [5]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import Row
import pyspark.sql.functions as F
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("MovieRecommendation") \
    .getOrCreate()

spark

In [6]:

!wget -q http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
!unzip -q ml-latest-small.zip
!ls ml-latest-small

links.csv  movies.csv  ratings.csv  README.txt	tags.csv


In [7]:

movies = spark.read.csv("ml-latest-small/movies.csv", header=True, inferSchema=True)
ratings = spark.read.csv("ml-latest-small/ratings.csv", header=True, inferSchema=True)


print("Movies:")
movies.show(5, truncate=False)

print("\nRatings:")
ratings.show(5)


ratings_grouped = ratings.groupBy("movieId").count()
ratings_grouped.orderBy("count", ascending=False).show(5)


movie_ratings = movies.join(ratings_grouped, "movieId", "left")
movie_ratings.orderBy("count", ascending=False).show(5, truncate=False)

Movies:
+-------+----------------------------------+-------------------------------------------+
|movieId|title                             |genres                                     |
+-------+----------------------------------+-------------------------------------------+
|1      |Toy Story (1995)                  |Adventure|Animation|Children|Comedy|Fantasy|
|2      |Jumanji (1995)                    |Adventure|Children|Fantasy                 |
|3      |Grumpier Old Men (1995)           |Comedy|Romance                             |
|4      |Waiting to Exhale (1995)          |Comedy|Drama|Romance                       |
|5      |Father of the Bride Part II (1995)|Comedy                                     |
+-------+----------------------------------+-------------------------------------------+
only showing top 5 rows


Ratings:
+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4

In [8]:

(training, test) = ratings.randomSplit([0.8, 0.2])

als = ALS(
    maxIter=5,
    regParam=0.01,
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    coldStartStrategy="drop"
)


model = als.fit(training)


predictions = model.transform(test)
evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction"
)

rmse = evaluator.evaluate(predictions)
print(f"Root-mean-square error = {rmse:.2f}")

Root-mean-square error = 1.09


In [9]:

user_recs = model.recommendForAllUsers(10)


movie_recs = model.recommendForAllItems(10)


user_id = 1


user_recs_df = user_recs.filter(user_recs.userId == user_id).collect()

if user_recs_df:
    rec_movies = user_recs_df[0].recommendations

    print(f"\nTop 10 recommendations for user {user_id}:")
    for row in rec_movies:
        movie_id = row.movieId
        rating = row.rating
        movie_title = movies.filter(movies.movieId == movie_id).select("title").collect()[0].title
        print(f"{movie_title} (predicted rating: {rating:.2f})")
else:
    print(f"No recommendations for user {user_id} (possibly new user)")


Top 10 recommendations for user 1:
Yojimbo (1961) (predicted rating: 6.37)
Guess Who's Coming to Dinner (1967) (predicted rating: 6.35)
Shadowlands (1993) (predicted rating: 6.22)
Sophie Scholl: The Final Days (Sophie Scholl - Die letzten Tage) (2005) (predicted rating: 6.17)
Simple Plan, A (1998) (predicted rating: 6.16)
On the Waterfront (1954) (predicted rating: 6.12)
Jetée, La (1962) (predicted rating: 6.11)
Kelly's Heroes (1970) (predicted rating: 6.07)
Grave of the Fireflies (Hotaru no haka) (1988) (predicted rating: 6.03)
Dead Again (1991) (predicted rating: 5.97)
