<a href="https://colab.research.google.com/github/amitgundad/pySpark/blob/master/Movie_Recommendation_System_with_BigData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

The goal of this smaple is to extract insights from a large dataset with the help of Big Data frameworks (Spark, Hadoop) and machine learning techniques (e.g. classification, collaborative filtering, clustering, frequent pattern mining)

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# install spark (change the version number if needed)
!wget -q https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop3.2.tgz

# unzip the spark file to the current folder
!tar xf spark-3.0.0-bin-hadoop3.2.tgz

# set your spark folder to your system path environment. 
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.0-bin-hadoop3.2"


# install findspark using pip
!pip install -q findspark

In [None]:
# Findspark for Jupyter Notebook (spark-2.4.4-bin-hadoop2.7)
import findspark
findspark.init()

# Start Apache Spark Session & Context
import pyspark
from pyspark.sql import SQLContext

sc = pyspark.SparkContext(appName='sd701-RecoSys-Models')
sqlContext = SQLContext(sc)

print('Master : ', sc.master)
print('Cores  : ', sc.defaultParallelism)

Master :  local[*]
Cores  :  2


In [None]:
sqlContext.sparkSession.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)

In [None]:
!pip install koalas

Collecting koalas
  Downloading koalas-1.8.2-py3-none-any.whl (390 kB)
[?25l[K     |▉                               | 10 kB 23.6 MB/s eta 0:00:01[K     |█▊                              | 20 kB 27.3 MB/s eta 0:00:01[K     |██▌                             | 30 kB 12.3 MB/s eta 0:00:01[K     |███▍                            | 40 kB 9.3 MB/s eta 0:00:01[K     |████▏                           | 51 kB 5.3 MB/s eta 0:00:01[K     |█████                           | 61 kB 5.9 MB/s eta 0:00:01[K     |█████▉                          | 71 kB 5.6 MB/s eta 0:00:01[K     |██████▊                         | 81 kB 6.3 MB/s eta 0:00:01[K     |███████▌                        | 92 kB 4.8 MB/s eta 0:00:01[K     |████████▍                       | 102 kB 5.2 MB/s eta 0:00:01[K     |█████████▎                      | 112 kB 5.2 MB/s eta 0:00:01[K     |██████████                      | 122 kB 5.2 MB/s eta 0:00:01[K     |███████████                     | 133 kB 5.2 MB/s eta 0:00:01[K  

In [None]:
# Default Packages (available by Default in Google Colab)
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
import random
from pprint import pprint
from matplotlib.lines import Line2D

# Downloaded Packages (not available by Default)
import databricks.koalas

# PySpark Utilities
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.evaluation import RegressionEvaluator, BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics

# Random Seed
SEED = 1492

# Set-up
plt.style.use('seaborn')



In [None]:
from google.colab import drive
drive.mount('/content/drive');

Mounted at /content/drive


In [None]:
!ls "drive/My Drive/Big Data Analytics/Sample Program"

ml-25m	ml-25m.zip


In [None]:
DATA_PATH = "drive/My Drive/Big Data Analytics/Sample Program/ml-25m"
RESULTS_PATH = "drive/My Drive/Big Data Analytics/Sample Program/Results"

In [None]:
class MovieLensDatasets(object):

  def __init__(self, ratings, movies, links, personalRatings, debug=True, debugLimit=10000):
  # Load Existing Data
    if debug:
      debugLimit = debugLimit
      ratings = ratings.limit(debugLimit)
    else:
      ratings = ratings      

    self.ratings = ratings
    self.movies = movies
    self.links = links
    self.personalRatings = personalRatings

    # Create New DataFrame
    users = ratings.select('userId').distinct()
    self.users = users

  def preprocessing(self):
    # Preprocess MovieLens Ratings
    self.ratings = self.ratings.withColumn('rating',
      F.col('rating').cast('double')).drop('timestamp') \
      .withColumn('userId', F.col('userId').cast('int')) \
      .withColumn('movieId', F.col('movieId').cast('int'))

    # Preprocess Personal IMDb Ratings To MovieLens Ratings
    self.personalRatings = self.personalRatings.select(['Const',
                                                    'Your Rating']) \
          .withColumnRenamed('Const', 'imdbId') \
          .withColumnRenamed('Your Rating', 'personalRating')

    self.personalRatings = self.personalRatings \
        .withColumn('personalRating', F.col('personalRating').cast('double')*0.5) \
        .withColumn('imdbId', F.expr("substr(imdbId, 3)"))

    self.personalRatings = self.personalRatings.join(
        self.links.select('movieId', 'imdbId'), ['imdbId'], how='inner')

    # Insert IMDb Ratings into MovieLens Ratings Dataset
    self.personalRatings = self.personalRatings \
                        .withColumn('userId', F.lit('0'))
    self.personalRatings = self.personalRatings \
                        .select(['userId', 'movieId', 'personalRating']) \
                        .toDF('userId', 'movieId', 'rating')
    self.ratings = self.ratings.union(self.personalRatings)

    # Binarize MovieLens Ratings (if rating >= 3.0, then 1.0, else 0.0)
    udf_scale_ratings = F.udf(lambda x: x - 2.5, DoubleType())
    udf_binary_ratings = F.udf(lambda x: 1.0 if x > 0.0 else 0.0, DoubleType())

    self.ratings = self.ratings \
    .withColumn('ratingsScaled', udf_scale_ratings(F.col('rating'))) \
    .withColumn('ratingsBinary', udf_binary_ratings(F.col('ratingsScaled')))

    def get_ratings(self):
        return self.ratings

    def get_movies(self):
        return self.movies

    # Displaying Null Values
    def spark_df_display_null_values(sparkDf):
        print('NaN values ?')
        sparkDf.select([F.count(F.when(F.isnan(c), c)).alias(c) for c in sparkDf.columns]).show()

        print('Null values ?')
        sparkDf.select([F.count(F.when(F.isnull(c), c)).alias(c) for c in sparkDf.columns]).show()
