### Importing needed Libraries

In [166]:
import pyspark
import pandas as pd
import re
from pyspark.sql import SparkSession

###Mounting Google Drive for Accessing Data

In [167]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Creating Spark Seesion

In [168]:
spark = SparkSession.builder.master("local[*]") \
                    .appName('Movie Recommendation Engine') \
                    .getOrCreate()

### Loading Movies DataSet into Spark DataFrame

In [189]:
movies = spark.read. \
      option("header","true"). \
      option("inferSchema","true"). \
      csv('/content/drive/MyDrive/Colab Notebooks/movies.csv')

In [190]:
movies.count()

62423

In [191]:
movies.drop_duplicates()

DataFrame[movieId: int, title: string, genres: string]

### Cleaning movies titles using spark and regex 

In [192]:
def clean_title(title):
  """
    This function will clean movie title by removing all special chars from title except space and alphanumeric chars
  """
  return re.sub("[^a-zA-Z0-9 ]","",title)

In [193]:
 # Example 
 print(clean_title("Vishal--- ====@#$%barvaliya?"))

Vishal barvaliya


In [194]:
from pyspark.sql.functions import col,udf
from pyspark.sql.types import StringType

Before cleaning movie titles lets check some titles with paranthases then we will compare same result after cleaning it

In [195]:
#movies.filter(col('title').contains(')')).show(truncate=False)

In [196]:
clean_titles = udf(clean_title,StringType())

In [197]:
movies = movies.withColumn("title", clean_titles('title'))

After cleaning movie titles we do not have any record left with paranthases which means data is cleaned perfectly

In [198]:
#movies.filter(col('title').contains(')')).show(truncate=False)

In [199]:
movies = movies.drop_duplicates(subset=['title'])

In [203]:
movies_df = movies.toPandas()

In [204]:
movies_df  = movies_df[list(movies_df.columns[~movies_df.columns.duplicated()])]

Using Vectorizer on title to create search engine first

In [205]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies_df['title'])

Creating Search Function

In [206]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


In [255]:
def search(title):
  title = clean_title(title)
  query_vec = vectorizer.transform([title])
  similarity = cosine_similarity(query_vec,tfidf)
  indices = np.argpartition(similarity, -5)[-5:]
  res = movies_df.iloc[indices[0]][::-1][:5]
  return res

In [256]:
import ipywidgets as w
from IPython.display import display


In [257]:
movie_input = w.Text(
    description = "Movie Title:",
    disabled = False
)

movie_list = w.Output()
def on_type(data):
  with movie_list:
    movie_list.clear_output()
    title = data["new"]
    if len(title) > 5:
      display(search(title))

movie_input.observe(on_type,names='value')
display(movie_input,movie_list)

Text(value='', description='Movie Title:')

Output()

In [254]:
ratings = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/ratings.csv")

#Creating Recommendation System

In [303]:
def find_similar_movies(movieId):
  similar_users = ratings[(ratings['movieId'] == movieid) & (ratings['rating'] >= 4) ]['userId'].unique()
  similar_user_recs = ratings[(ratings['userId'].isin(similar_users) & (ratings['rating'] > 4))]['movieId']

  similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
  similar_user_recs = similar_user_recs[similar_user_recs > .10]

  all_users = ratings[(ratings['movieId'].isin(similar_user_recs.index)) & (ratings['rating'] > 4)]
  all_users_recs = all_users['movieId'].value_counts() / len(all_users['userId'].unique())

  rec_percentages = pd.concat([similar_user_recs, all_users_recs],axis=1)
  rec_percentages.columns = ['similar','all']
  rec_percentages['score'] = rec_percentages['similar'] / rec_percentages['all']
  rec_percentages = rec_percentages.sort_values('score',ascending=False)
  return rec_percentages.head(10).merge(movies_df, left_index=True, right_on="movieId")[['score','title','genres']]

In [304]:
find_similar_movies(1)

Unnamed: 0,score,title,genres
11933,158.486182,Hulk 2003,Action|Adventure|Sci-Fi
12576,42.157324,Incredible Hulk The 2008,Action|Sci-Fi
54098,26.932292,SpiderMan 3 2007,Action|Adventure|Sci-Fi|Thriller|IMAX
41781,24.299781,Hancock 2008,Action|Adventure|Comedy|Crime|Fantasy
11211,22.377852,Hellboy 2004,Action|Adventure|Fantasy|Horror
24238,20.650733,Terminator 3 Rise of the Machines 2003,Action|Adventure|Sci-Fi
61915,20.111741,XMen Origins Wolverine 2009,Action|Sci-Fi|Thriller
44727,18.181971,King Kong 2005,Action|Adventure|Drama|Fantasy|Thriller
36298,17.86854,Chronicles of Riddick The 2004,Action|Sci-Fi|Thriller
28620,17.532425,Transporter The 2002,Action|Crime


In [306]:
movie_input_name = w.Text(
    Description = "Movie title",
    disabled = False
)
recommendation_list = w.Output()
def on_type(data):
  recommendation_list.clear_output()
  title = data['new']
  if len(title) > 5:
    res = search(title) 
    movie_id = res.iloc[0]["movieId"]
    display( find_similar_movies(movie_id))

movie_input_name.observe(on_type,names='value') 
display(movie_input_name, recommendation_list)


Text(value='')

Output()

Unnamed: 0,score,title,genres
11933,158.486182,Hulk 2003,Action|Adventure|Sci-Fi
12576,42.157324,Incredible Hulk The 2008,Action|Sci-Fi
54098,26.932292,SpiderMan 3 2007,Action|Adventure|Sci-Fi|Thriller|IMAX
41781,24.299781,Hancock 2008,Action|Adventure|Comedy|Crime|Fantasy
11211,22.377852,Hellboy 2004,Action|Adventure|Fantasy|Horror
24238,20.650733,Terminator 3 Rise of the Machines 2003,Action|Adventure|Sci-Fi
61915,20.111741,XMen Origins Wolverine 2009,Action|Sci-Fi|Thriller
44727,18.181971,King Kong 2005,Action|Adventure|Drama|Fantasy|Thriller
36298,17.86854,Chronicles of Riddick The 2004,Action|Sci-Fi|Thriller
28620,17.532425,Transporter The 2002,Action|Crime


Unnamed: 0,score,title,genres
11933,158.486182,Hulk 2003,Action|Adventure|Sci-Fi
12576,42.157324,Incredible Hulk The 2008,Action|Sci-Fi
54098,26.932292,SpiderMan 3 2007,Action|Adventure|Sci-Fi|Thriller|IMAX
41781,24.299781,Hancock 2008,Action|Adventure|Comedy|Crime|Fantasy
11211,22.377852,Hellboy 2004,Action|Adventure|Fantasy|Horror
24238,20.650733,Terminator 3 Rise of the Machines 2003,Action|Adventure|Sci-Fi
61915,20.111741,XMen Origins Wolverine 2009,Action|Sci-Fi|Thriller
44727,18.181971,King Kong 2005,Action|Adventure|Drama|Fantasy|Thriller
36298,17.86854,Chronicles of Riddick The 2004,Action|Sci-Fi|Thriller
28620,17.532425,Transporter The 2002,Action|Crime


Unnamed: 0,score,title,genres
11933,158.486182,Hulk 2003,Action|Adventure|Sci-Fi
12576,42.157324,Incredible Hulk The 2008,Action|Sci-Fi
54098,26.932292,SpiderMan 3 2007,Action|Adventure|Sci-Fi|Thriller|IMAX
41781,24.299781,Hancock 2008,Action|Adventure|Comedy|Crime|Fantasy
11211,22.377852,Hellboy 2004,Action|Adventure|Fantasy|Horror
24238,20.650733,Terminator 3 Rise of the Machines 2003,Action|Adventure|Sci-Fi
61915,20.111741,XMen Origins Wolverine 2009,Action|Sci-Fi|Thriller
44727,18.181971,King Kong 2005,Action|Adventure|Drama|Fantasy|Thriller
36298,17.86854,Chronicles of Riddick The 2004,Action|Sci-Fi|Thriller
28620,17.532425,Transporter The 2002,Action|Crime


Unnamed: 0,similar,all,score,movieId,title,genres
11933,0.302968,0.001912,158.486182,6534,Hulk 2003,Action|Adventure|Sci-Fi
12576,0.136131,0.003229,42.157324,60040,Incredible Hulk The 2008,Action|Sci-Fi
54098,0.106448,0.003952,26.932292,52722,SpiderMan 3 2007,Action|Adventure|Sci-Fi|Thriller|IMAX
41781,0.11566,0.00476,24.299781,60074,Hancock 2008,Action|Adventure|Comedy|Crime|Fantasy
11211,0.16172,0.007227,22.377852,7373,Hellboy 2004,Action|Adventure|Fantasy|Horror
24238,0.128966,0.006245,20.650733,6537,Terminator 3 Rise of the Machines 2003,Action|Adventure|Sci-Fi
61915,0.145343,0.007227,20.111741,68319,XMen Origins Wolverine 2009,Action|Sci-Fi|Thriller
44727,0.122825,0.006755,18.181971,41569,King Kong 2005,Action|Adventure|Drama|Fantasy|Thriller
36298,0.117707,0.006587,17.86854,8371,Chronicles of Riddick The 2004,Action|Sci-Fi|Thriller
28620,0.103378,0.005896,17.532425,5574,Transporter The 2002,Action|Crime
