# **Data Description**


The data consists of 105339 ratings applied over 10329 movies. The average rating is 3.5 and minimum and maximum rating is 0.5 and 5 respectively. There are 668 users who have given their ratings for 149532 movies.

- There are two data files which are provided:

 Movies.csv

  - movieId: ID assigned to a movie
  - title: Title of a movie
  - genres: pipe separated list of movie genres.


 Ratings.csv

  - userId: ID assigned to a user
  - movieId: ID assigned to a movie
  - rating: rating by a user to a movie
  - Timestamp: time at which the rating was provided.

In [None]:
#Import Libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer #To convert categorical data into vector to check their similarity with each other
from sklearn.metrics.pairwise import cosine_similarity  #Compare the genres with one another
from ipywidgets import * #For interactive widgets


In [None]:
#import the csv files
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')


In [None]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [None]:
#Check for null values
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [None]:
movies.shape

(9742, 3)

In [None]:
ratings.info()
ratings.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


(100836, 4)

In [None]:
#Null values not present

In [None]:
ratings.describe()  #Rarings are between 0 and 5 - avg rating is 3.5

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [None]:
#Different Genres available
movies['genres']=movies['genres'].str.split('|')

In [None]:
movies.explode('genres')

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure
0,1,Toy Story (1995),Animation
0,1,Toy Story (1995),Children
0,1,Toy Story (1995),Comedy
0,1,Toy Story (1995),Fantasy
...,...,...,...
9738,193583,No Game No Life: Zero (2017),Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Animation


In [None]:
movies2 = movies.explode('genres')
movies2.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure
0,1,Toy Story (1995),Animation
0,1,Toy Story (1995),Children
0,1,Toy Story (1995),Comedy
0,1,Toy Story (1995),Fantasy


In [None]:
#Unique genres available
movies2['genres'].unique()

array(['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Mystery', 'Sci-Fi', 'War', 'Musical', 'Documentary', 'IMAX',
       'Western', 'Film-Noir', '(no genres listed)'], dtype=object)

In [None]:
movies2['genres'].nunique()

20

In [None]:
#Removing (no genres listed)
movies2 = movies2[movies2['genres']!='(no genres listed)']

In [None]:
movies2['genres'].unique()

array(['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Mystery', 'Sci-Fi', 'War', 'Musical', 'Documentary', 'IMAX',
       'Western', 'Film-Noir'], dtype=object)

In [None]:
movies2['genres'].nunique()

19

In [None]:
#Group dataframe based on genres and get the average rating for each genre
#merge movies2 and ratings df
merged_data = pd.merge(ratings,movies2,on='movieId',how='inner')

In [None]:

merged_data.groupby('genres').agg({"title":"nunique",'rating':['mean','size']}).rename(columns = {'title':"unique_movie_count","rating":"mean_ratings"})

Unnamed: 0_level_0,unique_movie_count,mean_ratings,mean_ratings
Unnamed: 0_level_1,nunique,mean,size
genres,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Action,1827,3.447984,30635
Adventure,1262,3.508609,24161
Animation,610,3.629937,6988
Children,664,3.412956,9208
Comedy,3752,3.384721,39053
Crime,1195,3.658294,16681
Documentary,438,3.797785,1219
Drama,4347,3.656184,41928
Fantasy,778,3.491001,11834
Film-Noir,85,3.920115,870


In [None]:
#Popularity Recommender System
popularity = merged_data.groupby(['genres','title']).agg({'rating':['mean','size']}).reset_index()
popularity.columns = ['Genres','Title','Average_ratings','Number_of_Ratings']
popularity

Unnamed: 0,Genres,Title,Average_ratings,Number_of_Ratings
0,Action,'71 (2014),4.000000,1
1,Action,'Hellboy': The Seeds of Creation (2004),4.000000,1
2,Action,"10th Victim, The (La decima vittima) (1965)",4.000000,1
3,Action,12 Rounds (2009),3.000000,3
4,Action,13 Assassins (Jûsan-nin no shikaku) (2010),4.000000,4
...,...,...,...,...
21997,Western,Winds of the Wasteland (1936),3.500000,1
21998,Western,Wyatt Earp (1994),3.095238,21
21999,Western,Young Guns (1988),3.100000,25
22000,Western,Young Guns II (1990),3.000000,11


In [None]:
#genre = action
#threshold=50(movies which are rated at least 50 times)
#top = 7
popularity[(popularity['Genres']=='Action')&(popularity['Number_of_Ratings']>50)].sort_values('Average_ratings',ascending=False).head(7)

Unnamed: 0,Genres,Title,Average_ratings,Number_of_Ratings
551,Action,Fight Club (1999),4.272936,218
374,Action,"Dark Knight, The (2008)",4.238255,149
1255,Action,"Princess Bride, The (1987)",4.232394,142
1505,Action,Star Wars: Episode IV - A New Hope (1977),4.231076,251
92,Action,Apocalypse Now (1979),4.219626,107
1506,Action,Star Wars: Episode V - The Empire Strikes Back...,4.21564,211
1279,Action,Raiders of the Lost Ark (Indiana Jones and the...,4.2075,200


In [None]:
#Popularity recommender system
def TopNPopularMovies(genre,threshold,topN):
    popularity = merged_data.groupby(['genres','title']).agg({'rating':['mean','size']}).reset_index()
    popularity.columns = ['Genres','Title','Average_ratings','Number_of_Ratings']

    #filter the data
    topNrecommendations = popularity[(popularity['Genres']== genre)&(popularity['Number_of_Ratings']>threshold)].sort_values('Average_ratings',ascending=False).head(topN)
    topNrecommendations['Sno.'] = range(1,len(topNrecommendations)+1)
    topNrecommendations.index = range(0,len(topNrecommendations))
    # Added 'Number of Ratings' to the column list
    topNrecommendations.columns=['Genres','Movie Title','Average Movie rating','Number of Reviews','Sno.',]
    return topNrecommendations[['Sno.','Movie Title','Average Movie rating','Number of Reviews']]




In [None]:
#test case 1
genre = 'Adventure'
threshold = 50
topN = 18
TopNPopularMovies(genre=genre,threshold=threshold,topN=topN)

Unnamed: 0,Sno.,Movie Title,Average Movie rating,Number of Reviews
0,1,"Princess Bride, The (1987)",4.232394,142
1,2,Star Wars: Episode IV - A New Hope (1977),4.231076,251
2,3,Star Wars: Episode V - The Empire Strikes Back...,4.21564,211
3,4,Raiders of the Lost Ark (Indiana Jones and the...,4.2075,200
4,5,North by Northwest (1959),4.184211,57
5,6,Monty Python and the Holy Grail (1975),4.161765,136
6,7,Spirited Away (Sen to Chihiro no kamikakushi) ...,4.155172,87
7,8,City of God (Cidade de Deus) (2002),4.146667,75
8,9,"Good, the Bad and the Ugly, The (Buono, il bru...",4.145833,72
9,10,Star Wars: Episode VI - Return of the Jedi (1983),4.137755,196


In [None]:
#Content based Recommender System
#tfidf vectorizer
movies2.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure
0,1,Toy Story (1995),Animation
0,1,Toy Story (1995),Children
0,1,Toy Story (1995),Comedy
0,1,Toy Story (1995),Fantasy


In [None]:
movies3 = movies2.groupby('title').agg({'genres':lambda x:" ".join(list(x))}).reset_index()
movies3.head()

Unnamed: 0,title,genres
0,'71 (2014),Action Drama Thriller War
1,'Hellboy': The Seeds of Creation (2004),Action Adventure Comedy Documentary Fantasy
2,'Round Midnight (1986),Drama Musical
3,'Salem's Lot (2004),Drama Horror Mystery Thriller
4,'Til There Was You (1997),Drama Romance


In [None]:
#instance of tfidfvectorizer >>>> used to apply the function upon genres and generate their vectors
tf = TfidfVectorizer(analyzer = 'word',ngram_range=(1, 3), stop_words='english')
tf

In [None]:
tf_matrix=tf.fit_transform(movies3['genres'])

In [None]:
cosine_sim = cosine_similarity(tf_matrix,tf_matrix)
cosine_sim

array([[1.        , 0.02470055, 0.03161934, ..., 0.09683986, 0.        ,
        0.        ],
       [0.02470055, 1.        , 0.        , ..., 0.03412688, 0.02122325,
        0.02560568],
       [0.03161934, 0.        , 1.        , ..., 0.        , 0.        ,
        0.36165945],
       ...,
       [0.09683986, 0.03412688, 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.02122325, 0.        , ..., 0.        , 1.        ,
        0.05931276],
       [0.        , 0.02560568, 0.36165945, ..., 0.        , 0.05931276,
        1.        ]])

In [None]:
#final function
def recommendation_genre(movie_df,similarity_matrix,movie_title,topN):
    #indices for all movies
    indices = pd.Series(movies3.index,index = movies3['title'])
    #index of a target movie
    index = indices[movie_title]
    cosine_scores = list(enumerate(cosine_sim[index]))
    cosine_scores = sorted(cosine_scores,key=lambda x:x[1],reverse=True)[1:topN+2]
    #extract the matching movies
    matched = [i[0] for i in cosine_scores]
    matching_df = movie_df.iloc[matched]
    #filter out the target movie
    matching_df = matching_df[matching_df['title']!=movie_title]

    #output
    matching_df.rename(columns={'title':'Movie Title'},inplace=True)
    matching_df['Sno.'] = range(1,len(matching_df)+1)
    matching_df.index = range(0,len(matching_df))
    matching_df = matching_df[['Sno.','Movie Title']].head(topN)
    return matching_df


In [None]:
#test case
recommendation_genre(movie_df=movies3,similarity_matrix=cosine_sim,movie_title = 'Shrek the Third (2007)',topN = 10)

Unnamed: 0,Sno.,Movie Title
0,1,Antz (1998)
1,2,Asterix and the Vikings (Astérix et les Viking...
2,3,"Emperor's New Groove, The (2000)"
3,4,Moana (2016)
4,5,"Monsters, Inc. (2001)"
5,6,"Tale of Despereaux, The (2008)"
6,7,The Good Dinosaur (2015)
7,8,Toy Story (1995)
8,9,Toy Story 2 (1999)
9,10,Turbo (2013)


In [None]:
set(movies2['genres'])

{'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'}

In [None]:
#interactive widgets

In [None]:

#popularity
#inputs
genres = Dropdown(options=set(movies2['genres']),description="Genres",style={'description_width':'initial'})
num_reviews = IntText(description = "Minimum Reviwes",style = {"description_width":'initial'})
num_recommendations_1 = IntText(description = "Number of Recommendations",style = {"description_width":'initial'})

#tabs
b1 = Button(description = "Recommend",style = {"description_width":'initial'})
h1 = HBox([num_reviews,num_recommendations_1])
popularity_tab = VBox([genres,h1,b1])

#content based system
title = Textarea(description = "Movie Title",style = {"description_width":'initial'})
num_recommendations_2 = IntText(description = "Number of Recommendations",style = {"description_width":'initial'})

#tabs
h2 = HBox([title,num_recommendations_2])
b2 = Button(description = "Recommend",style = {"description_width":'initial'})
content_tab = VBox([h2,b2])

#creating final tabs
tabs = [popularity_tab,content_tab]
wid = widgets.Tab(tabs)

#set the titles to the tab
names = ['Popularity Based Recommender','Content Based Recommender']
[wid.set_title(i,title) for i,title in enumerate (names)]

display(wid)

Tab(children=(VBox(children=(Dropdown(description='Genres', options=('Mystery', 'Musical', 'Comedy', 'IMAX', '…

In [None]:
#setting up events to respond when clicked upon

def b1_clicked(b):
  global output
  output = TopNPopularMovies(genre=genres.value,threshold=num_reviews.value,topN=num_recommendations_1.value)
b1.on_click(b1_clicked)

#content
def b2_clicked(b):
  global output
  result = recommendation_genre(movie_df=movies3,similarity_matrix=cosine_sim,movie_title = title.value,topN = num_recommendations_2.value)
  output = result
b2.on_click(b2_clicked)

In [None]:
display(wid)

Tab(children=(VBox(children=(Dropdown(description='Genres', options=('Mystery', 'Musical', 'Comedy', 'IMAX', '…

In [None]:
output

Unnamed: 0,Sno.,Movie Title,Average Movie rating,Number of Reviews
0,1,Star Wars: Episode IV - A New Hope (1977),4.231076,251
1,2,Star Wars: Episode V - The Empire Strikes Back...,4.21564,211
2,3,"Matrix, The (1999)",4.192446,278
3,4,Eternal Sunshine of the Spotless Mind (2004),4.160305,131
4,5,Star Wars: Episode VI - Return of the Jedi (1983),4.137755,196
5,6,Blade Runner (1982),4.100806,124
6,7,Inception (2010),4.066434,143
7,8,WALL·E (2008),4.057692,104
8,9,Back to the Future (1985),4.038012,171
9,10,"Prestige, The (2006)",4.005556,90
