In [7]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from ipywidgets import *

In [8]:
movies=pd.read_csv('movies.csv')
rating=pd.read_csv('ratings.csv')


In [9]:
rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523


In [10]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [11]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10329 entries, 0 to 10328
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  10329 non-null  int64 
 1   title    10329 non-null  object
 2   genres   10329 non-null  object
dtypes: int64(1), object(2)
memory usage: 242.2+ KB


In [12]:
movies.shape

(10329, 3)

In [13]:
rating.shape

(105339, 4)

In [14]:
rating.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,105339.0,105339.0,105339.0,105339.0
mean,364.924539,13381.312477,3.51685,1130424000.0
std,197.486905,26170.456869,1.044872,180266000.0
min,1.0,1.0,0.5,828565000.0
25%,192.0,1073.0,3.0,971100800.0
50%,383.0,2497.0,3.5,1115154000.0
75%,557.0,5991.0,4.0,1275496000.0
max,668.0,149532.0,5.0,1452405000.0


In [15]:
movies['genres']=movies['genres'].str.split("|")

In [16]:
movies2=movies.explode('genres')
movies2.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure
0,1,Toy Story (1995),Animation
0,1,Toy Story (1995),Children
0,1,Toy Story (1995),Comedy
0,1,Toy Story (1995),Fantasy


In [17]:
movies2=movies2[movies2['genres']!='(no genres listed)']

In [18]:
movies2['genres'].unique()

array(['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Mystery', 'Sci-Fi', 'IMAX', 'War', 'Musical', 'Documentary',
       'Western', 'Film-Noir'], dtype=object)

In [19]:
#grp data based on genres to get avg rating of genres
merge_data=pd.merge(rating,movies2,on=['movieId'],how='inner')
merge_data

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,16,4.0,1217897793,Casino (1995),Crime
1,1,16,4.0,1217897793,Casino (1995),Drama
2,1,24,1.5,1217895807,Powder (1995),Drama
3,1,24,1.5,1217895807,Powder (1995),Sci-Fi
4,1,32,4.0,1217896246,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery
...,...,...,...,...,...,...
281892,668,143385,4.0,1446388585,Bridge of Spies (2015),Drama
281893,668,143385,4.0,1446388585,Bridge of Spies (2015),Thriller
281894,668,144976,2.5,1448656898,Bone Tomahawk (2015),Horror
281895,668,144976,2.5,1448656898,Bone Tomahawk (2015),Western


In [20]:
merge_data.groupby('genres').agg({"title":"nunique","rating":['mean','size']}).rename(columns={"title":"unique_movie_count","rating":"mean_rating"})

Unnamed: 0_level_0,unique_movie_count,mean_rating,mean_rating
Unnamed: 0_level_1,nunique,mean,size
genres,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Action,1736,3.45145,31205
Adventure,1164,3.518027,23076
Animation,400,3.63535,5966
Children,540,3.439429,8098
Comedy,3513,3.420996,38055
Crime,1440,3.642392,18291
Documentary,415,3.643035,1206
Drama,5217,3.650266,46960
Fantasy,670,3.500459,10889
Film-Noir,195,3.913636,1210


In [21]:
popularity = merge_data.groupby(['genres', 'title']).agg({"rating": ['mean', 'size']}).reset_index()
popularity.columns = ['genres', 'title', 'Average Ratings', 'Number of ratings']
popularity

Unnamed: 0,genres,title,Average Ratings,Number of ratings
0,Action,'71 (2014),3.500000,1
1,Action,'Hellboy': The Seeds of Creation (2004),3.000000,1
2,Action,10 to Midnight (1983),2.500000,1
3,Action,12 Rounds (2009),2.875000,4
4,Action,13 Assassins (JÃ»san-nin no shikaku) (2010),3.500000,5
...,...,...,...,...
23093,Western,Wyatt Earp (1994),3.200000,30
23094,Western,Young Guns (1988),3.375000,36
23095,Western,Young Guns II (1990),3.083333,12
23096,Western,Young Ones (2014),2.000000,1


In [22]:
# popularity recommender System
popularity[(popularity['genres']=='Action')&(popularity['Number of ratings']==50 )]

Unnamed: 0,genres,title,Average Ratings,Number of ratings
1148,Action,Pirates of the Caribbean: Dead Man's Chest (2006),3.46,50


In [23]:
#genre=action
#threshold=50
#top=7
popularity[(popularity['genres']=='Action')&(popularity['Number of ratings']>=50)].sort_values(by=['Average Ratings'], ascending=False).head(7)

Unnamed: 0,genres,title,Average Ratings,Number of ratings
1179,Action,Princess Mononoke (Mononoke-hime) (1997),4.384615,52
1076,Action,North by Northwest (1959),4.273973,73
975,Action,"Matrix, The (1999)",4.264368,261
1433,Action,Star Wars: Episode V - The Empire Strikes Back...,4.22807,228
1331,Action,Seven Samurai (Shichinin no samurai) (1954),4.217742,62
1199,Action,Raiders of the Lost Ark (Indiana Jones and the...,4.212054,224
747,Action,Inception (2010),4.18932,103


In [24]:
def TopNPopularMovies(genre,threshold,topN):
    popularity = merge_data.groupby(['genres', 'title']).agg({"rating": ['mean', 'size']}).reset_index()
    popularity.columns = ['genres', 'title', 'Average_Ratings', 'Number_of_ratings']

    #filter data
    topNrecommentions=popularity[(popularity['genres']==genre)&(popularity['Number_of_ratings']>=threshold)].sort_values(by=['Average_Ratings'], ascending=False).head(topN)

    #out
    topNrecommentions['Sno.']=range(0,len(topNrecommentions)) # Creating a serial number column
    #Rename columns to desired names
    topNrecommentions.columns=['Genres','Movie Title','Average_Movie_Rating','Number_of_Review', 'Sno']  #Include 'Sno.' to match existing columns
    return topNrecommentions[['Sno','Movie Title','Average_Movie_Rating','Number_of_Review']] # Selecting desired columns for output

In [25]:
genre='Action'
threshold=50
topN=8
TopNPopularMovies(genre=genre,threshold=threshold,topN=topN)

Unnamed: 0,Sno,Movie Title,Average_Movie_Rating,Number_of_Review
1179,0,Princess Mononoke (Mononoke-hime) (1997),4.384615,52
1076,1,North by Northwest (1959),4.273973,73
975,2,"Matrix, The (1999)",4.264368,261
1433,3,Star Wars: Episode V - The Empire Strikes Back...,4.22807,228
1331,4,Seven Samurai (Shichinin no samurai) (1954),4.217742,62
1199,5,Raiders of the Lost Ark (Indiana Jones and the...,4.212054,224
747,6,Inception (2010),4.18932,103
1432,7,Star Wars: Episode IV - A New Hope (1977),4.188645,273


In [26]:
#content reconmentaion sysytem

In [27]:
movies2.head()


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure
0,1,Toy Story (1995),Animation
0,1,Toy Story (1995),Children
0,1,Toy Story (1995),Comedy
0,1,Toy Story (1995),Fantasy


In [28]:
movies3=movies2.groupby('title').agg({'genres':lambda x: " ".join(x)}).reset_index()
movies3.head()

Unnamed: 0,title,genres
0,'71 (2014),Action Drama Thriller War
1,'Hellboy': The Seeds of Creation (2004),Action Adventure Comedy Documentary Fantasy
2,'Round Midnight (1986),Drama Musical
3,'Til There Was You (1997),Drama Romance
4,"'burbs, The (1989)",Comedy


In [29]:
tf=TfidfVectorizer(analyzer='word',ngram_range=(1,3),stop_words='english')
tf

In [30]:
tf_matrix=tf.fit_transform(movies3['genres'])

In [31]:
cosine_sim=cosine_similarity(tf_matrix,tf_matrix)
cosine_sim

array([[1.        , 0.02677945, 0.02931913, ..., 0.10229517, 0.        ,
        0.        ],
       [0.02677945, 1.        , 0.        , ..., 0.03626651, 0.02411583,
        0.02863994],
       [0.02931913, 0.        , 1.        , ..., 0.        , 0.        ,
        0.35526663],
       ...,
       [0.10229517, 0.03626651, 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.02411583, 0.        , ..., 0.        , 1.        ,
        0.07090711],
       [0.        , 0.02863994, 0.35526663, ..., 0.        , 0.07090711,
        1.        ]])

In [32]:

def recommendation_genre(movie_df, similarity_matrix, movie_title, topN):
  #indices for all movies
  indices=pd.Series(movies3.index, index=movies3['title'])
  #index of target movie
  #Strip whitespace, including tabs, from the movie title
  movie_title = movie_title.strip()
  index=indices[movie_title]
  #cosine similarity scores
  cosine_scores=list(enumerate(similarity_matrix[index]))
  cosine_scores=sorted(cosine_scores, key=lambda x:x[1], reverse=True) [1:topN+2]
  #extract the matching mvoies
  matched=[i[0] for i in cosine_scores ]
  matching_df=movie_df.iloc[matched]
  #filter out the Target movie
  matching_df=matching_df[matching_df['title' ] != movie_title]
  #output
  matching_df.rename(columns={'title':'Movie Title'}, inplace=True)
  matching_df['Sno.']=range(1, len(matching_df)+1)
  matching_df.index=range(0,len(matching_df))
  return matching_df[['Sno.','Movie Title' ]]. head(topN)

In [33]:
#test
recommendation_genre(movie_df=movies3,similarity_matrix=cosine_sim,movie_title='Toy Story (1995)',topN=10)

Unnamed: 0,Sno.,Movie Title
0,1,Antz (1998)
1,2,Asterix and the Vikings (AstÃ©rix et les Viking...
2,3,"Boxtrolls, The (2014)"
3,4,DuckTales: The Movie - Treasure of the Lost La...
4,5,"Emperor's New Groove, The (2000)"
5,6,"Monsters, Inc. (2001)"
6,7,Shrek the Third (2007)
7,8,"Tale of Despereaux, The (2008)"
8,9,Toy Story 2 (1999)
9,10,Turbo (2013)


In [34]:

#Interactive widgets
#input
genre = Dropdown(
    options=list(set(movies2['genres'])),
    description="Genres",
    style={"description_width": "initial"}
)
num_reviews = IntText(
    description="Minimum Reviews",
    style={"description_width": "initial"}
)
num_rec_popularity = IntText(
    description="Number of Recommendations",
    style={"description_width": "initial"}
)
b1 = Button(description="Recommend me", style={"description_width": "initial"})
h1 = HBox([num_reviews, num_rec_popularity])
popularity_tab = VBox([genre, h1, b1])

# Content-Based Recommendations
title = Textarea(
    description="Movie Title",
    style={"description_width": "initial"}
)
num_rec_content = IntText(
    description="Number of Recommendations",
    style={"description_width": "initial"}
)
b2 = Button(description="Recommend me", style={"description_width": "initial"})
h2 = HBox([title, num_rec_content])
content_tab = VBox([h2, b2])

# Creating Tabs
tabs = [popularity_tab, content_tab]
wid = Tab(tabs)

# Set titles for the tabs
names = ['Popularity Based Recommendations', 'Content Based Recommendations']
[wid.set_title(i, name) for i, name in enumerate(names)]

# Display the Tabs
display(wid)


Tab(children=(VBox(children=(Dropdown(description='Genres', options=('Mystery', 'Crime', 'Western', 'Sci-Fi', â€¦

In [35]:
Dropdown(Option=list(set(movies2['genres'])),description="Genres",style={"description_width":"initial"})

Dropdown(description='Genres', options=(), style=DescriptionStyle(description_width='initial'), value=None)

In [36]:
dropdown = Dropdown(
    options=list(set(movies2['genres'])),
    description="Genres",
    style={"description_width": "initial"}
)

# Display Dropdown
display(dropdown)

Dropdown(description='Genres', options=('Mystery', 'Crime', 'Western', 'Sci-Fi', 'Children', 'Horror', 'Musicaâ€¦

In [37]:
def b1_clicked(b):
  global output
  output=TopNPopularMovies(genre=dropdown.value,threshold=num_reviews.value,topN=num_rec_popularity.value)
b1.on_click(b1_clicked)

def b2_clicked(b):
  global output
  result=recommendation_genre(movie_df=movies3,similarity_matrix=cosine_sim,movie_title=title.value,topN=num_rec_content.value)
  output=result
b2.on_click(b2_clicked)

In [38]:
display(wid)

Tab(children=(VBox(children=(Dropdown(description='Genres', options=('Mystery', 'Crime', 'Western', 'Sci-Fi', â€¦

In [43]:
output

Unnamed: 0,Sno.,Movie Title
0,1,"4th Man, The (Fourth Man, The) (Vierde man, De..."
1,2,8MM (1999)
2,3,Abandoned (2010)
3,4,All Good Things (2010)
