<a href="https://colab.research.google.com/github/bbqgonewrong/Movie-Recommendation-system/blob/main/Movie_Recommendation_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Importing libraries

In [60]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import ipywidgets as widgets
from IPython.display import display

#Function in order to clean the data

In [3]:
def clean_data(title):
  return re.sub('[^a-zA-Z0-9]',' ',title)

In [4]:
movies = pd.read_csv('movies.csv')

In [5]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
movies['clean_title'] = movies['title'].apply(clean_data)

#Building TF-IDF matrix

In [7]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf = vectorizer.fit_transform(movies['clean_title'])

#Declaration to convert the search terms into a vectorizer for searching 

In [8]:
def search(title):
  title = clean_data(title)
  query_vec = vectorizer.transform([title])
  similarity = cosine_similarity(query_vec,tfidf).flatten() #create cosine similarity
  indices = np.argpartition(similarity,-5)[-5:] #5 most similar search terms
  results = movies.iloc[indices].iloc[::-1]
  return results

#Creating an interactive widget for typing the text field using IPython

In [61]:
#Creating a input widget to enter the text
movie_input = widgets.Text(value = 'Toy Story',
                           description = 'Movie Title: ',
                           disabled = False)
#Create a output widget to give the output of the movies
movie_list = widgets.Output()

#Creating the function in order to display the closest matching movie values
def on_type(data):
  with movie_list:
    movie_list.clear_output()
    title = data['new']
    if len(title) > 5:
      display(search(title))

movie_input.observe(on_type,names = 'value')

display(movie_input,movie_list)

Text(value='Toy Story', description='Movie Title: ')

Output()

#Tagging the recommendation engine with the movie ratings using ratings.csv

In [64]:
ratings = pd.read_csv('ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [14]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

#Checking users who like the same movies as the end-user

In [16]:
movie_id = 1

In [23]:
#A user who gave a 5 for the movie is one who I deem as who liked the movie
similar_users = ratings[(ratings['movieId']==movie_id) & (ratings['rating']>=5)]['userId'].unique()


In [25]:
similar_users

array([    36,     75,     86, ..., 162518, 162519, 162530])

#Viewing the movies which the similar users liked

In [30]:
similar_users_rec = ratings[(ratings['userId'].isin(similar_users))& (ratings['rating']>4)]['movieId']

In [None]:
similar_users_rec

In [36]:
similar_users_rec = similar_users_rec.value_counts() / len(similar_users)
similar_users_rec = similar_users_rec[similar_users_rec > .1]

In [37]:
similar_users_rec

1       1.000000
318     0.414556
260     0.404561
356     0.347253
296     0.342663
          ...   
1259    0.102991
7361    0.101881
1206    0.101362
1307    0.101066
1208    0.100918
Name: movieId, Length: 92, dtype: float64

#What set of regular people liked the titles liked by 10% of the similar users. Finding how much all users in the dataset liked the movies selected.

In [41]:
all_users = ratings[(ratings['movieId'].isin(similar_users_rec.index))& (ratings['rating']>4)]

In [43]:
all_users.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
29,1,4973,4.5,1147869080
48,1,7361,5.0,1147880055
72,2,110,5.0,1141416589
76,2,260,5.0,1141417172


#What percentage of all users recommend 10% of such movies

In [44]:
all_users_rec = all_users['movieId'].value_counts()/len(all_users['userId'].unique())

In [45]:
all_users_rec

318      0.345282
296      0.287220
2571     0.246217
356      0.237370
593      0.227930
           ...   
1387     0.047886
1307     0.046195
745      0.037362
78499    0.035445
2355     0.025316
Name: movieId, Length: 92, dtype: float64

#Creating a recommendation score

In [46]:
rec_percentages = pd.concat([similar_users_rec,all_users_rec],axis = 1)
rec_percentages.columns = ['similar','all']

In [50]:
rec_percentages

Unnamed: 0,similar,all,score
1,1.000000,0.125844,7.946323
3114,0.295498,0.054186,5.453383
2355,0.124685,0.025316,4.925186
78499,0.138161,0.035445,3.897906
588,0.233674,0.068117,3.430480
...,...,...,...
58559,0.160743,0.147779,1.087725
79132,0.129424,0.132559,0.976349
7361,0.101881,0.105172,0.968704
2959,0.205020,0.218656,0.937638


In [49]:
#Higher the score, better the recommendations
rec_percentages['score'] = rec_percentages['similar']/ rec_percentages['all']
rec_percentages = rec_percentages.sort_values('score',ascending = False)

In [52]:
rec_percentages.head(10).merge(movies,left_index = True,right_on='movieId')

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,1.0,0.125844,7.946323,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.295498,0.054186,5.453383,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
2264,0.124685,0.025316,4.925186,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bug s Life A 1998
14813,0.138161,0.035445,3.897906,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
580,0.233674,0.068117,3.43048,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
587,0.198949,0.060514,3.287671,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991
33,0.158226,0.052696,3.002602,34,Babe (1995),Children|Drama,Babe 1995
4780,0.210647,0.071444,2.94841,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
1047,0.143418,0.049202,2.914882,1073,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical,Willy Wonka the Chocolate Factory 1971
729,0.108322,0.037362,2.899227,745,Wallace & Gromit: A Close Shave (1995),Animation|Children|Comedy,Wallace Gromit A Close Shave 1995


#Building a recommendation function in order to do all this at once

In [59]:
def find_similar_movies(movie_id):
  #Finding similar user ids
  similar_users = ratings[(ratings['movieId']==movie_id) & (ratings['rating']>4)]['userId'].unique()
  similar_users_rec = ratings[(ratings['userId'].isin(similar_users))& (ratings['rating']>4)]['movieId']
  #Narrowing down to 10% of the recommendations
  similar_users_rec = similar_users_rec.value_counts() / len(similar_users)
  
  similar_users_rec = similar_users_rec[similar_users_rec > .10]
  #Checking the movies they liked
  all_users = ratings[(ratings['movieId'].isin(similar_users_rec.index))& (ratings['rating']>4)]
  #Narrowing down to 10% of the recommendations
  all_users_rec = all_users['movieId'].value_counts()/len(all_users['userId'].unique())
  #Creating the recommendation score
  rec_percentages = pd.concat([similar_users_rec,all_users_rec],axis = 1)
  rec_percentages.columns = ['similar','all']
  #Higher the score, better the recommendations
  rec_percentages['score'] = rec_percentages['similar']/ rec_percentages['all']
  rec_percentages = rec_percentages.sort_values('score',ascending = False)
  return rec_percentages.head(10).merge(movies,left_index = True,right_on='movieId')[['score','title','genres']]

#Creating a widget in order to return the recommendations

In [65]:
movie_name_input = widgets.Text(
    value = 'Toy Story',
    description= 'Movie Title:',
    disabled = False
)

recommendation_list = widgets.Output()

def on_type_recommend(data):
  with recommendation_list:
    recommendation_list.clear_output()
    title = data['new']
    if len(title)>5:
      results = search(title)
      movie_id = results.iloc[0]['movieId']
      display(find_similar_movies(movie_id))

movie_name_input.observe(on_type_recommend,names = 'value')

display(movie_name_input,recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()

#Next Steps:


*   Genres
*   use genres with users
*   use meta data 
*   export it to create a project using Flask and React

