# **<u>Machine Learning Internship Project at Bharat Intern - Movies Recommendation System (Task 2)</u>**

# *In this project walkthrough, we create a movie recommendation system using Jupyter, Python, and Pandas.  By the end, we'll be able to type the name of a movie into an input box, and instantly get recommendations for other movies we might like.* We'll start with the MovieLens 25M dataset, which contains movie reviews and ratings.  Then, we'll build a search engine to find a specific movie title in our data.  We'll then be able to create a recommendation engine to recommend specific movies*

# <i> import required libraries </i>

In [1]:
import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ipywidgets as widgets
from IPython.display import display


## loading Movies Data

In [2]:
movies = pd.read_csv("/content/movies.csv")
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  62423 non-null  int64 
 1   title    62423 non-null  object
 2   genres   62423 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


# ***Preprocessing*** :-

# Checking null values

In [13]:
movies.isnull().sum()

movieId        0
title          0
genres         0
clean_title    0
dtype: int64

# Checking duplicate values

In [15]:
movies.duplicated().sum()

0

**Nice there is no missing and duplicate values**

## ***Cleaning title column***

In [5]:
def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title

In [6]:
movies["clean_title"] = movies["title"].apply(clean_title)

In [8]:
movies.head()

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995


## *Creating a tfidf matrix*

In [9]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies["clean_title"])

# ***Creating a search function***

In [10]:

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices].iloc[::-1]

    return results

# ***Building an interactive search box with Jupyter***

In [11]:
movie_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names='value')


display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [21]:
movie_id = 89745

#def find_similar_movies(movie_id):
movie = movies[movies["movieId"] == movie_id]

In [17]:
ratings = pd.read_csv("/content/ratings.csv")
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1104160 entries, 0 to 1104159
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   userId     1104160 non-null  int64  
 1   movieId    1104160 non-null  int64  
 2   rating     1104160 non-null  float64
 3   timestamp  1104160 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 33.7 MB


In [19]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [18]:
ratings.duplicated().sum()

0

## Similar users that like the same movie

In [25]:
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()

similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]



In [24]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .10]

##  Finding how much all users like movies

In [26]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [28]:
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]

In [29]:
rec_percentages

Unnamed: 0,similar,all
3741,318,0.024164
3742,527,0.021066
3743,541,
3744,589,0.006196
3745,741,0.016109
...,...,...
1103576,195159,
1103578,195497,
1103579,197691,
1103580,197711,


#  Creating a recommendation score

In [30]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

In [31]:
rec_percentages = rec_percentages.sort_values("score", ascending=False)

In [32]:
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
14972,179881,0.00062,290327934.0,79299,"No. 1 Ladies' Detective Agency, The (2008)",Comedy|Crime|Mystery,No 1 Ladies Detective Agency The 2008
12418,170875,0.00062,275792250.0,59988,"Boys and Girls Guide to Getting Down, The (2006)",Comedy,Boys and Girls Guide to Getting Down The 2006
9174,160378,0.00062,258850092.0,27334,Sound and Fury (2000),Documentary,Sound and Fury 2000
11290,160271,0.00062,258677394.0,50066,Sweet Land (2005),Drama|Romance,Sweet Land 2005
34962,157296,0.00062,253875744.0,146654,Nous trois ou rien (2015),Comedy|Drama,Nous trois ou rien 2015
3739,136449,0.00062,220228686.0,3841,Air America (1990),Action|Comedy,Air America 1990
3736,134853,0.00062,217652742.0,3838,Phantasm III: Lord of the Dead (1994),Horror,Phantasm III Lord of the Dead 1994
18022,134130,0.00062,216485820.0,94126,Bullhead (Rundskop) (2011),Crime|Drama,Bullhead Rundskop 2011
3733,122904,0.00062,198367056.0,3835,"Crush, The (1993)",Thriller,Crush The 1993
3731,122886,0.00062,198338004.0,3833,"Brain That Wouldn't Die, The (1962)",Horror|Sci-Fi,Brain That Wouldnt Die The 1962


# ** Building a recommendation function**

In [33]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]

    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

# **Creating an interactive recommendation widget**

In [35]:
movie_name_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()