# Movies Recommender System:
### Objective:-
   The objective of this project is to develop a movie recommender system. 
   The recommender system aims to provide top 5 similar genres movies as recommender based on the 
   selected movie using cosine similarity method.

- Importing all required libraries

In [459]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import HTML
from bs4 import BeautifulSoup
pd.set_option('display.max_columns', 500)


- Importing datasets and assigning new variable name

In [460]:
credits=pd.read_csv('tmdb_5000_credits.csv')
movies=pd.read_csv('tmdb_5000_movies.csv')
movies_complete=pd.read_csv('movies_complete.csv')


In [461]:
df_mc=movies_complete[['title','budget_musd','revenue_musd','poster_path']]
df_mc.shape

(44691, 4)

- Checking whether Url path is diplaying the correct images. 

In [462]:
subset=df_mc[['title','poster_path']].head(5)
HTML(subset.to_html(escape=False))


Unnamed: 0,title,poster_path
0,Toy Story,
1,Jumanji,
2,Grumpier Old Men,
3,Waiting to Exhale,
4,Father of the Bride Part II,


# Data preprocessing

- A function to extract only web adress from given full src

In [463]:
# Function to extract the full URL
def extract_full_url(html_script):
    soup = BeautifulSoup(html_script, 'html.parser')
    img_tag = soup.find('img')
    if img_tag and 'src' in img_tag.attrs:
        return img_tag['src']


In [464]:
subset

Unnamed: 0,title,poster_path
0,Toy Story,<img src='http://image.tmdb.org/t/p/w185//uXDf...
1,Jumanji,<img src='http://image.tmdb.org/t/p/w185//vgpX...
2,Grumpier Old Men,<img src='http://image.tmdb.org/t/p/w185//1FSX...
3,Waiting to Exhale,<img src='http://image.tmdb.org/t/p/w185//4wjG...
4,Father of the Bride Part II,<img src='http://image.tmdb.org/t/p/w185//lf9R...


In [465]:
subset['extracted_url'] = subset['poster_path'].apply(extract_full_url)

In [466]:
subset['poster_path'][0],subset['extracted_url'][0]

("<img src='http://image.tmdb.org/t/p/w185//uXDfjJbdP4ijW5hWSBrPrlKpxab.jpg' style='height:100px;'>",
 'http://image.tmdb.org/t/p/w185//uXDfjJbdP4ijW5hWSBrPrlKpxab.jpg')

In [467]:
movies.shape, credits.shape

((4803, 20), (4803, 4))

- Removing null values.

In [468]:
df_mc.isnull().sum()

title               0
budget_musd     35837
revenue_musd    37306
poster_path       224
dtype: int64

In [469]:
df_mc.dropna(subset=['poster_path'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_mc.dropna(subset=['poster_path'],inplace=True)


In [470]:
df_mc['poster_path'] = df_mc['poster_path'].apply(extract_full_url)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_mc['poster_path'] = df_mc['poster_path'].apply(extract_full_url)


- Merging dataset into one

In [471]:
movies=movies.merge(df_mc, on=['title'])

In [472]:
# movies.merge(credits, left_on='id', right_on='movie_id')
movies=movies.merge(credits, on='title')


In [473]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,budget_musd,revenue_musd,poster_path,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,237.0,2787.965087,http://image.tmdb.org/t/p/w185//btnl50ZDJDSCal...,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


# Taking only those columns which is required.

In [474]:
# genres, id, keywords, title, overview, cast, crew,poster_path

In [475]:
movies=movies[['movie_id','title','overview','genres','keywords','cast','crew','poster_path']]

In [476]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew,poster_path
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...",http://image.tmdb.org/t/p/w185//btnl50ZDJDSCal...


In [477]:
movies.isnull().sum()

movie_id       0
title          0
overview       1
genres         0
keywords       0
cast           0
crew           0
poster_path    0
dtype: int64

In [478]:
movies.dropna(inplace=True)


In [479]:
movies.duplicated().sum()

0

In [480]:
movies['genres'][0]

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

- #  Function for extracting names

In [481]:
import ast

def convert(obj):
    L=[]
    n=1
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L    


In [482]:
movies['genres']=movies['genres'].apply(lambda x: convert(x))
movies['keywords']=movies['keywords'].apply(lambda x: convert(x))


In [483]:
def cast_name(obj):
    L=[]
    n=1
    for i in ast.literal_eval(obj):
        if n<=3:
            L.append(i['name'])
            n=n+1
        else:
            break 
    return L
cast_name(movies['cast'][0])       

['Sam Worthington', 'Zoe Saldana', 'Sigourney Weaver']

In [484]:
movies['cast']=movies['cast'].apply(cast_name)

In [485]:
def Director_name(obj):
    L=[]
    for i in ast.literal_eval(obj):
        if i['job']=='Director':
            L.append(i['name'])
    return L
Director_name(movies['crew'][0])       

['James Cameron']

In [486]:
movies['crew']=movies['crew'].apply( lambda x: Director_name(x))

In [487]:
movies.columns

Index(['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew',
       'poster_path'],
      dtype='object')

In [488]:
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ", "") for i in x])
# movies['genres']


In [489]:
movies['overview']=movies['overview'].apply( lambda x: x.split())

# Creating tags column

In [490]:
# movies['overview']
movies['tags']=movies['overview']+movies['genres']+movies['keywords']+movies['cast']+movies['crew']
movies.head()


Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew,poster_path,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],http://image.tmdb.org/t/p/w185//btnl50ZDJDSCal...,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski],http://image.tmdb.org/t/p/w185//oVh3REsCwJwmrT...,"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes],http://image.tmdb.org/t/p/w185//672kUEMtTHcaVY...,"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan],http://image.tmdb.org/t/p/w185//vzvKcPQ4o7TjWe...,"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton],http://image.tmdb.org/t/p/w185//7GSSyUUgUEXm1r...,"[John, Carter, is, a, war-weary,, former, mili..."


In [491]:
new_df= movies[['movie_id','title','tags','poster_path']]
new_df

Unnamed: 0,movie_id,title,tags,poster_path
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...",http://image.tmdb.org/t/p/w185//btnl50ZDJDSCal...
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...",http://image.tmdb.org/t/p/w185//oVh3REsCwJwmrT...
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...",http://image.tmdb.org/t/p/w185//672kUEMtTHcaVY...
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...",http://image.tmdb.org/t/p/w185//vzvKcPQ4o7TjWe...
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...",http://image.tmdb.org/t/p/w185//7GSSyUUgUEXm1r...
...,...,...,...,...
5478,9367,El Mariachi,"[El, Mariachi, just, wants, to, play, his, gui...",http://image.tmdb.org/t/p/w185//zZYP9wJW6XAm80...
5479,72766,Newlyweds,"[A, newlywed, couple's, honeymoon, is, upended...",http://image.tmdb.org/t/p/w185//eINKU1ewT7B8sd...
5480,231617,"Signed, Sealed, Delivered","[""Signed,, Sealed,, Delivered"", introduces, a,...",http://image.tmdb.org/t/p/w185//6BVCgmhLeSTF8n...
5481,126186,Shanghai Calling,"[When, ambitious, New, York, attorney, Sam, is...",http://image.tmdb.org/t/p/w185//2a1q1RTxspKxGW...


In [492]:
new_df['tags']=new_df['tags'].apply( lambda x : " ".join(x))
new_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']=new_df['tags'].apply( lambda x : " ".join(x))


Unnamed: 0,movie_id,title,tags,poster_path
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...",http://image.tmdb.org/t/p/w185//btnl50ZDJDSCal...
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",http://image.tmdb.org/t/p/w185//oVh3REsCwJwmrT...
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,http://image.tmdb.org/t/p/w185//672kUEMtTHcaVY...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,http://image.tmdb.org/t/p/w185//vzvKcPQ4o7TjWe...
4,49529,John Carter,"John Carter is a war-weary, former military ca...",http://image.tmdb.org/t/p/w185//7GSSyUUgUEXm1r...
...,...,...,...,...
5478,9367,El Mariachi,El Mariachi just wants to play his guitar and ...,http://image.tmdb.org/t/p/w185//zZYP9wJW6XAm80...
5479,72766,Newlyweds,A newlywed couple's honeymoon is upended by th...,http://image.tmdb.org/t/p/w185//eINKU1ewT7B8sd...
5480,231617,"Signed, Sealed, Delivered","""Signed, Sealed, Delivered"" introduces a dedic...",http://image.tmdb.org/t/p/w185//6BVCgmhLeSTF8n...
5481,126186,Shanghai Calling,When ambitious New York attorney Sam is sent t...,http://image.tmdb.org/t/p/w185//2a1q1RTxspKxGW...


In [493]:
new_df['tags']=new_df['tags'].apply( lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']=new_df['tags'].apply( lambda x: x.lower())


# Creating vector for given tags column

In [494]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 5000, stop_words='english')

In [495]:
vectors=cv.fit_transform(new_df['tags']).toarray()
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [496]:
cv.get_feature_names_out()


array(['000', '007', '10', ..., 'zone', 'zoo', 'zooeydeschanel'],
      dtype=object)

# Stemming function

In [497]:
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [498]:
ps.stem('loving')

'love'

In [499]:
def stem(text):
    y=[]
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)    

In [500]:
new_df['tags']=new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']=new_df['tags'].apply(stem)


# Creating cosine_similarity for vector

In [501]:
from sklearn.metrics.pairwise import cosine_similarity

In [502]:
similarity=cosine_similarity(vectors)

In [503]:
list(enumerate(similarity[0]))


[(0, 1.0000000000000002),
 (1, 0.08603090020146065),
 (2, 0.0582716546748065),
 (3, 0.038778336716474064),
 (4, 0.17521916101261562),
 (5, 0.11357771260606365),
 (6, 0.019389168358237032),
 (7, 0.019389168358237032),
 (8, 0.17119059581558146),
 (9, 0.06243905410544627),
 (10, 0.07421560439929402),
 (11, 0.11295649894498103),
 (12, 0.07894736842105264),
 (13, 0.09197090092274487),
 (14, 0.04120428217151646),
 (15, 0.04120428217151646),
 (16, 0.04120428217151646),
 (17, 0.11128297681493143),
 (18, 0.04891159880445185),
 (19, 0.07894736842105264),
 (20, 0.07894736842105264),
 (21, 0.1442149876003076),
 (22, 0.10743376064838502),
 (23, 0.08749572785196143),
 (24, 0.055824219567359015),
 (25, 0.08471737420873576),
 (26, 0.08471737420873576),
 (27, 0.08471737420873576),
 (28, 0.08471737420873576),
 (29, 0.06917144638660747),
 (30, 0.09365858115816941),
 (31, 0.0533380747062665),
 (32, 0.0533380747062665),
 (33, 0.0533380747062665),
 (34, 0.051298917604257706),
 (35, 0.051298917604257706),
 (

In [504]:
sorted(list(enumerate(similarity[0])), reverse=True, key = lambda x: x[1])[1:6]

[(2923, 0.25334729596907),
 (690, 0.25038669783359574),
 (1771, 0.24455799402225925),
 (651, 0.24283093212859141),
 (1473, 0.2421000623531261)]

# Recommender function

In [505]:
def recommend(movies):
    movie_index=new_df[new_df['title']==movies].index[0]
    distances = similarity[movie_index]
    movies_list=sorted(list(enumerate(distances)), reverse=True, key = lambda x: x[1])[1:6] 
    
    for i in movies_list:
        print( new_df.iloc[i[0]].title)
        # print(i[0])
    

In [506]:
recommend('Batman Begins')

The Dark Knight
The Dark Knight
The Dark Knight Rises
Batman
Batman


# Importing pickle in order to save our created model

In [507]:
import pickle

In [508]:
pickle.dump(new_df, open('movies.pkl','wb'))

In [509]:
pickle.dump(new_df.to_dict(), open('movies_dict.pkl','wb'))

In [510]:
pickle.dump(similarity, open('similarity.pkl','wb'))