In [None]:
import pandas as pd

In [None]:
data_1 = pd.read_csv("/content/drive/MyDrive/Study/Projects/Movie Recommender System/tmdb_5000_credits.csv")
data_2 = pd.read_csv("/content/drive/MyDrive/Study/Projects/Movie Recommender System/tmdb_5000_movies.csv")

In [None]:
data_1.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [None]:
data_2.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [None]:
movies = data_2.merge(data_1, on="title")
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]
movies.head(2)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [None]:
movies.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [None]:
movies.dropna(inplace=True)

In [None]:
from ast import literal_eval

def cast_preprocessing(cell):
  casts = []
  cast_details_list = literal_eval(cell)
  for details_dict in cast_details_list[0:3]:
    casts.append(details_dict["name"])
  return casts

def crew_preprocessing(cell):
  directors = []
  crew_details_list = literal_eval(cell)
  for crew_member in crew_details_list:
    if crew_member["job"] == "Director":
      directors.append(crew_member['name'])
  return directors

def genres_keywords_preprocessing(cell):
  genres = []
  genres_list = literal_eval(cell)
  for genre in genres_list:
    genres.append(genre["name"])
  return genres

def collapse(cell):
  result = []
  for element in cell:
    result.append(element.lower().replace(" ", ""))
  return result

import nltk
nltk.download('stopwords')

def overview_preprocessing(cell):
  filtered_words = []
  words = cell.replace(",", "").replace(".", "").lower().split()
  stopword_list = nltk.corpus.stopwords.words('english')
  for word in words:
    if word not in stopword_list:
      filtered_words.append(word)
  return filtered_words

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
movies["cast"] = movies["cast"].apply(cast_preprocessing).apply(collapse)
movies["crew"] = movies["crew"].apply(crew_preprocessing).apply(collapse)
movies["genres"] = movies["genres"].apply(genres_keywords_preprocessing).apply(collapse)
movies["keywords"] = movies["keywords"].apply(genres_keywords_preprocessing).apply(collapse)
movies["overview"] = movies["overview"].apply(overview_preprocessing)

In [None]:
movies.rename(columns={"movie_id": "id", "crew": "directors"}, inplace=True)

In [None]:
movies.head(2)

Unnamed: 0,id,title,overview,genres,keywords,cast,directors
0,19995,Avatar,"[22nd, century, paraplegic, marine, dispatched...","[action, adventure, fantasy, sciencefiction]","[cultureclash, future, spacewar, spacecolony, ...","[samworthington, zoesaldana, sigourneyweaver]",[jamescameron]
1,285,Pirates of the Caribbean: At World's End,"[captain, barbossa, long, believed, dead, come...","[adventure, fantasy, action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[johnnydepp, orlandobloom, keiraknightley]",[goreverbinski]


In [None]:
movies["tags"] = movies["overview"] + movies["genres"] + movies["keywords"] + movies["cast"] + movies["directors"]
movies['tags'] = movies['tags'].apply(lambda x: " ".join(x))
movies = movies[["id", "title", "tags"]]

movies.head(2)

Unnamed: 0,id,title,tags
0,19995,Avatar,22nd century paraplegic marine dispatched moon...
1,285,Pirates of the Caribbean: At World's End,captain barbossa long believed dead come back ...


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfv = TfidfVectorizer()
tfidf_matrix = tfidfv.fit_transform(movies["tags"])

print(tfidf_matrix.shape)
print(len(movies))

(4806, 35892)
4806


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(tfidf_matrix)

similarity.shape

(4806, 4806)

In [None]:
def recommender(movie_name, number_of_recommendations):
  movie_index = movies[movies["title"] == movie_name].index[0]
  similarity_array = sorted(similarity[movie_index], reverse=True)[1:number_of_recommendations+1]
  recommended_indices=[]
  for i in similarity_array:
    recommended_indices.append(list(similarity[movie_index]).index(i))

  recommended_movies = []
  for j in recommended_indices:
    recommended_movies.append(movies["title"][j])

  return recommended_movies

In [None]:
recommender("Titanic", 5)

['Raise the Titanic',
 'Poseidon',
 'Ghost Ship',
 'Pirates of the Caribbean: On Stranger Tides',
 'In the Heart of the Sea']

In [None]:
recommender("Pirates of the Caribbean: On Stranger Tides", 2)

["Pirates of the Caribbean: Dead Man's Chest",
 'Pirates of the Caribbean: The Curse of the Black Pearl']