[![GitHub Repository](https://img.shields.io/badge/GitHub-Repository-black?style=for-the-badge&logo=github)](https://github.com/anmmashud/Recommendation_System_TMDB_5000_Movies)

[![Live Preview](https://img.shields.io/badge/Live%20Preview-Streamlit-brightgreen?style=for-the-badge&logo=streamlit)](https://recommendation-system-tmdb-5000-movies.streamlit.app/)


In [1]:
import numpy as np
import pandas as pd

# make a copy of any dataframe
import copy

# convert to string format
import ast

# for stemming words
import nltk
from nltk.stem.porter import PorterStemmer

# for vectorization
from sklearn.feature_extraction.text import CountVectorizer

# for calculating cosine distance/ similarity
from sklearn.metrics.pairwise import cosine_similarity

# pickle format
import pickle

# importing all files from kaggel directory
# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

df1 = pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_movies.csv')
df2 = pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_credits.csv')

# converting column index `movie_id` to `id`
df2['id'] = df2['movie_id']

# there are two title in 2 different dataset
# we will convert one for tags and one for indexing
df1['title_tag'] = df1['title']

# this will create new column but 2 title columns are still exits
# delete one otherwise they will conflict during merging
df2 = df2.drop(columns=['title'])

# merge datasets
df = df1.merge(df2, on='id')

# creating new dataframe for our requirement
movies = df[['id','title','title_tag','overview','genres',
             'keywords','original_language','production_companies',
             'tagline','cast','crew']]
movies['overview'] = movies['overview'].fillna(" ")
movies['tagline'] = movies['tagline'].fillna(" ")

# ===================== #
# Feature Engeneering 
# ===================== #

# first make a copy
movies2 = movies.copy()

# function fr genres column
def convert_genres(obj):
    List = []
    for i in ast.literal_eval(obj):
        List.append(f"gen_{i['name']}")
    text = ", ".join(List)
    text = text.lower().replace(" ","").replace(","," ")
    return text

# function fr keywords column
def convert_keywords(obj):
    List = []
    for i in ast.literal_eval(obj):
        List.append(i['name'])
    text = ", ".join(List)
    text = text.lower().replace(" ","").replace(",", " ")
    return text

# function for productin companies
def convert_pc(obj):
    List = []
    for i in ast.literal_eval(obj):
        List.append(i['name'])
    text = ", ".join(List)
    text = text.lower().replace(" ","").replace(","," ").replace("."," ").replace("  "," ")
    return text

# funciton for extracting top 5 cast
def top_cast(obj):
    data = ast.literal_eval(obj)
    top5 = data[:5]
    top5_cast = [] 
    for member in top5:
        name = member['name'].replace(' ','').replace('"'," ").replace("."," ").replace("-","").lower()
        character = member['character'].replace(' ','').replace('"'," ").replace("."," ").replace("-","").lower()
        top5_cast.append(f"{name} {character}")
    return " ".join(top5_cast) 

# for extrating crew (director)
def fetch_director(obj):
    data = ast.literal_eval(obj)
    director = []
    for i in data:
        if i['job'] == 'Director':
            name = i['name'].replace(' ','').replace('"'," ").replace("."," ").replace("-","").lower()
            director.append(f"{name}")
    return " ".join(director)

# applying function 
movies2['genres'] = movies2['genres'].apply(convert_genres)
movies2['keywords'] = movies2['keywords'].apply(convert_keywords)
movies2['production_companies'] = movies2['production_companies'].apply(convert_pc)
movies2['cast'] = movies2['cast'].apply(top_cast)
movies2['crew'] = movies2['crew'].apply(fetch_director)

# converting original language
movies2['original_language'] = "ol_" + movies2['original_language'].astype(str)

# converting all columns into list format to concate them with each other
columns = ['title_tag','overview','genres','keywords',
           'original_language','production_companies',
           'tagline','cast','crew']
for i in columns:
    movies2[i] = movies2[i].apply(lambda x:x.lower().split())

# concating all columns into `tags` column
movies2["tags"] = movies2['title_tag'] + movies2['overview'] + movies2['genres'] + movies2['keywords'] + movies2['original_language'] + movies2['production_companies'] + movies2['tagline'] + movies2['cast'] + movies2['crew']

# creating a fresh dataframe with tags
movies_tags = pd.DataFrame(movies2[['id', 'title', 'tags']])

# coverting tags column into string
movies_tags['tags'] = movies_tags['tags'].apply(lambda x: ' '.join(x))

# making a function to clean the column
def clean_text(text):
    for ch in ['(', ')', '.', ',','-']:     # remove these characters completely
        text = text.replace(ch, '')
    text = ' '.join(text.split())       # replace multiple spaces with single space because it joinning only words with single space
    return text

# applying function on tags column to remove extra symbols
movies_tags['tags'] = movies_tags['tags'].apply(clean_text)

# Stemming text to reduce words size and remove similar like words
ps = PorterStemmer()

# creating function to apply stemming to all row's `tags` column
def stem(text):
    a = []
    for i in text.split():
        a.append(ps.stem(i))
    return " ".join(a)

# applyting stem on tags
movies_stem = movies_tags.copy()
movies_stem['tags'] = movies_stem['tags'].apply(stem)

# ===================== #
# Vectorizing
# ===================== #

# creating another dataframe for vectorizing 
vector = movies_stem.copy()

# fixing stop words on english
cv = CountVectorizer(stop_words='english')

# covert into vector
vector = cv.fit_transform(movies_stem['tags']).toarray()

# calculate conine similarity to determine the similar movies
similarity = cosine_similarity(vector)

# ===================== #
# Final recommendation
# ===================== #

# Function to fine the index of top10 similar movies, fetching names
def recommend(movie):
    movie_index = movies2[movies2['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)),reverse=True, key = lambda x:x[1])[1:21] # we can remove [0] as we did it n `distance`

    for i in movies_list:
        print(movies2.iloc[i[0]].title)

print("\n================================\n\n✅ All Tranning Compleate....!\n..........................")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies['overview'] = movies['overview'].fillna(" ")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies['tagline'] = movies['tagline'].fillna(" ")




✅ All Tranning Compleate....!
..........................


<h1 align="center">Testing</h1>

In [2]:
recommend("Pirates of the Caribbean: At World's End")

Pirates of the Caribbean: Dead Man's Chest
Pirates of the Caribbean: The Curse of the Black Pearl
Pirates of the Caribbean: On Stranger Tides
The Pirates! In an Adventure with Scientists!
Cutthroat Island
9
G-Force
The Prophecy
The Pirate
Shipwrecked
Megiddo: The Omega Code 2
VeggieTales: The Pirates Who Don't Do Anything
Pan
Waterworld
Pocahontas
Nim's Island
The Dead Girl
Journey to the Center of the Earth
Noah
Thor


In [4]:
movies2.head(2)

Unnamed: 0,id,title,title_tag,overview,genres,keywords,original_language,production_companies,tagline,cast,crew,tags
0,19995,Avatar,[avatar],"[in, the, 22nd, century,, a, paraplegic, marin...","[gen_action, gen_adventure, gen_fantasy, gen_s...","[cultureclash, future, spacewar, spacecolony, ...",[ol_en],"[ingeniousfilmpartners, twentiethcenturyfoxfil...","[enter, the, world, of, pandora.]","[samworthington, jakesully, zoesaldana, neytir...",[jamescameron],"[avatar, in, the, 22nd, century,, a, paraplegi..."
1,285,Pirates of the Caribbean: At World's End,"[pirates, of, the, caribbean:, at, world's, end]","[captain, barbossa,, long, believed, to, be, d...","[gen_adventure, gen_fantasy, gen_action]","[ocean, drugabuse, exoticisland, eastindiatrad...",[ol_en],"[waltdisneypictures, jerrybruckheimerfilms, se...","[at, the, end, of, the, world,, the, adventure...","[johnnydepp, captainjacksparrow, orlandobloom,...",[goreverbinski],"[pirates, of, the, caribbean:, at, world's, en..."


In [None]:
import pickle
pickle.dump(movies2.to_dict(),open('movies_dict.pkl','wb'))

In [None]:
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))

In [None]:
movies2['id'].values

In [5]:
df.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,spoken_languages,status,tagline,title,vote_average,vote_count,title_tag,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Avatar,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
