In [None]:
import kagglehub
from kagglehub import KaggleDatasetAdapter
import pandas as pd
import numpy as np

# Data Acquisition

In [None]:
movies_file_path = "tmdb_5000_movies.csv"
movies = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "tmdb/tmdb-movie-metadata",
  movies_file_path
)

credits_file_path = "tmdb_5000_credits.csv"
credits = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "tmdb/tmdb-movie-metadata",
  credits_file_path
)

In [None]:
# merge both dataframs into one
movies = movies.merge(credits, on="title")

In [None]:
movies.info()

In [None]:
movies = movies[['id', 'title', 'genres', 'keywords', 'overview', 'cast', 'crew']]

# Preprocessing

In [None]:
# check for null values
movies.isnull().sum()
movies.dropna(inplace=True)

In [None]:
movies.isnull().sum()

In [None]:
# check and remove duplicates if any
movies.duplicated().sum()

In [None]:
movies.iloc[1].genres
movies.iloc[1].keywords
# movies.iloc[1].cast

In [None]:
# make proper structure of genres, keyword, cast, crew

# helper function
import ast
def convert(obj):
    list = []
    for i in ast.literal_eval(obj):
        list.append(i['name'])
    return list

def fetch_director(obj):
  list = []
  for i in ast.literal_eval(obj):
    if i['job'] == 'Director':
      list.append(i['name'])
      break
  return list


In [None]:
movies['genres'] = movies['genres'].apply(convert)

In [None]:
movies['keywords'] = movies['keywords'].apply(convert)

In [None]:
movies['crew'] = movies['crew'].apply(fetch_director)

In [None]:
movies['overview'] = movies['overview'].apply(lambda x: x.split())

In [None]:
# remove white spaces
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])

In [None]:
# combine columns to one
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [None]:
# getting our new df
new_df = movies[['id', 'title', 'tags']]

In [None]:
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))

In [None]:
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())

## Stemming

In [None]:
from nltk.stem.porter import PorterStemmer

In [None]:
ps = PorterStemmer()

In [None]:
def stem(text):
  y = []
  for i in text.split():
    y.append(ps.stem(i))

  return " ".join(y)

In [None]:
new_df['tags'] = new_df['tags'].apply(stem)

# Vectorization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
cv = CountVectorizer(max_features=5000, stop_words='english')

In [None]:
vectors = cv.fit_transform(new_df['tags']).toarray()

# Model

In [None]:
from sklearn.metrics.pairwise import cosine_similarity


In [None]:
similarity = cosine_similarity(vectors)

In [None]:
def recommend(movie):
  movie_index = new_df[new_df['title'] == movie].index[0]
  distances = similarity[movie_index]
  movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:6]


  for i in movies_list:
    print(new_df.iloc[i[0]].title)

In [None]:
recommend('Avatar')