**Imports and Data Loading**

In [53]:
import numpy as np
import pandas as pd
import ast

In [54]:
# Load dataset from Google Drive
movies = pd.read_csv('/content/drive/MyDrive/Movie Recommendar System DataSets/tmdb_5000_movies.csv')
credits = pd.read_csv('/content/drive/MyDrive/Movie Recommendar System DataSets/tmdb_5000_credits.csv')

**Merge DataFrames**

In [11]:
# Merge movies and credits dataframes on 'title' column
movies = movies.merge(credits,on='title')

**Data Preprocessing**

In [13]:
# Select relevant columns

movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]

In [14]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4809 entries, 0 to 4808
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4809 non-null   int64 
 1   title     4809 non-null   object
 2   overview  4806 non-null   object
 3   genres    4809 non-null   object
 4   keywords  4809 non-null   object
 5   cast      4809 non-null   object
 6   crew      4809 non-null   object
dtypes: int64(1), object(6)
memory usage: 263.1+ KB


In [16]:
# Check for missing values
movies.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [17]:
# Drop rows with missing values in 'overview'
movies.dropna(inplace=True)

In [18]:
# Check for duplicates
movies.duplicated().sum()

0

**Data Transformation**

In [19]:
movies.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [20]:
# Convert genres, keywords, cast, and crew columns from JSON string to list of names
def convert(obj):
  L =[]
  for i in ast.literal_eval(obj):
    L.append(i['name'])
  return L

In [21]:
movies['genres'] = movies['genres'].apply(convert)

In [22]:
movies['keywords'] = movies['keywords'].apply(convert)

In [23]:
def convert_cast(obj):
  L = []
  counter = 0
  for i in ast.literal_eval(obj):
    if counter != 3:
      L.append(i['name'])
      counter+=1
    else:
      break
  return L

In [24]:
movies['cast'] = movies['cast'].apply(convert_cast)

In [25]:
def fetch_director(obj):
  L = []
  for i in ast.literal_eval(obj):
    if i['job'] == 'Director':
      L.append(i['name'])
      break
  return L

In [26]:
movies['crew'] = movies['crew'].apply(fetch_director)

**Tokenize and clean text data**

In [28]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [29]:
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(" ","") for i in x])

In [30]:
# Combine all tags into a single 'tags' column
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

**Final Dataset Preparation**

In [31]:
# Select final columns for recommendation
final_df = movies[['movie_id','title','tags']]

In [32]:
final_df.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili..."


In [33]:
final_df['tags'] = final_df['tags'].apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['tags'] = final_df['tags'].apply(lambda x:" ".join(x))


In [34]:
# Convert tags list to lowercase and join into a single string
final_df['tags'] = final_df['tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['tags'] = final_df['tags'].apply(lambda x:x.lower())


In [35]:
final_df.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


**NLP and Machine Learning Setup**

In [36]:
# Install and import necessary libraries
!pip uninstall -y nltk scikit-learn
!pip install nltk==3.5 scikit-learn==0.24.2

Found existing installation: nltk 3.8.1
Uninstalling nltk-3.8.1:
  Successfully uninstalled nltk-3.8.1
Found existing installation: scikit-learn 1.2.2
Uninstalling scikit-learn-1.2.2:
  Successfully uninstalled scikit-learn-1.2.2
Collecting nltk==3.5
  Downloading nltk-3.5.zip (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting scikit-learn==0.24.2
  Downloading scikit-learn-0.24.2.tar.gz (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mPreparing metadata [0m[1;32m([0m[32mpyproject.toml[0m[1;32m)[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [

In [37]:
!pip install nltk scikit-learn

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scikit-learn
  Downloading scikit_learn-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m36.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: nltk, scikit-learn
Successfully installed nltk-3.8.1 scikit-learn-1.5.0


In [38]:
import nltk
import sklearn

print("nltk version:", nltk.__version__)
print("sklearn version:", sklearn.__version__)

nltk version: 3.8.1
sklearn version: 1.5.0


In [39]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [40]:
def stem(text):
  y = []
  for i in text.split():
    y.append(ps.stem(i))
  return " ".join(y)

In [41]:
final_df['tags'] = final_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['tags'] = final_df['tags'].apply(stem)


In [42]:
# Vectorize text using CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english') #cv-object

In [43]:
vectors = cv.fit_transform(final_df['tags']).toarray()

In [44]:
vectors[0]

array([0, 0, 0, ..., 0, 0, 0])

In [45]:
# Compute cosine similarity matrix
from sklearn.metrics.pairwise import cosine_similarity

In [46]:
similarity = cosine_similarity(vectors)

**Recommendation Function and Model Persistence**

In [47]:
# Recommendation function
def recommend(movie):
  movie_index = final_df[final_df['title'] == movie].index[0]
  distances = similarity[movie_index]
  movies_list = sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:6]

  for i in movies_list:
    print(final_df.iloc[i[0]].title)

In [48]:
recommend('Pacific Rim')

Broken Arrow
Red Tails
The Time Machine
Oblivion
Firefox


**Save final_df and similarity matrix using pickle**


In [49]:
import pickle

In [50]:
pickle.dump(final_df,open('movies.pkl','wb'))

In [52]:
pickle.dump(similarity,open('similarity.pkl','wb'))