In [20]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.stem import WordNetLemmatizer

In [None]:
# Load the dataset
df = pd.read_csv('tmdb_5000_movies.csv')

In [None]:
# Replace NaN values with empty strings
df['overview'] = df['overview'].fillna('')

In [None]:
# Create the lemmatizer object
lemmatizer = WordNetLemmatizer()

In [None]:
# Define a function to preprocess the text
def preprocess(text):
    # Tokenize the text
    tokens = nltk.word_tokenize(text)

    # Lemmatize each token
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Join the lemmatized tokens back into a single string
    preprocessed_text = ' '.join(lemmatized_tokens)

    return preprocessed_text

In [None]:
# Apply the preprocessing function to the overview column
df['overview'] = df['overview'].apply(preprocess)

In [None]:
# Create the TF-IDF vectorizer object
tfidf_vectorizer = TfidfVectorizer()

In [None]:
# Fit the vectorizer object to the preprocessed text
tfidf_matrix = tfidf_vectorizer.fit_transform(df['overview'])

In [22]:
print(tfidf_matrix.toarray())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [None]:
# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [None]:
# Define a function to get recommendations based on movie title
def get_recommendations(title):
    # Get the index of the movie with the specified title
    movie_index = df[df['title'] == title].index.values[0]

    # Get the pairwise similarity scores of all movies with that movie
    similarity_scores = list(enumerate(cosine_sim[movie_index]))

    # Sort the movies based on the similarity scores
    sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    print("Sorted scores:")
    print(sorted_scores)

    # Get the movie indices of the top similar movies
    top_indices = [i[0] for i in sorted_scores[1:6]] # Top 5 similar movies

    # Return the top similar movies
    return df['title'].iloc[top_indices]


In [21]:
# Test the model with a sample movie title
title = 'The Dark Knight Rises'
print('Recommendations for movie:', title)
print(get_recommendations(title))

Recommendations for movie: The Dark Knight Rises
Sorted scores:
[(3, 1.0), (65, 0.3411299023806399), (299, 0.3024870431113532), (428, 0.29955275760269007), (1359, 0.2828938599002321), (3854, 0.20868998351544238), (119, 0.19420297455913563), (2507, 0.17584676920587108), (210, 0.15346458416689748), (1181, 0.13950812972911003), (9, 0.1365976529968677), (1398, 0.1224545801630694), (1068, 0.11831892541050165), (1984, 0.11560394695952919), (879, 0.11444988565793487), (2193, 0.1088199996827923), (979, 0.10490151486797258), (56, 0.1033919788104789), (160, 0.10261250133756737), (1349, 0.10163320048529488), (3942, 0.10152770074658561), (2416, 0.10140764184344911), (1246, 0.10138036889498243), (286, 0.10037919324504875), (790, 0.1000459776363481), (2850, 0.09894135785006644), (198, 0.09588085053775502), (1253, 0.09474715883808713), (1766, 0.09407033221109312), (1491, 0.09335354998573411), (1492, 0.09335006222595493), (1274, 0.09225118877442827), (2921, 0.09222843160011068), (296, 0.09182850293799