In [76]:
import numpy as np  
import ast  # For dealing with JSON-like strings
from ast import literal_eval  # For safely evaluating literal expressions
import pickle  # For saving and loading data
from nltk.stem.porter import PorterStemmer  # For word stemming
from sklearn.feature_extraction.text import CountVectorizer  # For converting text to numerical vectors
from sklearn.metrics.pairwise import cosine_similarity  # For calculating similarity scores
movies = pd.read_csv(r"C:\Users\Aman Chaturvedi\Downloads\tmdb\tmdb_5000_movies.csv")
credits = pd.read_csv(r"C:\Users\Aman Chaturvedi\Downloads\tmdb\tmdb_5000_credits.csv")
movies = movies.merge(credits, on='title')
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]
movies.head()

# Check for missing values in the dataset and remove rows with missing data
movies.isnull().sum()
movies.dropna(inplace=True)

# Check for duplicate entries and remove them
movies.duplicated().sum()

# Extract and convert relevant information from JSON-like columns (genres, keywords, cast, crew)
def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name'])
    return L

columns_to_convert = ['genres', 'keywords', 'cast', 'crew']
for col in columns_to_convert:
    movies[col] = movies[col].apply(convert)

# Tokenize the 'overview' column by splitting text into words
movies['overview'] = movies['overview'].apply(lambda x: x.split())

# Create a new column 'tags' by combining information from multiple columns
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

# Create a new DataFrame 'new' by dropping unnecessary columns
new = movies.drop(columns=['overview', 'genres', 'keywords', 'cast', 'crew'])

# Initialize CountVectorizer and PorterStemmer for text preprocessing
cv = CountVectorizer(max_features=5000, stop_words='english')
ps = PorterStemmer()

# Define a function to perform word stemming
def stem(tag_list):
    return [ps.stem(word) for word in tag_list]

# Apply word stemming to the 'tags' column
new['tags'] = new['tags'].apply(stem)

# Convert the list of stemmed words into space-separated strings
new['tags'] = new['tags'].apply(lambda x: ' '.join(x))

# Create numerical vectors from the 'tags' column using CountVectorizer
vector = cv.fit_transform(new['tags']).toarray()

# Calculate cosine similarity between movie tags
similarity = cosine_similarity(vector)

# To recommend similar movies based on input movie title
def recommend(movie):
    # Check if the movie title exists in the DataFrame
    if movie in new['title'].values:
        index = new[new['title'] == movie].index[0]
        distances = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x: x[1])
        for i in distances[1:6]:
            print(new.iloc[i[0]].title)
    else:
        print(f"'{movie}' not found in the movie database.")

# Serialize and save data using pickle for later use
pickle.dump(new, open('movie.pkl', 'wb'))
pickle.dump(new.to_dict(), open('movie_dict.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))
recommend('The avengers')

'The avengers' not found in the movie database.
