In [9]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Load dataset
dataset_url = "https://raw.githubusercontent.com/rashida048/Some-NLP-Projects/master/movie_dataset.csv"
df = pd.read_csv(dataset_url)
df.head()
df.shape

In [None]:
# Select relevant features for content-based filtering
features = ['title', 'genres', 'director', 'cast', 'keywords', 'overview']
df = df[features]
df.isnull().sum()

In [None]:
# Handle missing values
df = df.fillna('')

In [17]:
# Combine selected features into a single text column
df['Combined'] = df['genres'] + ' ' + df['director'] + ' ' + df['cast'] + ' ' + df['keywords'] + ' ' + df['overview']

In [18]:
# Convert text data into numerical format using TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english')
feature_matrix = vectorizer.fit_transform(df['Combined'])

In [19]:
# Compute similarity using cosine similarity
similarity_matrix = cosine_similarity(feature_matrix)

In [20]:
# Function to get movie recommendations
def recommend_movies(movie_title, num_recommendations=5):
    if movie_title not in df['title'].values:
        return "Movie not found in dataset."
    
    # Get index of the movie
    idx = df[df['title'] == movie_title].index[0]
    
    # Get similarity scores for all movies with the given movie
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    
    # Sort movies based on similarity scores in descending order
    sorted_movies = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:num_recommendations+1]
    
    # Get recommended movie titles
    recommended_movies = [df.iloc[i[0]]['title'] for i in sorted_movies]
    return recommended_movies

In [21]:
# Example usage
movie_name = "The Avengers"
print(f"Movies similar to {movie_name}: {recommend_movies(movie_name)}")

Movies similar to The Avengers: ['Avengers: Age of Ultron', 'Iron Man 2', 'Captain America: The Winter Soldier', 'Captain America: Civil War', 'X-Men']
