In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
data = pd.read_csv("movie_dataset.csv")
selected_features = ['title','genres', 'keywords', 'tagline', 'overview']

In [8]:
for feature in selected_features:
    data[feature] = data[feature].fillna('')

In [10]:
data['combined_features'] = (
    data['genres'] + ' ' +
    data['keywords'] + ' ' +
    data['tagline'] + ' ' +
    data['overview']
)

In [12]:
vectorizer = TfidfVectorizer(stop_words='english')
feature_vectors = vectorizer.fit_transform(data['combined_features'])

In [14]:
similarity = cosine_similarity(feature_vectors)

In [16]:
def recommend(movie_name):
    movie_name = movie_name.lower()
    if movie_name not in data['title'].str.lower().values:
        return "Movie not found in database."
        
    index = data[data['title'].str.lower() == movie_name].index[0]
    similarity_scores = list(enumerate(similarity[index]))
    sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    print(f"\nTop 10 movies similar to: {data.iloc[index]['title']}\n")
    for i in sorted_scores[1:11]:
        print(data.iloc[i[0]]['title'])

In [18]:
recommend("Avatar")


Top 10 movies similar to: Avatar

Lifeforce
Moonraker
Gattaca
Trekkies
Apollo 18
The Inhabited Island
Cargo
Gravity
Lockout
Space Pirate Captain Harlock
