In [None]:
!pip install pandas numpy scikit-learn surprise nltk

### User Rating Prediction Based on Custom Input
Approach:

Finds similar movies based on genre, keywords, and actors<br>
Uses collaborative filtering (SVD) to predict user ratings

In [None]:
import pandas as pd
import numpy as np
import ast
from surprise import SVD, Dataset, Reader

# Read credits.csv while handling errors
try:
    credits = pd.read_csv("credits.csv", low_memory=False, encoding="utf-8", on_bad_lines="skip")
except pd.errors.ParserError:
    print("Error reading credits.csv. Trying alternative parsing...")

    with open("credits.csv", "r", encoding="utf-8") as file:
        lines = file.readlines()

    with open("credits_fixed.csv", "w", encoding="utf-8") as file:
        for line in lines:
            if line.count('"') % 2 == 0:  # Ensures quotes are balanced
                file.write(line)

    credits = pd.read_csv("credits_fixed.csv", low_memory=False, encoding="utf-8")

# Load datasets
movies = pd.read_csv("movies_metadata.csv", low_memory=False, on_bad_lines="skip")
ratings = pd.read_csv("ratings.csv", low_memory=False, on_bad_lines="skip")
keywords = pd.read_csv("keywords.csv", low_memory=False, on_bad_lines="skip")

# Convert 'id' columns to a consistent type before merging
movies['id'] = pd.to_numeric(movies['id'], errors='coerce')
keywords['id'] = pd.to_numeric(keywords['id'], errors='coerce')
credits['id'] = pd.to_numeric(credits['id'], errors='coerce')

# Now perform the merge
movies = movies.merge(keywords, on="id", how="left").merge(credits, on="id", how="left")
# Convert genre and keywords to readable format
def parse_column(text):
    try:
        return " ".join([item["name"] for item in ast.literal_eval(text)])
    except:
        return ""

movies["genres"] = movies["genres"].apply(parse_column)
movies["keywords"] = movies["keywords"].apply(parse_column)

# Train collaborative filtering model
reader = Reader(rating_scale=(ratings["rating"].min(), ratings["rating"].max()))
data = Dataset.load_from_df(ratings[["userId", "movieId", "rating"]], reader)
trainset = data.build_full_trainset()
svd = SVD()
svd.fit(trainset)

# Function to predict user rating based on input
def predict_user_rating(user_id, genre, keyword, actor):
    # Filter movies based on user input
    filtered_movies = movies[
        (movies["genres"].str.contains(genre, case=False, na=False)) &
        (movies["keywords"].str.contains(keyword, case=False, na=False)) &
        (movies["cast"].str.contains(actor, case=False, na=False))
    ]

    if filtered_movies.empty:
        return "No matching movies found."

    # Predict ratings for filtered movies
    filtered_movies["predicted_rating"] = filtered_movies["id"].apply(
        lambda x: svd.predict(user_id, x).est if str(x).isdigit() else 0
    )

    # Return top-rated prediction
    return filtered_movies[["title", "predicted_rating"]].sort_values(
        by="predicted_rating", ascending=False
    ).head(5)

# Example Usage
print(predict_user_rating(1, "Action", "hero", "Tom Cruise"))


No matching movies found.


### Recommend Movies User Will Like (Top 10 Rated by User)
Approach:<br>
This function finds the top 10 movies rated by a specific user and recommends similar movies.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Combine genres, keywords, and overview for content-based filtering
movies["combined_features"] = (
    movies["genres"] + " " + movies["keywords"] + " " + movies["overview"].fillna("")
)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(movies["combined_features"])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Function to recommend movies based on top-rated movies by a user
def recommend_movies_from_user_ratings(user_id):
    # Get top 10 highest-rated movies by user
    user_ratings = ratings[ratings["userId"] == user_id].sort_values(
        by="rating", ascending=False
    ).head(10)

    # Get recommended movies based on similarity
    recommended_movies = []
    for movie_id in user_ratings["movieId"]:
        idx = movies[movies["id"] == str(movie_id)].index
        if len(idx) == 0:
            continue
        idx = idx[0]
        similarity_scores = list(enumerate(cosine_sim[idx]))
        similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:3]

        recommended_movies.extend([movies.iloc[i[0]]["title"] for i in similarity_scores])

    return list(set(recommended_movies))[:10]

# Example Usage
print(recommend_movies_from_user_ratings(1))


### Hybrid Recommendation (Users Like You Have Also Liked)
Approach:<br>
This function finds users with similar rating patterns and recommends movies they liked.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Create a user-item matrix
user_movie_ratings = ratings.pivot(index="userId", columns="movieId", values="rating").fillna(0)

# Compute user similarity matrix
user_similarity = cosine_similarity(user_movie_ratings)

# Function to get recommendations based on similar users
def users_like_you_recommendations(user_id):
    if user_id not in user_movie_ratings.index:
        return "User not found"

    # Get similar users
    user_idx = user_movie_ratings.index.get_loc(user_id)
    similar_users = list(enumerate(user_similarity[user_idx]))
    similar_users = sorted(similar_users, key=lambda x: x[1], reverse=True)[1:6]  # Top 5 similar users

    # Get movies liked by similar users
    similar_user_ids = [user_movie_ratings.index[i[0]] for i in similar_users]
    similar_users_ratings = ratings[ratings["userId"].isin(similar_user_ids)]

    # Recommend movies that similar users have rated highly
    recommended_movies = similar_users_ratings.groupby("movieId")["rating"].mean().sort_values(ascending=False).head(10)

    return movies[movies["id"].isin(recommended_movies.index)]["title"].tolist()

# Example Usage
print(users_like_you_recommendations(1))
