In [95]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/movielens-1m/movies.csv
/kaggle/input/movielens-1m/ratings.csv
/kaggle/input/movielens-1m/users.csv


# Import libraries

In [96]:
#Import Libraries (Updated)
import pandas as pd
import numpy as np
import re
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from surprise import SVD, Dataset, Reader
from surprise.model_selection import cross_validate
from surprise import accuracy
from surprise.model_selection import train_test_split as surprise_train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
%matplotlib inline

# Set visualization style
plt.style.use('seaborn')
sns.set_palette('pastel')

  plt.style.use('seaborn')


# Load data and Explore Data

In [None]:
#Load Data 
movies = pd.read_csv("/kaggle/input/movielens-1m/movies.csv", sep=",", engine="python", encoding="latin-1",header=0,
                     names=["movieId", "title", "genres"])
ratings = pd.read_csv("/kaggle/input/movielens-1m/ratings.csv", sep=",", engine="python", encoding="latin-1",header=0,
                      names=["userId", "movieId", "rating", "timestamp"])
users = pd.read_csv("/kaggle/input/movielens-1m/users.csv", sep=",", engine="python", encoding="latin-1",header=0,
                    names=["userId", "gender", "age", "occupation", "zip"])


In [None]:
print("Movies Dataset Shape:", movies.shape)
print("Ratings Dataset Shape:", ratings.shape)
print("Users Dataset Shape:", users.shape)

In [None]:
movies.head()

In [None]:
ratings.head()

In [None]:
users.head()

In [None]:
movies = movies[movies['title'] != "title"]

In [None]:
movies['year'] = movies['title'].str.extract(r'\((\d{4})\)', expand=False)
movies['year'] = pd.to_numeric(movies['year'], errors='coerce')

print(movies[['title', 'year']].head(10))


In [None]:
df = ratings.merge(movies, on="movieId", how="left")

df = df.merge(users, on="userId", how="left")


df.head()


# Data Preprocessing

In [None]:
def clean_title(title):
    if isinstance(title, str):
        return re.sub("[^a-zA-Z0-9 ]", "", title)   # remove special chars
    else:
        return ""

In [None]:
movies = movies.dropna(subset=["title"])

movies_clean = movies.copy()
movies_clean["title"] = movies_clean["title"].apply(clean_title)

In [None]:

movies_clean["genres"] = movies_clean["genres"].fillna("").astype(str)

movies_clean["genres"] = movies_clean["genres"].str.split("|")

movies_clean = movies_clean[movies_clean["genres"].apply(lambda x: "(no genres listed)" not in x)]

movies_clean["genres_text"] = movies_clean["genres"].apply(lambda x: " ".join(x))


In [None]:
ratings_clean = ratings.drop("timestamp", axis=1)

In [None]:
print(f"✅ Clean Movies Dataset: {movies_clean.shape[0]} movies")
print(f"✅ Clean Ratings Dataset: {ratings_clean.shape[0]} ratings")
print(f"✅ Unique Users: {ratings_clean['userId'].nunique()}")

In [None]:
print(df.isnull().sum())

In [None]:
num_cols = df.select_dtypes(include=np.number).columns.tolist()
cat_cols = df.select_dtypes(include=['object']).columns.tolist()

for col in num_cols:
    mean_val = df[col].median()
    df[col].fillna(mean_val, inplace=True)

for col in cat_cols:
    mode_val = df[col].mode()[0]
    df[col].fillna(mode_val, inplace=True)

In [None]:
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

In [None]:
#Final Validation
if df is not None:
    print("\n🔍 Final Data Overview:")
    print(f"Total records: {len(df)}")
    print(f"Columns ({len(df.columns)}): {list(df.columns)}")
    print("\n📊 Sample Data:")
    display(df.head(3))
    
    print("\n🧮 Basic Stats:")
    print(df[['rating', 'age', 'year']].describe())

# EDA (Exploratory Data Analysis) and Visualization

In [None]:
#Basic Data Overview
print("📊 Data Overview:")
print("1. Users Data:")
print(users.info())
print("\n2. Movies Data:")
print(movies.info())
print("\n3. Ratings Data:")
print(ratings.info())

In [None]:
#Ratings Distribution Analysis
plt.figure(figsize=(10, 6))
sns.countplot(x='rating', data=ratings)
plt.title('Distribution of Movie Ratings', fontsize=14)
plt.xlabel('Rating Score (1-5)')
plt.ylabel('Count')
plt.show()

# Calculate statistics
print("\n📈 Ratings Statistics:")
print(ratings['rating'].describe())

In [None]:
 #User Demographics Analysis
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Gender Distribution
sns.countplot(x='gender', data=users, ax=ax1)
ax1.set_title('Gender Distribution')

# Age Distribution
sns.histplot(x='age', data=users, bins=20, ax=ax2)
ax2.set_title('age Distribution')

plt.tight_layout()
plt.show()

In [None]:
#Movie Genres Analysis
# Count individual genres
genre_counts = movies['genres'].str.split('|').explode().value_counts()

plt.figure(figsize=(12, 8))
genre_counts.plot(kind='barh')
plt.title('Most Common Movie Genres', fontsize=14)
plt.xlabel('Number of Movies')
plt.ylabel('Genre')
plt.show()

In [None]:
# Word Cloud for Movie Titles
# Combine all titles into one text
all_titles = ' '.join(movies['title'].values)

# Generate word cloud
wordcloud = WordCloud(width=800, height=400, 
                     background_color='white').generate(all_titles)

plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Most Frequent Words in Movie Titles', fontsize=14)
plt.show()

In [None]:
# User Engagement Analysis
user_activity = ratings['userId'].value_counts()

plt.figure(figsize=(12, 6))
sns.histplot(user_activity, bins=50)
plt.title('Distribution of Ratings per User')
plt.xlabel('Number of Ratings Given')
plt.ylabel('Number of Users')
plt.show()

print("\nTop 5 Most Active Users:")
print(user_activity.head())

In [None]:
# Movie Popularity Analysis
movie_popularity = ratings['movieId'].value_counts()

plt.figure(figsize=(12, 6))
sns.histplot(movie_popularity, bins=50)
plt.title('Distribution of Ratings per Movie')
plt.xlabel('Number of Ratings Received')
plt.ylabel('Number of Movies')
plt.show()

print("\nTop 5 Most Rated Movies:")
top_movies = movie_popularity.head().index.tolist()
print(movies[movies['movieId'].isin(top_movies)]['title'])

# Building User-Based Collaborative Filtering

In [None]:
# Make sure the rating is numbers.
ratings['rating'] = pd.to_numeric(ratings['rating'], errors='coerce')

# User-Item Matrix
user_item_matrix = ratings.pivot_table(
    index='userId', 
    columns='movieId', 
    values='rating'
)

print("User-Item Matrix built successfully!")
print("Shape:", user_item_matrix.shape)


In [None]:
def normalize_ratings(user_item_matrix):
    """
    Normalizing user ratings by removing the average rating of each user 
    improves the accuracy of the similarity calculation between users.
    """
    user_means = user_item_matrix.mean(axis=1)
    normalized_matrix = user_item_matrix.sub(user_means, axis=0)
    return normalized_matrix.fillna(0)

#  Applay Normalization on the matrix 
user_item_normalized = normalize_ratings(user_item_matrix)
user_item_filled = user_item_normalized.fillna(0)

print("Data normalized successfully!")

In [None]:
# Fill NaN with 0 for similarity calc
user_item_filled = user_item_matrix.fillna(0)

# Compute user-user similarity
user_similarity = cosine_similarity(user_item_filled)
user_similarity_df = pd.DataFrame(user_similarity, 
                                  index=user_item_matrix.index, 
                                  columns=user_item_matrix.index)


print("User Similarity Matrix ready!")

In [None]:
# Compute item-item similarity
item_similarity = cosine_similarity(user_item_filled.T)
item_similarity_df = pd.DataFrame(item_similarity, 
                                  index=user_item_matrix.columns, 
                                  columns=user_item_matrix.columns)

print("Item Similarity Matrix ready!")


In [None]:
def handle_new_user(user_id, user_item_matrix, movies, top_n=5):
    """
    Dealing with new users by recommending 
    the most popular or highest-rated movies.
    """
    if user_id not in user_item_matrix.index:
        print(f"User {user_id} is new. Returning popular movies.")
        
        #Finding the highest-rated movies
        movie_stats = ratings.groupby('movieId')['rating'].agg(['count', 'mean'])
        movie_stats = movie_stats[movie_stats['count'] > 10]  # على الأقل 10 تقييمات
        
        # Ranking movies based on average rating and number of ratings.
        movie_stats['score'] = movie_stats['mean'] * np.log1p(movie_stats['count'])
        top_movies = movie_stats.sort_values('score', ascending=False).head(top_n)
        
        return movies[movies['movieId'].isin(top_movies.index)]['title'].tolist()
    return None

In [None]:
def recommend_movies_user(user_id, user_item_matrix, user_similarity_df, movies, top_n=5):
    """
 user_id: The user ID for recommendations
 user_item_matrix: User × Movie matrix
 user_similarity_df: User similarity matrix
 movies: Movies table
 (movieId → title)top_n: Number of recommendations
    """
    
    #Checking the new user
    popular_movies = handle_new_user(user_id, user_item_matrix, movies, top_n)
    if popular_movies is not None:
        return popular_movies
    
    #Ratings of this user
    user_ratings = user_item_matrix.loc[user_id]
    
    #Users similar to this user
    similar_users = user_similarity_df[user_id].sort_values(ascending=False)[1:11]  # top 10 neighbors
    
    # Weighted ratings prediction
    weighted_scores = pd.Series(dtype=float)
    for neighbor_id, sim_score in similar_users.items():
        neighbor_ratings = user_item_matrix.loc[neighbor_id]
        
        # Weigh the rating of the neighbor by similarity.
        weighted_scores = weighted_scores.add(neighbor_ratings * sim_score, fill_value=0)
    
    # Normalize by sum of similarities
    sim_sums = similar_users.sum()
    predicted_ratings = weighted_scores / sim_sums
    
    #Remove movies already watched by user
    already_seen = user_ratings[user_ratings.notna()].index
    predicted_ratings = predicted_ratings.drop(already_seen, errors='ignore')
    
    #Get top recommendations
    top_movies = predicted_ratings.sort_values(ascending=False).head(top_n)
    
    return movies.set_index("movieId").loc[top_movies.index]["title"].tolist()

In [None]:
def improved_item_based_recommendations(movie_input, movies, item_similarity_df, top_n=5, min_similarity=0.2):
    """
    Enhanced item-based recommendation system with better handling
    """
    try:
        # Handle both movie ID and movie title inputs
        if isinstance(movie_input, str):
            # Search for movie by title
            movie_match = movies[movies['title'].str.contains(movie_input, case=False, na=False)]
            if len(movie_match) == 0:
                print(f"Movie '{movie_input}' not found. Returning popular movies instead.")
                return get_popular_movies(movies, top_n)
            movie_id = movie_match.iloc[0]['movieId']
        else:
            # Use movie ID directly
            movie_id = movie_input
        
        # Check if movie exists in similarity matrix
        if movie_id not in item_similarity_df.columns:
            print(f"Movie ID {movie_id} not in similarity matrix. Returning popular movies.")
            return get_popular_movies(movies, top_n)
        
        # Get similarity scores with minimum threshold
        sim_scores = item_similarity_df[movie_id]
        sim_scores = sim_scores[sim_scores >= min_similarity]
        
        # Remove the movie itself and sort results
        sim_scores = sim_scores.drop(movie_id, errors='ignore').sort_values(ascending=False)
        
        if len(sim_scores) == 0:
            # Fallback to popular movies if no similar movies found
            print("No similar movies found. Returning popular movies.")
            return get_popular_movies(movies, top_n)
        
        # Get top recommendations
        top_movies = sim_scores.head(top_n).index
        return movies[movies['movieId'].isin(top_movies)][['movieId', 'title', 'genres']]
        
    except Exception as e:
        print(f"Error in recommendation: {e}")
        return get_popular_movies(movies, top_n)

def get_popular_movies(movies, top_n=5):
    """
    Get popular movies as fallback recommendation
    """
    # Calculate movie popularity based on rating count and average
    movie_stats = ratings.groupby('movieId')['rating'].agg(['count', 'mean'])
    movie_stats = movie_stats[movie_stats['count'] > 10]  # Minimum 10 ratings
    movie_stats['score'] = movie_stats['mean'] * np.log1p(movie_stats['count'])
    top_movies = movie_stats.sort_values('score', ascending=False).head(top_n)
    
    return movies[movies['movieId'].isin(top_movies.index)][['movieId', 'title', 'genres']]

# First, let's find the Toy Story movie ID
toy_story_movies = movies[movies['title'].str.contains('Toy Story', case=False, na=False)]
if not toy_story_movies.empty:
    toy_story_id = toy_story_movies.iloc[0]['movieId']
    print(f"Toy Story found with ID: {toy_story_id}")
    
    # Test the improved function
    improved_item_recs = improved_item_based_recommendations(toy_story_id, movies, item_similarity_df, 5)
    print("Improved item-based recommendations:")
    print(improved_item_recs)
else:
    print("Toy Story not found in the database")

In [None]:

# Experience with an existing user
sample_user = 1
print("Recommendations for Existing User", sample_user)
print(recommend_movies_user(sample_user, user_item_matrix, user_similarity_df, movies, top_n=5))

#Experience with a new user (not present in the data)
new_user_id = max(user_item_matrix.index) + 1  #Create a new user ID
print(f"\n Recommendations for New User {new_user_id}")
print(recommend_movies_user(new_user_id, user_item_matrix, user_similarity_df, movies, top_n=5))

#Item-based recommendation function (remains the same)
def recommend_similar_movies(movie, movies, item_similarity_df, top_n=10):
    """
  movie: The movieId can be either an int or a title (string)
  movies: A table of movies (movieId → title) 
  item_similarity_df: A matrix of similarity between movies
  top_n: The number of movies that will be returned
    """
    
    # If input is title → get its ID
    if isinstance(movie, str):
        if movie not in movies["title"].values:
            raise ValueError("Movie title not found!")
        movie_id = movies[movies["title"] == movie]["movieId"].values[0]
    else:
        movie_id = movie
    
    # Similarity scores
    sim_scores = item_similarity_df[movie_id].sort_values(ascending=False).drop(movie_id)
    
    # Top N similar
    top_movies = sim_scores.head(top_n).index
    return movies[movies["movieId"].isin(top_movies)][["movieId", "title"]]

print("\n Item-based recommendations for movie ID 1:")
print(recommend_similar_movies(1, movies, item_similarity_df, top_n=5))

print("\n Item-based recommendations for 'Toy Story (1995)':")
print(recommend_similar_movies("Toy Story (1995)", movies, item_similarity_df, top_n=5))

In [None]:

def evaluate_precision_at_k(user_id, user_item_matrix, user_similarity_df, movies, k=5):
    """
    Evaluating the accuracy of recommendations using Precision@K
    
    """
    #Data splitting into training and testing
    user_ratings = user_item_matrix.loc[user_id]
    rated_movies = user_ratings[user_ratings.notna()].index
    
    if len(rated_movies) < 10:  #You need sufficient evaluations for the assessment.
        return None
    
    #Random splitting
    train_movies, test_movies = train_test_split(rated_movies, test_size=0.2, random_state=42)
    
    #Creating a user-item matrix for training
    train_matrix = user_item_matrix.copy()
    train_matrix.loc[user_id, test_movies] = np.nan
    
    #Recalculate similarity based on training data.
    train_normalized = normalize_ratings(train_matrix)
    train_filled = train_normalized.fillna(0)
    train_similarity = cosine_similarity(train_filled)
    train_similarity_df = pd.DataFrame(train_similarity, 
                                      index=train_matrix.index, 
                                      columns=train_matrix.index)
    
    
    recommendations = recommend_movies_user(user_id, train_matrix, train_similarity_df, movies, top_n=k)
    recommended_movies = movies[movies['title'].isin(recommendations)]['movieId'].values
    
    # calculate Precision@K
    relevant_items = set(test_movies)
    recommended_items = set(recommended_movies)
    relevant_and_recommended = relevant_items.intersection(recommended_items)
    
    return len(relevant_and_recommended) / k
    
#evaluation System for multiple users
def evaluate_system(user_item_matrix, user_similarity_df, movies, n_users=10, k=5):
    precisions = []
    evaluated_users = 0
    
    for user_id in user_item_matrix.index[:n_users]:
        precision = evaluate_precision_at_k(user_id, user_item_matrix, user_similarity_df, movies, k)
        if precision is not None:
            precisions.append(precision)
            evaluated_users += 1
    
    if evaluated_users > 0:
        avg_precision = np.mean(precisions)
        print(f"Average Precision@{k} for {evaluated_users} users: {avg_precision:.4f}")
        return avg_precision
    else:
        print("Not enough data for evaluation")
        return None

In [None]:

def improved_svd_recommendations(user_id, top_n=5):
    """
    Using the Surprise library to improve SVD
    """
    #Preparing the data for Surprise format
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
    
    #Using SVD with regularization
    algo = SVD(n_factors=100, n_epochs=20, lr_all=0.005, reg_all=0.02)
    
    #Training on the full data
    trainset = data.build_full_trainset()
    algo.fit(trainset)
    
    #Generate recommendations
    user_inner_id = trainset.to_inner_uid(user_id)
    user_ratings = trainset.ur[user_inner_id]
    
    #Getting predictions for all movies
    predictions = []
    for movie_inner_id in range(trainset.n_items):
        pred = algo.predict(user_id, trainset.to_raw_iid(movie_inner_id))
        predictions.append((pred.iid, pred.est))
    
    #Sorting recommendations
    predictions.sort(key=lambda x: x[1], reverse=True)

    #Exclude the movies that the user has watched.
    user_watched = [trainset.to_raw_iid(movie_inner_id) for movie_inner_id, _ in user_ratings]
    recommendations = [movie for movie, rating in predictions if movie not in user_watched]
    
    return movies[movies['movieId'].isin(recommendations[:top_n])]['title'].tolist()

#Testing the improved SVD
print("Improved SVD Recommendations:")
improved_svd_recs = improved_svd_recommendations(1, 5)
print(improved_svd_recs)

In [None]:
def matrix_factorization_recommendations(user_id, user_item_matrix, movies, top_n=5):
    """
    Generating recommendations using matrix factorization (SVD)
    """
    try:
        #Converting the array to numerical values
        R = user_item_matrix.fillna(0).values
        
        # Applay SVD
        U, sigma, Vt = np.linalg.svd(R, full_matrices=False)
        sigma = np.diag(sigma)
        
        #Rebuild the array with predictions
        predicted_ratings = np.dot(np.dot(U, sigma), Vt)
        predicted_df = pd.DataFrame(predicted_ratings, 
                                   index=user_item_matrix.index, 
                                   columns=user_item_matrix.columns)
        
        # Getting predictions for the user
        user_predictions = predicted_df.loc[user_id]
        
        # Remove the movies that the user has already watched.
        user_ratings = user_item_matrix.loc[user_id]
        already_rated = user_ratings[user_ratings.notna()].index
        user_predictions = user_predictions.drop(already_rated, errors='ignore')
        
        # Getting the best recommendations
        top_movies = user_predictions.sort_values(ascending=False).head(top_n)
        
        return movies.set_index("movieId").loc[top_movies.index]["title"]
    
    except Exception as e:
        print(f"SVD Error: {e}")
        # Going back to the recommendations based on the user.
        return recommend_movies_user(user_id, user_item_matrix, user_similarity_df, movies, top_n)

def recommend_movies(user_id, method='user_based', top_n=5):
    """
    A unified recommendation function that supports multiple methods
    """
    if method == 'user_based':
        return recommend_movies_user(user_id, user_item_matrix, user_similarity_df, movies, top_n)
    elif method == 'item_based':
        # To obtain based on items, we need a reference movie.
        # We use the movie Toy Story as a default.
        return recommend_similar_movies(1, movies, item_similarity_df, top_n)
    elif method == 'svd':
        return matrix_factorization_recommendations(user_id, user_item_matrix, movies, top_n)
    else:
        raise ValueError("Method must be 'user_based', 'item_based', or 'svd'")

def recommend_similar_movies(movie_id, movies, item_similarity_df, top_n=5):
    """
    The recommendation based on similar movies
    """
    try:
        # Getting similarity scores
        sim_scores = item_similarity_df[movie_id].sort_values(ascending=False)
        
        #Removing the film
        sim_scores = sim_scores.drop(movie_id, errors='ignore')
        
        # Getting the best similar movies
        top_movies = sim_scores.head(top_n).index
        
        return movies[movies['movieId'].isin(top_movies)][["movieId", "title"]]
    
    except Exception as e:
        print(f"Item-based recommendation error: {e}")
        # Back to popular movies
        movie_stats = ratings.groupby('movieId')['rating'].agg(['count', 'mean'])
        movie_stats = movie_stats[movie_stats['count'] > 10]
        movie_stats['score'] = movie_stats['mean'] * np.log1p(movie_stats['count'])
        top_movies = movie_stats.sort_values('score', ascending=False).head(top_n)
        return movies[movies['movieId'].isin(top_movies.index)][["movieId", "title"]]

def recommend_movies_user(user_id, user_item_matrix, user_similarity_df, movies, top_n=5):
    """
   The recommendation is 
   based on similar users.
    """
    try:
        # User Ratings
        user_ratings = user_item_matrix.loc[user_id]
        
        # similar_users
        similar_users = user_similarity_df[user_id].sort_values(ascending=False)[1:11]

        # Calculation of Weighted Ratings
        weighted_scores = pd.Series(dtype=float)
        for neighbor_id, sim_score in similar_users.items():
            neighbor_ratings = user_item_matrix.loc[neighbor_id]
            weighted_scores = weighted_scores.add(neighbor_ratings * sim_score, fill_value=0)
        
        # Normalization of results
        sim_sums = similar_users.sum()
        predicted_ratings = weighted_scores / sim_sums
        
        # Remove watched movies
        already_seen = user_ratings[user_ratings.notna()].index
        predicted_ratings = predicted_ratings.drop(already_seen, errors='ignore')
        
        # Getting the best recommendations
        top_movies = predicted_ratings.sort_values(ascending=False).head(top_n)
        
        return movies.set_index("movieId").loc[top_movies.index]["title"].tolist()
    
    except Exception as e:
        print(f"User-based recommendation error: {e}")
        #Back to the popular movies
        movie_stats = ratings.groupby('movieId')['rating'].agg(['count', 'mean'])
        movie_stats = movie_stats[movie_stats['count'] > 10]
        movie_stats['score'] = movie_stats['mean'] * np.log1p(movie_stats['count'])
        top_movies = movie_stats.sort_values('score', ascending=False).head(top_n)
        return movies[movies['movieId'].isin(top_movies.index)]['title'].tolist()



In [132]:
# Activate the evaluation function.
print("\n" + "="*50)
print("Evaluate the system's performance")
print("="*50)

# Evaluation of the system on the first 15 users
avg_precision = evaluate_system(user_item_matrix, user_similarity_df, movies, n_users=15, k=5)
print(f"Precision@5 Average : {avg_precision:.4f}")

# Try SVD Recommendation 
print("\n" + "="*50)
print("Recommendations using Singular Value Decomposition (SVD)")
print("="*50)

# Try SVD  for an Existing User
try:
    svd_recommendations = matrix_factorization_recommendations(1, user_item_matrix, movies, top_n=5)
    print(f"recommendations SVD for user 1: {svd_recommendations.tolist()}")
except Exception as e:
    print(f" Error in SVD: {e}")

# Unified recommendation function test
print("\n" + "="*50)
print("Unified recommendation function test")
print("="*50)


# Searching for the Toy Story movie ID
toy_story_id = movies[movies['title'].str.contains('Toy Story')]['movieId'].values[0]
print(f"Toy Story movie ID: {toy_story_id}")

# Testing All Recommendation Methods
methods = ['user_based', 'item_based', 'svd']

for method in methods:
    try:
        if method == 'item_based':
            # To recommend based on items, we need to pass the movie ID.
            recommendations = recommend_similar_movies(toy_story_id, movies, item_similarity_df, top_n=5)
            print(f"\n Recommendations {method} based on Toy Story:")
            print(recommendations)
        else:
            recommendations = recommend_movies(1, method=method, top_n=5)
            print(f"\n Recommendations {method} For user 1: {recommendations}")
    except Exception as e:
        print(f" Error in {method}: {e}")


#Comparison of different recommendation methods
print("\n" + "="*50)
print(" Comparison of different recommendation methods")
print("="*50)

#Comparison of recommendations for user 1 using different methods
user_id = 1

print(f"User: {user_id}")
print("-" * 30)

# recommendation based on users
user_based_recs = recommend_movies(user_id, 'user_based', 5)
print(f"recommendation based on users: {user_based_recs}")

# Recommendation based on items (using a favorite movie of the user)
# Let's assume that the movie Toy Story is one of his favorite films.
item_based_recs = recommend_similar_movies(toy_story_id, movies, item_similarity_df, 5)
print(f"Recommendation based on items ( based on Toy Story):")
print(item_based_recs)

# Recommendation using SVD
svd_recs = recommend_movies(user_id, 'svd', 5)
print(f"Recommendation using SVD : {svd_recs}")


 Recommendations svd For user 1: movieId
2                Jumanji (1995)
142     Shadows (Cienie) (1988)
10             GoldenEye (1995)
1380              Grease (1978)
1614            In & Out (1997)
Name: title, dtype: object

 Comparison of different recommendation methods
User: 1
------------------------------
recommendation based on users: ['Little Mermaid, The (1989)', 'Jungle Book, The (1967)', 'Sleeping Beauty (1959)', 'Lion King, The (1994)', 'Fantasia (1940)']
Recommendation based on items ( based on Toy Story):
      movieId                      title
584       588             Aladdin (1992)
1245     1265       Groundhog Day (1993)
1250     1270  Back to the Future (1985)
2286     2355       Bug's Life, A (1998)
3045     3114         Toy Story 2 (1999)
Recommendation using SVD : movieId
2                Jumanji (1995)
142     Shadows (Cienie) (1988)
10             GoldenEye (1995)
1380              Grease (1978)
1614            In & Out (1997)
Name: title, dtype: object


In [133]:
def analyze_precision_issues():
    """
    A detailed analysis of the relatively low accuracy reasons.
    """
    # Study of the distribution of ratings
    rating_distribution = ratings['rating'].value_counts().sort_index()
    print("Distribution of ratings:")
    print(rating_distribution)
    
    # Study the number of ratings per user
    user_rating_counts = ratings.groupby('userId')['movieId'].count()
    print(f"\n Average ratings per user: {user_rating_counts.mean():.2f}")
    print(f" Min user ratings : {user_rating_counts.min()}")
    print(f" Max user ratings: {user_rating_counts.max()}")
    
    #Sparsity analysis of the matrix
    matrix_size = user_item_matrix.shape[0] * user_item_matrix.shape[1]
    non_na_count = user_item_matrix.count().sum()
    sparsity = 1 - (non_na_count / matrix_size)
    print(f"\nSparsity matrix: {sparsity:.4f} ({sparsity*100:.2f}%)")
    
    return sparsity

matrix_sparsity = analyze_precision_issues()

Distribution of ratings:
rating
1     56174
2    107557
3    261197
4    348971
5    226310
Name: count, dtype: int64

 Average ratings per user: 165.60
 Min user ratings : 20
 Max user ratings: 2314

Sparsity matrix: 0.9553 (95.53%)


In [134]:
train_ratings, test_ratings = train_test_split(ratings, test_size=0.2, random_state=42)

In [135]:

def evaluate_svd_with_surprise_fast():
    """
    SVD evaluation using the Surprise library.
    """
    # prepar Data
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
    
    # Split Data
    trainset, testset = surprise_train_test_split(data, test_size=0.2, random_state=42)
    
    # Training Model
    algo = SVD(n_factors=100, n_epochs=20, lr_all=0.005, reg_all=0.02)
    algo.fit(trainset)
    
    # predictions and Evaluation
    predictions = algo.test(testset)
    
 
    rmse = accuracy.rmse(predictions)
    mae = accuracy.mae(predictions)
    
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")
    
     #Calculating Precision@K more efficiently
    def precision_at_k_fast(predictions, k=5, threshold=4):
        """
        Calculating Precision@K  
        """
        # Creating a dictionary for quick access to real ratings.
        true_ratings_dict = {}
        for uid, iid, true_r, est, _ in predictions:
            if uid not in true_ratings_dict:
                true_ratings_dict[uid] = {}
            true_ratings_dict[uid][iid] = true_r
        
        # Aggregating predictions for each user
        user_predictions = defaultdict(list)
        for uid, iid, true_r, est, _ in predictions:
            user_predictions[uid].append((iid, est))
        
        # Calculating Precision@K for all users 
        precisions = []
        for uid, preds in user_predictions.items():
            
            # Sort the predictions in descending order
            preds.sort(key=lambda x: x[1], reverse=True)
            
            # Top K Recommendations
            top_k = preds[:k]
            
            # Calculating Hits
            hits = 0
            for iid, est in top_k:
                true_rating = true_ratings_dict[uid].get(iid, None)
                if true_rating is not None and true_rating >= threshold:
                    hits += 1
            
            precision = hits / k
            precisions.append(precision)
        
        return np.mean(precisions) if precisions else 0
    
    # Calculating Precision@K  
    precision = precision_at_k_fast(predictions, k=5, threshold=4)
    print(f"Precision@5: {precision:.4f}")
    
    return rmse, mae, precision

rmse, mae, precision = evaluate_svd_with_surprise_fast()

RMSE: 0.8733
MAE:  0.6846
RMSE: 0.8733
MAE: 0.6846
Precision@5: 0.7872


In [136]:
def final_recommendation_system(user_id, top_n=10):
    """
    The final system that combines power and precision.

    """
    #For new users: Popular movies
    if user_id not in ratings['userId'].values:
        return get_popular_movies(movies, top_n)
    
    #For old users: SVD with Surprise
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
    
    #Using the best parameters
    algo = SVD(n_factors=100, n_epochs=25, lr_all=0.007, reg_all=0.1)
    trainset = data.build_full_trainset()
    algo.fit(trainset)
    
    #recommendations
    testset = trainset.build_anti_testset()
    user_testset = [x for x in testset if x[0] == user_id]
    predictions = algo.test(user_testset)
    
    # Sort and Top recommendations
    predictions.sort(key=lambda x: x.est, reverse=True)
    top_predictions = predictions[:top_n]
    
    #Getting the movie titles
    recommended_ids = [int(pred.iid) for pred in top_predictions]
    recommended_movies = movies[movies['movieId'].isin(recommended_ids)]
    
    return recommended_movies[['movieId', 'title', 'genres']]

#Try the final system
print("final recommendations for user 1:")
final_recommendations = final_recommendation_system(1, 10)
print(final_recommendations)

final recommendations for user 1:
      movieId                                              title  \
52         53                                    Lamerica (1994)   
315       318                   Shawshank Redemption, The (1994)   
892       904                                 Rear Window (1954)   
1132     1148                         Wrong Trousers, The (1993)   
1242     1262                           Great Escape, The (1963)   
1950     2019  Seven Samurai (The Magnificent Seven) (Shichin...   
2128     2197                                   Firelight (1997)   
2434     2503                            Apple, The (Sib) (1998)   
2836     2905                                     Sanjuro (1962)   
3269     3338                             For All Mankind (1989)   

                genres  
52               Drama  
315              Drama  
892   Mystery|Thriller  
1132  Animation|Comedy  
1242     Adventure|War  
1950      Action|Drama  
2128             Drama  
2434             

In [137]:
def recommend_movies_user(user_id, user_item_matrix, user_similarity_df, movies, top_n=5):
    """
    Recommendations based on the user with support for the required number of recommendations - modified version.
    """
    try:
        # user ratings
        user_ratings = user_item_matrix.loc[user_id]
        
        # similar_users
        similar_users = user_similarity_df[user_id].sort_values(ascending=False)[1:11]
        
        # Calculate weighted_scores
        weighted_scores = pd.Series(dtype=float)
        for neighbor_id, sim_score in similar_users.items():
            neighbor_ratings = user_item_matrix.loc[neighbor_id]
            weighted_scores = weighted_scores.add(neighbor_ratings * sim_score, fill_value=0)
        
        sim_sums = similar_users.sum()
        predicted_ratings = weighted_scores / sim_sums
        
        # Remove Already seen Movies 
        already_seen = user_ratings[user_ratings.notna()].index
        predicted_ratings = predicted_ratings.drop(already_seen, errors='ignore')
        
        #Getting the best recommendations - use top_n here
        top_movies = predicted_ratings.sort_values(ascending=False).head(top_n)
        
        return movies.set_index("movieId").loc[top_movies.index]["title"].tolist()
    
    except Exception as e:
        print(f"Error in user-based recommendations: {e}")
        # Fallback to popular movies
        movie_stats = ratings.groupby('movieId')['rating'].agg(['count', 'mean'])
        movie_stats = movie_stats[movie_stats['count'] > 10]
        movie_stats['score'] = movie_stats['mean'] * np.log1p(movie_stats['count'])
        top_movies = movie_stats.sort_values('score', ascending=False).head(top_n)
        return movies[movies['movieId'].isin(top_movies.index)]['title'].tolist()

In [138]:
!pip install gradio



In [145]:
import gradio as gr
import pandas as pd
import numpy as np

def item_based_recommendation_gradio(movie_input, top_n=5, min_similarity=0.1):
    """
    Item-based recommendation system for Gradio
    """
    try:
        # Find the movie
        if movie_input.isdigit():
            # If movie ID is entered
            movie_id = int(movie_input)
            movie_info = movies[movies['movieId'] == movie_id]
            if len(movie_info) == 0:
                return "❌ No movie found with this ID", None
            movie_title = movie_info.iloc[0]['title']
            movie_id = movie_info.iloc[0]['movieId']
        else:
            # If movie name is entered
            movie_match = movies[movies['title'].str.contains(movie_input, case=False, na=False)]
            if len(movie_match) == 0:
                return "❌ No movie found with this name", None
            movie_id = movie_match.iloc[0]['movieId']
            movie_title = movie_match.iloc[0]['title']
        
        # Check if movie exists in similarity matrix
        if movie_id not in item_similarity_df.columns:
            return "❌ This movie is not in the database", None
        
        # Get similar movies
        similar_movies = item_similarity_df[movie_id].sort_values(ascending=False)
        
        # Remove the movie itself and apply similarity threshold
        similar_movies = similar_movies.drop(movie_id, errors='ignore')
        similar_movies = similar_movies[similar_movies >= min_similarity]
        
        if len(similar_movies) == 0:
            return "❌ Not enough similar movies found", None
        
        # Get top recommendations
        top_recommendations = similar_movies.head(top_n)
        
        # Create results table
        results = []
        for similar_movie_id, similarity_score in top_recommendations.items():
            movie_info = movies[movies['movieId'] == similar_movie_id].iloc[0]
            results.append({
                "Movie": movie_info['title'],
                "Similarity Score": f"{similarity_score:.3f}",
                "Genres": movie_info['genres'],
                "Movie ID": similar_movie_id
            })
        
        results_df = pd.DataFrame(results)
        return f"🎯 Top {len(results)} movies similar to '{movie_title}':", results_df
        
    except Exception as e:
        return f"❌ Error occurred: {e}", None

def user_based_recommendation_gradio(user_id, top_n=5):
    """
    User-based recommendation system for Gradio
    """
    try:
        user_id = int(user_id)
        
        # Check if user exists
        if user_id not in ratings['userId'].values:
            # For new users, show popular movies
            movie_stats = ratings.groupby('movieId')['rating'].agg(['count', 'mean'])
            movie_stats = movie_stats[movie_stats['count'] > 10]
            movie_stats['score'] = movie_stats['mean'] * np.log1p(movie_stats['count'])
            top_movies = movie_stats.sort_values('score', ascending=False).head(top_n)
            popular_movies = movies[movies['movieId'].isin(top_movies.index)]
            
            results = []
            for _, movie in popular_movies.iterrows():
                results.append({
                    "Movie": movie['title'],
                    "Genres": movie['genres'],
                    "Rating": f"{top_movies.loc[movie['movieId'], 'mean']:.2f}",
                    "Number of Ratings": int(top_movies.loc[movie['movieId'], 'count'])
                })
            
            results_df = pd.DataFrame(results)
            return f"🎯 Popular movies for new user {user_id}:", results_df
        
        # User-based recommendation
        recommendations = recommend_movies_user(user_id, user_item_matrix, user_similarity_df, movies, top_n)
        
        results = []
        for movie_title in recommendations:
            movie_info = movies[movies['title'] == movie_title].iloc[0]
            results.append({
                "Movie": movie_info['title'],
                "Genres": movie_info['genres'],
                "Movie ID": movie_info['movieId']
            })
        
        results_df = pd.DataFrame(results)
        return f"🎯 Top {top_n} recommendations for user {user_id}:", results_df
        
    except Exception as e:
        return f"❌ Error occurred: {e}", None

# Create Gradio interface
with gr.Blocks(title="Movie Recommendation System", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🎬 Movie Recommendation System")
    gr.Markdown("## Choose your preferred recommendation method")
    
    with gr.Tab("Item-Based Recommendations"):
        gr.Markdown("### 🔍 Find movies similar to your favorite")
        with gr.Row():
            movie_input = gr.Textbox(label="Movie Name or ID", placeholder="Enter 'Toy Story' or '1'")
            top_n_similar = gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Number of Recommendations")
        
        movie_btn = gr.Button("Find Similar Movies")
        movie_output_text = gr.Textbox(label="Result")
        movie_output_df = gr.Dataframe(label="Recommended Movies")
    
    with gr.Tab("User-Based Recommendations"):
        gr.Markdown("### 👤 Get personalized recommendations based on your preferences")
        with gr.Row():
            user_input = gr.Textbox(label="User ID", placeholder="Enter user ID (e.g., 1)")
            top_n_user = gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Number of Recommendations")
        
        user_btn = gr.Button("Get Recommendations")
        user_output_text = gr.Textbox(label="Result")
        user_output_df = gr.Dataframe(label="Recommended Movies")
    
    with gr.Tab("Explore Movies"):
        gr.Markdown("### 📊 Explore the movie database")
        genre_filter = gr.Dropdown(label="Select Genre", choices=sorted(movies['genres'].str.split('|').explode().unique()))
        explore_btn = gr.Button("Show Movies")
        explore_output = gr.Dataframe(label="Movies")
        
        def explore_movies(genre):
            if genre:
                filtered_movies = movies[movies['genres'].str.contains(genre, na=False)]
                return filtered_movies[['title', 'genres', 'movieId']].head(20)
            return movies[['title', 'genres', 'movieId']].head(20)
    
    # Connect buttons to functions
    movie_btn.click(
        fn=item_based_recommendation_gradio,
        inputs=[movie_input, top_n_similar],
        outputs=[movie_output_text, movie_output_df]
    )
    
    user_btn.click(
        fn=user_based_recommendation_gradio,
        inputs=[user_input, top_n_user],
        outputs=[user_output_text, user_output_df]
    )
    
    explore_btn.click(
        fn=explore_movies,
        inputs=[genre_filter],
        outputs=[explore_output]
    )
    
    gr.Markdown("---")
    gr.Markdown("### 💡 Examples to try:")
    gr.Markdown("- **Movies**: 'Toy Story', 'Titanic'  , 'Avengers' ")
    gr.Markdown("- **Movie IDs**: 1, 318, 858, 260")
    gr.Markdown("- **User IDs**: 1, 5, 10, 15")
    gr.Markdown("- **Genres**: Drama, Comedy, Action, Adventure")

# Run the application
if __name__ == "__main__":
    demo.launch(share=True)  # share=True to create a public link     

* Running on local URL:  http://127.0.0.1:7867
* Running on public URL: https://8b161f11b1b926021e.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


In [142]:
import joblib
from surprise import dump

def save_recommendation_model():
    """
    save model 
    """
    #  train model
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
    algo = SVD(n_factors=100, n_epochs=25, lr_all=0.007, reg_all=0.1)
    trainset = data.build_full_trainset()
    algo.fit(trainset)
    
    # save model
    dump.dump('movie_recommender_model', algo=algo)
    
    #Save reference data
    joblib.dump({
        'movies': movies,
        'ratings': ratings,
        'user_item_matrix': user_item_matrix
    }, 'movie_data.pkl')
    
    print("The model and data have been saved successfully.")

def load_recommendation_model():
    """
    Download the saved model
    """
    try:
        algo = dump.load('movie_recommender_model')[1]
        data = joblib.load('movie_data.pkl')
        return algo, data['movies'], data['ratings'], data['user_item_matrix']
    except:
        print(" No saved model found")
        return None, None, None, None

save_recommendation_model()

The model and data have been saved successfully.
