In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/movielens-100k-dataset/ml-100k/u.occupation
/kaggle/input/movielens-100k-dataset/ml-100k/u1.base
/kaggle/input/movielens-100k-dataset/ml-100k/u.info
/kaggle/input/movielens-100k-dataset/ml-100k/u4.test
/kaggle/input/movielens-100k-dataset/ml-100k/u.item
/kaggle/input/movielens-100k-dataset/ml-100k/README
/kaggle/input/movielens-100k-dataset/ml-100k/u1.test
/kaggle/input/movielens-100k-dataset/ml-100k/ua.test
/kaggle/input/movielens-100k-dataset/ml-100k/u.data
/kaggle/input/movielens-100k-dataset/ml-100k/u5.test
/kaggle/input/movielens-100k-dataset/ml-100k/mku.sh
/kaggle/input/movielens-100k-dataset/ml-100k/u5.base
/kaggle/input/movielens-100k-dataset/ml-100k/u.user
/kaggle/input/movielens-100k-dataset/ml-100k/ub.base
/kaggle/input/movielens-100k-dataset/ml-100k/u4.base
/kaggle/input/movielens-100k-dataset/ml-100k/u2.test
/kaggle/input/movielens-100k-dataset/ml-100k/ua.base
/kaggle/input/movielens-100k-dataset/ml-100k/u3.test
/kaggle/input/movielens-100k-dataset/ml-100k/u.

# Description
Task 5:  Movie Recommendation System Description
 
 Description:
 * Dataset (Recommended): MovieLens 100K Dataset (Kaggle)
 * Build a system that recommends movies based on user similarity
 * Use a user-item matrix to compute similarity scores
 * Recommend top-rated unseen movies for a given user
 * Evaluate performance using precision at K
 
 Tools & Libraries:
 Python
 Pandas
 Numpy
 Scikit-learn

 Covered Topics: 
 Recommendation systems |  Similarity-based modeling 

Bonus:
 Implement item-based collaborative filtering.
Try matrix factorization (SVD)

# Import Libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score

# Load Dataset

In [3]:
# Ratings file: u.data => user_id, item_id, rating, timestamp
ratings = pd.read_csv(
    "/kaggle/input/movielens-100k-dataset/ml-100k/u.data",
    sep="\t", names=["user_id", "movie_id", "rating", "timestamp"]
)

In [4]:
# Movies file: u.item => movie_id | movie_title | ...
movies = pd.read_csv(
    "/kaggle/input/movielens-100k-dataset/ml-100k/u.item",
    sep="|", encoding="latin-1", header=None
)

In [5]:
movies = movies[[0, 1]]
movies.columns = ["movie_id", "title"]

In [6]:
print("Ratings shape:", ratings.shape)
print("Movies shape:", movies.shape)
ratings.head()
movies.head()

Ratings shape: (100000, 4)
Movies shape: (1682, 2)


Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


# Build User-Item Matrix

In [None]:
user_item_matrix = ratings.pivot_table(
    index="user_id", columns="movie_id", values="rating"
).fillna(0)

print("\nUser-Item Matrix shape:", user_item_matrix.shape)

# Compute User Similarity

In [None]:
user_similarity = cosine_similarity(user_item_matrix)
user_similarity_df = pd.DataFrame(
    user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index
)

print("\nUser Similarity Matrix shape:", user_similarity_df.shape)

# Recommend Movies

In [None]:
def recommend_movies(user_id, k=5, top_n=10):
    """
    Recommend top_n movies for a given user based on user similarity
    """
    # Get similar users
    similar_users = user_similarity_df[user_id].sort_values(ascending=False)[1:k+1].index
    
    # Movies rated by similar users
    similar_users_ratings = user_item_matrix.loc[similar_users]
    avg_ratings = similar_users_ratings.mean(axis=0)
    
    # Movies already watched by target user
    watched_movies = user_item_matrix.loc[user_id]
    unwatched_movies = avg_ratings[watched_movies == 0]
    
    # Recommend top_n
    recommendations = unwatched_movies.sort_values(ascending=False).head(top_n)
    
    return movies[movies["movie_id"].isin(recommendations.index)]

# Example: Recommend for user 10
print("\nTop Recommendations for User 10:")
print(recommend_movies(user_id=10, k=10, top_n=5))

# Evaluation: Precision@K

In [None]:
def precision_at_k(user_id, k=5, top_n=10):
    """
    Calculate precision@k for a given user
    """
    recommended = recommend_movies(user_id, k, top_n)
    recommended_ids = set(recommended["movie_id"].values)
    
    # Actual liked movies (rating >= 4)
    user_ratings = ratings[(ratings.user_id == user_id) & (ratings.rating >= 4)]
    liked_movies = set(user_ratings.movie_id.values)
    
    if len(recommended_ids) == 0:
        return 0
    
    precision = len(recommended_ids & liked_movies) / len(recommended_ids)
    return precision

# Example precision for user 10
print("\nPrecision@5 for User 10:", precision_at_k(10, k=10, top_n=5))

# Bonus: Item-Based CF

In [None]:
item_similarity = cosine_similarity(user_item_matrix.T)
item_similarity_df = pd.DataFrame(
    item_similarity, index=user_item_matrix.columns, columns=user_item_matrix.columns
)

def recommend_movies_item_based(user_id, top_n=5):
    """
    Recommend movies using item-based collaborative filtering
    """
    user_ratings = user_item_matrix.loc[user_id]
    scores = np.dot(user_ratings, item_similarity)
    scores = pd.Series(scores, index=user_item_matrix.columns)
    
    # Exclude already watched movies
    watched = user_ratings[user_ratings > 0].index
    scores = scores.drop(watched, errors="ignore")
    
    top_movies = scores.sort_values(ascending=False).head(top_n).index
    return movies[movies["movie_id"].isin(top_movies)]

print("\nItem-Based Recommendations for User 10:")
print(recommend_movies_item_based(10, top_n=5))

# Bonus: Matrix Factorization (SVD)

In [None]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=20, random_state=42)
matrix_svd = svd.fit_transform(user_item_matrix)

user_factors = matrix_svd
item_factors = svd.components_.T

def recommend_movies_svd(user_id, top_n=5):
    """
    Recommend movies using matrix factorization (SVD)
    """
    user_vector = user_factors[user_id - 1]  # index shift
    scores = np.dot(item_factors, user_vector)
    scores = pd.Series(scores, index=user_item_matrix.columns)
    
    # Exclude already watched movies
    watched = user_item_matrix.loc[user_id]
    scores = scores[watched == 0]
    
    top_movies = scores.sort_values(ascending=False).head(top_n).index
    return movies[movies["movie_id"].isin(top_movies)]

print("\nSVD-Based Recommendations for User 10:")
print(recommend_movies_svd(10, top_n=5))