##  Task 5:  Movie Recommendation System Description

In [None]:
# Load libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
from sklearn.decomposition import TruncatedSVD

In [28]:
ratings_path = 'ml-100k/u.data'
movies_path = 'ml-100k/u.item'

# Load ratings
ratings = pd.read_csv(ratings_path, sep='\t', names=['user_id', 'movie_id', 'rating', 'timestamp'])

# Load movies (only need id and title)
movies = pd.read_csv(movies_path, sep='|', encoding='latin-1', usecols=[0, 1], names=['movie_id', 'title'])
movies['movie_id'] = movies['movie_id'].astype(int)  # Ensure int type

# Merge for movie titles
ratings = pd.merge(ratings, movies, on='movie_id')

# Display info
print(ratings.head())
print(ratings.info())
print(f"Unique users: {ratings['user_id'].nunique()}, Unique movies: {ratings['movie_id'].nunique()}")

   user_id  movie_id  rating  timestamp                       title
0      196       242       3  881250949                Kolya (1996)
1      186       302       3  891717742    L.A. Confidential (1997)
2       22       377       1  878887116         Heavyweights (1994)
3      244        51       2  880606923  Legends of the Fall (1994)
4      166       346       1  886397596         Jackie Brown (1997)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   user_id    100000 non-null  int64 
 1   movie_id   100000 non-null  int64 
 2   rating     100000 non-null  int64 
 3   timestamp  100000 non-null  int64 
 4   title      100000 non-null  object
dtypes: int64(4), object(1)
memory usage: 3.8+ MB
None
Unique users: 943, Unique movies: 1682


In [29]:
# Check sparsity
total_possible = ratings['user_id'].nunique() * ratings['movie_id'].nunique()
sparsity = (len(ratings) / total_possible) * 100
print(f"Sparsity: {sparsity:.2f}%")

Sparsity: 6.30%


In [30]:
# Split into train/test
train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)

In [32]:
# User-item matrix (train data)
user_item_matrix = train_data.pivot_table(index='user_id', columns='movie_id', values='rating')
user_item_matrix = user_item_matrix.fillna(0)
print(user_item_matrix.shape)

(943, 1653)


In [34]:
user_similarity = cosine_similarity(user_item_matrix)
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)
print(user_similarity_df.head())

user_id       1         2         3         4         5         6         7    \
user_id                                                                         
1        1.000000  0.136196  0.030424  0.026203  0.284613  0.331412  0.319056   
2        0.136196  1.000000  0.114644  0.168220  0.093128  0.162165  0.095848   
3        0.030424  0.114644  1.000000  0.346894  0.000000  0.085071  0.032829   
4        0.026203  0.168220  0.346894  1.000000  0.011848  0.051287  0.075209   
5        0.284613  0.093128  0.000000  0.011848  1.000000  0.168527  0.298438   

user_id       8         9         10   ...       934       935       936  \
user_id                                ...                                 
1        0.274139  0.083486  0.281396  ...  0.277459  0.084849  0.205849   
2        0.091360  0.149476  0.125701  ...  0.149359  0.268977  0.320095   
3        0.053875  0.060177  0.052552  ...  0.021713  0.017707  0.154299   
4        0.142100  0.060465  0.035202  ...  0.034908

In [35]:
def recommend_movies(user_id, n=5, k_similar=10):
    # Get similar users (top K excluding self)
    similar_users = user_similarity_df[user_id].sort_values(ascending=False)[1:k_similar+1].index
    
    # Ratings from similar users
    similar_ratings = user_item_matrix.loc[similar_users]
    
    # Weighted average ratings (by similarity)
    sim_scores = user_similarity_df.loc[similar_users, user_id]
    predicted_ratings = np.dot(similar_ratings.T, sim_scores) / sim_scores.sum()
    
    # DataFrame of predictions
    predictions = pd.Series(predicted_ratings, index=user_item_matrix.columns)
    
    # Unseen movies (user's ratings == 0)
    unseen = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] == 0].index
    
    # Top N unseen with highest predicted ratings
    recommendations = predictions[unseen].sort_values(ascending=False)[:n]
    
    # Create DF from recommendations and merge with movies
    rec_df = pd.DataFrame({'movie_id': recommendations.index.astype(int), 'predicted_rating': recommendations.values})
    rec_movies = rec_df.merge(movies, on='movie_id', how='left')
    
    # Check for 'title' (raise if missing)
    if 'title' not in rec_movies.columns:
        print("Merge failed. movies columns:", movies.columns.tolist())
        print("rec_df movie_id sample:", rec_df['movie_id'].head())
        print("movies movie_id sample:", movies['movie_id'].head())
        raise KeyError("'title' column missing after merge. Check movie_id matches/types or movies load.")
    
    return rec_movies[['title', 'predicted_rating']]

# Example: Recommend for user 1
print(recommend_movies(user_id=1, n=5))

                                    title  predicted_rating
0                            Fargo (1996)          3.802328
1                       Get Shorty (1995)          3.611206
2               Return of the Jedi (1983)          3.596395
3  One Flew Over the Cuckoo's Nest (1975)          3.512580
4       E.T. the Extra-Terrestrial (1982)          3.504424


In [38]:
movies = pd.read_csv('ml-100k/u.item', sep='|', encoding='latin-1', usecols=[0, 1], names=['movie_id', 'title'])
print(movies.columns.tolist())  # Should be ['movie_id', 'title']
print(movies.head())  # Check data
print(movies['movie_id'].dtype)  # int64

['movie_id', 'title']
   movie_id              title
0         1   Toy Story (1995)
1         2   GoldenEye (1995)
2         3  Four Rooms (1995)
3         4  Get Shorty (1995)
4         5     Copycat (1995)
int64


In [36]:
def recommend_movies(user_id, n=5, k_similar=10):
    # Get similar users (top K excluding self)
    similar_users = user_similarity_df[user_id].sort_values(ascending=False)[1:k_similar+1].index
    
    # Ratings from similar users
    similar_ratings = user_item_matrix.loc[similar_users]
    
    # Weighted average ratings (by similarity)
    sim_scores = user_similarity_df.loc[similar_users, user_id]
    predicted_ratings = np.dot(similar_ratings.T, sim_scores) / sim_scores.sum()
    
    # DataFrame of predictions
    predictions = pd.Series(predicted_ratings, index=user_item_matrix.columns)
    
    # Unseen movies (user's ratings == 0)
    unseen = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] == 0].index
    
    # Top N unseen with highest predicted ratings
    recommendations = predictions[unseen].sort_values(ascending=False)[:n]
    
    # Create DF from recommendations and merge with movies
    rec_df = pd.DataFrame({'movie_id': recommendations.index.astype(int), 'predicted_rating': recommendations.values})
    rec_movies = rec_df.merge(movies, on='movie_id', how='left')
    
    # Check for 'title' (raise if missing)
    if 'title' not in rec_movies.columns:
        print("Merge failed. movies columns:", movies.columns.tolist())
        print("rec_df movie_id sample:", rec_df['movie_id'].head())
        print("movies movie_id sample:", movies['movie_id'].head())
        raise KeyError("'title' column missing after merge. Check movie_id matches/types or movies load.")
    
    return rec_movies[['title', 'predicted_rating']]

# Example: Recommend for user 1
print(recommend_movies(user_id=1, n=5))

                                    title  predicted_rating
0                            Fargo (1996)          3.802328
1                       Get Shorty (1995)          3.611206
2               Return of the Jedi (1983)          3.596395
3  One Flew Over the Cuckoo's Nest (1975)          3.512580
4       E.T. the Extra-Terrestrial (1982)          3.504424


## BONUS

In [37]:
svd = TruncatedSVD(n_components=50, random_state=42)
matrix_reduced = svd.fit_transform(user_item_matrix)

# Reconstructed matrix
predicted_matrix = np.dot(matrix_reduced, svd.components_)
predicted_df = pd.DataFrame(predicted_matrix, index=user_item_matrix.index, columns=user_item_matrix.columns)

def recommend_svd(user_id, n=5):
    predictions = predicted_df.loc[user_id]
    unseen = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] == 0].index
    recommendations = predictions[unseen].sort_values(ascending=False)[:n]
    
    # Create DF from recommendations and merge
    rec_df = pd.DataFrame({'movie_id': recommendations.index.astype(int), 'predicted_rating': recommendations.values})
    rec_movies = rec_df.merge(movies, on='movie_id', how='left')
    
    # Check for 'title'
    if 'title' not in rec_movies.columns:
        print("Merge failed in SVD.")
        raise KeyError("'title' column missing after merge.")
    
    return rec_movies[['title', 'predicted_rating']]

# Example
print(recommend_svd(user_id=1, n=5))

                        title  predicted_rating
0                 Heat (1995)          3.278345
1  Blues Brothers, The (1980)          3.035114
2           Piano, The (1993)          2.823149
3       Reservoir Dogs (1992)          2.717629
4         My Left Foot (1989)          2.705065
