# Movie Recommendation System using MovieLens 100K Dataset
## Objectives
- Load and explore the MovieLens 100K dataset
- Build User-User collaborative filtering
- Recommend top-rated unseen movies to a user
- Evaluate using Precision@K
- (Bonus) Implement Item-Item filtering and SVD-based matrix factorization

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD

## Load and Explore Dataset

In [3]:
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols)

m_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown', 'Action', 'Adventure',
          'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
          'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
movies = pd.read_csv('ml-100k/u.item', sep='|', names=m_cols, encoding='latin-1')

ratings.head(), movies.head()

(   user_id  movie_id  rating  timestamp
 0      196       242       3  881250949
 1      186       302       3  891717742
 2       22       377       1  878887116
 3      244        51       2  880606923
 4      166       346       1  886397596,
    movie_id              title release_date  video_release_date  \
 0         1   Toy Story (1995)  01-Jan-1995                 NaN   
 1         2   GoldenEye (1995)  01-Jan-1995                 NaN   
 2         3  Four Rooms (1995)  01-Jan-1995                 NaN   
 3         4  Get Shorty (1995)  01-Jan-1995                 NaN   
 4         5     Copycat (1995)  01-Jan-1995                 NaN   
 
                                             IMDb_URL  unknown  Action  \
 0  http://us.imdb.com/M/title-exact?Toy%20Story%2...        0       0   
 1  http://us.imdb.com/M/title-exact?GoldenEye%20(...        0       1   
 2  http://us.imdb.com/M/title-exact?Four%20Rooms%...        0       0   
 3  http://us.imdb.com/M/title-exact?Get%20Shor

## Create User-Item Rating Matrix

In [4]:
user_item_matrix = ratings.pivot(index='user_id', columns='movie_id', values='rating').fillna(0)
user_item_matrix.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## User-User Collaborative Filtering

In [6]:
def recommend_user_based(user_id, num_recommendations=5):
    sim_scores = user_similarity_df[user_id].sort_values(ascending=False).drop(user_id)
    top_neighbors = sim_scores.index[:10]
    
    neighbors_ratings = user_item_matrix.loc[top_neighbors]
    neighbor_weights = sim_scores[top_neighbors].values.reshape(-1, 1)
    
    weighted_sum = np.dot(neighbor_weights.T, neighbors_ratings.fillna(0))
    sum_weights = np.abs(neighbor_weights).sum()
    preds = weighted_sum.flatten() / sum_weights

    recommendations = pd.Series(preds, index=user_item_matrix.columns)


    already_rated = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] > 0].index
    recommendations = recommendations.drop(already_rated)

    top_movies = recommendations.nlargest(num_recommendations).index
    return movies.set_index('movie_id').loc[top_movies]['title']

# Test
print(recommend_user_based(1, 5))


movie_id
318                              Schindler's List (1993)
474    Dr. Strangelove or: How I Learned to Stop Worr...
655                                   Stand by Me (1986)
423                    E.T. the Extra-Terrestrial (1982)
403                                        Batman (1989)
Name: title, dtype: object


## Evaluation: Precision@K

In [7]:
def precision_at_k(predictions, actual, k=5):
    pred_k = predictions.iloc[:k]
    return np.intersect1d(pred_k.index, actual).shape[0] / k

## Bonus: Item-Item Collaborative Filtering

In [8]:
item_similarity = cosine_similarity(user_item_matrix.T)
item_similarity_df = pd.DataFrame(item_similarity, index=user_item_matrix.columns, columns=user_item_matrix.columns)

def recommend_item_based(user_id, num_recommendations=5):
    user_ratings = user_item_matrix.loc[user_id]
    scores = item_similarity_df.dot(user_ratings).div(item_similarity_df.sum(axis=1))
    scores = scores.drop(user_ratings[user_ratings > 0].index)
    top_items = scores.nlargest(num_recommendations).index
    return movies.set_index('movie_id').loc[top_items]['title']

recommend_item_based(1, 5)

movie_id
1618                      King of New York (1990)
1682    Scream of Stone (Schrei aus Stein) (1991)
1130                        Jupiter's Wife (1994)
1526                               Witness (1985)
1619                       All Things Fair (1996)
Name: title, dtype: object

## Bonus: Matrix Factorization (SVD)

In [9]:
svd = TruncatedSVD(n_components=20, random_state=42)
matrix = svd.fit_transform(user_item_matrix)
corr = np.dot(matrix, matrix.T)
svd_sim_df = pd.DataFrame(corr, index=user_item_matrix.index, columns=user_item_matrix.index)
svd_sim_df.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2216.334261,351.098427,99.245953,118.239727,1043.763076,1431.109653,2283.109806,632.911258,153.645922,1396.883201,...,1202.128946,198.535987,729.218275,286.106308,469.072861,239.810274,769.185487,220.728806,438.681691,1155.358236
2,351.098427,446.323942,133.642943,98.417865,92.225771,374.756971,220.27981,93.855814,92.122058,302.362411,...,212.511652,213.924114,549.369213,256.786928,357.675454,207.499338,299.626879,120.711611,186.817483,147.175443
3,99.245953,133.642943,167.256175,111.98297,3.04994,93.244894,109.404939,56.433731,32.7968,89.025468,...,27.709752,27.084764,167.632505,44.100085,104.184777,17.350375,139.832469,38.539748,137.376984,32.815137
4,118.239727,98.417865,111.98297,104.44474,47.611137,78.710143,150.165686,88.002699,36.996514,93.090784,...,61.670855,37.691265,133.830963,45.817738,125.672187,27.687573,136.955407,55.970612,104.864867,69.932366
5,1043.763076,92.225771,3.04994,47.611137,858.448226,555.354959,1316.488111,315.775917,60.468419,540.627961,...,745.395928,75.712328,210.322672,82.792773,258.494433,98.12883,387.280616,125.910303,214.555971,557.136967
