In [1]:
import pandas as pd
import numpy as np
import zipfile
import io
import requests
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
# URL for the MovieLens 100K dataset
url = "https://files.grouplens.org/datasets/movielens/ml-100k.zip"

# Download the dataset
response = requests.get(url)

# Extract the dataset
with zipfile.ZipFile(io.BytesIO(response.content)) as z:
    z.extractall("ml-100k")


In [2]:
# Load the data files into dataframes
ratings = pd.read_csv('ml-100k/ml-100k/u.data', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
movies = pd.read_csv('ml-100k/ml-100k/u.item', sep='|', encoding='latin-1', names=['item_id', 'title'] + [f'col_{i}' for i in range(22)], usecols=[0, 1])

# Display the first few rows of each dataframe
print("Ratings DataFrame:")
display(ratings.head())

print("\nMovies DataFrame:")
movies.head()


Ratings DataFrame:


Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596



Movies DataFrame:


Unnamed: 0,item_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [3]:
merged_df = pd.merge(ratings, movies, on='item_id')

# Display the first few rows of the merged DataFrame
merged_df.head()

Unnamed: 0,user_id,item_id,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


In [4]:
# Create the user-item matrix
user_item_matrix = merged_df.pivot_table(index='user_id', columns='title', values='rating')

# Fill NaN values with 0 or some other placeholder (e.g., user average)
user_item_matrix = user_item_matrix.fillna(0)

# Display the user-item matrix
user_item_matrix.head()


title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,2.0,5.0,0.0,0.0,3.0,4.0,0.0,0.0,...,0.0,0.0,0.0,5.0,3.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,2.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,4.0,0.0


In [5]:
# Apply Truncated SVD
svd = TruncatedSVD(n_components=20, random_state=42)
latent_matrix = svd.fit_transform(user_item_matrix)

# Display the reduced dimensionality matrix
print(latent_matrix.shape)


(943, 20)


In [6]:
# Compute the cosine similarity between users
user_similarity = cosine_similarity(latent_matrix)

# Create a DataFrame for easy lookup
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)

# Display the user similarity matrix
user_similarity_df.head()


user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.352421,0.159204,0.241997,0.763114,0.694862,0.702747,0.617133,0.517059,0.707255,...,0.7405,0.258406,0.480642,0.421826,0.353962,0.301545,0.682209,0.427033,0.423546,0.730744
2,0.352421,1.0,0.495369,0.462954,0.149889,0.401112,0.153989,0.203086,0.698293,0.340898,...,0.283101,0.608623,0.826804,0.841416,0.600674,0.571739,0.580854,0.524044,0.399996,0.211992
3,0.159204,0.495369,1.0,0.852333,0.010574,0.161575,0.120671,0.199159,0.40814,0.167913,...,0.061299,0.127099,0.414695,0.248562,0.288375,0.078764,0.447042,0.28331,0.478056,0.075444
4,0.241997,0.462954,0.852333,1.0,0.15853,0.175902,0.208786,0.390941,0.57112,0.215603,...,0.17688,0.21994,0.412746,0.319548,0.432099,0.152847,0.546821,0.497414,0.46473,0.201268
5,0.763114,0.149889,0.010574,0.15853,1.0,0.43473,0.645055,0.490171,0.329671,0.435519,...,0.734086,0.153042,0.230862,0.198867,0.310798,0.194146,0.539832,0.392111,0.330369,0.565436


In [7]:
def recommend_movies(user_id, user_item_matrix, user_similarity_df, n_recommendations=10):
    # Get the ratings of the user
    user_ratings = user_item_matrix.loc[user_id]

    # Get similar users sorted by similarity score
    similar_users = user_similarity_df[user_id].sort_values(ascending=False)

    # Calculate weighted ratings from similar users
    weighted_ratings = np.dot(similar_users, user_item_matrix)
    recommendation_scores = weighted_ratings / similar_users.sum()

    # Exclude movies the user has already rated
    user_watched_movies = user_ratings[user_ratings > 0].index
    recommendation_scores = pd.Series(recommendation_scores, index=user_item_matrix.columns)
    recommendation_scores = recommendation_scores.drop(user_watched_movies)

    # Return the top N recommended movies
    top_recommendations = recommendation_scores.sort_values(ascending=False).head(n_recommendations)
    return top_recommendations

# Example: Recommend top 10 movies for user with ID 196
recommended_movies = recommend_movies(196, user_item_matrix, user_similarity_df, n_recommendations=10)
print("Top 10 movie recommendations:")
print(recommended_movies)


Top 10 movie recommendations:
title
Star Wars (1977)                    2.710642
Fargo (1996)                        2.321488
Return of the Jedi (1983)           2.149164
Contact (1997)                      2.097875
Raiders of the Lost Ark (1981)      1.892970
Godfather, The (1972)               1.870347
Toy Story (1995)                    1.847340
Silence of the Lambs, The (1991)    1.835729
Pulp Fiction (1994)                 1.772905
Scream (1996)                       1.758794
dtype: float64


In [8]:
pip freeze > requirements.txt

Note: you may need to restart the kernel to use updated packages.
