In [1]:
import pandas as pd  
import numpy as np   

import matplotlib.pyplot as plt 
import seaborn as sns            

from sklearn.metrics.pairwise import cosine_similarity  

from sklearn.model_selection import train_test_split  


# Load Data

In [2]:
ratings_columns = ['user_id', 'item_id', 'rating', 'timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=ratings_columns)

ratings.to_csv('ratings.csv', index=False)


ratings.head()


Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [3]:
movies_columns = ['item_id', 'title', 'release_date', 'video_release_date',
                  'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation',
                  'Children', 'Comedy', 'Crime', 'Documentary', 'Drama',
                  'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
                  'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

movies = pd.read_csv('ml-100k/u.item', sep='|', names=movies_columns, encoding='latin-1')

movies = movies[['item_id', 'title', 'release_date']]

movies.to_csv('movies.csv', index=False)


movies.head()



Unnamed: 0,item_id,title,release_date
0,1,Toy Story (1995),01-Jan-1995
1,2,GoldenEye (1995),01-Jan-1995
2,3,Four Rooms (1995),01-Jan-1995
3,4,Get Shorty (1995),01-Jan-1995
4,5,Copycat (1995),01-Jan-1995


In [4]:
user_columns = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=user_columns)

users.to_csv('users.csv', index=False)

users.head()


Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [5]:
ratings_with_titles = ratings.merge(movies, on='item_id')
ratings_with_titles.head()

Unnamed: 0,user_id,item_id,rating,timestamp,title,release_date
0,196,242,3,881250949,Kolya (1996),24-Jan-1997
1,186,302,3,891717742,L.A. Confidential (1997),01-Jan-1997
2,22,377,1,878887116,Heavyweights (1994),01-Jan-1994
3,244,51,2,880606923,Legends of the Fall (1994),01-Jan-1994
4,166,346,1,886397596,Jackie Brown (1997),01-Jan-1997


## User Item Matrix

In [6]:
user_item_matrix = ratings_with_titles.pivot_table(index='user_id', columns='title', values='rating')

In [7]:
user_item_matrix = user_item_matrix.fillna(0)

user_item_matrix.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,2.0,5.0,0.0,0.0,3.0,4.0,0.0,0.0,...,0.0,0.0,0.0,5.0,3.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,2.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,4.0,0.0


## Cosine Similarity

In [8]:
user_sim_matrix = cosine_similarity(user_item_matrix)


user_similarity_df = pd.DataFrame(user_sim_matrix,
    index=user_item_matrix.index,
    columns=user_item_matrix.index)


user_similarity_df.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.168937,0.048388,0.064561,0.37967,0.429682,0.443097,0.320079,0.078385,0.377733,...,0.372213,0.11986,0.26986,0.193343,0.197949,0.118722,0.315064,0.149086,0.181612,0.399432
2,0.168937,1.0,0.113393,0.179694,0.073623,0.242106,0.108604,0.104257,0.16247,0.161273,...,0.147095,0.310661,0.363328,0.410725,0.322713,0.231096,0.228793,0.162911,0.175273,0.106732
3,0.048388,0.113393,1.0,0.349781,0.021592,0.074018,0.067423,0.084419,0.062039,0.066217,...,0.033885,0.043453,0.16714,0.071288,0.126278,0.026758,0.164539,0.102899,0.136757,0.02699
4,0.064561,0.179694,0.349781,1.0,0.031804,0.068431,0.091507,0.18806,0.101284,0.060859,...,0.054615,0.036784,0.133619,0.196561,0.146058,0.030202,0.196858,0.152041,0.171538,0.058752
5,0.37967,0.073623,0.021592,0.031804,1.0,0.238636,0.374733,0.24893,0.056847,0.201427,...,0.340183,0.08058,0.095284,0.081053,0.148607,0.071612,0.239955,0.139595,0.153799,0.313941


In [9]:
target_user_id = 10

similarities = user_similarity_df[target_user_id].sort_values(ascending=False)

similar_users = similarities.drop(index=target_user_id)

top_similar_users = similar_users.head(5)
top_similar_users


user_id
474    0.556142
6      0.554851
234    0.543002
308    0.538171
537    0.533409
Name: 10, dtype: float64

In [10]:
user_ratings = user_item_matrix.loc[target_user_id]

unwatched_movies = user_ratings[user_ratings.isna()].index.tolist()
unwatched_movies[:5]  

[]

In [11]:
top_users_ratings = user_item_matrix.loc[top_similar_users.index]

movie_scores = {}

for movie in unwatched_movies:
    ratings = top_users_ratings[movie]
    valid_ratings = ratings.dropna()

    if len(valid_ratings) > 0:
        movie_scores[movie] = valid_ratings.mean()


recommendations = pd.DataFrame(movie_scores.items(), columns=['title', 'predicted_rating'])
recommendations = recommendations.sort_values(by='predicted_rating', ascending=False)
recommendations.head(10)


Unnamed: 0,title,predicted_rating


# Model training

In [12]:
train_ratings, test_ratings = train_test_split(
    ratings_with_titles,
    test_size=0.2, 
    shuffle=True,      
    random_state=42       
)

print("Train size:", train_ratings.shape)
print("Test size:", test_ratings.shape)

Train size: (80000, 6)
Test size: (20000, 6)


In [13]:
user_item_matrix_train = train_ratings.pivot_table(
    index='user_id',      
    columns='item_id',  
    values='rating'
)
user_item_matrix_train = user_item_matrix_train.fillna(0)



In [14]:
user_sim_matrix = cosine_similarity(user_item_matrix_train)
user_similarity_df = pd.DataFrame(user_sim_matrix,
    index=user_item_matrix_train.index,
    columns=user_item_matrix_train.index)

In [15]:
def recommend_items(user_id, user_item_matrix, similarity_matrix, k=5, n_recommendations=5):
    
    if user_id not in user_item_matrix.index:
        print("User not found.")
        return []

    
    user_sim_scores = similarity_matrix.loc[user_id]

   
    similar_users = user_sim_scores.drop(user_id).sort_values(ascending=False).head(k)

    
    similar_users_ratings = user_item_matrix.loc[similar_users.index]

    
    weighted_ratings = similar_users_ratings.T.dot(similar_users) / similar_users.sum()

   
    user_rated_items = user_item_matrix.loc[user_id]
    unrated_items = user_rated_items[user_rated_items == 0].index  

   
    recommendations = weighted_ratings.loc[unrated_items].sort_values(ascending=False).head(n_recommendations)
    return recommendations.index.tolist()

In [16]:
recommended_ids = recommend_items(
    user_id=10,
    user_item_matrix=user_item_matrix_train,
    similarity_matrix=user_similarity_df,
)

print("Recommended item IDs:", recommended_ids)


Recommended item IDs: [515, 523, 50, 191, 211]


In [17]:
def precision_at_k(recommendations, test_data, user_id, k=5):
    # Get the set of item_ids that the user actually rated in the test set
    actual_items = test_data[test_data['user_id'] == user_id]['item_id'].tolist()

    if not actual_items:
        return None

    # Count how many of the top-k recommendations are in the actual items
    relevant_recommendations = [item for item in recommendations[:k] if item in actual_items]

    return len(relevant_recommendations) / k

In [18]:
recs = recommend_items(
    user_id=10,
    user_item_matrix=user_item_matrix_train,
    similarity_matrix=user_similarity_df,
    n_recommendations=5
)

print("Recommended items:", recs)
print("Precision at k=5:", precision_at_k(recs, test_ratings,user_id=10, k=5))

Recommended items: [515, 523, 50, 191, 211]
Precision at k=5: 0.6
