In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import euclidean

# --- Global Setup and Data Loading ---
# Column names for MovieLens u.data file [cite: 5]
r_cols = ['user_id', 'item_id', 'rating', 'timestamp']
# Load ratings data. Make sure 'ml-100k/u.data' path is correct. [cite: 5]
try:
    ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols, encoding='latin-1')
except FileNotFoundError:
    print("Error: ml-100k/u.data file not found. Please ensure the dataset is downloaded and in the correct path.")
    exit()

# Create the user-item utility matrix [cite: 6]
utility_matrix = ratings.pivot_table(index='user_id', columns='item_id', values='rating')

# --- Problem 1.1 Start ---
print("--- Starting Problem 1.1 ---")

# 1. Center user ratings by subtracting the user's mean rating. [cite: 6]
user_mean_ratings = utility_matrix.mean(axis=1)
centered_utility_matrix = utility_matrix.subtract(user_mean_ratings, axis=0)

# 2. Fill NaNs with 0 for similarity calculation. [cite: 6]
#    A 0 in centered data means rating is at user's average.
centered_utility_matrix_filled = centered_utility_matrix.fillna(0)

# 3. Calculate cosine similarity between all users. [cite: 6]
user_similarity_cosine = cosine_similarity(centered_utility_matrix_filled)
#    Store similarities in a DataFrame for easier access.
user_similarity_df = pd.DataFrame(user_similarity_cosine,
                                  index=centered_utility_matrix_filled.index,
                                  columns=centered_utility_matrix_filled.index)

# 4. Find the 10 most similar users for user 1. [cite: 6]
user_1_id = 1
if user_1_id in user_similarity_df.index:
    user_1_similarities = user_similarity_df.loc[user_1_id]
    # Sort by similarity, highest first, exclude self.
    similar_users_to_user_1 = user_1_similarities.sort_values(ascending=False)
    top_10_similar_users_for_user_1 = similar_users_to_user_1.iloc[1:11] 
    print(f"\nTop 10 most similar users to user {user_1_id} (and their similarity scores):\n{top_10_similar_users_for_user_1}")

    # 5. Predict rating for item 508 for user 1. [cite: 7]
    #    Based on the average of *original* ratings from these similar users.
    item_id_to_predict = 508
    similar_user_ids = top_10_similar_users_for_user_1.index

    if item_id_to_predict in utility_matrix.columns:
        # Get ratings for item 508 from the original utility matrix.
        ratings_for_item_508_by_similar_users = utility_matrix.loc[similar_user_ids, item_id_to_predict]
        # Consider only users who actually rated the item.
        ratings_for_item_508_by_similar_users_rated = ratings_for_item_508_by_similar_users.dropna()

        if not ratings_for_item_508_by_similar_users_rated.empty:
            expected_rating_for_user_1 = ratings_for_item_508_by_similar_users_rated.mean()
            print(f"\nRatings for item {item_id_to_predict} by similar users (only showing rated):\n{ratings_for_item_508_by_similar_users_rated}")
            print(f"Expected rating for item {item_id_to_predict} for user {user_1_id} (based on simple average of similar users' ratings): {expected_rating_for_user_1:.4f}")
        else:
            print(f"\nNone of the top 10 similar users have rated item {item_id_to_predict}. Cannot predict.")
    else:
        print(f"\nItem {item_id_to_predict} not found in the dataset.")
else:
    print(f"\nUser {user_1_id} not found in the dataset.")

print("--- Problem 1.1 Finished ---")
print("\n" + "="*50 + "\n") # Separator

# --- Problem 1.2 Start ---
print("--- Starting Problem 1.2 ---")

# 1. Build user profiles (centered ratings) for users 200 and 15. [cite: 9]
user_id_A = 200
user_id_B = 15
item_id_target = 95 # Target item: movie 95

#   Get centered rating vectors, fill NaNs with 0.
if user_id_A in centered_utility_matrix.index:
    user_A_profile_centered = centered_utility_matrix.loc[user_id_A].fillna(0)
    print(f"\nUser {user_id_A} centered profile (first 10 items):\n{user_A_profile_centered.head(10)}")
else:
    print(f"\nUser {user_id_A} not found in the dataset. Problem 1.2 cannot proceed for this user.")
    user_A_profile_centered = None 

if user_id_B in centered_utility_matrix.index:
    user_B_profile_centered = centered_utility_matrix.loc[user_id_B].fillna(0)
    print(f"\nUser {user_id_B} centered profile (first 10 items):\n{user_B_profile_centered.head(10)}")
else:
    print(f"\nUser {user_id_B} not found in the dataset. Problem 1.2 cannot proceed for this user.")
    user_B_profile_centered = None 


# 2. Create an idealized preference vector for item 95.
#    This vector is 1 for item 95, 0 otherwise. Represents "only liking item 95".
num_all_items = utility_matrix.shape[1] 
ideal_item_target_preference_vector = np.zeros(num_all_items)

#    Find item 95's actual column index in the utility matrix.
if item_id_target in utility_matrix.columns:
    item_column_position = utility_matrix.columns.get_loc(item_id_target)
    ideal_item_target_preference_vector[item_column_position] = 1
    print(f"\nCreated idealized preference vector for item {item_id_target} (set to 1 at position {item_column_position}).")

    # 3. Calculate similarity & distance between user profiles and the idealized item vector. [cite: 9]
    results_A = None
    if user_A_profile_centered is not None:
        user_A_vec = user_A_profile_centered.values.reshape(1, -1) 
        ideal_vec_reshaped = ideal_item_target_preference_vector.reshape(1, -1)

        cosine_sim_A_item_target = cosine_similarity(user_A_vec, ideal_vec_reshaped)[0][0]
        euclidean_dist_A_item_target = euclidean(user_A_profile_centered.values, ideal_item_target_preference_vector)
        results_A = (cosine_sim_A_item_target, euclidean_dist_A_item_target)
        print(f"\nUser {user_id_A} vs. Idealized Preference for Item {item_id_target}:")
        print(f"  Cosine Similarity: {cosine_sim_A_item_target:.4f}")
        print(f"  Euclidean Distance: {euclidean_dist_A_item_target:.4f}")

    results_B = None
    if user_B_profile_centered is not None:
        user_B_vec = user_B_profile_centered.values.reshape(1, -1) 
        ideal_vec_reshaped = ideal_item_target_preference_vector.reshape(1, -1) 

        cosine_sim_B_item_target = cosine_similarity(user_B_vec, ideal_vec_reshaped)[0][0]
        euclidean_dist_B_item_target = euclidean(user_B_profile_centered.values, ideal_item_target_preference_vector)
        results_B = (cosine_sim_B_item_target, euclidean_dist_B_item_target)
        print(f"\nUser {user_id_B} vs. Idealized Preference for Item {item_id_target}:")
        print(f"  Cosine Similarity: {cosine_sim_B_item_target:.4f}")
        print(f"  Euclidean Distance: {euclidean_dist_B_item_target:.4f}")

    # 4. Decide who to recommend this movie to. [cite: 9]
    print("\nRecommendation Decision:")
    if results_A and results_B:
        sim_A = results_A[0]
        sim_B = results_B[0]
        if sim_A > sim_B:
            print(f"The recommender system would likely suggest movie {item_id_target} to User {user_id_A} (higher cosine similarity).")
        elif sim_B > sim_A:
            print(f"The recommender system would likely suggest movie {item_id_target} to User {user_id_B} (higher cosine similarity).")
        else:
            # If cosine similarities are equal, check Euclidean distance (lower is better).
            dist_A = results_A[1]
            dist_B = results_B[1]
            if dist_A < dist_B:
                print(f"Both users have equal cosine similarity, but User {user_id_A} has a smaller Euclidean distance, suggesting preference for User {user_id_A}.")
            elif dist_B < dist_A:
                 print(f"Both users have equal cosine similarity, but User {user_id_B} has a smaller Euclidean distance, suggesting preference for User {user_id_B}.")
            else:
                print(f"Both users have similar or identical cosine similarity and Euclidean distance to movie {item_id_target}.")
    elif results_A:
        print(f"Only User {user_id_A}'s data is available for comparison. The system would lean towards User {user_id_A} if their similarity is positive.")
    elif results_B:
        print(f"Only User {user_id_B}'s data is available for comparison. The system would lean towards User {user_id_B} if their similarity is positive.")
    else:
        print(f"Neither User {user_id_A} nor User {user_id_B} are in the dataset. Cannot make a recommendation decision.")
else:
    print(f"\nTarget Item {item_id_target} not found in the dataset. Problem 1.2 cannot proceed.")

print("--- Problem 1.2 Finished ---")

--- Starting Problem 1.1 ---

Top 10 most similar users to user 1 (and their similarity scores):
user_id
773    0.204792
868    0.202321
592    0.196592
880    0.195801
429    0.190661
276    0.187476
916    0.186358
222    0.182415
457    0.182253
8      0.180891
Name: 1, dtype: float64

Ratings for item 508 by similar users (only showing rated):
user_id
592    5.0
880    4.0
429    4.0
276    5.0
222    3.0
Name: 508, dtype: float64
Expected rating for item 508 for user 1 (based on simple average of similar users' ratings): 4.2000
--- Problem 1.1 Finished ---


--- Starting Problem 1.2 ---

User 200 centered profile (first 10 items):
item_id
1     0.967593
2    -0.032407
3     0.000000
4     0.000000
5     0.000000
6     0.000000
7    -0.032407
8    -0.032407
9    -0.032407
10    0.000000
Name: 200, dtype: float64

User 15 centered profile (first 10 items):
item_id
1    -1.875
2     0.000
3     0.000
4     0.000
5     0.000
6     0.000
7    -1.875
8     0.000
9     1.125
10    0.000
