Dataframe Creation

In [62]:
import pandas as pd
import numpy as np
import os 

directory = 'ml-100k'
rating_col = ['user_id', 'item_id', 'rating', 'timestamp']
movie_col = ['movie_id', 'movie_title', 'release_date', 'video_release_date', 'imdb_url', 
    'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 
    'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 
    'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

def load_full_data(data_dir=directory):
    filepath = os.path.join(data_dir, 'u.data')
    df = pd.read_csv(filepath, sep='\t', names=rating_col, encoding='latin-1')
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
    return df

def load_movie_titles(data_dir=directory):
    filepath = os.path.join(data_dir, 'u.item')
    movies = pd.read_csv(
        filepath, 
        sep='|', 
        names=movie_col, 
        encoding='latin-1',
        usecols=['movie_id', 'movie_title', 'release_date'])
    return movies


Testing Split

In [63]:
def split_data_custom(df, user_sample_frac=0.20, rating_sample_frac=0.20, random_state=42):

    np.random.seed(random_state)
    
    unique_users = df['user_id'].unique()
    
    num_test_users = int(len(unique_users) * user_sample_frac)
    test_user_ids = np.random.choice(unique_users, size=num_test_users, replace=False)
    
    is_test_user = df['user_id'].isin(test_user_ids)
    
    train_non_test_users = df[~is_test_user].copy()
    
    potential_test_data = df[is_test_user].copy()
    
    test_set = potential_test_data.groupby('user_id', group_keys=False).apply(
        lambda x: x.sample(frac=rating_sample_frac, random_state=random_state)
    )
    
    train_test_users = potential_test_data.drop(test_set.index)
    
    train_set = pd.concat([train_non_test_users, train_test_users])
    

    print(len(test_user_ids))
    print(len(df))
    print(len(train_set))
    print(len(test_set))
    
    return train_set, test_set

Utility-Matrix Creation

In [64]:
def create_utility_matrix(train_df):

    utility_matrix = train_df.pivot(index='user_id', columns='item_id', values='rating')

    user_means = utility_matrix.mean(axis=1)
    
    utility_matrix_centered = utility_matrix.sub(user_means, axis=0)

    utility_matrix_filled = utility_matrix_centered.fillna(0)
    
    return utility_matrix_filled, user_means

full_df = load_full_data()
movies_df = load_movie_titles()

train_df, test_df = split_data_custom(full_df)

utility_matrix_centered, user_means = create_utility_matrix(train_df)

print(utility_matrix_centered.shape)
print(utility_matrix_centered.iloc[:5, :5])
print(test_df.head())

188
100000
95954
4046
(943, 1677)
item_id         1         2         3         4         5
user_id                                                  
1        1.389706 -0.610294  0.389706 -0.610294 -0.610294
2        0.290323  0.000000  0.000000  0.000000  0.000000
3        0.000000  0.000000  0.000000  0.000000  0.000000
4        0.000000  0.000000  0.000000  0.000000  0.000000
5        1.125714  0.125714  0.000000  0.000000  0.000000
       user_id  item_id  rating           timestamp
3929        10      203       4 1997-10-26 18:52:47
8999        10      164       4 1997-10-26 18:08:53
70178       10      617       5 1997-10-26 18:56:00
29093       10      274       4 1997-10-26 18:08:53
57611       10      478       5 1997-10-26 18:03:24


  test_set = potential_test_data.groupby('user_id', group_keys=False).apply(


Cosine Simalirity-based KNN

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
import numpy as np

def calculate_similarity_matrix(utility_matrix):

    sim_matrix = cosine_similarity(utility_matrix)
    
    sim_df = pd.DataFrame(sim_matrix, index=utility_matrix.index, columns=utility_matrix.index)
    
    return sim_df

def predict_rating(user_id, item_id, utility_matrix, sim_df, user_means, k=20):
   
    if item_id not in utility_matrix.columns:
        return user_means.get(user_id, 3.5)

    if user_id not in sim_df.index:
        return 3.5 
        
    user_sims = sim_df.loc[user_id].sort_values(ascending=False)
    item_ratings = utility_matrix[item_id]
    rated_users = item_ratings[item_ratings!=0].index
    relevant_neighbors = user_sims.loc[user_sims.index.intersection(rated_users)]

    if user_id in relevant_neighbors.index:
        relevant_neighbors = relevant_neighbors.drop(user_id)

    top_k_neighbors = relevant_neighbors.head(k)
    
    if len(top_k_neighbors) == 0:
        return user_means[user_id]
    
    neighbor_ratings_centered = utility_matrix.loc[top_k_neighbors.index, item_id]
    neighbor_sims = top_k_neighbors.values
    numerator = np.dot(neighbor_ratings_centered, neighbor_sims)
    denominator = np.sum(np.abs(neighbor_sims))
    
    if denominator == 0:
        return user_means[user_id]
        
    pred_rating = user_means[user_id]+(numerator/denominator)

    return np.clip(pred_rating, 1, 5)

Similarity Matrix

In [66]:

sim_df = calculate_similarity_matrix(utility_matrix_centered)

k_values = [5, 10, 20]
results = {}

for k in k_values:
    predictions = []
    actuals = []
    
    for idx, row in test_df.iterrows():
        user = row['user_id']
        item = row['item_id']
        actual = row['rating']
        
        pred = predict_rating(user, item, utility_matrix_centered, sim_df, user_means, k=k)
        
        predictions.append(pred)
        actuals.append(actual)
    mse = mean_squared_error(actuals, predictions)
    rmse = np.sqrt(mse)
    results[k] = {'mse': mse, 'rmse': rmse}
    print(f"k={k}: rmse={rmse:.4f}")

  ret = a @ b
  ret = a @ b
  ret = a @ b


k=5: rmse=0.9524
k=10: rmse=0.9195
k=20: rmse=0.9108


Results

In [67]:
for k, metrics in results.items():
    print(f"k={k}: mse={metrics['mse']:.4f}, rmse={metrics['rmse']:.4f}")

test_df_with_pred = test_df.copy()
test_df_with_pred['predicted'] = predictions
test_df_with_pred['error'] = test_df_with_pred['predicted'] - test_df_with_pred['rating']
test_df_with_pred['abs_error'] = test_df_with_pred['error'].abs()

best_pred = test_df_with_pred.loc[test_df_with_pred['abs_error'].idxmin()]
worst_pred = test_df_with_pred.loc[test_df_with_pred['abs_error'].idxmax()]

print("Highest accuracy")
print(best_pred[['user_id', 'item_id', 'rating', 'predicted']])

print("Lowest accuracy")
print(worst_pred[['user_id', 'item_id', 'rating', 'predicted']])

k=5: mse=0.9070, rmse=0.9524
k=10: mse=0.8455, rmse=0.9195
k=20: mse=0.8295, rmse=0.9108
Highest accuracy
user_id       10
item_id      127
rating         5
predicted    5.0
Name: 17971, dtype: object
Lowest accuracy
user_id      850
item_id       98
rating         1
predicted    5.0
Name: 93567, dtype: object


Item-based Collaborative Filtering

In [68]:
def predict_item_based(user_id, item_id, utility_matrix, item_sim_df, k=20):
   
    if item_id not in item_sim_df.index:
        return 3.5
    
    similar_items = item_sim_df[item_id].sort_values(ascending=False)
    user_ratings = utility_matrix.loc[user_id]
    rated_items = user_ratings[user_ratings != 0].index
    relevant_items = similar_items.loc[similar_items.index.intersection(rated_items)]

    if item_id in relevant_items.index:
        relevant_items = relevant_items.drop(item_id)
        
    top_k_items = relevant_items.head(k)
    
    if len(top_k_items) == 0:
        return 3.5
        
    neighbor_ratings = utility_matrix.loc[user_id, top_k_items.index]
    neighbor_sims = top_k_items.values
    numerator = np.dot(neighbor_ratings, neighbor_sims)
    denominator = np.sum(np.abs(neighbor_sims))
    
    if denominator == 0:
        return 3.5
  
    pred_rating = user_means[user_id]+(numerator/denominator)
    
    return np.clip(pred_rating, 1, 5)

# Utility Matrix + Cosine Similarity

item_utility_matrix = utility_matrix_centered.T 
item_sim_matrix = cosine_similarity(item_utility_matrix)
item_sim_df = pd.DataFrame(item_sim_matrix, index=item_utility_matrix.index, columns=item_utility_matrix.index)
k = 20
ib_predictions = []
ib_actuals = []

for idx, row in test_df.iterrows():
    user = row['user_id']
    item = row['item_id']
    actual = row['rating']
    
    if item not in item_sim_df.index:
        pred = 3.5
    else:
        pred = predict_item_based(user, item, utility_matrix_centered, item_sim_df, k=k)
        
    ib_predictions.append(pred)
    ib_actuals.append(actual)

ib_mse = mean_squared_error(ib_actuals, ib_predictions)
ib_rmse = np.sqrt(ib_mse)

print(f"item based for k=20: mse={ib_mse:.4f}, rmse={ib_rmse:.4f}")
print(f"user based for k=20: mse={results[20]['mse']:.4f}, rmse={results[20]['rmse']:.4f}")


  ret = a @ b
  ret = a @ b
  ret = a @ b


item based for k=20: mse=0.8065, rmse=0.8980
user based for k=20: mse=0.8295, rmse=0.9108
