In [108]:
import pandas as pd
import numpy as np
import scipy.stats
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

In [106]:
data = pd.read_csv('ml-100k/u.data',header= None , sep = '\t',names=['user_id' , 'item_id' , 'rating' , 'timestamp'])

In [107]:
data.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [145]:
data.describe()

Unnamed: 0,user_id,item_id,rating,timestamp
count,100000.0,100000.0,100000.0,100000.0
mean,462.48475,425.53013,3.52986,883528900.0
std,266.61442,330.798356,1.125674,5343856.0
min,1.0,1.0,1.0,874724700.0
25%,254.0,175.0,3.0,879448700.0
50%,447.0,322.0,4.0,882826900.0
75%,682.0,631.0,4.0,888260000.0
max,943.0,1682.0,5.0,893286600.0


In [109]:
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

In [123]:
x =train_data[train_data['user_id']==1]
x[x['item_id']==2]

Unnamed: 0,user_id,item_id,rating,timestamp
23171,1,2,3,876893171


In [139]:
num_users = data['user_id'].nunique()
num_items = data['item_id'].nunique()
user_item_matrix = np.zeros((num_users, num_items))

In [140]:
for row in train_data.itertuples():
    user_item_matrix[row.user_id - 1, row.item_id - 1] = row.rating

In [141]:
user_item_matrix

array([[0., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

In [142]:
row_averages = np.ma.mean(np.ma.masked_equal(user_item_matrix, 0), axis=1).data
column_averages = np.ma.mean(np.ma.masked_equal(user_item_matrix, 0), axis=0).data            

array([3.6875    , 3.80434783, 2.8       , 4.5       , 2.86231884,
       3.61309524, 3.91343284, 3.74074074, 4.06666667, 4.2       ,
       3.5034965 , 4.43589744, 3.15098039, 4.01190476, 2.92682927,
       4.36538462, 3.13043478, 3.87946429, 3.625     , 3.16666667,
       2.65517241, 3.26732673, 3.6446281 , 4.32692308, 4.01538462,
       2.97647059, 3.28571429, 3.7       , 3.56666667, 3.69444444,
       3.73333333, 3.39393939, 3.66666667, 4.05      , 3.        ,
       4.05882353, 3.66666667, 3.80392157, 3.88235294, 2.87878788,
       3.83333333, 3.76666667, 3.70348837, 3.60526316, 3.58974359,
       4.04347826, 3.61111111, 3.7755102 , 2.69822485, 3.47619048,
       3.64705882, 4.26530612, 3.71428571, 3.69090909, 3.28571429,
       3.70063694, 3.48809524, 3.86324786, 3.93442623, 4.07386364,
       2.75      , 3.37288136, 3.09589041, 3.61077844, 3.88888889,
       3.5       , 3.95      , 2.82758621, 3.83928571, 3.5047619 ,
       3.78125   , 3.83783784, 3.8627451 , 3.64285714, 3.21875

In [148]:
column_averages.shape

(1682,)

In [143]:
for i in range(len(user_item_matrix)):
    for j in range(len(user_item_matrix[i])):
        if user_item_matrix[i][j]!=0:
            user_item_matrix[i][j] = user_item_matrix[i][j] - row_averages[i]

# collabrative filter based on items similarity

In [154]:
user_similarity = cosine_similarity(user_item_matrix)
predicted_ratings = []
actual_ratings = []

for row in val_data.itertuples():
    user_id = row.user_id - 1
    item_id = row.item_id - 1
    actual_rating = row.rating
    similar_items = np.argsort(user_similarity[user_id])[-100:]
    mean_all = 3.529860
    predicted_rating = np.mean(user_item_matrix[user_id, similar_items]) +mean_all+column_averages[item_id]-mean_all+row_averages[user_id]-mean_all
    predicted_ratings.append(predicted_rating)
    actual_ratings.append(actual_rating)

predicted_ratings = np.array(predicted_ratings)
actual_ratings = np.array(actual_ratings)
mae = mean_absolute_error(actual_ratings, predicted_ratings)
rmse = np.sqrt(mean_squared_error(actual_ratings, predicted_ratings))

print("Mean Absolute Error:", mae)
print("Root Mean Squared Error:", rmse)

Mean Absolute Error: 0.7613221108926315
Root Mean Squared Error: 0.9735438980351925


## part 2 

In [155]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
ratings_data = pd.read_csv('ml-100k/u.data', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
movies_data = pd.read_csv('ml-100k/u.item', sep='|', encoding='latin-1', header=None)

In [156]:
movies_data.columns = ['item_id', 'title', 'release_date', 'video_release_date', 'imdb_url', 'unknown',
'Action',
'Adventure',
'Animation',
'Childrens',
'Comedy',
'Crime',
'Documentary',
'Drama',
'Fantasy',
'Film-Noir',
'Horror',
'Musical',
'Mystery',
'Romance',
'Sci-Fi',
'Thriller',
'War',
'Western']
merged_data = pd.merge(ratings_data, movies_data[['item_id', 'title', 'unknown',
'Action',
'Adventure',
'Animation',
'Childrens',
'Comedy',
'Crime',
'Documentary',
'Drama',
'Fantasy',
'Film-Noir',
'Horror',
'Musical',
'Mystery',
'Romance',
'Sci-Fi',
'Thriller',
'War',
'Western']], on='item_id')

In [165]:
genre_columns = movies_data.columns[5:-1]
genres = movies_data[genre_columns]
item_similarity = cosine_similarity(genres, genres)

Index(['unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy',
       'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
       'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War'],
      dtype='object')

In [168]:
predicted_ratings = []
actual_ratings = []
for row in val_data.itertuples():
    user_id = row.user_id - 1
    item_id = row.item_id - 1
    actual_rating = row.rating
    similar_items = np.argsort(item_similarity[item_id])[-100:]
    mean_all = 3.529860
    predicted_rating = np.mean(user_item_matrix[user_id, similar_items]) +mean_all+column_averages[item_id]-mean_all+row_averages[user_id]-mean_all
    predicted_ratings.append(predicted_rating)
    actual_ratings.append(actual_rating)

predicted_ratings = np.array(predicted_ratings)
actual_ratings = np.array(actual_ratings)
mae = mean_absolute_error(actual_ratings, predicted_ratings)
rmse = np.sqrt(mean_squared_error(actual_ratings, predicted_ratings))

print("Mean Absolute Error:", mae)
print("Root Mean Squared Error:", rmse)

Mean Absolute Error: 0.7595334732505445
Root Mean Squared Error: 0.9704709003748508


# part3

In [175]:
hybrid_predictions = []

for row in val_data.itertuples():
    user_id = row.user_id - 1
    item_id = row.item_id - 1
    similar_users = np.argsort(user_similarity[user_id])[-100:]
    collaborative_prediction = np.mean(user_item_matrix[similar_users, item_id])
    similar_items = np.argsort(item_similarity[item_id])[-100:]
    content_based_prediction = np.mean(user_item_matrix[user_id, similar_items])
    mean_all = 3.529860
    hybrid_prediction = 0.6 * collaborative_prediction + 0.4 * content_based_prediction+mean_all+column_averages[item_id]-mean_all+row_averages[user_id]-mean_all
    hybrid_predictions.append(hybrid_prediction)
actual_ratings = val_data['rating']
hybrid_predictions = np.array(hybrid_predictions)
mae = mean_absolute_error(actual_ratings, hybrid_predictions)
rmse = np.sqrt(mean_squared_error(actual_ratings, hybrid_predictions))
print("Mean Absolute Error (MAE):", mae)
print("Root Mean Squared Error (RMSE):", rmse)

Mean Absolute Error (MAE): 0.7573446110649472
Root Mean Squared Error (RMSE): 0.9721680151203498


# part4

In [211]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=20, random_state=42)
user_factors = svd.fit_transform(user_item_matrix)
item_factors = svd.components_.T
item_similarity = cosine_similarity(item_factors)
hybrid_predictions = []

for row in val_data.itertuples():
    user_id = row.user_id - 1
    item_id = row.item_id - 1
    similar_items = np.argsort(item_similarity[item_id])[-100:]
    content_based_prediction = np.mean(user_item_matrix[user_id, similar_items])
    hybrid_prediction = 0.8 * item_factors[item_id] @ user_factors[user_id] + 0.2 * content_based_prediction+mean_all+column_averages[item_id]-mean_all+row_averages[user_id]-mean_all
    hybrid_predictions.append(hybrid_prediction)

actual_ratings = val_data['rating']
hybrid_predictions = np.array(hybrid_predictions)

mae = mean_absolute_error(actual_ratings, hybrid_predictions)
rmse = np.sqrt(mean_squared_error(actual_ratings, hybrid_predictions))

print("Mean Absolute Error (MAE):", mae)
print("Root Mean Squared Error (RMSE):", rmse)

Mean Absolute Error (MAE): 1.288803190354534
Root Mean Squared Error (RMSE): 1.5933181951144897


In [195]:
val_data

Unnamed: 0,user_id,item_id,rating,timestamp
75721,877,381,4,882677345
80184,815,602,3,878694269
19864,94,431,4,891721716
76699,416,875,2,876696938
92991,500,182,2,883873556
...,...,...,...,...
32595,72,591,5,880035708
29313,523,393,5,883702411
37862,606,287,4,880921656
53421,650,612,4,891369656


In [218]:
from sklearn.cluster import KMeans
user_item_svd = svd.fit_transform(user_item_matrix)
kmeans = KMeans(n_clusters=5, random_state=42)
cluster_labels = kmeans.fit_predict(genre_matrix)
hybrid_predictions = []

for row in val_data.itertuples():
    user_id = row.user_id - 1
    item_id = row.item_id - 1
    collaborative_prediction = item_factors[item_id] @ user_factors[user_id]
    target_cluster = cluster_labels[item_id]
    cluster_items = np.where(cluster_labels == target_cluster)[0]
    cluster_ratings = user_item_matrix[user_id, cluster_items]
    cluster_average = np.mean(cluster_ratings)
    hybrid_prediction = 0.7 * collaborative_prediction + 0.3 * cluster_average+mean_all+column_averages[item_id]-mean_all+row_averages[user_id]-mean_all

    hybrid_predictions.append(hybrid_prediction)
    top_items = np.argsort(hybrid_predictions)[-k:]  
    true_items = user_item_matrix[user_id, :].nonzero()[0]  

    num_common_items = len(set(top_items) & set(true_items))
    precision = num_common_items / k
    recall = num_common_items / len(true_items)

    precision_sum += precision
    recall_sum += recall
actual_ratings = val_data['rating']
hybrid_predictions = np.array(hybrid_predictions)

mae = mean_absolute_error(actual_ratings, hybrid_predictions)
rmse = np.sqrt(mean_squared_error(actual_ratings, hybrid_predictions))

print("Mean Absolute Error (MAE):", mae)
print("Root Mean Squared Error (RMSE):", rmse)



Mean Absolute Error (MAE): 1.1830442738216032
Root Mean Squared Error (RMSE): 1.4657327686906139
